DDRTree/0000755000176200001440000000000013101447371011504 5ustar liggesusersDDRTree/src/0000755000176200001440000000000013074204410012265 5ustar liggesusersDDRTree/src/DDRTree.cpp0000644000176200001440000005756313074204410014242 0ustar liggesusers#include "DDRTree.h" #include #include #include //using namespace boost; //using boost::functional; using namespace Rcpp; using namespace Eigen; typedef Eigen::SparseMatrix SpMat; // This is a simple example of exporting a C++ function to R. You can // source this function into an R session using the Rcpp::sourceCpp // function (or via the Source button on the editor toolbar). Learn // more about Rcpp at: // // http://www.rcpp.org/ // http://adv-r.had.co.nz/Rcpp.html // http://gallery.rcpp.org/ // // [[Rcpp::export]] SEXP pca_projection(SEXP R_C, int dimensions){ NumericMatrix Rcpp_C(R_C); const int n = Rcpp_C.nrow(), p = Rcpp_C.ncol(); Map C(Rcpp_C.begin(), n, p); MatrixXd W; pca_projection_cpp(C, dimensions, W); return wrap(W); } void pca_projection_cpp(const MatrixXd& C, int dimensions, MatrixXd& W){ EigenSolver es(C, true); MatrixXd eVecs = es.eigenvectors().real(); VectorXd eVals = es.eigenvalues().real(); // Sort by ascending eigenvalues: std::vector > D; D.reserve(eVals.size()); for (MatrixXd::Index i=0;i((double)eVals.coeff(i,0),(long)i)); std::sort(D.rbegin(),D.rend()); MatrixXd sortedEigs; sortedEigs.resize(eVecs.rows(), dimensions); for (int i=0; i < eVals.size() && i < dimensions; i++) { eVals.coeffRef(i,0)=D[i].first; sortedEigs.col(i)=eVecs.col(D[i].second); } W = sortedEigs; } // [[Rcpp::export]] SEXP sqdist(SEXP R_a, SEXP R_b){ NumericMatrix Rcpp_a(R_a); const int a_n = Rcpp_a.nrow(), a_p = Rcpp_a.ncol(); Map a(Rcpp_a.begin(), a_n, a_p); NumericMatrix Rcpp_b(R_b); const int b_n = Rcpp_b.nrow(), b_p = Rcpp_b.ncol(); Map b(Rcpp_b.begin(), b_n, b_p); MatrixXd W; sq_dist_cpp(a, b, W); return wrap(W); } void sq_dist_cpp(const MatrixXd& a, const MatrixXd& b, MatrixXd& W){ // aa <- colSums(a^2) // bb <- colSums(b^2) // ab <- t(a) %*% b // // aa_repmat <- matrix(rep(aa, times = ncol(b)), ncol = ncol(b), byrow = F) // bb_repmat <- matrix(rep(bb, times = ncol(a)), nrow = ncol(a), byrow = T) // dist <- abs(aa_repmat + bb_repmat - 2 * ab) // Rcpp::Rcout << " a nan check : (" << a.rows() << "x" << a.cols() << ", " << a.maxCoeff() << " )" << std::endl; // Rcpp::Rcout << " b nan check : (" << b.rows() << "x" << b.cols() << ", " << b.maxCoeff() << " )" << std::endl; VectorXd aa = (a.array() * a.array()).colwise().sum(); VectorXd bb = (b.array() * b.array()).colwise().sum(); MatrixXd ab = a.transpose() * b; // Rcpp::Rcout << " ab nan check : (" << ab.rows() << "x" << ab.cols() << ", " << ab.maxCoeff() << " )" << std::endl; MatrixXd aa_repmat; aa_repmat.resize(a.cols(), b.cols()); for (int i=0; i < aa_repmat.cols(); i++) { aa_repmat.col(i) = aa; } // Rcpp::Rcout << " aa_repmat nan check : (" << aa_repmat.rows() << "x" << aa_repmat.cols() << ", " << aa_repmat.maxCoeff() << " )" << std::endl; MatrixXd bb_repmat; bb_repmat.resize(a.cols(), b.cols()); for (int i=0; i < bb_repmat.rows(); i++) { bb_repmat.row(i) = bb; } // Rcpp::Rcout << " bb_repmat nan check : (" << bb_repmat.rows() << "x" << bb_repmat.cols() << ", " << bb_repmat.maxCoeff() << " )" << std::endl; W = aa_repmat + bb_repmat - 2 * ab; // Rcpp::Rcout << " W nan check : (" << W.rows() << "x" << W.cols() << ", " << W.maxCoeff() << " )" << std::endl; W = W.array().abs().matrix(); } void DDRTree_reduce_dim_cpp(const MatrixXd& X_in, const MatrixXd& Z_in, const MatrixXd& Y_in, const MatrixXd& W_in, int dimensions, int maxIter, int num_clusters, double sigma, double lambda, double gamma, double eps, bool verbose, MatrixXd& Y_out, SpMat& stree, MatrixXd& Z_out, MatrixXd& W_out, std::vector& objective_vals){ Y_out = Y_in; W_out = W_in; Z_out = Z_in; int N_cells = X_in.cols(); /* typedef boost::property EdgeWeightProperty; typedef boost::adjacency_matrix< boost::undirectedS, boost::no_property, EdgeWeightProperty> Graph; typedef boost::graph_traits < Graph >::edge_descriptor Edge; typedef boost::graph_traits < Graph >::vertex_descriptor Vertex; if (verbose) Rcpp::Rcout << "setting up adjacency matrix" << std::endl; Graph g(Y_in.cols()); for (std::size_t j = 0; j < Y_in.cols(); ++j) { for (std::size_t i = 0; i < Y_in.cols() && i <= j ; ++i) { Edge e; bool inserted; tie(e, inserted) = add_edge(i, j, g); } } */ using namespace boost; typedef boost::property EdgeWeightProperty; typedef boost::adjacency_list < vecS, vecS, undirectedS, property, property < edge_weight_t, double > > Graph; typedef boost::graph_traits < Graph >::edge_descriptor Edge; typedef boost::graph_traits < Graph >::vertex_descriptor Vertex; typedef boost::graph_traits::edge_iterator edge_iter; Graph g(Y_in.cols()); //property_map::type weightmap = get(edge_weight, g); for (std::size_t j = 0; j < Y_in.cols(); ++j) { for (std::size_t i = 0; i < Y_in.cols() && i <= j ; ++i) { if (i != j){ Edge e; bool inserted; tie(e, inserted) = add_edge(i, j, g); } } } boost::property_map::type EdgeWeightMap = get(boost::edge_weight_t(), g); MatrixXd B = MatrixXd::Zero(Y_in.cols(), Y_in.cols()); std::vector < graph_traits < Graph >::vertex_descriptor > old_spanning_tree(num_vertices(g)); // std::vector objective_vals; MatrixXd distsqMU; MatrixXd L; MatrixXd distZY; distZY.resize(X_in.cols(), num_clusters); MatrixXd min_dist; min_dist.resize(X_in.cols(), num_clusters); MatrixXd tmp_distZY; tmp_distZY.resize(X_in.cols(), num_clusters); //SpMat tmp_R(X_in.cols(), num_clusters); MatrixXd tmp_R; tmp_R.resize(X_in.cols(), num_clusters); //SpMat R(X_in.cols(), num_clusters); MatrixXd R; R.resize(tmp_R.rows(), num_clusters); //SpMat Gamma(R.cols(), R.cols()); MatrixXd Gamma = MatrixXd::Zero(R.cols(), R.cols()); SpMat tmp(Gamma.rows(), Gamma.cols()); MatrixXd tmp_dense; tmp_dense.resize(Gamma.rows(), Gamma.cols()); //SpMat Q; MatrixXd Q; Q.resize(tmp_dense.rows(), R.rows()); MatrixXd C; C.resize(X_in.rows(), Q.cols()); MatrixXd tmp1; tmp1.resize(C.rows(), X_in.rows()); Environment stats("package:DDRTree"); Function pca_projection_R = stats["pca_projection_R"]; Function get_major_eigenvalue = stats["get_major_eigenvalue"]; for (int iter = 0; iter < maxIter; ++iter){ if (verbose) Rcpp::Rcout << "************************************** " << std::endl; if (verbose) Rcpp::Rcout << "Iteration: " << iter << std::endl; sq_dist_cpp(Y_out, Y_out, distsqMU); //Rcpp::Rcout << "distsqMU: " << distsqMU<< std::endl; std::pair edgePair; if (verbose) Rcpp::Rcout << "updating weights in graph" << std::endl; for(edgePair = edges(g); edgePair.first != edgePair.second; ++edgePair.first) { if (source(*edgePair.first,g) != target(*edgePair.first,g)){ //Rcpp::Rcout << "edge: " << source(*edgePair.first,g) << " " << target(*edgePair.first,g) << " : " << distsqMU(source(*edgePair.first,g), target(*edgePair.first,g)) << std::endl; EdgeWeightMap[*edgePair.first] = distsqMU(source(*edgePair.first,g), target(*edgePair.first,g)); } } std::vector < graph_traits < Graph >::vertex_descriptor > spanning_tree(num_vertices(g)); if (verbose) Rcpp::Rcout << "Finding MST" << std::endl; prim_minimum_spanning_tree(g, &spanning_tree[0]); if (verbose) Rcpp::Rcout << "Refreshing B matrix" << std::endl; // update the adjacency matrix. First, erase the old edges for (size_t ei = 0; ei < old_spanning_tree.size(); ++ei) { //if (ei != old_spanning_tree[ei]){ B(ei, old_spanning_tree[ei]) = 0; B(old_spanning_tree[ei], ei) = 0; // } } // now add the new edges for (size_t ei = 0; ei < spanning_tree.size(); ++ei) { if (ei != spanning_tree[ei]){ B(ei, spanning_tree[ei]) = 1; B(spanning_tree[ei], ei) = 1; } } //Rcpp::Rcout << "B: " << std::endl << B << std::endl; if (verbose) Rcpp::Rcout << " B : (" << B.rows() << " x " << B.cols() << ")" << std::endl; old_spanning_tree = spanning_tree; L = B.colwise().sum().asDiagonal(); L = L - B; //Rcpp::Rcout << " Z_out nan check : (" << Z_out.rows() << "x" << Z_out.cols() << ", " << Z_out.maxCoeff() << " )" << std::endl; //Rcpp::Rcout << " Y_out nan check : (" << Y_out.rows() << "x" << Y_out.cols() << ", " << Y_out.maxCoeff() << " )" << std::endl; sq_dist_cpp(Z_out, Y_out, distZY); //Rcpp::Rcout << " distZY nan check : (" << distZY.maxCoeff() << " )" << std::endl; if (verbose) Rcpp::Rcout << " distZY : (" << distZY.rows() << " x " << distZY.cols() << ")" << std::endl; if (verbose) Rcpp::Rcout << " min_dist : (" << min_dist.rows() << " x " << min_dist.cols() << ")" << std::endl; //min_dist <- matrix(rep(apply(distZY, 1, min), times = K), ncol = K, byrow = F) VectorXd distZY_minCoeff = distZY.rowwise().minCoeff(); if (verbose) Rcpp::Rcout << "distZY_minCoeff = " << std::endl; for (int i=0; i < min_dist.cols(); i++) { min_dist.col(i) = distZY_minCoeff; } //Rcpp::Rcout << min_dist << std::endl; //tmp_distZY <- distZY - min_dist tmp_distZY = distZY - min_dist; //Rcpp::Rcout << tmp_distZY << std::endl; if (verbose) Rcpp::Rcout << " tmp_R : (" << tmp_R.rows() << " x " << tmp_R.cols() << ")" << std::endl; //tmp_R <- exp(-tmp_distZY / params$sigma) tmp_R = tmp_distZY.array() / (-1.0 * sigma); //Rcpp::Rcout << tmp_R << std::endl; tmp_R = tmp_R.array().exp().matrix(); if (verbose) Rcpp::Rcout << " R : (" << R.rows() << " x " << R.cols() << ")" << std::endl; //R <- tmp_R / matrix(rep(rowSums(tmp_R), times = K), byrow = F, ncol = K) VectorXd tmp_R_rowsums = tmp_R.rowwise().sum(); for (int i=0; i < R.cols(); i++) { R.col(i) = tmp_R_rowsums; } //Rcpp::Rcout << R << std::endl; //Rcpp::Rcout << "&&&&&" << std::endl; R = (tmp_R.array() / R.array()).matrix(); //Rcpp::Rcout << R << std::endl; if (verbose) Rcpp::Rcout << " Gamma : (" << Gamma.rows() << " x " << Gamma.cols() << ")" << std::endl; //Gamma <- matrix(rep(0, ncol(R) ^ 2), nrow = ncol(R)) Gamma = MatrixXd::Zero(R.cols(), R.cols()); //diag(Gamma) <- colSums(R) Gamma.diagonal() = R.colwise().sum(); //Rcpp::Rcout << Gamma << std::endl; //termination condition //obj1 <- - params$sigma * sum(log(rowSums(exp(-tmp_distZY / params$sigma))) - min_dist[, 1] / params$sigma) VectorXd x1 = (tmp_distZY.array() / -sigma).exp().rowwise().sum().log(); //Rcpp::Rcout << "Computing x1 " << x1.transpose() << std::endl; double obj1 = -sigma * (x1 - min_dist.col(0) / sigma).sum(); //Rcpp::Rcout << obj1 << std::endl; //obj2 <- (norm(X - W %*% Z, '2'))^2 + params$lambda * sum(diag(Y %*% L %*% t(Y))) + params$gamma * obj1 #sum(diag(A)) //Rcpp:Rcout << X_in - W_out * Z_out << std::endl; if (verbose){ Rcpp::Rcout << " X : (" << X_in.rows() << " x " << X_in.cols() << ")" << std::endl; Rcpp::Rcout << " W : (" << W_out.rows() << " x " << W_out.cols() << ")" << std::endl; Rcpp::Rcout << " Z : (" << Z_out.rows() << " x " << Z_out.cols() << ")" << std::endl; } double major_eigen_value = as(get_major_eigenvalue(X_in - W_out * Z_out,dimensions)); double obj2 = major_eigen_value; //Rcpp::Rcout << "norm = " << obj2 << std::endl; obj2 = obj2 * obj2; if (verbose){ Rcpp::Rcout << " L : (" << L.rows() << " x " << L.cols() << ")" << std::endl; } obj2 = obj2 + lambda * (Y_out * L * Y_out.transpose()).diagonal().sum() + gamma * obj1; //Rcpp::Rcout << obj2 << std::endl; //Rcpp::Rcout << "obj2 = " << obj2 << std::endl; objective_vals.push_back(obj2); if (verbose) Rcpp::Rcout << "Checking termination criterion" << std::endl; if(iter >= 1) { double delta_obj = std::abs(objective_vals[iter] - objective_vals[iter - 1]); delta_obj /= std::abs(objective_vals[iter - 1]); if (verbose) Rcpp::Rcout << "delta_obj: " << delta_obj << std::endl; if(delta_obj < eps) { break; } } //Rcpp::Rcout << "L" << std::endl; //Rcpp::Rcout << L << std::endl; if (verbose) Rcpp::Rcout << "Computing tmp" << std::endl; //tmp <- t(solve( ( ( (params$gamma + 1) / params$gamma) * ((params$lambda / params$gamma) * L + Gamma) - t(R) %*% R), t(R))) if (verbose) Rcpp::Rcout << "... stage 1" << std::endl; tmp = ((Gamma + (L * (lambda / gamma))) * ((gamma + 1.0) / gamma)).sparseView(); //Rcpp::Rcout << tmp << std::endl; if (verbose){ Rcpp::Rcout << "... stage 2" << std::endl; //Rcpp::Rcout << R.transpose() << std::endl; } SparseMatrix R_sp = R.sparseView(); tmp = tmp - (R_sp.transpose() * R_sp); //tmp = tmp_dense.sparseView(); if (verbose){ Rcpp::Rcout << "Pre-computing LLT analysis" << std::endl; Rcpp::Rcout << "tmp is (" << tmp.rows() << "x" << tmp.cols() <<"), " << tmp.nonZeros() << " non-zero values" << std::endl; } //Rcpp::Rcout << tmp << std::endl; SimplicialLLT , Lower, AMDOrdering > solver; solver.compute(tmp); if(solver.info()!=Success) { // decomposition failed Rcpp::Rcout << "Error!" << std::endl; tmp_dense = tmp; tmp_dense = tmp_dense.partialPivLu().solve(R.transpose()).transpose(); Rcpp::Rcout << tmp_dense << std::endl; }else{ if (verbose) Rcpp::Rcout << "Computing LLT" << std::endl; tmp_dense = solver.solve(R.transpose()).transpose(); if(solver.info()!=Success) { // solving failed Rcpp::Rcout << "Error!" << std::endl; } } //tmp_dense = tmp_dense.llt().solve(R.transpose()).transpose(); if (verbose) Rcpp::Rcout << "tmp_dense " << tmp_dense.rows() << "x" << tmp_dense.cols() <<") "<< std::endl; if (verbose) Rcpp::Rcout << "Computing Q " << Q.rows() << "x" << Q.cols() <<") "<< std::endl; //Q <- 1 / (params$gamma + 1) * (diag(1, N) + tmp %*% t(R)) //tmp = tmp_dense.sparseView(); //Rcpp::Rcout << "tmp_dense is (" << tmp_dense.rows() << "x" << tmp_dense.cols() <<"), " << tmp_dense.nonZeros() << " non-zero values" << std::endl; //Rcpp::Rcout << "R_sp is (" << R_sp.rows() << "x" << R_sp.cols() <<"), " << R_sp.nonZeros() << " non-zero values" << std::endl; ///////////////////////// /* double gamma_coeff = 1.0 / (1 + gamma); SpMat Q_id(tmp_dense.rows(), R.rows()); Q_id.setIdentity(); tmp1 = gamma_coeff * (X_in * tmp_dense.sparseView()); if (verbose) Rcpp::Rcout << "First tmp1 product complete: " << tmp1.rows() << "x" << tmp1.cols() <<"), " << tmp1.nonZeros() << " non-zero values" << std::endl; tmp1 = tmp1 * R_sp.transpose(); if (verbose) Rcpp::Rcout << "Second tmp1 product complete: " << tmp1.rows() << "x" << tmp1.cols() <<"), " << tmp1.nonZeros() << " non-zero values" << std::endl; tmp1 += gamma_coeff * X_in; if (verbose) Rcpp::Rcout << "Third tmp1 product complete: " << tmp1.rows() << "x" << tmp1.cols() <<"), " << tmp1.nonZeros() << " non-zero values" << std::endl; tmp1 = tmp1 * X_in.transpose(); if (verbose) Rcpp::Rcout << "Final tmp1 product complete: " << tmp1.rows() << "x" << tmp1.cols() <<"), " << tmp1.nonZeros() << " non-zero values" << std::endl; */ /////////////////////////// /* Q = ((MatrixXd::Identity(X_in.cols(), X_in.cols()) + (tmp_dense * R.transpose()) ).array() / (gamma + 1.0)); if (verbose){ Rcpp::Rcout << "gamma: " << gamma << std::endl; Rcpp::Rcout << " X_in : (" << X_in.rows() << " x " << X_in.cols() << ")" << std::endl; Rcpp::Rcout << " Q : (" << Q.rows() << " x " << Q.cols() << ")" << std::endl; //Rcpp::Rcout << Q << std::endl; } // C <- X %*% Q C = X_in * Q; if (verbose) Rcpp::Rcout << " C : (" << C.rows() << " x " << C.cols() << ")" << std::endl; tmp1 = C * X_in.transpose(); */ ///////////////////////// Q = (X_in + ((X_in * tmp_dense) * R.transpose()) ).array() / (gamma + 1.0); if (verbose){ Rcpp::Rcout << "gamma: " << gamma << std::endl; Rcpp::Rcout << " X_in : (" << X_in.rows() << " x " << X_in.cols() << ")" << std::endl; Rcpp::Rcout << " Q : (" << Q.rows() << " x " << Q.cols() << ")" << std::endl; //Rcpp::Rcout << Q << std::endl; } // C <- X %*% Q //C = X_in * Q; C = Q; tmp1 = Q * X_in.transpose(); ///////////////////////// //Rcpp::Rcout << tmp1 << std::endl; //Rcpp::Rcout << tmp1 << std::endl; if (verbose){ Rcpp::Rcout << "Computing W" << std::endl; //Rcpp::Rcout << "tmp1 = " << std::endl; //Rcpp::Rcout << tmp1 << std::endl; //Rcpp::Rcout << (tmp1 + tmp1.transpose()) / 2 << std::endl; } //W <- pca_projection_R((tmp1 + t(tmp1)) / 2, params$dim) NumericMatrix W_R = pca_projection_R((tmp1 + tmp1.transpose()) / 2,dimensions); const int X_n = W_R.nrow(), X_p = W_R.ncol(); Map W(W_R.begin(), X_n, X_p); W_out = W; //pca_projection_cpp((tmp1 + tmp1.transpose()) / 2, dimensions, W_out); //Rcpp::Rcout << W_out << std::endl; if (verbose) Rcpp::Rcout << "Computing Z" << std::endl; //Z <- t(W) %*% C Z_out = W_out.transpose() * C; //Rcpp::Rcout << Z_out << std::endl; if (verbose) Rcpp::Rcout << "Computing Y" << std::endl; //Y <- t(solve((params$lambda / params$gamma * L + Gamma), t(Z %*% R))) Y_out = L * (lambda / gamma) + Gamma; Y_out = Y_out.llt().solve((Z_out * R).transpose()).transpose(); //Rcpp::Rcout << Y_out << std::endl; } if (verbose) Rcpp::Rcout << "Clearing MST sparse matrix" << std::endl; stree.setZero(); if (verbose){ Rcpp::Rcout << "Setting up MST sparse matrix with " << old_spanning_tree.size() << std::endl; } typedef Eigen::Triplet T; std::vector tripletList; tripletList.reserve(2*old_spanning_tree.size()); // Send back the weighted MST as a sparse matrix for (size_t ei = 0; ei < old_spanning_tree.size(); ++ei) { //stree.insert(source(*ei, g), target(*ei, g)) = 1;//distsqMU(source(*ei, g), target(*ei, g)); tripletList.push_back(T( ei, old_spanning_tree[ei], distsqMU(ei, old_spanning_tree[ei]))); tripletList.push_back(T( old_spanning_tree[ei], ei, distsqMU(old_spanning_tree[ei], ei))); } stree = SpMat(N_cells, N_cells); stree.setFromTriplets(tripletList.begin(), tripletList.end()); } // [[Rcpp::export]] Rcpp::List DDRTree_reduce_dim(SEXP R_X, SEXP R_Z, SEXP R_Y, SEXP R_W, SEXP R_dimensions, SEXP R_maxiter, SEXP R_num_clusters, SEXP R_sigma, SEXP R_lambda, SEXP R_gamma, SEXP R_eps, SEXP R_verbose){ //Rcpp::Rcout << "Mapping verbose" << std::endl; bool verbose = as(R_verbose); if (verbose) Rcpp::Rcout << "Mapping X" << std::endl; NumericMatrix Rcpp_X(R_X); const int X_n = Rcpp_X.nrow(), X_p = Rcpp_X.ncol(); Map X(Rcpp_X.begin(), X_n, X_p); if (verbose) Rcpp::Rcout << "Mapping Z" << std::endl; NumericMatrix Rcpp_Z(R_Z); const int Z_n = Rcpp_Z.nrow(), Z_p = Rcpp_Z.ncol(); Map Z(Rcpp_Z.begin(), Z_n, Z_p); if (verbose) Rcpp::Rcout << "Mapping Y" << std::endl; NumericMatrix Rcpp_Y(R_Y); const int Y_n = Rcpp_Y.nrow(), Y_p = Rcpp_Y.ncol(); Map Y(Rcpp_Y.begin(), Y_n, Y_p); if (verbose) Rcpp::Rcout << "Mapping W" << std::endl; NumericMatrix Rcpp_W(R_W); const int W_n = Rcpp_W.nrow(), W_p = Rcpp_W.ncol(); Map W(Rcpp_W.begin(), W_n, W_p); if (verbose) Rcpp::Rcout << "Mapping dimensions" << std::endl; int dimensions = as(R_dimensions); if (verbose) Rcpp::Rcout << "Mapping maxIter" << std::endl; int maxiter = as(R_maxiter); if (verbose) Rcpp::Rcout << "Mapping num_clusters" << std::endl; int num_clusters = as(R_num_clusters); if (verbose) Rcpp::Rcout << "Mapping sigma" << std::endl; double sigma = as(R_sigma); if (verbose) Rcpp::Rcout << "Mapping lambda" << std::endl; double lambda = as(R_lambda); if (verbose) Rcpp::Rcout << "Mapping gamma" << std::endl; double gamma = as(R_gamma); if (verbose) Rcpp::Rcout << "Mapping eps" << std::endl; double eps = as(R_eps); MatrixXd Y_res; SpMat stree_res; MatrixXd Z_res; MatrixXd W_out; std::vector objective_vals; //a vector for the value for the objective function at each iteration DDRTree_reduce_dim_cpp(X, Z, Y, W, dimensions, maxiter, num_clusters, sigma, lambda, gamma, eps, verbose, Y_res, stree_res, Z_res, W_out, objective_vals); NumericMatrix X_res; NumericMatrix stree; return Rcpp::List::create(Rcpp::Named("W") = W_out, //this really should be W. W can be used to for reverse embedding for missing data Rcpp::Named("Z") = Z_res, Rcpp::Named("stree") = wrap(stree_res), Rcpp::Named("Y") = wrap(Y_res), Rcpp::Named("X") = X, Rcpp::Named("objective_vals") = objective_vals); } // You can include R code blocks in C++ files processed with sourceCpp // (useful for testing and development). The R code will be automatically // run after the compilation. // DDRTree/src/DDRTree.h0000644000176200001440000000116213074204410013667 0ustar liggesusers#ifndef _DDRTree_DDRTREE_H #define _DDRTree_DDRTREE_H #include #include using namespace Rcpp; using namespace Eigen; // This is a simple example of exporting a C++ function to R. You can // source this function into an R session using the Rcpp::sourceCpp // function (or via the Source button on the editor toolbar). Learn // more about Rcpp at: // // http://www.rcpp.org/ // http://adv-r.had.co.nz/Rcpp.html // http://gallery.rcpp.org/ // void pca_projection_cpp(const MatrixXd& R_C, int dimensions, MatrixXd& W); void sq_dist_cpp(const MatrixXd& a, const MatrixXd& b, MatrixXd& W); #endif DDRTree/src/RcppExports.cpp0000644000176200001440000000512413074204410015264 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; // pca_projection SEXP pca_projection(SEXP R_C, int dimensions); RcppExport SEXP DDRTree_pca_projection(SEXP R_CSEXP, SEXP dimensionsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type R_C(R_CSEXP); Rcpp::traits::input_parameter< int >::type dimensions(dimensionsSEXP); rcpp_result_gen = Rcpp::wrap(pca_projection(R_C, dimensions)); return rcpp_result_gen; END_RCPP } // sqdist SEXP sqdist(SEXP R_a, SEXP R_b); RcppExport SEXP DDRTree_sqdist(SEXP R_aSEXP, SEXP R_bSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type R_a(R_aSEXP); Rcpp::traits::input_parameter< SEXP >::type R_b(R_bSEXP); rcpp_result_gen = Rcpp::wrap(sqdist(R_a, R_b)); return rcpp_result_gen; END_RCPP } // DDRTree_reduce_dim Rcpp::List DDRTree_reduce_dim(SEXP R_X, SEXP R_Z, SEXP R_Y, SEXP R_W, SEXP R_dimensions, SEXP R_maxiter, SEXP R_num_clusters, SEXP R_sigma, SEXP R_lambda, SEXP R_gamma, SEXP R_eps, SEXP R_verbose); RcppExport SEXP DDRTree_DDRTree_reduce_dim(SEXP R_XSEXP, SEXP R_ZSEXP, SEXP R_YSEXP, SEXP R_WSEXP, SEXP R_dimensionsSEXP, SEXP R_maxiterSEXP, SEXP R_num_clustersSEXP, SEXP R_sigmaSEXP, SEXP R_lambdaSEXP, SEXP R_gammaSEXP, SEXP R_epsSEXP, SEXP R_verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type R_X(R_XSEXP); Rcpp::traits::input_parameter< SEXP >::type R_Z(R_ZSEXP); Rcpp::traits::input_parameter< SEXP >::type R_Y(R_YSEXP); Rcpp::traits::input_parameter< SEXP >::type R_W(R_WSEXP); Rcpp::traits::input_parameter< SEXP >::type R_dimensions(R_dimensionsSEXP); Rcpp::traits::input_parameter< SEXP >::type R_maxiter(R_maxiterSEXP); Rcpp::traits::input_parameter< SEXP >::type R_num_clusters(R_num_clustersSEXP); Rcpp::traits::input_parameter< SEXP >::type R_sigma(R_sigmaSEXP); Rcpp::traits::input_parameter< SEXP >::type R_lambda(R_lambdaSEXP); Rcpp::traits::input_parameter< SEXP >::type R_gamma(R_gammaSEXP); Rcpp::traits::input_parameter< SEXP >::type R_eps(R_epsSEXP); Rcpp::traits::input_parameter< SEXP >::type R_verbose(R_verboseSEXP); rcpp_result_gen = Rcpp::wrap(DDRTree_reduce_dim(R_X, R_Z, R_Y, R_W, R_dimensions, R_maxiter, R_num_clusters, R_sigma, R_lambda, R_gamma, R_eps, R_verbose)); return rcpp_result_gen; END_RCPP } DDRTree/src/DDRTree_init.c0000644000176200001440000000150413074204410014705 0ustar liggesusers#include #include #include // for NULL #include /* FIXME: Check these declarations against the C/Fortran source code. */ /* .Call calls */ extern SEXP DDRTree_DDRTree_reduce_dim(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP DDRTree_pca_projection(SEXP, SEXP); extern SEXP DDRTree_sqdist(SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"DDRTree_DDRTree_reduce_dim", (DL_FUNC) &DDRTree_DDRTree_reduce_dim, 12}, {"DDRTree_pca_projection", (DL_FUNC) &DDRTree_pca_projection, 2}, {"DDRTree_sqdist", (DL_FUNC) &DDRTree_sqdist, 2}, {NULL, NULL, 0} }; void R_init_DDRTree(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } DDRTree/NAMESPACE0000644000176200001440000000040513074063556012732 0ustar liggesusers# Generated by roxygen2: do not edit by hand export(DDRTree) export(get_major_eigenvalue) export(pca_projection_R) export(sqdist_R) import(irlba) importFrom(Rcpp,evalCpp) importFrom(stats,kmeans) importFrom(stats,qnorm) useDynLib(DDRTree, .registration = TRUE)DDRTree/NEWS0000644000176200001440000000132613073113777012215 0ustar liggesusersDDRTree 0.0.0 Series NEWS ================================================================================ Version 0.1.5 -------------------------------------------------------------------------------- BUGFIXES o Fixed an problem where DDRTre would return different results on repeated runs given the same inputs. The problem was actually in DDRTree in two places: kmeans and irlba. We now call irlba with deterministically initialized eigenvectors and kmeans with deterministically selected rows of the input. Version 0.1.4 -------------------------------------------------------------------------------- BUGFIXES o Fixed a build error triggered by recent versions of GCC using the C++14 standard DDRTree/R/0000755000176200001440000000000013074204410011677 5ustar liggesusersDDRTree/R/DDRTree.R0000644000176200001440000001437413074022006013263 0ustar liggesusers#' Compute the PCA projection #' #' @param C data matrix used for PCA projection #' @param L number for the top principal components #' @import irlba irlba #' @importFrom stats qnorm #' @export pca_projection_R <- function(C, L) { if (L >= min(dim(C))){ eigen_res <- eigen(C) U <- eigen_res$vector V <- eigen_res$value eig_sort <- sort(V, decreasing = T, index.return = T) eig_idx <- eig_sort$ix W <- U[, eig_idx[1:L]] return (W) } else{ initial_v <- as.matrix(qnorm(1:(ncol(C) + 1)/(ncol(C) + 1))[1:ncol(C)]) eigen_res <- irlba::irlba(C, nv = L, v = initial_v) U <- eigen_res$u V <- eigen_res$v return (V) } } #' Get the top L eigenvalues #' @param C data matrix used for eigendecomposition #' @param L number for the top eigenvalues #' @import irlba irlba #' @export get_major_eigenvalue <- function(C, L) { if (L >= min(dim(C))){ return (base::norm(C, '2')^2); }else{ #message("using irlba") initial_v <- as.matrix(qnorm(1:(ncol(C) + 1)/(ncol(C) + 1))[1:ncol(C)]) eigen_res <- irlba(C, nv = L, v = initial_v) return (max(abs(eigen_res$v))) } # eig_sort <- sort(V, decreasing = T, index.return = T) # eig_idx <- eig_sort$ix # # W <- U[, eig_idx[1:L]] } #' calculate the square distance between a, b #' @param a a matrix with \eqn{D \times N} dimension #' @param b a matrix with \eqn{D \times N} dimension #' @return a numeric value for the different between a and b #' @export sqdist_R <- function(a, b) { aa <- colSums(a^2) bb <- colSums(b^2) ab <- t(a) %*% b aa_repmat <- matrix(rep(aa, times = ncol(b)), ncol = ncol(b), byrow = F) bb_repmat <- matrix(rep(bb, times = ncol(a)), nrow = ncol(a), byrow = T) dist <- abs(aa_repmat + bb_repmat - 2 * ab) } #' Perform DDRTree construction #' @param X a matrix with \eqn{\mathbf{D \times N}} dimension which is needed to perform DDRTree construction #' @param initial_method a function to take the data transpose of X as input and then output the reduced dimension, #' row number should not larger than observation and column number should not be larger than variables (like isomap may only #' return matrix on valid sample sets). Sample names of returned reduced dimension should be preserved. #' @param dimensions reduced dimension #' @param maxIter maximum iterations #' @param sigma bandwidth parameter #' @param lambda regularization parameter for inverse graph embedding #' @param ncenter number of nodes allowed in the regularization graph #' @param param.gamma regularization parameter for k-means (the prefix of 'param' is used to avoid name collision with gamma) #' @param tol relative objective difference #' @param verbose emit extensive debug output #' @param ... additional arguments passed to DDRTree #' @importFrom stats kmeans #' @return a list with W, Z, stree, Y, history #' W is the orthogonal set of d (dimensions) linear basis vector #' Z is the reduced dimension space #' stree is the smooth tree graph embedded in the low dimension space #' Y represents latent points as the center of Z #' @examples #' data('iris') #' subset_iris_mat <- as.matrix(t(iris[c(1, 2, 52, 103), 1:4])) #subset the data #' #run DDRTree with ncenters equal to species number #' DDRTree_res <- DDRTree(subset_iris_mat, dimensions = 2, maxIter = 5, sigma = 1e-2, #' lambda = 1, ncenter = 3, param.gamma = 10, tol = 1e-2, verbose = FALSE) #' Z <- DDRTree_res$Z #obatain matrix #' Y <- DDRTree_res$Y #' stree <- DDRTree_res$stree #' plot(Z[1, ], Z[2, ], col = iris[c(1, 2, 52, 103), 'Species']) #reduced dimension #' legend("center", legend = unique(iris[c(1, 2, 52, 103), 'Species']), cex=0.8, #' col=unique(iris[c(1, 2, 52, 103), 'Species']), pch = 1) #legend #' title(main="DDRTree reduced dimension", col.main="red", font.main=4) #' dev.off() #' plot(Y[1, ], Y[2, ], col = 'blue', pch = 17) #center of the Z #' title(main="DDRTree smooth principal curves", col.main="red", font.main=4) #' #' #run DDRTree with ncenters equal to species number #' DDRTree_res <- DDRTree(subset_iris_mat, dimensions = 2, maxIter = 5, sigma = 1e-3, #' lambda = 1, ncenter = NULL,param.gamma = 10, tol = 1e-2, verbose = FALSE) #' Z <- DDRTree_res$Z #obatain matrix #' Y <- DDRTree_res$Y #' stree <- DDRTree_res$stree #' plot(Z[1, ], Z[2, ], col = iris[c(1, 2, 52, 103), 'Species']) #reduced dimension #' legend("center", legend = unique(iris[c(1, 2, 52, 103), 'Species']), cex=0.8, #' col=unique(iris[c(1, 2, 52, 103), 'Species']), pch = 1) #legend #' title(main="DDRTree reduced dimension", col.main="red", font.main=4) #' dev.off() #' plot(Y[1, ], Y[2, ], col = 'blue', pch = 2) #center of the Z #' title(main="DDRTree smooth principal graphs", col.main="red", font.main=4) #' @export #' DDRTree <- function(X, dimensions = 2, initial_method = NULL, maxIter = 20, sigma = 1e-3, lambda = NULL, ncenter = NULL, param.gamma = 10, tol = 1e-3, verbose = F, ...) { D <- nrow(X) N <- ncol(X) #initialization W <- pca_projection_R(X %*% t(X), dimensions) if(is.null(initial_method)){ Z <- t(W) %*% X } else{ tmp <- initial_method(X, ...) #a function to return reduced dimension data if(ncol(tmp) > D | nrow(tmp) > N) stop('The dimension reduction method passed need to return correct dimensions') Z <- tmp[, 1:dimensions] Z <- t(Z) } if(is.null(ncenter)) { K <- N Y <- Z[, 1:K] } else { K <- ncenter if (K > ncol(Z)) stop("Error: ncenters must be greater than or equal to ncol(X)") centers = t(Z)[seq(1, ncol(Z), length.out=K),] kmean_res <- kmeans(t(Z), K, centers=centers) Y <- kmean_res$centers Y <- t(Y) } if (is.null(lambda)){ lambda = 5 * ncol(X) } ddrtree_res <- DDRTree_reduce_dim(X, Z, Y, W, dimensions, maxIter, K, sigma, lambda, param.gamma, tol, verbose) return(list(W = ddrtree_res$W, Z = ddrtree_res$Z, stree = ddrtree_res$stree, Y = ddrtree_res$Y, X = ddrtree_res$X, objective_vals = ddrtree_res$objective_vals, history = NULL)) } DDRTree/R/RcppExports.R0000644000176200001440000000121613074021164014316 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 pca_projection <- function(R_C, dimensions) { .Call('DDRTree_pca_projection', PACKAGE = 'DDRTree', R_C, dimensions) } sqdist <- function(R_a, R_b) { .Call('DDRTree_sqdist', PACKAGE = 'DDRTree', R_a, R_b) } DDRTree_reduce_dim <- function(R_X, R_Z, R_Y, R_W, R_dimensions, R_maxiter, R_num_clusters, R_sigma, R_lambda, R_gamma, R_eps, R_verbose) { .Call('DDRTree_DDRTree_reduce_dim', PACKAGE = 'DDRTree', R_X, R_Z, R_Y, R_W, R_dimensions, R_maxiter, R_num_clusters, R_sigma, R_lambda, R_gamma, R_eps, R_verbose) } DDRTree/R/help.R0000644000176200001440000001720512655504132012767 0ustar liggesusers#'DDRTree - An algorithm to reduce dimensionality and learning principal graphs simultaneously #' #' This is an R and C code implementation of the DDRTree algorithm from Qi Mao, Li Wang et al. \cr \cr #' Qi Mao, Li Wang, Steve Goodison, and Yijun Sun. Dimensionality Reduction via Graph Structure Learning. #' The 21st ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD'15), 2015 \cr #' \url{http://dl.acm.org/citation.cfm?id=2783309} \cr \cr #' to perform dimension reduction and principal graph #' learning simultaneously. Please cite this package and KDD'15 paper if you found DDRTree is useful for your research. #' #'@section Introduction: #'The unprecedented increase in big-data causes a huge difficulty in data visualization and downstream analysis. #'Conventional dimension reduction approaches (for example, PCA, ICA, Isomap, LLE, etc.) are limited in their ability #'to explictly recover the intrinisic structure from the data as well as the discriminative feature representation, #'both are important for scientific discovery. The DDRTree algorithm is a new algorithm to perform the following #'three tasks in one setting: \cr #'\cr #'1. Reduce high dimension data into a low dimension space \cr #'\cr #'2. Recover an explicit smooth graph structure with local geometry only captured by distances of data #'points in the low dimension space. \cr #'\cr #'3. Obtain clustering structures of data points in reduced dimension \cr #' #'@section Dimensionality reduction via graph structure learning: #' #'Reverse graph embedding is previously applied to learn the intrinisic graph structure in the original dimension. #'The optimization of graph inference can be represented as: #'\deqn{\mathop{min}_{f_g \in \mathcal{F}} \mathop{min}_{\{\mathbf{z}_1, ..., \mathbf{z}_M\}} \sum_{(V_i, V_j) \in #'\mathcal{E}} b_{i,j}||f_g(\mathbf{z}_i) - f_g(\mathbf{z}_j)||^2} where #'\eqn{f_g} is a function to map the instrinsic data space \eqn{\mathcal{Z} = \{\mathbf{z}_1, ..., \mathbf{z}_M\}} back to the input data space (reverse embedding) #'\eqn{\mathcal{X} = \{ \mathbf{x}_1, ..., \mathbf{x}_N\}}. \eqn{V_i} is the the vertex of the instrinsic undirected graph #'\eqn{\mathcal{G} = (\mathcal{V}, \mathcal{E})}. \eqn{b_{ij}} is #'the edge weight associates with the edge set \eqn{\mathcal{E}}. #'In order to learn the intrinsic structure from a reduced dimension, we need also to consider a term which includes #'the error during the learning of the instrinsic structure. This strategy is incorporated as the following: #'\deqn{\mathop{min}_{\mathcal{G} \in \hat{\mathcal{G}}_b}\mathop{min}_{f_g \in \mathcal{F}} \mathop{min}_{\{\mathbf{z}_1, ..., #'\mathbf{z}_M\}} \sum_{i = 1}^N ||\mathbf{x}_i - f_g (\mathbf{z}_i)||^2 + \frac{\lambda}{2} \sum_{(V_i, V_j) \in \mathcal{E}} #'b_{i,j}||f_g(\mathbf{z}_i) - f_g(\mathbf{z}_j)||^2} where \eqn{\lambda} is a non-negative parameter which controls the tradeoff between the data #'reconstruction error and the reverse graph embedding. #' #'@section Dimensionality reduction via learning a tree: #'The general framework for reducing dimension by learning an intrinsic structure in a low dimension requires a feasible set #'\eqn{\hat{\mathcal{G}}_b} of graph and a mapping function \eqn{f_\mathcal{G}}. The algorithm uses minimum spanning tree as the feasible tree #'graph structure, which can be solved by Kruskal' algoritm. A linear projection model \eqn{f_g (\mathbf{z}) = \mathbf{Wz}} is used as the mapping function. #'Those setting results in the following specific form for the previous framework: #'\deqn{\mathop{min}_{\mathbf{W}, \mathbf{Z}, \mathbf{B}} \sum_{i = 1}^N ||\mathbf{x}_i - \mathbf{W}\mathbf{z}_i||^2 + \frac{\lambda}{2} \sum_{i,j}b_{i,j}||\mathbf{W} \mathbf{z}_i - \mathbf{W} \mathbf{z}_j||^2} #'where \eqn{\mathbf{W} = [\mathbf{w}_1, ..., \mathbf{w}_d] \in #'\mathcal{R}^{D \times d}} is an orthogonal set of \eqn{d} linear basis vectors. We can group tree graph \eqn{\mathbf{B}}, the orthogonal set of linear basis vectors #'and projected points in reduced dimension \eqn{\mathbf{W}, \mathbf{Z}} as two groups and apply alternative structure optimization to optimize the tree graph. #'This method is defined as DRtree (Dimension Reduction tree) as discussed by the authors. #' #'@section Discriminative dimensionality reduction via learning a tree: #'In order to avoid the issues where data points scattered into different branches (which leads to lose of cluster information) and to #' incorporate the discriminative information,another set of points \eqn{\{\mathbf{y}_k\}_{k = 1}^K} as the centers of \eqn{\{\mathbf{z}_i\}^N_{i = 1}} can be also introduced. #' By so doing, the objective functions of K-means and the DRtree can be simulatenously minimized. The author further proposed a soft partition method #' to account for the limits from K-means and proposed the following objective function: #' \deqn{\mathop{min}_{\mathbf{W}, \mathbf{Z}, \mathbf{B}, \mathbf{Y}, \mathbf{R}} \sum_{i = 1}^N ||\mathbf{x}_i - \mathbf{W} \mathbf{z}_i||^2 + #' \frac{\lambda}{2} \sum_{k, k'}b_{k, k'}||\mathbf{W} \mathbf{y}_k - \mathbf{W} \mathbf{y}_k'||^2 + #' \gamma\Big[\sum_{k = 1}^K \sum_{i = 1}^N r_{i, k} ||\mathbf{z}_i - \mathbf{y}_k||^2 + \sigma \Omega (\mathbf{R})\Big]} #' \deqn{s.t.\ \mathbf{W}^T \mathbf{W} = \mathbf{I}, \mathbf{B} \in \mathcal{B}, \sum_{k = 1}^K r_{i, k} = 1, #' r_{i, k} \leq 0, \forall i, \forall k} where \eqn{\mathbf{R} \in \mathcal{R}^{N \times N}, \Omega(\mathbf{R}) = \sum_{i = 1}^N \sum_{k = 1}^k r_{i, k} log\ r_{i, k}} is the negative #' entropy regularization which transforms the hard assignments used in K-means into soft assignments and \eqn{\sigma > 0} is the regulization parameter. #'Alternative structure optimization is again used to solve the above problem by separately optimize each group \eqn{{\mathbf{W}, \mathbf{Z}, \mathbf{Y}}, {\mathbf{B}, \mathbf{R}}} until convergence. #' #'@section The actual algorithm of DDRTree: #'\eqn{1.} \eqn{\mathbf{Input}}: Data matrix \eqn{\mathbf{X}}, parameters \eqn{\lambda, \sigma, \gamma} \cr #'\eqn{2.} Initialize \eqn{\mathbf{Z}} by PCA \cr #'\eqn{3.} \eqn{K = N, \mathbf{Y} = \mathbf{Z}} \cr #'\eqn{4.} \eqn{\mathbf{repeat}}: \cr #' \eqn{\ 5.} \eqn{d_{k,k'} = ||\mathbf{y}_k - \mathbf{y}_{k'}||^2, \forall k, \forall k'} \cr #' \eqn{\ 6.} Obtain \eqn{\mathbf{B}} via Kruskal's algorithm \cr #' \eqn{\ 7.} \eqn{\mathbf{L} = diag(\mathbf{B1}) - \mathbf{B}} \cr #' \eqn{\ 8.} Compute \eqn{\mathbf{R}} with each element \cr #' \eqn{\ 9.} \eqn{\tau = diag(\mathbf{1}^T \mathbf{R})} \cr #' \eqn{\ 10.} \eqn{\mathbf{Q} = \frac{1}{\mathbf{1} + \gamma} \Big[\mathbf{I} + \mathbf{R} (\frac{1 + \gamma}{\gamma}(\frac{\lambda}{\gamma} \mathbf{L} + #' \tau) - \mathbf{R}^T \mathbf{R})^{-1} \mathbf{R}^T\Big]} \cr #' \eqn{\ 11.} \eqn{\mathbf{C} = \mathbf{X Q X}^T} \cr #' \eqn{\ 12.} Perform eigen-decomposition on \eqn{\mathbf{C}} such that \eqn{\mathbf{C} = \mathbf{U} \wedge \mathbf{U}^T} and \eqn{diag(\wedge)} is sorted in a descending order \cr #' \eqn{\ 13.} \eqn{\mathbf{W} = \mathbf{U}(:, 1:d)} \cr #' \eqn{\ 14.} \eqn{\mathbf{Z} = \mathbf{W}^T \mathbf{X Q}} \cr #' \eqn{\ 15.} \eqn{\mathbf{Y} = \mathbf{Z R} (\frac{\lambda}{\gamma} \mathbf{L} + \tau)^{-1}} \cr #'\eqn{16.} \eqn{\mathbf{Until}} Convergence \cr #' #'@section Implementation of DDRTree algorithm: #'We implemented the algorithm mostly in Rcpp for the purpose of efficiency. It also has extensive optimization #'for sparse input data. This implementation is originally based on the matlab code provided from the author of DDRTree paper. #' #'@docType package #'@name DDRTree #'@import irlba #'@importFrom Rcpp evalCpp #'@useDynLib DDRTree #'@aliases DDRTree DDRTree-package NULLDDRTree/README.md0000644000176200001440000000012512652300616012761 0ustar liggesusers# DDRTree An R implementation of the DDRTree algorithm for learning principal graphs DDRTree/MD50000644000176200001440000000134013101447371012012 0ustar liggesuserse67f2627c03a0d7979652fb80721ce63 *DESCRIPTION b64834af2d27fcc7d8a30a3f3a226e0f *NAMESPACE e4c7bcc142c306e1e1e7a3599b84b574 *NEWS 7e7622a65ab2fd5d3f6b3d3d7ee24658 *R/DDRTree.R 594c45ffae1e8e09b9719b82004f7e1f *R/RcppExports.R 899472cf6b7066cd91e7105457e6a69a *R/help.R 0f1c7f0af7f1e0508e25dc7ebaceab6e *README.md b27828be449ab44d6fd8bfce8fcfeff3 *man/DDRTree.Rd 74d05da10b7c097a32228b374987abec *man/get_major_eigenvalue.Rd e6d67d586cadef08fd1a2ccc03109c82 *man/pca_projection_R.Rd 2b8df45de21f703421ead7da611a0e84 *man/sqdist_R.Rd 6878b7f1ad90e349abed8a3812208e6f *src/DDRTree.cpp 81730e37e4737419ff3d5415482b9632 *src/DDRTree.h dea28e7f699249858dbb6a5de6ec1b4b *src/DDRTree_init.c 746742dcacdb529d121b19c45e53291f *src/RcppExports.cpp DDRTree/DESCRIPTION0000644000176200001440000000170613101447371013216 0ustar liggesusersPackage: DDRTree Type: Package Title: Learning Principal Graphs with DDRTree Version: 0.1.5 Date: 2017-4-14 Author: Xiaojie Qiu, Cole Trapnell, Qi Mao, Li Wang Depends: irlba Imports: Rcpp LinkingTo: Rcpp, RcppEigen, BH Maintainer: Xiaojie Qiu Description: Provides an implementation of the framework of reversed graph embedding (RGE) which projects data into a reduced dimensional space while constructs a principal tree which passes through the middle of the data simultaneously. DDRTree shows superiority to alternatives (Wishbone, DPT) for inferring the ordering as well as the intrinsic structure of the single cell genomics data. In general, it could be used to reconstruct the temporal progression as well as bifurcation structure of any datatype. License: Artistic License 2.0 RoxygenNote: 6.0.1 SystemRequirements: C++11 NeedsCompilation: yes Packaged: 2017-04-14 17:38:16 UTC; xqiu Repository: CRAN Date/Publication: 2017-04-30 20:54:17 UTC DDRTree/man/0000755000176200001440000000000013074022006012250 5ustar liggesusersDDRTree/man/DDRTree.Rd0000644000176200001440000002463613074022006014003 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/DDRTree.R, R/help.R \docType{package} \name{DDRTree} \alias{DDRTree} \alias{DDRTree} \alias{DDRTree-package} \alias{DDRTree-package} \title{Perform DDRTree construction} \usage{ DDRTree(X, dimensions = 2, initial_method = NULL, maxIter = 20, sigma = 0.001, lambda = NULL, ncenter = NULL, param.gamma = 10, tol = 0.001, verbose = F, ...) } \arguments{ \item{X}{a matrix with \eqn{\mathbf{D \times N}} dimension which is needed to perform DDRTree construction} \item{dimensions}{reduced dimension} \item{initial_method}{a function to take the data transpose of X as input and then output the reduced dimension, row number should not larger than observation and column number should not be larger than variables (like isomap may only return matrix on valid sample sets). Sample names of returned reduced dimension should be preserved.} \item{maxIter}{maximum iterations} \item{sigma}{bandwidth parameter} \item{lambda}{regularization parameter for inverse graph embedding} \item{ncenter}{number of nodes allowed in the regularization graph} \item{param.gamma}{regularization parameter for k-means (the prefix of 'param' is used to avoid name collision with gamma)} \item{tol}{relative objective difference} \item{verbose}{emit extensive debug output} \item{...}{additional arguments passed to DDRTree} } \value{ a list with W, Z, stree, Y, history W is the orthogonal set of d (dimensions) linear basis vector Z is the reduced dimension space stree is the smooth tree graph embedded in the low dimension space Y represents latent points as the center of Z } \description{ Perform DDRTree construction This is an R and C code implementation of the DDRTree algorithm from Qi Mao, Li Wang et al. \cr \cr Qi Mao, Li Wang, Steve Goodison, and Yijun Sun. Dimensionality Reduction via Graph Structure Learning. The 21st ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD'15), 2015 \cr \url{http://dl.acm.org/citation.cfm?id=2783309} \cr \cr to perform dimension reduction and principal graph learning simultaneously. Please cite this package and KDD'15 paper if you found DDRTree is useful for your research. } \section{Introduction}{ The unprecedented increase in big-data causes a huge difficulty in data visualization and downstream analysis. Conventional dimension reduction approaches (for example, PCA, ICA, Isomap, LLE, etc.) are limited in their ability to explictly recover the intrinisic structure from the data as well as the discriminative feature representation, both are important for scientific discovery. The DDRTree algorithm is a new algorithm to perform the following three tasks in one setting: \cr \cr 1. Reduce high dimension data into a low dimension space \cr \cr 2. Recover an explicit smooth graph structure with local geometry only captured by distances of data points in the low dimension space. \cr \cr 3. Obtain clustering structures of data points in reduced dimension \cr } \section{Dimensionality reduction via graph structure learning}{ Reverse graph embedding is previously applied to learn the intrinisic graph structure in the original dimension. The optimization of graph inference can be represented as: \deqn{\mathop{min}_{f_g \in \mathcal{F}} \mathop{min}_{\{\mathbf{z}_1, ..., \mathbf{z}_M\}} \sum_{(V_i, V_j) \in \mathcal{E}} b_{i,j}||f_g(\mathbf{z}_i) - f_g(\mathbf{z}_j)||^2} where \eqn{f_g} is a function to map the instrinsic data space \eqn{\mathcal{Z} = \{\mathbf{z}_1, ..., \mathbf{z}_M\}} back to the input data space (reverse embedding) \eqn{\mathcal{X} = \{ \mathbf{x}_1, ..., \mathbf{x}_N\}}. \eqn{V_i} is the the vertex of the instrinsic undirected graph \eqn{\mathcal{G} = (\mathcal{V}, \mathcal{E})}. \eqn{b_{ij}} is the edge weight associates with the edge set \eqn{\mathcal{E}}. In order to learn the intrinsic structure from a reduced dimension, we need also to consider a term which includes the error during the learning of the instrinsic structure. This strategy is incorporated as the following: \deqn{\mathop{min}_{\mathcal{G} \in \hat{\mathcal{G}}_b}\mathop{min}_{f_g \in \mathcal{F}} \mathop{min}_{\{\mathbf{z}_1, ..., \mathbf{z}_M\}} \sum_{i = 1}^N ||\mathbf{x}_i - f_g (\mathbf{z}_i)||^2 + \frac{\lambda}{2} \sum_{(V_i, V_j) \in \mathcal{E}} b_{i,j}||f_g(\mathbf{z}_i) - f_g(\mathbf{z}_j)||^2} where \eqn{\lambda} is a non-negative parameter which controls the tradeoff between the data reconstruction error and the reverse graph embedding. } \section{Dimensionality reduction via learning a tree}{ The general framework for reducing dimension by learning an intrinsic structure in a low dimension requires a feasible set \eqn{\hat{\mathcal{G}}_b} of graph and a mapping function \eqn{f_\mathcal{G}}. The algorithm uses minimum spanning tree as the feasible tree graph structure, which can be solved by Kruskal' algoritm. A linear projection model \eqn{f_g (\mathbf{z}) = \mathbf{Wz}} is used as the mapping function. Those setting results in the following specific form for the previous framework: \deqn{\mathop{min}_{\mathbf{W}, \mathbf{Z}, \mathbf{B}} \sum_{i = 1}^N ||\mathbf{x}_i - \mathbf{W}\mathbf{z}_i||^2 + \frac{\lambda}{2} \sum_{i,j}b_{i,j}||\mathbf{W} \mathbf{z}_i - \mathbf{W} \mathbf{z}_j||^2} where \eqn{\mathbf{W} = [\mathbf{w}_1, ..., \mathbf{w}_d] \in \mathcal{R}^{D \times d}} is an orthogonal set of \eqn{d} linear basis vectors. We can group tree graph \eqn{\mathbf{B}}, the orthogonal set of linear basis vectors and projected points in reduced dimension \eqn{\mathbf{W}, \mathbf{Z}} as two groups and apply alternative structure optimization to optimize the tree graph. This method is defined as DRtree (Dimension Reduction tree) as discussed by the authors. } \section{Discriminative dimensionality reduction via learning a tree}{ In order to avoid the issues where data points scattered into different branches (which leads to lose of cluster information) and to incorporate the discriminative information,another set of points \eqn{\{\mathbf{y}_k\}_{k = 1}^K} as the centers of \eqn{\{\mathbf{z}_i\}^N_{i = 1}} can be also introduced. By so doing, the objective functions of K-means and the DRtree can be simulatenously minimized. The author further proposed a soft partition method to account for the limits from K-means and proposed the following objective function: \deqn{\mathop{min}_{\mathbf{W}, \mathbf{Z}, \mathbf{B}, \mathbf{Y}, \mathbf{R}} \sum_{i = 1}^N ||\mathbf{x}_i - \mathbf{W} \mathbf{z}_i||^2 + \frac{\lambda}{2} \sum_{k, k'}b_{k, k'}||\mathbf{W} \mathbf{y}_k - \mathbf{W} \mathbf{y}_k'||^2 + \gamma\Big[\sum_{k = 1}^K \sum_{i = 1}^N r_{i, k} ||\mathbf{z}_i - \mathbf{y}_k||^2 + \sigma \Omega (\mathbf{R})\Big]} \deqn{s.t.\ \mathbf{W}^T \mathbf{W} = \mathbf{I}, \mathbf{B} \in \mathcal{B}, \sum_{k = 1}^K r_{i, k} = 1, r_{i, k} \leq 0, \forall i, \forall k} where \eqn{\mathbf{R} \in \mathcal{R}^{N \times N}, \Omega(\mathbf{R}) = \sum_{i = 1}^N \sum_{k = 1}^k r_{i, k} log\ r_{i, k}} is the negative entropy regularization which transforms the hard assignments used in K-means into soft assignments and \eqn{\sigma > 0} is the regulization parameter. Alternative structure optimization is again used to solve the above problem by separately optimize each group \eqn{{\mathbf{W}, \mathbf{Z}, \mathbf{Y}}, {\mathbf{B}, \mathbf{R}}} until convergence. } \section{The actual algorithm of DDRTree}{ \eqn{1.} \eqn{\mathbf{Input}}: Data matrix \eqn{\mathbf{X}}, parameters \eqn{\lambda, \sigma, \gamma} \cr \eqn{2.} Initialize \eqn{\mathbf{Z}} by PCA \cr \eqn{3.} \eqn{K = N, \mathbf{Y} = \mathbf{Z}} \cr \eqn{4.} \eqn{\mathbf{repeat}}: \cr \eqn{\ 5.} \eqn{d_{k,k'} = ||\mathbf{y}_k - \mathbf{y}_{k'}||^2, \forall k, \forall k'} \cr \eqn{\ 6.} Obtain \eqn{\mathbf{B}} via Kruskal's algorithm \cr \eqn{\ 7.} \eqn{\mathbf{L} = diag(\mathbf{B1}) - \mathbf{B}} \cr \eqn{\ 8.} Compute \eqn{\mathbf{R}} with each element \cr \eqn{\ 9.} \eqn{\tau = diag(\mathbf{1}^T \mathbf{R})} \cr \eqn{\ 10.} \eqn{\mathbf{Q} = \frac{1}{\mathbf{1} + \gamma} \Big[\mathbf{I} + \mathbf{R} (\frac{1 + \gamma}{\gamma}(\frac{\lambda}{\gamma} \mathbf{L} + \tau) - \mathbf{R}^T \mathbf{R})^{-1} \mathbf{R}^T\Big]} \cr \eqn{\ 11.} \eqn{\mathbf{C} = \mathbf{X Q X}^T} \cr \eqn{\ 12.} Perform eigen-decomposition on \eqn{\mathbf{C}} such that \eqn{\mathbf{C} = \mathbf{U} \wedge \mathbf{U}^T} and \eqn{diag(\wedge)} is sorted in a descending order \cr \eqn{\ 13.} \eqn{\mathbf{W} = \mathbf{U}(:, 1:d)} \cr \eqn{\ 14.} \eqn{\mathbf{Z} = \mathbf{W}^T \mathbf{X Q}} \cr \eqn{\ 15.} \eqn{\mathbf{Y} = \mathbf{Z R} (\frac{\lambda}{\gamma} \mathbf{L} + \tau)^{-1}} \cr \eqn{16.} \eqn{\mathbf{Until}} Convergence \cr } \section{Implementation of DDRTree algorithm}{ We implemented the algorithm mostly in Rcpp for the purpose of efficiency. It also has extensive optimization for sparse input data. This implementation is originally based on the matlab code provided from the author of DDRTree paper. } \examples{ data('iris') subset_iris_mat <- as.matrix(t(iris[c(1, 2, 52, 103), 1:4])) #subset the data #run DDRTree with ncenters equal to species number DDRTree_res <- DDRTree(subset_iris_mat, dimensions = 2, maxIter = 5, sigma = 1e-2, lambda = 1, ncenter = 3, param.gamma = 10, tol = 1e-2, verbose = FALSE) Z <- DDRTree_res$Z #obatain matrix Y <- DDRTree_res$Y stree <- DDRTree_res$stree plot(Z[1, ], Z[2, ], col = iris[c(1, 2, 52, 103), 'Species']) #reduced dimension legend("center", legend = unique(iris[c(1, 2, 52, 103), 'Species']), cex=0.8, col=unique(iris[c(1, 2, 52, 103), 'Species']), pch = 1) #legend title(main="DDRTree reduced dimension", col.main="red", font.main=4) dev.off() plot(Y[1, ], Y[2, ], col = 'blue', pch = 17) #center of the Z title(main="DDRTree smooth principal curves", col.main="red", font.main=4) #run DDRTree with ncenters equal to species number DDRTree_res <- DDRTree(subset_iris_mat, dimensions = 2, maxIter = 5, sigma = 1e-3, lambda = 1, ncenter = NULL,param.gamma = 10, tol = 1e-2, verbose = FALSE) Z <- DDRTree_res$Z #obatain matrix Y <- DDRTree_res$Y stree <- DDRTree_res$stree plot(Z[1, ], Z[2, ], col = iris[c(1, 2, 52, 103), 'Species']) #reduced dimension legend("center", legend = unique(iris[c(1, 2, 52, 103), 'Species']), cex=0.8, col=unique(iris[c(1, 2, 52, 103), 'Species']), pch = 1) #legend title(main="DDRTree reduced dimension", col.main="red", font.main=4) dev.off() plot(Y[1, ], Y[2, ], col = 'blue', pch = 2) #center of the Z title(main="DDRTree smooth principal graphs", col.main="red", font.main=4) } DDRTree/man/sqdist_R.Rd0000644000176200001440000000066113074022006014332 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/DDRTree.R \name{sqdist_R} \alias{sqdist_R} \title{calculate the square distance between a, b} \usage{ sqdist_R(a, b) } \arguments{ \item{a}{a matrix with \eqn{D \times N} dimension} \item{b}{a matrix with \eqn{D \times N} dimension} } \value{ a numeric value for the different between a and b } \description{ calculate the square distance between a, b } DDRTree/man/get_major_eigenvalue.Rd0000644000176200001440000000055413074022006016716 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/DDRTree.R \name{get_major_eigenvalue} \alias{get_major_eigenvalue} \title{Get the top L eigenvalues} \usage{ get_major_eigenvalue(C, L) } \arguments{ \item{C}{data matrix used for eigendecomposition} \item{L}{number for the top eigenvalues} } \description{ Get the top L eigenvalues } DDRTree/man/pca_projection_R.Rd0000644000176200001440000000054713074022006016025 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/DDRTree.R \name{pca_projection_R} \alias{pca_projection_R} \title{Compute the PCA projection} \usage{ pca_projection_R(C, L) } \arguments{ \item{C}{data matrix used for PCA projection} \item{L}{number for the top principal components} } \description{ Compute the PCA projection }