ranger/0000755000176200001440000000000014073544142011534 5ustar liggesusersranger/NAMESPACE0000755000176200001440000000121014073533556012757 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(importance,ranger) S3method(predict,ranger) S3method(predict,ranger.forest) S3method(predictions,ranger) S3method(predictions,ranger.prediction) S3method(print,ranger) S3method(print,ranger.forest) S3method(print,ranger.prediction) S3method(timepoints,ranger) S3method(timepoints,ranger.prediction) export(csrf) export(getTerminalNodeIDs) export(holdoutRF) export(importance) export(importance_pvalues) export(predictions) export(ranger) export(timepoints) export(treeInfo) import(stats) import(utils) importFrom(Matrix,Matrix) importFrom(Rcpp,evalCpp) useDynLib(ranger, .registration = TRUE) ranger/man/0000755000176200001440000000000014073531412012303 5ustar liggesusersranger/man/getTerminalNodeIDs.Rd0000755000176200001440000000127414027301517016262 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/getTerminalNodeIDs.R \name{getTerminalNodeIDs} \alias{getTerminalNodeIDs} \title{Get terminal node IDs (deprecated)} \usage{ getTerminalNodeIDs(rf, dat) } \arguments{ \item{rf}{\code{ranger} object.} \item{dat}{New dataset. Terminal node IDs for this dataset are obtained.} } \value{ Matrix with terminal nodeIDs for all observations in dataset and trees. } \description{ This function is deprecated. Please use predict() with \code{type = "terminalNodes"} instead. This function calls predict() now. } \examples{ rf <- ranger(Species ~ ., data = iris, num.trees = 5, write.forest = TRUE) getTerminalNodeIDs(rf, iris) } ranger/man/predict.ranger.Rd0000755000176200001440000001231314073532574015516 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/predict.R \name{predict.ranger} \alias{predict.ranger} \title{Ranger prediction} \usage{ \method{predict}{ranger}( object, data = NULL, predict.all = FALSE, num.trees = object$num.trees, type = "response", se.method = "infjack", quantiles = c(0.1, 0.5, 0.9), what = NULL, seed = NULL, num.threads = NULL, verbose = TRUE, ... ) } \arguments{ \item{object}{Ranger \code{ranger} object.} \item{data}{New test data of class \code{data.frame} or \code{gwaa.data} (GenABEL).} \item{predict.all}{Return individual predictions for each tree instead of aggregated predictions for all trees. Return a matrix (sample x tree) for classification and regression, a 3d array for probability estimation (sample x class x tree) and survival (sample x time x tree).} \item{num.trees}{Number of trees used for prediction. The first \code{num.trees} in the forest are used.} \item{type}{Type of prediction. One of 'response', 'se', 'terminalNodes', 'quantiles' with default 'response'. See below for details.} \item{se.method}{Method to compute standard errors. One of 'jack', 'infjack' with default 'infjack'. Only applicable if type = 'se'. See below for details.} \item{quantiles}{Vector of quantiles for quantile prediction. Set \code{type = 'quantiles'} to use.} \item{what}{User specified function for quantile prediction used instead of \code{quantile}. Must return numeric vector, see examples.} \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode.} \item{num.threads}{Number of threads. Default is number of CPUs available.} \item{verbose}{Verbose output on or off.} \item{...}{further arguments passed to or from other methods.} } \value{ Object of class \code{ranger.prediction} with elements \tabular{ll}{ \code{predictions} \tab Predicted classes/values (only for classification and regression) \cr \code{unique.death.times} \tab Unique death times (only for survival). \cr \code{chf} \tab Estimated cumulative hazard function for each sample (only for survival). \cr \code{survival} \tab Estimated survival function for each sample (only for survival). \cr \code{num.trees} \tab Number of trees. \cr \code{num.independent.variables} \tab Number of independent variables. \cr \code{treetype} \tab Type of forest/tree. Classification, regression or survival. \cr \code{num.samples} \tab Number of samples. } } \description{ Prediction with new data and a saved forest from Ranger. } \details{ For \code{type = 'response'} (the default), the predicted classes (classification), predicted numeric values (regression), predicted probabilities (probability estimation) or survival probabilities (survival) are returned. For \code{type = 'se'}, the standard error of the predictions are returned (regression only). The jackknife-after-bootstrap or infinitesimal jackknife for bagging is used to estimate the standard errors based on out-of-bag predictions. See Wager et al. (2014) for details. For \code{type = 'terminalNodes'}, the IDs of the terminal node in each tree for each observation in the given dataset are returned. For \code{type = 'quantiles'}, the selected quantiles for each observation are estimated. See Meinshausen (2006) for details. If \code{type = 'se'} is selected, the method to estimate the variances can be chosen with \code{se.method}. Set \code{se.method = 'jack'} for jackknife-after-bootstrap and \code{se.method = 'infjack'} for the infinitesimal jackknife for bagging. For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. } \examples{ ## Classification forest ranger(Species ~ ., data = iris) train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) iris.train <- iris[train.idx, ] iris.test <- iris[-train.idx, ] rg.iris <- ranger(Species ~ ., data = iris.train) pred.iris <- predict(rg.iris, data = iris.test) table(iris.test$Species, pred.iris$predictions) ## Quantile regression forest rf <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE) pred <- predict(rf, mtcars[27:32, ], type = "quantiles", quantiles = c(0.1, 0.5, 0.9)) pred$predictions ## Quantile regression forest with user-specified function rf <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE) pred <- predict(rf, mtcars[27:32, ], type = "quantiles", what = function(x) sample(x, 10, replace = TRUE)) pred$predictions } \references{ \itemize{ \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. } } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/parse.formula.Rd0000755000176200001440000000133414027301517015354 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/formula.R \name{parse.formula} \alias{parse.formula} \title{Parse formula} \usage{ parse.formula(formula, data, env = parent.frame()) } \arguments{ \item{formula}{Object of class \code{formula} or \code{character} describing the model to fit.} \item{data}{Training data of class \code{data.frame}.} \item{env}{The environment in which the left hand side of \code{formula} is evaluated.} } \value{ Dataset including selected columns and interactions. } \description{ Parse formula and return dataset containing selected columns. Interactions are supported for numerical columns only. An interaction column is the product of all interacting columns. } ranger/man/ranger.Rd0000755000176200001440000004260514073532575014075 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/ranger.R \name{ranger} \alias{ranger} \title{Ranger} \usage{ ranger( formula = NULL, data = NULL, num.trees = 500, mtry = NULL, importance = "none", write.forest = TRUE, probability = FALSE, min.node.size = NULL, max.depth = NULL, replace = TRUE, sample.fraction = ifelse(replace, 1, 0.632), case.weights = NULL, class.weights = NULL, splitrule = NULL, num.random.splits = 1, alpha = 0.5, minprop = 0.1, split.select.weights = NULL, always.split.variables = NULL, respect.unordered.factors = NULL, scale.permutation.importance = FALSE, local.importance = FALSE, regularization.factor = 1, regularization.usedepth = FALSE, keep.inbag = FALSE, inbag = NULL, holdout = FALSE, quantreg = FALSE, oob.error = TRUE, num.threads = NULL, save.memory = FALSE, verbose = TRUE, seed = NULL, dependent.variable.name = NULL, status.variable.name = NULL, classification = NULL, x = NULL, y = NULL, ... ) } \arguments{ \item{formula}{Object of class \code{formula} or \code{character} describing the model to fit. Interaction terms supported only for numerical variables.} \item{data}{Training data of class \code{data.frame}, \code{matrix}, \code{dgCMatrix} (Matrix) or \code{gwaa.data} (GenABEL).} \item{num.trees}{Number of trees.} \item{mtry}{Number of variables to possibly split at in each node. Default is the (rounded down) square root of the number variables. Alternatively, a single argument function returning an integer, given the number of independent variables.} \item{importance}{Variable importance mode, one of 'none', 'impurity', 'impurity_corrected', 'permutation'. The 'impurity' measure is the Gini index for classification, the variance of the responses for regression and the sum of test statistics (see \code{splitrule}) for survival.} \item{write.forest}{Save \code{ranger.forest} object, required for prediction. Set to \code{FALSE} to reduce memory usage if no prediction intended.} \item{probability}{Grow a probability forest as in Malley et al. (2012).} \item{min.node.size}{Minimal node size. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability.} \item{max.depth}{Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree).} \item{replace}{Sample with replacement.} \item{sample.fraction}{Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values.} \item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.} \item{class.weights}{Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes.} \item{splitrule}{Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".} \item{num.random.splits}{For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable.} \item{alpha}{For "maxstat" splitrule: Significance threshold to allow splitting.} \item{minprop}{For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting.} \item{split.select.weights}{Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.} \item{always.split.variables}{Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting.} \item{respect.unordered.factors}{Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details.} \item{scale.permutation.importance}{Scale permutation importance by standard error as in (Breiman 2001). Only applicable if permutation variable importance mode selected.} \item{local.importance}{Calculate and return local importance values as in (Breiman 2001). Only applicable if \code{importance} is set to 'permutation'.} \item{regularization.factor}{Regularization factor (gain penalization), either a vector of length p or one value for all variables.} \item{regularization.usedepth}{Consider the depth in regularization.} \item{keep.inbag}{Save how often observations are in-bag in each tree.} \item{inbag}{Manually set observations per tree. List of size num.trees, containing inbag counts for each observation. Can be used for stratified sampling.} \item{holdout}{Hold-out mode. Hold-out all samples with case weight 0 and use these for variable importance and prediction error.} \item{quantreg}{Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction.} \item{oob.error}{Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests.} \item{num.threads}{Number of threads. Default is number of CPUs available.} \item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems.} \item{verbose}{Show computation status and estimated runtime.} \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed.} \item{dependent.variable.name}{Name of dependent variable, needed if no formula given. For survival forests this is the time variable.} \item{status.variable.name}{Name of status variable, only applicable to survival data and needed if no formula given. Use 1 for event and 0 for censoring.} \item{classification}{Set to \code{TRUE} to grow a classification forest. Only needed if the data is a matrix or the response numeric.} \item{x}{Predictor data (independent variables), alternative interface to data with formula or dependent.variable.name.} \item{y}{Response vector (dependent variable), alternative interface to data with formula or dependent.variable.name. For survival use a \code{Surv()} object or a matrix with time and status.} \item{...}{Further arguments passed to or from other methods (currently ignored).} } \value{ Object of class \code{ranger} with elements \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.} \item{\code{predictions}}{Predicted classes/values, based on out of bag samples (classification and regression only).} \item{\code{variable.importance}}{Variable importance for each independent variable.} \item{\code{variable.importance.local}}{Variable importance for each independent variable and each sample, if \code{local.importance} is set to TRUE and \code{importance} is set to 'permutation'.} \item{\code{prediction.error}}{Overall out of bag prediction error. For classification this is the fraction of missclassified samples, for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.} \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out of bag data.} \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out of bag samples (classification only).} \item{\code{unique.death.times}}{Unique death times (survival only).} \item{\code{chf}}{Estimated cumulative hazard function for each sample (survival only).} \item{\code{survival}}{Estimated survival function for each sample (survival only).} \item{\code{call}}{Function call.} \item{\code{num.trees}}{Number of trees.} \item{\code{num.independent.variables}}{Number of independent variables.} \item{\code{mtry}}{Value of mtry used.} \item{\code{min.node.size}}{Value of minimal node size used.} \item{\code{treetype}}{Type of forest/tree. classification, regression or survival.} \item{\code{importance.mode}}{Importance mode used.} \item{\code{num.samples}}{Number of samples.} \item{\code{inbag.counts}}{Number of times the observations are in-bag in the trees.} } \description{ Ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data. Classification, regression, and survival forests are supported. Classification and regression forests are implemented as in the original Random Forest (Breiman 2001), survival forests as in Random Survival Forests (Ishwaran et al. 2008). Includes implementations of extremely randomized trees (Geurts et al. 2006) and quantile regression forests (Meinshausen 2006). } \details{ The tree type is determined by the type of the dependent variable. For factors classification trees are grown, for numeric values regression trees and for survival objects survival trees. The Gini index is used as default splitting rule for classification. For regression, the estimated response variances or maximally selected rank statistics (Wright et al. 2016) can be used. For Survival the log-rank test, a C-index based splitting rule (Schmid et al. 2015) and maximally selected rank statistics (Wright et al. 2016) are available. For all tree types, forests of extremely randomized trees (Geurts et al. 2006) can be grown. With the \code{probability} option and factor dependent variable a probability forest is grown. Here, the node impurity is used for splitting, as in classification forests. Predictions are class probabilities for each sample. In contrast to other implementations, each tree returns a probability estimate and these estimates are averaged for the forest probability estimate. For details see Malley et al. (2012). Note that for classification and regression nodes with size smaller than \code{min.node.size} can occur, as in original Random Forests. For survival all nodes contain at \code{min.node.size} samples. Variables selected with \code{always.split.variables} are tried additionally to the mtry variables randomly selected. In \code{split.select.weights}, weights do not need to sum up to 1, they will be normalized later. The weights are assigned to the variables in the order they appear in the formula or in the data if no formula is used. Names of the \code{split.select.weights} vector are ignored. The usage of \code{split.select.weights} can increase the computation times for large forests. Unordered factor covariates can be handled in 3 different ways by using \code{respect.unordered.factors}: For 'ignore' all factors are regarded ordered, for 'partition' all possible 2-partitions are considered for splitting. For 'order' and 2-class classification the factor levels are ordered by their proportion falling in the second class, for regression by their mean response, as described in Hastie et al. (2009), chapter 9.2.4. For multiclass classification the factor levels are ordered by the first principal component of the weighted covariance matrix of the contingency table (Coppersmith et al. 1999), for survival by the median survival (or the largest available quantile if the median is not available). The use of 'order' is recommended, as it computationally fast and can handle an unlimited number of factor levels. Note that the factors are only reordered once and not again in each split. The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. See Nembrini et al. (2018) for details. This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. If regularization is used, multithreading is deactivated because all trees need access to the list of variables that are already included in the model. For a large number of variables and data frames as input data the formula interface can be slow or impossible to use. Alternatively \code{dependent.variable.name} (and \code{status.variable.name} for survival) or \code{x} and \code{y} can be used. Use \code{x} and \code{y} with a matrix for \code{x} to avoid conversions and save memory. Consider setting \code{save.memory = TRUE} if you encounter memory problems for very large datasets, but be aware that this option slows down the tree growing. For GWAS data consider combining \code{ranger} with the \code{GenABEL} package. See the Examples section below for a demonstration using \code{Plink} data. All SNPs in the \code{GenABEL} object will be used for splitting. To use only the SNPs without sex or other covariates from the phenotype file, use \code{0} on the right hand side of the formula. Note that missing values are treated as an extra category while splitting. See \url{https://github.com/imbs-hl/ranger} for the development version. With recent R versions, multithreading on Windows platforms should just work. If you compile yourself, the new RTools toolchain is required. } \examples{ ## Classification forest with default settings ranger(Species ~ ., data = iris) ## Prediction train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) iris.train <- iris[train.idx, ] iris.test <- iris[-train.idx, ] rg.iris <- ranger(Species ~ ., data = iris.train) pred.iris <- predict(rg.iris, data = iris.test) table(iris.test$Species, pred.iris$predictions) ## Quantile regression forest rf <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE) pred <- predict(rf, mtcars[27:32, ], type = "quantiles") pred$predictions ## Variable importance rg.iris <- ranger(Species ~ ., data = iris, importance = "impurity") rg.iris$variable.importance ## Survival forest require(survival) rg.veteran <- ranger(Surv(time, status) ~ ., data = veteran) plot(rg.veteran$unique.death.times, rg.veteran$survival[1,]) ## Alternative interfaces (same results) ranger(dependent.variable.name = "Species", data = iris) ranger(y = iris[, 5], x = iris[, -5]) \dontrun{ ## Use GenABEL interface to read Plink data into R and grow a classification forest ## The ped and map files are not included library(GenABEL) convert.snp.ped("data.ped", "data.map", "data.raw") dat.gwaa <- load.gwaa.data("data.pheno", "data.raw") phdata(dat.gwaa)$trait <- factor(phdata(dat.gwaa)$trait) ranger(trait ~ ., data = dat.gwaa) } } \references{ \itemize{ \item Wright, M. N. & Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. \item Schmid, M., Wright, M. N. & Ziegler, A. (2016). On the use of Harrell's C for clinical risk prediction via random survival forests. Expert Syst Appl 63:450-459. \doi{10.1016/j.eswa.2016.07.018}. \item Wright, M. N., Dankowski, T. & Ziegler, A. (2017). Unbiased split variable selection for random survival forests using maximally selected rank statistics. Stat Med 36:1272-1284. \doi{10.1002/sim.7212}. \item Nembrini, S., Koenig, I. R. & Wright, M. N. (2018). The revival of the Gini Importance? Bioinformatics. \doi{10.1093/bioinformatics/bty373}. \item Breiman, L. (2001). Random forests. Mach Learn, 45:5-32. \doi{10.1023/A:1010933404324}. \item Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. Ann Appl Stat 2:841-860. \doi{10.1097/JTO.0b013e318233d835}. \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \doi{10.3414/ME00-01-0052}. \item Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical Learning. Springer, New York. 2nd edition. \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \doi{10.1007/s10994-006-6226-1}. \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \doi{10.1198/106186008X344522}. \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \doi{10.1023/A:1009869804967}. \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \doi{10.1109/IJCNN.2012.6252640}. } } \seealso{ \code{\link{predict.ranger}} } \author{ Marvin N. Wright } ranger/man/holdoutRF.Rd0000755000176200001440000000163214073532573014516 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/holdoutRF.R \name{holdoutRF} \alias{holdoutRF} \title{Hold-out random forests} \usage{ holdoutRF(...) } \arguments{ \item{...}{All arguments are passed to \code{\link{ranger}()} (except \code{importance}, \code{case.weights}, \code{replace} and \code{holdout}.).} } \value{ Hold-out random forests with variable importance. } \description{ Grow two random forests on two cross-validation folds. Instead of out-of-bag data, the other fold is used to compute permutation importance. Related to the novel permutation variable importance by Janitza et al. (2015). } \references{ Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/print.ranger.Rd0000755000176200001440000000063514027301517015212 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/print.R \name{print.ranger} \alias{print.ranger} \title{Print Ranger} \usage{ \method{print}{ranger}(x, ...) } \arguments{ \item{x}{Object of class 'ranger'.} \item{...}{Further arguments passed to or from other methods.} } \description{ Print contents of Ranger object. } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/print.ranger.prediction.Rd0000755000176200001440000000067614027301517017356 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/print.R \name{print.ranger.prediction} \alias{print.ranger.prediction} \title{Print Ranger prediction} \usage{ \method{print}{ranger.prediction}(x, ...) } \arguments{ \item{x}{Object of class 'ranger.prediction'.} \item{...}{further arguments passed to or from other methods.} } \description{ Print contents of Ranger prediction object. } \author{ Marvin N. Wright } ranger/man/print.ranger.forest.Rd0000755000176200001440000000064614027301517016515 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/print.R \name{print.ranger.forest} \alias{print.ranger.forest} \title{Print Ranger forest} \usage{ \method{print}{ranger.forest}(x, ...) } \arguments{ \item{x}{Object of class 'ranger.forest'.} \item{...}{further arguments passed to or from other methods.} } \description{ Print contents of Ranger forest object. } \author{ Marvin N. Wright } ranger/man/csrf.Rd0000755000176200001440000000420014073531730013531 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/csrf.R \name{csrf} \alias{csrf} \title{Case-specific random forests.} \usage{ csrf( formula, training_data, test_data, params1 = list(), params2 = list(), verbose = FALSE ) } \arguments{ \item{formula}{Object of class \code{formula} or \code{character} describing the model to fit.} \item{training_data}{Training data of class \code{data.frame}.} \item{test_data}{Test data of class \code{data.frame}.} \item{params1}{Parameters for the proximity random forest grown in the first step.} \item{params2}{Parameters for the prediction random forests grown in the second step.} \item{verbose}{Logical indicating whether or not to print computation progress.} } \value{ Predictions for the test dataset. } \description{ In case-specific random forests (CSRF), random forests are built specific to the cases of interest. Instead of using equal probabilities, the cases are weighted according to their difference to the case of interest. } \details{ The algorithm consists of 3 steps: \enumerate{ \item Grow a random forest on the training data \item For each observation of interest (test data), the weights of all training observations are computed by counting the number of trees in which both observations are in the same terminal node. \item For each test observation, grow a weighted random forest on the training data, using the weights obtained in step 2. Predict the outcome of the test observation as usual. } In total, n+1 random forests are grown, where n is the number observations in the test dataset. For details, see Xu et al. (2014). } \examples{ ## Split in training and test data train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) iris.train <- iris[train.idx, ] iris.test <- iris[-train.idx, ] ## Run case-specific RF csrf(Species ~ ., training_data = iris.train, test_data = iris.test, params1 = list(num.trees = 50, mtry = 4), params2 = list(num.trees = 5)) } \references{ Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \doi{10.1080/10618600.2014.983641}. } \author{ Marvin N. Wright } ranger/man/predictions.ranger.prediction.Rd0000755000176200001440000000126314027301517020536 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/predictions.R \name{predictions.ranger.prediction} \alias{predictions.ranger.prediction} \alias{predictions} \title{Ranger predictions} \usage{ \method{predictions}{ranger.prediction}(x, ...) } \arguments{ \item{x}{Ranger prediction object.} \item{...}{Further arguments passed to or from other methods.} } \value{ Predictions: Classes for Classification forests, Numerical values for Regressions forests and the estimated survival functions for all individuals for Survival forests. } \description{ Extract predictions of Ranger prediction object. } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/importance_pvalues.Rd0000755000176200001440000000630014073532574016506 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/importance.R \name{importance_pvalues} \alias{importance_pvalues} \title{ranger variable importance p-values} \usage{ importance_pvalues( x, method = c("janitza", "altmann"), num.permutations = 100, formula = NULL, data = NULL, ... ) } \arguments{ \item{x}{\code{ranger} or \code{holdoutRF} object.} \item{method}{Method to compute p-values. Use "janitza" for the method by Janitza et al. (2016) or "altmann" for the non-parametric method by Altmann et al. (2010).} \item{num.permutations}{Number of permutations. Used in the "altmann" method only.} \item{formula}{Object of class formula or character describing the model to fit. Used in the "altmann" method only.} \item{data}{Training data of class data.frame or matrix. Used in the "altmann" method only.} \item{...}{Further arguments passed to \code{ranger()}. Used in the "altmann" method only.} } \value{ Variable importance and p-value for each variable. } \description{ Compute variable importance with p-values. For high dimensional data, the fast method of Janitza et al. (2016) can be used. The permutation approach of Altmann et al. (2010) is computationally intensive but can be used with all kinds of data. See below for details. } \details{ The method of Janitza et al. (2016) uses a clever trick: With an unbiased variable importance measure, the importance values of non-associated variables vary randomly around zero. Thus, all non-positive importance values are assumed to correspond to these non-associated variables and they are used to construct a distribution of the importance under the null hypothesis of no association to the response. Since only the non-positive values of this distribution can be observed, the positive values are created by mirroring the negative distribution. See Janitza et al. (2016) for details. The method of Altmann et al. (2010) uses a simple permutation test: The distribution of the importance under the null hypothesis of no association to the response is created by several replications of permuting the response, growing an RF and computing the variable importance. The authors recommend 50-100 permutations. However, much larger numbers have to be used to estimate more precise p-values. We add 1 to the numerator and denominator to avoid zero p-values. } \examples{ ## Janitza's p-values with corrected Gini importance n <- 50 p <- 400 dat <- data.frame(y = factor(rbinom(n, 1, .5)), replicate(p, runif(n))) rf.sim <- ranger(y ~ ., dat, importance = "impurity_corrected") importance_pvalues(rf.sim, method = "janitza") ## Permutation p-values \dontrun{ rf.iris <- ranger(Species ~ ., data = iris, importance = 'permutation') importance_pvalues(rf.iris, method = "altmann", formula = Species ~ ., data = iris) } } \references{ Janitza, S., Celik, E. & Boulesteix, A.-L., (2016). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr Altmann, A., Tolosi, L., Sander, O. & Lengauer, T. (2010). Permutation importance: a corrected feature importance measure, Bioinformatics 26:1340-1347. } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/treeInfo.Rd0000755000176200001440000000461314027301517014354 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/treeInfo.R \name{treeInfo} \alias{treeInfo} \title{Tree information in human readable format} \usage{ treeInfo(object, tree = 1) } \arguments{ \item{object}{\code{ranger} object.} \item{tree}{Number of the tree of interest.} } \value{ A data.frame with the columns \tabular{ll}{ \code{nodeID} \tab The nodeID, 0-indexed. \cr \code{leftChild} \tab ID of the left child node, 0-indexed. \cr \code{rightChild} \tab ID of the right child node, 0-indexed. \cr \code{splitvarID} \tab ID of the splitting variable, 0-indexed. Caution, the variable order changes if the formula interface is used. \cr \code{splitvarName} \tab Name of the splitting variable. \cr \code{splitval} \tab The splitting value. For numeric or ordinal variables, all values smaller or equal go to the left, larger values to the right. For unordered factor variables see above. \cr \code{terminal} \tab Logical, TRUE for terminal nodes. \cr \code{prediction} \tab One column with the predicted class (factor) for classification and the predicted numerical value for regression. One probability per class for probability estimation in several columns. Nothing for survival, refer to \code{object$forest$chf} for the CHF node predictions. \cr } } \description{ Extract tree information of a \code{ranger} object. } \details{ Node and variable ID's are 0-indexed, i.e., node 0 is the root node. If the formula interface is used in the \code{ranger} call, the variable ID's are usually different to the original data used to grow the tree. Refer to the variable name instead to be sure. Splitting at unordered factors (nominal variables) depends on the option \code{respect.unordered.factors} in the \code{ranger} call. For the "ignore" and "order" approaches, all values smaller or equal the \code{splitval} value go to the left and all values larger go to the right, as usual. However, with "order" the values correspond to the order in \code{object$forest$covariate.levels} instead of the original order (usually alphabetical). In the "partition" mode, the \code{splitval} values for unordered factor are comma separated lists of values, representing the factor levels (in the original order) going to the right. } \examples{ rf <- ranger(Species ~ ., data = iris) treeInfo(rf, 1) } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/importance.ranger.Rd0000755000176200001440000000077414027301517016223 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/importance.R \name{importance.ranger} \alias{importance.ranger} \alias{importance} \title{ranger variable importance} \usage{ \method{importance}{ranger}(x, ...) } \arguments{ \item{x}{ranger object.} \item{...}{Further arguments passed to or from other methods.} } \value{ Variable importance measures. } \description{ Extract variable importance of ranger object. } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/predictions.ranger.Rd0000755000176200001440000000116614027301517016401 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/predictions.R \name{predictions.ranger} \alias{predictions.ranger} \title{Ranger predictions} \usage{ \method{predictions}{ranger}(x, ...) } \arguments{ \item{x}{Ranger object.} \item{...}{Further arguments passed to or from other methods.} } \value{ Predictions: Classes for Classification forests, Numerical values for Regressions forests and the estimated survival functions for all individuals for Survival forests. } \description{ Extract training data predictions of Ranger object. } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/predict.ranger.forest.Rd0000755000176200001440000001002014073532574017010 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/predict.R \name{predict.ranger.forest} \alias{predict.ranger.forest} \title{Ranger prediction} \usage{ \method{predict}{ranger.forest}( object, data, predict.all = FALSE, num.trees = object$num.trees, type = "response", se.method = "infjack", seed = NULL, num.threads = NULL, verbose = TRUE, inbag.counts = NULL, ... ) } \arguments{ \item{object}{Ranger \code{ranger.forest} object.} \item{data}{New test data of class \code{data.frame} or \code{gwaa.data} (GenABEL).} \item{predict.all}{Return individual predictions for each tree instead of aggregated predictions for all trees. Return a matrix (sample x tree) for classification and regression, a 3d array for probability estimation (sample x class x tree) and survival (sample x time x tree).} \item{num.trees}{Number of trees used for prediction. The first \code{num.trees} in the forest are used.} \item{type}{Type of prediction. One of 'response', 'se', 'terminalNodes', 'quantiles' with default 'response'. See below for details.} \item{se.method}{Method to compute standard errors. One of 'jack', 'infjack' with default 'infjack'. Only applicable if type = 'se'. See below for details.} \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode.} \item{num.threads}{Number of threads. Default is number of CPUs available.} \item{verbose}{Verbose output on or off.} \item{inbag.counts}{Number of times the observations are in-bag in the trees.} \item{...}{further arguments passed to or from other methods.} } \value{ Object of class \code{ranger.prediction} with elements \tabular{ll}{ \code{predictions} \tab Predicted classes/values (only for classification and regression) \cr \code{unique.death.times} \tab Unique death times (only for survival). \cr \code{chf} \tab Estimated cumulative hazard function for each sample (only for survival). \cr \code{survival} \tab Estimated survival function for each sample (only for survival). \cr \code{num.trees} \tab Number of trees. \cr \code{num.independent.variables} \tab Number of independent variables. \cr \code{treetype} \tab Type of forest/tree. Classification, regression or survival. \cr \code{num.samples} \tab Number of samples. } } \description{ Prediction with new data and a saved forest from Ranger. } \details{ For \code{type = 'response'} (the default), the predicted classes (classification), predicted numeric values (regression), predicted probabilities (probability estimation) or survival probabilities (survival) are returned. For \code{type = 'se'}, the standard error of the predictions are returned (regression only). The jackknife-after-bootstrap or infinitesimal jackknife for bagging is used to estimate the standard errors based on out-of-bag predictions. See Wager et al. (2014) for details. For \code{type = 'terminalNodes'}, the IDs of the terminal node in each tree for each observation in the given dataset are returned. If \code{type = 'se'} is selected, the method to estimate the variances can be chosen with \code{se.method}. Set \code{se.method = 'jack'} for jackknife after bootstrap and \code{se.method = 'infjack'} for the infinitesimal jackknife for bagging. For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. } \references{ \itemize{ \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. } } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/timepoints.ranger.prediction.Rd0000755000176200001440000000103514027301517020403 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/timepoints.R \name{timepoints.ranger.prediction} \alias{timepoints.ranger.prediction} \title{Ranger timepoints} \usage{ \method{timepoints}{ranger.prediction}(x, ...) } \arguments{ \item{x}{Ranger Survival prediction object.} \item{...}{Further arguments passed to or from other methods.} } \value{ Unique death times } \description{ Extract unique death times of Ranger Survival prediction object. } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/man/timepoints.ranger.Rd0000755000176200001440000000077714027301517016260 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/timepoints.R \name{timepoints.ranger} \alias{timepoints.ranger} \alias{timepoints} \title{Ranger timepoints} \usage{ \method{timepoints}{ranger}(x, ...) } \arguments{ \item{x}{Ranger Survival forest object.} \item{...}{Further arguments passed to or from other methods.} } \value{ Unique death times } \description{ Extract unique death times of Ranger Survival forest } \seealso{ \code{\link{ranger}} } \author{ Marvin N. Wright } ranger/DESCRIPTION0000755000176200001440000000206514073544142013250 0ustar liggesusersPackage: ranger Type: Package Title: A Fast Implementation of Random Forests Version: 0.13.1 Date: 2021-07-14 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high dimensional data. Ensembles of classification, regression, survival and probability prediction trees are supported. Data from genome-wide association studies can be analyzed efficiently. In addition to data frames, datasets of class 'gwaa.data' (R package 'GenABEL') and 'dgCMatrix' (R package 'Matrix') can be directly analyzed. License: GPL-3 Imports: Rcpp (>= 0.11.2), Matrix LinkingTo: Rcpp, RcppEigen Depends: R (>= 3.1) Suggests: covr, survival, testthat Encoding: UTF-8 RoxygenNote: 7.1.1 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues NeedsCompilation: yes Packaged: 2021-07-14 10:10:00 UTC; wright Repository: CRAN Date/Publication: 2021-07-14 11:20:02 UTC ranger/build/0000755000176200001440000000000014073533770012640 5ustar liggesusersranger/build/partial.rdb0000644000176200001440000010664014073533770014774 0ustar liggesusers `Wz7!J"R" 5IĈx^x$] ]ݪ\pn7{q6{:Ό&z⽼q]{u {U]Mj4v^\d3u"}?k3Uo?]18;oڦcxf,2rwqִ&X̂s]dVwK];Y\˗^ilOOM*Z%3Ug| 57:^i+˞m)ЗFt[~l2uſ*g>8>珷rd/wټvuy/(+O/$^* BWOtÓ35T/;^<\v {tˆ=wbn: -:_lU%J]=j׺Zi?hѭ! {{EZ4\ӷB%_+gxVf&Qd-o7|06_b$ģA}~&謦҉btFz!aiߖBn!7JA^Vv(Ynp#dU!& $T!{Ns2VͼUB̊,nN)JaC8ӯ)J hԺY3y꙳!|֢ X֒ (Add60rddIBwccX6.u"~ ؼ6q^ܦ59MvƪcQv,T&U]f;QAMj5ؼTx;wYebRf &=Ld3':5ט k1'c `nEVj~.G<"9r=M\ ֫3yUL ?X/Ei^ `r61Z3¬c!«Yѩphbղ.?ۈGs ҩ1I'cWV0ݼc>LV橗WaESCSW!`XYHmȷo($ē6 Oy\!QVH?d`U ]A5V~z'0 Y-WtI+?sOC~|"zJ Ǡ> 􍛒NBnњF OS!UCKFHɮs{bS<2eb=yHވh/p XmulNPWosof9VqMgHoo0JFJ?Vh?l/X׶k]M޴Y6qYLś[;Q.SlNڕܠ @v@?ѥA`͆. 1+f{w`'5܌_*A1 4{ǕNtz Y (e)+ oCۉO$ILGAOBWɟwg~j4c/&4'a3m!5| %Lh@6mL%ޫwMXc=h Tf*׿ Mwl#Pg5rZ`҉\RG<1elǃBͤLQijlc]*eOQs]-6˱\{>ĞdsY1;{bNz,{E(2,)TˡX4LB&O3YkP;3#r79ɲ,rqP49rX(m %m9Ϫ!*4u]:9$wzl|d$G;~HnL|Ո|ZOCj>Ww]6NyVWa't9{g43R>[Kl0?Wqf`dz\%)ݎRl][M)F[cLە4U#ovp OuR^$U1N[}inp~TLt괻;s6klwSGwuzYV Mg*%7lN'vaWlÞnqtMy\g!607݄ '|'Mhe+$m )+)0h*ӡs$vp-]ɱ[-*zEN Өys'<(g(S|-yD4=lgtNolA4;+1AVL{k]A2Yj:Hmye)cOsyҚ6N, heކ|.ykzghSzm{k o^R K^+r g䗶5bl˴Y--' ݋ojՆi62UzPhzcL(G잞vJյ,mwms!_)E(~heb*4]FYqs9.ʯ/[}v-ao7Ubu"}X5 [տekT0Y< ٧lXRAҚfKA"㲪\=~l4z~!fygJ+fR05^K,B{VkK'%NB:"+h,%mYasq,uSWd]+5'FX(ش%if;KvM #PWaG⑳r* &g*vi10ˆ%+_K>X^zm|B5}y+2wJk-m<Z)N/ޥٝZ^ R۰}J-}j. I_Z b)ΐw N{Lv+;h_NXZړ&~iI\Q-Go:Be_۹$](C:Vqyjft ȇ~sOD]oj{AR#qݶfk]ҙ/m_.=X\lQm͒:)3{)AjC;?dzmRKyԀRUJqSd0X&,@'K%V] BeT c4n|0هڠ08 veeds m{`Pt|"vױ.h3s%yXo35 D2&Ty-ȷgV^^I^6R?k;S3/KK2Іl'V3=Y-zRt*Q?Rֈ8<6Ð?\kS/~ө eUnP/BbŷޯB_{%ēP˾. Р0ޠf#(:0X$ [96o81S2Y7| U %84 Z`<-ZP OR9&z V-8%5e":W=V#(&Run CЌQDęM0F4[V5rpH ;aI$ޮ̛&!6NKi!)CFB+` '⻀tg|g 70Y6˲fɺeg)ԩJĝiͲKch;)_cJh^J(toH?P W0]yv䝤cV)|߀7K+UE{K'mR'$fc[]W&- wߓt~/ac7\;AO: 9z%b;MϱoߔcD,l \H>}} MC:шY ,R+Ϙ9 S*T2B:`tIȓWylu.!iD x[.h7&țU!M'qO<˞#+Q[S6.GBvo B2R5 MCޑ>XMS=vh a٤bn`hz>deAw _ru7j0$zZJGɿ"!iHm}(@ eDTT9"^A#v@b0dE?G Iм"*(8z} yGː^<ç%ēP]َۗVcoG6Jz롛!  &b48FK c̅(Xg1NhM\:,/}'fdOc-l՘SeuG;R9=aAOxE"J|#ffLH=_ ${}9Q:2`йIBƲ=*y<9~ew.~G B;LN,U.K rY Đ㙎:.J[xkc\:OdAn-G4K@sw<| UQ!MGo sY͞ 0\6 显c{d7fN%o-$-"\w SSQx&K_\YĒLŵ<[KIyyCcՊusbeQH#xzN-u7"\$ yX7qsUZ(|$JF9D "v9qʢHvC9@QqMg,JM*Vxpl9N &Yms/˿O, 8n#N $m4]Gm|erMf+u{4nBSg!QVgoְ  M0V)Ks_βY@&8yN81Dlʒ{@AsyF?224:2Ҳx,3I(],PdY{%áX>uٜܙF*e 23if\ ' _Sx5 lqFT0Jٲſ |wSO%GaRjc:;eKwKzFCOb9lt/E,O>e[sR~Hx,w͞HDA>κ*9,^|!vvs9 aerY_VnO~cy*ܯ )*w9Fȁ?gY8uҠVna=#yL ⻹&AKp7}_"NAW%LfZ,׆LUyX`:t^i1R`F Dz:fDQV3 uZĢE7BC(g5(uæ%&nڼkP3g,[=ND C3Kdߕ /e‚+}2 `HeH5>;:u6~K<~vO\x1-i[dul] m U3 ~5T#A߫UN} Jl{ڍ ^E*Vډ-4eb?  /⻙%A2zD|J!AӚxb* vtX]*e4/6eО,JfL{֨?4-f4f&hc}h^ JxʪX6sȻll|b!K;>~W3{\p j_@%ģi;/ ]S%_*Js({¾̒KbJPr=Kb'q0 A咀kRїgbm%g`e:NB<1K%%QA oKyI@|gyhFZQ[f j撀"οw[-`'o xF/ԼCxS@Mϙ=yϛTn%N_ĤuMSѠ_ܬzQVpEL=d: "`]3N 5Cn4`d" Mue섬W&%!fۤ=j~vPvCޝ~PKrbl])uQ,,rAP+K_ϮO׼"Drd% weWvN% CSa a 4P J+d-mכ'عƻ!$P&;_9IwEzx30,6yUq#8rbs\coe_7w4cM$[<^o ۳J#,׸d3mN.Jn5 ڬDY9QpI3 7Kz}򆆯L%/4S#uk7Ǿ x?S \fnZvݺql:kY9$b7u5#J R[; 뵾 ^Bd/1\U!ۤt!OB֚<(ľ/-Bg%l/-}?iCkEoVqs?)Zdoybe}〽 "<Pa(Ðj+1u [Fg!Ob/@b6}K)S NR7$ē6{DOof QeWi6-ĺ'M2h 5$W4)_r=>˜u(dOmb+nP`8~qy1B}KӥYCf8(ޮ:c*Yb\a _skμ53&n$Ta];|+5L”AW(] ia#§'dӱN d2 U"ښo)+Zi> *n9 j I5bkC=~hzUUoD^Ak7# [Ju Mc/J 6 i!FcA2ZNKCS;Wmr,;3X ͥ?WG1 fѨ<9\6?cǠ ՓxҦPͮ/ * f+[]N L 6,1X/Fs%b/A~]_J4e/V^rꈈ409~G$_R\HPxҦq%`!E]mApۥHEF5[(?ɥ 3l^|)10VuDDg %w2\coǟQ߭<M ^"C6%O@@OxҦJ 6*ڦȌw"ֵ-MQ?XR^%v Hs B !Vhe sV~$+fŶǢ+PFW!_M&u*름xҨk[oŮuBnd"5ȯ%W+#:2+!4jA֚n%+Ěv :l1H' Ʈg]|bӸ~Eq͋eSE? ȿ.A&LCxҦ!|EEnQie֧럞veƼau|WAT(as%IBT7SՙQ a7d}ÊjjM]( OZ.2 &tUCe\HzeJbVjͪ |UJ}3f"RdYr8îsY2af9PmSln>@d_J)!| 0%OI>~p?!"|GC>~ƀ2d  YWڒNQJo@N\ބ|3Aٕ7$W-`r92<$gJj%;J[B,ry^7}Oe]v Nea@̲CI: K8y,6qQ9ӿ^%^QydMڌE*]M3#rD뗹\k@nZxJP@Oۣu^PZ @ZFS= VSH;dŽ\!j6Ap/佱Iu9)/Ns<W 0&dԯɦucG' ﯯ|gU,?0Ͳq&rDx r(W)󒸈]ݰ E`2e ̟C\gh<9& vPDoB32HJ;cfk+ŹZ2pw0mn訖TzML<)ȧO aO*PS&n:)b Hݎt1%-aqEet^HģI/.#/CZc 3PPWZDnEqV`%v˱f,cWr eΖL'N{ZMًdO.֟:'a_(34O=%A&sy6MU;ztbthQ9*^W!U`)nygVN!zƬybf U*%wbbbp޻]XG]c㇎Gsh1uvӓW#25![5%E:i눸hܼiT˯JUx'Xh&؜U݉ab;L-WkJ/RlmLVSwQĖi2ʻioES$t浽H#@ ,TzN%ÝS.Ӎ+љvNDiNfUn΢ ꨞ>dw&3mRpӻ(azcSWlsDJd!2S^sk#˼o3,)UF?ʲ\]Fd׹;[t(RʩUE`y]=:P qCEC7M[2vd&M;24:~dTyI o$ |#vը;Ƿ-e9!al.t B6~%P@3Xщ ?~\ AX` %6ytȸaٶ&v9ȟ sL(91<' DLӄ9qM34^a֛a(CJY؄_XR*ʳDiƬ01je^CN@uKJ`$VCPqDlkEns({>} J}'" /d#vAC:9O!(I̙;UL[F(n hw\:trxbʶˆ3oٴbߩҺ "1s`ԼQXZRf%K WF3:Yko[oLcmOkO3^:{%L$-o5̟[Eq1õ?hd>Yڼ1Fq3Q0'o{yOR=׮N]|1:&5~h}kbgԛ?^}Z|`?ެ\KLJ=y%aGB=$!ٓk%pM_7 |߂zO+47wenפ\rSѠQ#F%Gt(Ft%m֕"MW)MZDTA[{B*TN%J1DtiMJ'G()[!|C ֎gy0Y! ywC?,!ٲR:mע ģɇv@] \:[tȅڴ"' >^ZV1V,I+SWȏe! D3*Ei>6^&Y 뚩ŤN^:N*"c87&Ns! Yl;X8J_F!yGln##`20o 3ڕ. &[ kP94 69֕iIyә!ub[Ar_IJg\R=b& t&k"#HS@B#' \c6d:XphMM[7g/.%ēpGw)r#WC]nۧ{0d 7wiG3弝3=LҨ'G,]CR#-B:ӴhH]Lg[YMCEf\ߓ FkZW&Bň%ՐiybNf=q=C0ZOꪗC>[W3c̺-V̳Fg3MY1ǁ _]1y>yAUOD. O?W@)Mn!!iH6z Ҍ"A^nM`I.RBM0Ը~²]Ω0fgs/1j~&p;~`eslBB6"PCf8("Ia __$n$Ta];|KSfC7)Yߺk( %6GPjէ== ;%ܒ~$4n!Oӊ|Lѿh9r4}"|suUsxw(k{⊢QS5z#bp\}1J PK'mVv >kGesj%7d[:!p]ڢTYhH6?cCxIBB Y/rgWO{LD{ 2]g5_#xN4K;GSS+J2o9MI8\s)INk_KP09 鈻.f,w(!:!ooϡz; 77e3i}xR9.Ðɟ|N[[] 8 y:-WC vMB<q52{9i?E|6w@Ndz2jV.nȻS6=.>rQ{aiyJt8y,}q(8=L;n]0& Ǣbބm>{JD#) %PB<)Jud/TUDٷ,2\(+k͹P/H9#ߟɴeD 2gc#] LqDݐP<ÂCaҎ;ݴb4R4}/e/i(iૐ_MP%O3xFlGb b5%Lnzng:ކR y_.]W~Gb3SߛsB ;1 z0G*(INm !Kvh1{ݺ {3~,uǿM$7y?;лV=@|LH6NL7X%G)*~QTjO\:Ogܪ>p5d-Ww6R[!^@N򆬈 %(7d.2^@2^3+ςhn3FtB: <pPq~#N8"~"E׼JQK!n FF64R *]>vm.-'qmYómm4^S5ga,Sȧf,Lpo&ϸml΃E6Wͥ#Yk޴E5fg~ kE;z `2}UVq7L F!0cLɭyi,u͢\RJa*F Yt.{E[jgC %^;Įڵ~')߀FbU'߉8ԩOAT:u? ӱhh^'bGhTg߂*pGSQ(uW߁t߅uuH.rb~q=A|]c2]q); B YuN319;%pHJ} E#+fe8iL!m'?^!z CWV@Ԧdi̪gtg2MgKRN?}稪~D<ڡƀ ǿkgMBkTEeZ+)Tu ZߨWM7qD]{TH!+iL_$[7FL\ߡ M:!ϵ^dnԗ/@A.db./!2@j@] _bkv˲9v%Ǟ`/Ylt?zt0`'9E@H[h.eg,M0~ZhlvDgo2ǞgS7T-G'FF*oâ~W 2Vyj%=g̚'fjV[^Rr'&&& ۅ5п`4:=6~뮛9z4gFmݍO ۪76.)iN|\G#]OZ~Ub:<'>B ܘ4lLvOEN ] 7Eۣ%<oQѴRƗi%pl6ilؑ#M*^7 #>`_/ᴫFt# TܜW o᷉?HRsJ߶m?aǵT{m[oR;j&ڐRQ#J%Lņi[ =WhM\>% [WYIK?+z:,"({>} vJ}'" |&l:ehvZ{)%CB<934y+F(n hRUORvp-EUZPA k5oD%BghFZ= ZfId;Vm?4F{\~BGgruxtlS6cNj~k3\6`n\ҏoO<ڢ#y܂Z`3̩]5 V01GKbz+a%)shQAdۚ% ˍ熎A넖a|me6J9_wZt+XKvuy/P5OC[W>T[P_Zm`֬TKLH*˼OP݋z~lqvd͕B |dDTڄ]IX('%lLFO5bi!oerXQk,!y^&ZEida#O鴗߰hE*H'*έA5iSxW 555Uɓ+]q- ˧4׷e:=PJNfN+7.]*ȈYL,&-̏Nj&uikԎ3[6p~y=gQ6ɆDkcVZq<:EǹWgCN;[p,ĨI]dqNs'/Mi\zf9:;s7MmœM4Ҥ3 f՛ӦtCf]cVK0=&t rE#`2gh ʗ7<Ls+[rɚ8ɚ[ũ-xHLvM#Vs%ި:D$\TH:+Ilp ۍQZrM\kz 6ō`]4Y:]0]_y> n 5!n fW阅O#GxTx ަTM>'rM}BnͣH(]*o"֞f1)ʟ֛~q0s ^{ GΘfiJfyl;c՜eS&mr &&|^z6-7>Qr%؟*L/ĬƜcqz+|[5b'3 )b#;_Dp*5Åʂ=7j">t׭~:'K|$m5o)ˇ%ά8Ċ5qqi`b)%EXߎG\7 k %P*!iG \o8{2[ Bc/7qʲ-n>H[fABͱEY-.WiAƗX4Uy˶SZ[a.H1zZYK{iUr=+OwޙQ'P^ϓ79.:_&lwo/z_jkꗣQn_&ld0mE?IFplE ύEOہ2敽 Q YKsC˦81`r `!q_@7o,ǿzYҿNJC5j0 fb5FҩWd#<ƨRx5ը?oI"ՠӐ@ y!GpW %}MxҦq%`{ ?Edpmw* |D*@!IåeT2iǔg.vگϠ: /BQK'mϢloj:+H9sxs1U|W%zɝ1q^j/dotMz݂TAl}<- %ē6(4_?HpK&.wm[;ɸ%6@l\j.Ye rzreg@1~go,xҦq%`{ɝy{J)qt޿i_U=ƒ_Ϡ;M@xҦq%`{xKKEÄ; HLsvO+3. "U:U.=:G([ [zC.M[[ 6o!: M/MC ASxҦjM3rE}3~b"4T]U( " b+,XrKsCi\By5` w~6 SwodՕO_)ߕO4 vb(R||G2e+eoHpk!5ݦcRWhq.mClؠ ^.4cL)Z?``մ3)pzV|3Ml٘PV*>vm.޲>d72afBoi`M##⚎&=-'j2Ph5&Q,o4߼4b\EYt?[_e.mv_`*aYr_S5f>ܕi1VPC0k:Wt+8c'^>(:Y8Kw(3:LF{_ǰjd+n=Ֆ~Nd!N#⽏ɔX}:E]}+8/B[)Y~J~6Ai;~t@_he)b$1ٙE:ej pSisym#N;5De8 BM%( /BuxҦ2J KYNdwBޙXGihFZ4?^Fwmvb{﴿[OU:9Fu}SSL9 %i?۠*a"S=kG*T_nZC)'(Yi%|2e%ْizsJhځc({ȫc+'%1 M6J Ȯ}ɾ yObV`׀?,j" x+kJB|BEɟO4 @Q@g~Hs=n>AD-{ BČ Tt+ZMgP҉ !iQiKՄoeîȢ7wpD4.{ҿ>-_ٞ0)^nS4ii 8j;T&_NBu_O_i)*!i(۴7)+b]\आ\]YzaxI{C-p lt]? U-? Yk磚U~I,J XHop#"9fțMupcVTi&?WD: qeӲ9[̱kI ᥑmzkۡÛ:($^? m %w_m0O@*EmQ UFDmVLw  %w%ē6 %`{ۺ^NpK&VJZLvꩥ͵fxAö' O%QrG +f>fٜ蔉sލdhT]? |wQ ߁7PO4l ]He3xT:§T.3 so>Y+֞PQ޴u$!iQIե Po-n#MN ![/!F~b˱S⊺`)>N* IWDQf1+F+5fbv遒YVnOl@N! U"`|Jw%ē6 J 6Mz{ErUa^)ZM d,0+;Njw|]E>ՊVB.JbK`!Pĥq4ADeCW53 t3IRB(ATBNsD T'}V-CFNK _u ϱ2t.{) ѣ(oCN $`3OR5(㫺*z3㻤$ԦM  { ,3q%Nȋ^|Ml tW!˯jZw5xfoBē6P.i(l!xvlh2p89,A$L4?SNqr="@ֺ=T@}2REQbO#4o(L Vޅm Mm@qrr9U:kϝ99pgF,J7^h\lzs5of1#_Pگ^ '!OjxzU=l0*Fģim'z%T^E{Ș"~>\_`RMcuM:|V0M&Yo:Dt~%qŜ6Df/0 Y&RLUQ>,+5mW#CC֚eSSJnpd#E'? M MV OLP^<5iC@Xv 9Ɂ2]ޅt/T]mMǻ@0ۢRl;yNv"W JF[!;~%Iݣ~Է nFɭ> 9Ɨ1)]+.eٌA:L[=+:]i>~ dmG &B? EZkoBX;D[Y|Z;Gɭ.MxRo>s[1E4%%SWMG- Lzycun`c"+[D룑#5BOr,r$_qL]i(nR7 o (؅c4l5]0,5wUv]ܥ$ >@ɮe´<;iS\ bzOu=gCJ3,NFƻ%2Ւa\ nRg"}Eaqͳe>d!R\%KOV *% n}~2[ݚ)T jؘfJ|c銸pԴcbrnKPWi2Wkr02? k$HeR0)XR̾mQ3sr7OyQSGB6?{$Yk86D=Ȥysϵ߼>"<) & ^ݼ> t**?WTOÆA~{4lr_btl=zPF;&9MaXa"y*6QVv> +#| K[%xR>*jvf.ZP{avcm6o> # Yk4Q$ē}Tm52Zga]}Z5K,볙z0„,mHTm8Y zb%xR^>_j@89 a[8T(&D> `6OA~*}I݄>|*&LHQ?)lR0#aZ5L*}:N<bfEN_0&R1e/2 x5}Db /d{Mq/-"ݩ ej RB<Џh~<TMEUKqX#V> Ybx԰$˶FQwP| goV?S"|%xR7kZ`ͪ bU٢`|?+"lRs\t OT.F9_]fhKoNM%0lRp7M Hm 8oFE/)˦esKWKW%i^yǪ6/qGbpA4%-a<'}'.:Q^B<1KnV $_dJ8o03&"/ *%< t,ґ͢A%G71GӖ"(S,.B^T-w"i.f756ujAz9)g}³&SrHz']Zx.]Z΀Y2E('Mn[t;]rӅ]jb6K'T?BQJ]\]J?z&43X R-*W1,Ͼ/6LEVȭ:2Mv,81KZ<6(\1k!*2:KΈm n^Ge6YY!| z'9]j|X\I;Uw"}X\m֭IBvoXc(f4,4g0{\8wpj4Kl\UgY--҇%oӭ:YU?M B3b7P NjSb _xiMHUoݡ:'끏C اf̔j1Ad 5zoUBg'OEG $V/7TGcW`لNSIc*A炄#L,W"=bjv*xRF><dڊ4e8IJ@r^C FpG$V3+)RQEsL h.g)f<w%i#OK%QY%EZm`*byK|(]LH:dzxFɋ#Vj([6qGQʂX?/קyUC9egl JL!oṈZ4g P kEUQr[Y֮e-qAK9/A֚TCː.k5j 6/)#E Q=g!k-6eN/(KKȢNIF@ruͱ{#?w- !v+ %Θ-Qy[͖x<$!4j5>V zMJbI's)iaW,ڦ`ciи|x捽 Nx!,78/Mfډ&NCkmݢL??;S } 0}4uq)*"a FLh#⎩ AjPr\*[Xרa1}K:,,dH3sxؕUcT3D<z[$O>ҕwqk7͘4XSwmsf}UBqS5;'VZ9gگZ,z~Mx rM{[OJlʵ0EV߅|"b r g)mƮլ~piu'E?v6 ͢fyy? %ҡS)s]V̊RA '!O&7QD#i&RrS ǿyaӼlNy73Xo◺6͗t?b]\bߣ4fcgoؗj!T;p#k" 7yUwbxxLz;4W^vňh> 9k6WOלRDkOAzԦT"?- bBLF@z ]E[]ɉj8oJzPu7!#W!_g/3t,\`ٮh/IK5K?qg!?NclH_Gj$QPVMJ[B_//ˁ~K)2/ٺ ѽ(l CW']0 *FA#G4LFx/%LB9A3yhߠL\ð OZ}3#yT{ tM͎IQI2P0j*N1 ceڮGoL\xӕ"ПWd*P (W:; wNO7Dg];]9RU;&z~+eһUB홶f ;N`'Lolj}cmGB&dD%XůA,ҷYҗWY,ϕ,eQU]`8Y*ti\kV#˽QؼeOyͱwG|zZ-C >r\yI~7:?|<'/s`4s#Gs#tF߅~IlOb{:#r^fqgK|ܙ,$Ɲ1jǸ31rJCd$ Sd=y_AW0Ӿ+NHmd&2},ҫwޭ:$|Nx,J߇ ҋ Fv4ʂ{ʲQ7Dzu_liQCO9*w5;b'zVQeɡѱcCc+N^CΏN :26Zѱ11)w?Ǥ?X֑-ՉƘT*Fk䳒3YI?cԎgb䔆ITߩZf9I#ڥ;|ժ .ZX=s_vwvWqʹ?Q$G`J*`rJ~CsMQ*mfn}eӿ9'9?ZDtQg)bLj }wLSgCGi E~ɗ `euXG0#è^n1c21a cԆ1crT: IT'rH:UpHRQ5c򧛛ugzQ%C޿[Nq'S.\|%Twvх&?% rLlYvI GF~l]6si8G& S^I_&H~lcON;tx[v OY;|՗ 1<-:@c /}[uܞ|'K1q{b1nOR="\tȖYSWtGv5 EOVYgMbݪk;ᄇa Fƃ x\Ln<- /YTsF>U2w\bYv6'^y]2jrMcc1ғ'm?* M;<:tl|Dy@I~ė ;=|ԟ*;:Ccc:3}sLx*K^udsCZ/eÿ:L>/21F_&FNilDu1*Щr[fy\1R5Mwviw-$F﹁|ջUT~vs_so}91'% ܘ;Q*Y|ASV 'ougkUϠȱbzu<9c*Y"+96 Ƶŵ\.J/1U]]߲9Vh?P]6Je8zdtcX*^˄n~×ώ R1e|J?aو KJ8zUՁjYJ|,$1j@51rJd$z`%&Щ{B0ꃮZMW==C\ݪp߫vZr6pNw'1]^cXen2Y;ᄧgw˹ſX)άA`/9;Cɮ&7;.g_?֌;g9>XfAl_?sg\:>3J.r˞ͱcmLYv\`?Rqؘ]`f"Qv/CLIT ^a͚YvYveΙT??@^E4 l:K@! V?J5MX@Mn? awtü.;{F3eBuל1鰎[\BŅ1BxGKbh1jr);z(RV[-4ef鍎Nsʹr^R}ko}9 .Jv0 +iQsM>BͰ=dIs1] 2=/eM.;:qءǕoye„c]XXȽ^.9VGˍ9\Rteg}0nC)Ez9_&ۍ1eMӄ9 L4Mv 'K|Rx^& O2+KMlű9Ȧ~USʦ'@ߵVYJZyǨΉ,=:1>:c'E/jTbSQf1=6KC޲3ؔoIr};{]u:%>,$cԎ) ד-J͠Se}XLwAW/ӭ+Uvא`=YmNbr=nU;PVmSyԐ\?{}؁'% |W2LMO*,źy^: *e`qwp&ktix %f[sZp4t֊.ڞbђ5XǭvtLZpŬ9+Pql\̲Sd;<_eN6q#LJE^sTR#cGt˗ '}a*X -5C:L>{;İ31Fv&FNi`lD<\!tԖY9~_AW0Ӿ+NHm@&2|,ҫwޭ:|TC.^}kjzҔ3*J7 kCJKrOBaaݐr P.|0x4ے270xbtšU,Xy/4q_F}<`P;eĒW a&Qhޣbi`$\4Jn%B%619Cm*ْA [6ckvSE -.Q*NDyPAҒmВmq$uNZ7jG ipM i{$OM<<)fNj3ǎflsAyVׄyOzZw"}]ܹJ85t:ojp[sࡖXgSU'branger/tests/0000755000176200001440000000000014027301517012672 5ustar liggesusersranger/tests/testthat/0000755000176200001440000000000014073544142014536 5ustar liggesusersranger/tests/testthat/test_maxstat.R0000755000176200001440000000346614027301520017403 0ustar liggesuserslibrary(ranger) library(survival) context("ranger_maxstat") test_that("maxstat splitting works for survival", { rf <- ranger(Surv(time, status) ~ ., veteran, splitrule = "maxstat") expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.4) }) test_that("maxstat splitting works for regression", { rf <- ranger(Sepal.Length ~ ., iris, splitrule = "maxstat") expect_is(rf, "ranger") expect_gt(rf$r.squared, 0.5) }) test_that("maxstat splitting, alpha or minprop out of range throws error", { expect_error(ranger(Surv(time, status) ~ ., veteran, splitrule = "maxstat", alpha = -1)) expect_error(ranger(Surv(time, status) ~ ., veteran, splitrule = "maxstat", alpha = 2)) expect_error(ranger(Surv(time, status) ~ ., veteran, splitrule = "maxstat", minprop = -1)) expect_error(ranger(Surv(time, status) ~ ., veteran, splitrule = "maxstat", minprop = 1)) }) test_that("maxstat splitting not working for classification", { expect_error(ranger(Species ~ ., iris, splitrule = "maxstat")) }) test_that("maxstat impurity importance is positive", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, splitrule = "maxstat", importance = "impurity") expect_gt(mean(rf$variable.importance), 0) rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "maxstat", importance = "impurity") expect_gt(mean(rf$variable.importance), 0) }) test_that("maxstat corrected impurity importance is positive (on average)", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 50, splitrule = "maxstat", importance = "impurity_corrected") expect_gt(mean(rf$variable.importance), 0) rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "maxstat", importance = "impurity_corrected") expect_gt(mean(rf$variable.importance), 0) }) ranger/tests/testthat/test_importance_casewise.R0000755000176200001440000001107414027301520021740 0ustar liggesuserscontext("test_casewise_importances") test_that("casewise importance works, classification", { n <- 1000 data <- data.frame( x = round(runif(n), 1), y = round(rnorm(n, mean = 1), 1), z = round(rnorm(n, mean = 2), 1) ) rownames(data) <- paste0("case_", seq_len(nrow(data))) data$a <- factor(ifelse(ifelse(data$x < .5, data$y, data$z) > 1.5, "left", "right")) rf <- ranger( data = data, dependent.variable.name = "a", importance = "permutation", local.importance = TRUE, num.trees = 5 ) vic <- rf$variable.importance.local # should see clear pattern here: # pheatmap::pheatmap(vic[order(data$x),], cluster_cols = FALSE, cluster_rows = FALSE) expect_equal(rownames(vic), rownames(data)) expect_equal(colnames(vic), colnames(data)[1:3]) expect_lte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "greater")$p.value, .01) expect_gte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "less")$p.value, .99) expect_lte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "less")$p.value, .01) expect_gte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "greater")$p.value, .99) }) test_that("casewise importance works, regression", { n <- 1000 data <- data.frame( x = round(runif(n), 1), y = round(rnorm(n, mean = 1), 1), z = round(rnorm(n, mean = 2), 1) ) rownames(data) <- paste0("case_", seq_len(nrow(data))) data$a <- ifelse(data$x < .5, data$y, data$z) rf <- ranger( data = data, dependent.variable.name = "a", importance = "permutation", local.importance = TRUE, num.trees = 5 ) vic <- rf$variable.importance.local # should see clear pattern here: # pheatmap::pheatmap(vic[order(data$x),], cluster_cols = FALSE, cluster_rows = FALSE) expect_equal(rownames(vic), rownames(data)) expect_equal(colnames(vic), colnames(data)[1:3]) expect_lte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "greater")$p.value, .01) expect_gte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "less")$p.value, .99) expect_lte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "less")$p.value, .01) expect_gte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "greater")$p.value, .99) }) test_that("casewise importance works, probability", { n <- 1000 data <- data.frame( x = round(runif(n), 1), y = round(rnorm(n, mean = 1), 1), z = round(rnorm(n, mean = 2), 1) ) rownames(data) <- paste0("case_", seq_len(nrow(data))) # data$a <- ifelse(data$x < .5, data$y, data$z) data$a <- factor(ifelse(ifelse(data$x < .5, data$y, data$z) > 1.5, "left", "right")) rf <- ranger( data = data, dependent.variable.name = "a", importance = "permutation", probability = TRUE, local.importance = TRUE, num.trees = 5 ) vic <- rf$variable.importance.local # should see clear pattern here: # pheatmap::pheatmap(vic[order(data$x),], cluster_cols = FALSE, cluster_rows = FALSE) expect_equal(rownames(vic), rownames(data)) expect_equal(colnames(vic), colnames(data)[1:3]) expect_lte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "greater")$p.value, .01) expect_gte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "less")$p.value, .99) expect_lte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "less")$p.value, .01) expect_gte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "greater")$p.value, .99) }) test_that("casewise importance works, survival", { n <- 1000 data <- data.frame( x = round(runif(n), 1), y = round(rnorm(n, mean = 1), 1), z = round(rnorm(n, mean = 2), 1), surv = rbinom(n, 1, .8) ) rownames(data) <- paste0("case_", seq_len(nrow(data))) data$a <- (ifelse(data$x < .5, data$y, data$z)) rf <- ranger( data = data, dependent.variable.name = "a", status.variable.name = "surv", importance = "permutation", local.importance = TRUE, num.trees = 5 ) vic <- rf$variable.importance.local # should see clear pattern here: # pheatmap::pheatmap(vic[order(data$x),], cluster_cols = FALSE, cluster_rows = FALSE) expect_equal(rownames(vic), rownames(data)) expect_equal(colnames(vic), colnames(data)[1:3]) expect_lte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "greater")$p.value, .1) expect_gte(wilcox.test(vic[data$x < .5, 2], vic[data$x >= .5, 2], "less")$p.value, .9) expect_lte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "less")$p.value, .1) expect_gte(wilcox.test(vic[data$x < .5, 3], vic[data$x >= .5, 3], "greater")$p.value, .9) }) ranger/tests/testthat/test_probability.R0000755000176200001440000001226014027301520020232 0ustar liggesusers## Tests for random forests for probability estimation library(ranger) context("ranger_prob") ## Initialize random forest train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) iris.train <- iris[train.idx, ] iris.test <- iris[-train.idx, ] rg.prob <- ranger(Species ~ ., data = iris.train, write.forest = TRUE, probability = TRUE) prob <- predict(rg.prob, iris.test) ## Tests test_that("probability estimations are a matrix with correct size", { expect_is(prob$predictions, "matrix") expect_equal(nrow(prob$predictions), nrow(iris.test)) expect_equal(ncol(prob$predictions), length(rg.prob$forest$levels)) }) test_that("growing works for single observations, probability prediction", { expect_warning(rf <- ranger(Species ~ ., iris[1, ], write.forest = TRUE, probability = TRUE), "Dropped unused factor level\\(s\\) in dependent variable\\: versicolor\\, virginica\\.") expect_is(rf$predictions, "matrix") }) test_that("probability estimations are between 0 and 1 and sum to 1", { expect_true(all(prob$predictions > -1e-5 & prob$predictions <= 1 + 1e-5)) expect_equal(rowSums(prob$predictions), rep(1, nrow(prob$predictions))) }) test_that("save.memory option works for probability", { rf <- ranger(Species ~ ., data = iris, probability = TRUE, save.memory = TRUE) expect_equal(rf$treetype, "Probability estimation") }) test_that("predict works for single observations, probability prediction", { rf <- ranger(Species ~ ., iris, write.forest = TRUE, probability = TRUE) pred <- predict(rf, head(iris, 1)) expect_is(pred$predictions, "matrix") expect_equal(names(which.max(pred$predictions[1, ])), as.character(iris[1,"Species"])) dat <- iris dat$Species <- as.numeric(dat$Species) rf <- ranger(Species ~ ., dat, write.forest = TRUE, probability = TRUE) pred <- predict(rf, head(dat, 1)) expect_is(pred$predictions, "matrix") expect_equal(which.max(pred$predictions[1, ]), as.numeric(iris[1,"Species"])) }) test_that("Probability estimation works correctly if labels are reversed", { ## Simulate data n <- 50 a1 <- c(rnorm(n, 3, sd = 2), rnorm(n, 8, sd = 2)) a2 <- c(rnorm(n, 8, sd = 2), rnorm(n, 3, sd = 2)) ## create labels for data labels <- as.factor(c(rep("0", n), rep("1", n))) dat <- data.frame(label = labels, a1, a2) labels.rev <- as.factor(c(rep("1", n), rep("0", n))) dat.rev <- data.frame(label = labels.rev, a1, a2) ## Train rf <- ranger(dependent.variable.name = "label", data = dat, probability = TRUE, write.forest = TRUE, num.trees = 5) rf.rev <- ranger(dependent.variable.name = "label", data = dat.rev, probability = TRUE, write.forest = TRUE, num.trees = 5) ## Check OOB predictions expect_gte(mean(rf$predictions[1:n, "0"], na.rm = TRUE), 0.5) expect_gte(mean(rf$predictions[(n+1):(2*n), "1"], na.rm = TRUE), 0.5) expect_gte(mean(rf.rev$predictions[1:n, "1"], na.rm = TRUE), 0.5) expect_gte(mean(rf.rev$predictions[(n+1):(2*n), "0"], na.rm = TRUE), 0.5) ## Check predict() predictions pred <- predict(rf, dat) expect_gte(mean(pred$predictions[1:n, "0"], na.rm = TRUE), 0.5) expect_gte(mean(pred$predictions[(n+1):(2*n), "1"], na.rm = TRUE), 0.5) pred.rev <- predict(rf.rev, dat.rev) expect_gte(mean(pred.rev$predictions[1:n, "1"], na.rm = TRUE), 0.5) expect_gte(mean(pred.rev$predictions[(n+1):(2*n), "0"], na.rm = TRUE), 0.5) }) test_that("Probability estimation works correctly if first or second factor level empty", { expect_warning(rf <- ranger(Species ~ ., iris[51:150, ], probability = TRUE), "^Dropped unused factor level\\(s\\) in dependent variable\\: setosa\\.") expect_silent(pred <- predict(rf, iris[101:150, ])) expect_gte(mean(pred$predictions[1:50, "virginica"], na.rm = TRUE), 0.9) expect_warning(rf <- ranger(Species ~ ., iris[c(101:150, 51:100), ], probability = TRUE), "^Dropped unused factor level\\(s\\) in dependent variable\\: setosa\\.") expect_silent(pred <- predict(rf, iris[c(101:150, 51:100), ])) expect_gte(mean(pred$predictions[1:50, "virginica"], na.rm = TRUE), 0.9) expect_gte(mean(pred$predictions[51:100, "versicolor"], na.rm = TRUE), 0.9) }) test_that("No error if unused factor levels in outcome", { expect_warning(rf <- ranger(Species ~ ., iris[1:100, ], num.trees = 5, probability = TRUE), "^Dropped unused factor level\\(s\\) in dependent variable\\: virginica\\.") pred <- predict(rf, iris) expect_equal(ncol(pred$predictions), 2) }) test_that("predict.all for probability returns 3d array of size samples x classes x trees", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE, probability = TRUE) pred <- predict(rf, iris, predict.all = TRUE) expect_is(pred$predictions, "array") expect_equal(dim(pred$predictions), c(nrow(iris), nlevels(iris$Species), rf$num.trees)) }) test_that("Mean of predict.all for probability is equal to forest prediction", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE, probability = TRUE) pred_forest <- predict(rf, iris, predict.all = FALSE) pred_trees <- predict(rf, iris, predict.all = TRUE) expect_equivalent(apply(pred_trees$predictions, 1:2, mean), pred_forest$predictions) }) ranger/tests/testthat/test_regression.R0000755000176200001440000001306714027301520020100 0ustar liggesusers## Tests for random forests for regression library(ranger) context("ranger_reg") ## Initialize the random forest for regression rg.reg <- ranger(Sepal.Length ~ ., data = iris) ## Basic tests (for all random forests equal) test_that("regression result is of class ranger with 14 elements", { expect_is(rg.reg, "ranger") expect_equal(length(rg.reg), 14) }) test_that("regression prediction returns numeric vector", { expect_is(rg.reg$predictions, "numeric") expect_null(dim(rg.reg$predictions)) pred <- predict(rg.reg, iris) expect_is(pred$predictions, "numeric") expect_null(dim(pred$predictions)) }) test_that("results have 500 trees", { expect_equal(rg.reg$num.trees, 500) }) test_that("results have right number of independent variables", { expect_equal(rg.reg$num.independent.variables, ncol(iris) - 1) }) test_that("Alternative interface works for regression", { rf <- ranger(dependent.variable.name = "Sepal.Length", data = iris) expect_equal(rf$treetype, "Regression") }) test_that("Matrix interface works for regression", { rf <- ranger(dependent.variable.name = "Sepal.Length", data = data.matrix(iris), write.forest = TRUE) expect_equal(rf$treetype, "Regression") expect_equal(rf$forest$independent.variable.names, colnames(iris)[2:5]) }) test_that("Matrix interface prediction works for regression", { dat <- data.matrix(iris) rf <- ranger(dependent.variable.name = "Sepal.Length", data = dat, write.forest = TRUE) expect_silent(predict(rf, dat)) }) test_that("save.memory option works for regression", { rf <- ranger(Sepal.Length ~ ., data = iris, save.memory = TRUE) expect_equal(rf$treetype, "Regression") }) test_that("predict.all for regression returns numeric matrix of size n x trees", { rf <- ranger(Petal.Width ~ ., iris, num.trees = 5, write.forest = TRUE) pred <- predict(rf, iris, predict.all = TRUE) expect_is(pred$predictions, "matrix") expect_equal(dim(pred$predictions), c(nrow(iris), rf$num.trees)) }) test_that("Mean of predict.all for regression is equal to forest prediction", { rf <- ranger(Petal.Width ~ ., iris, num.trees = 5, write.forest = TRUE) pred_forest <- predict(rf, iris, predict.all = FALSE) pred_trees <- predict(rf, iris, predict.all = TRUE) expect_equal(rowMeans(pred_trees$predictions), pred_forest$predictions) }) test_that("Alternative interface regression prediction works if only independent variable given, one independent variable", { n <- 50 dt <- data.frame(x = runif(n), y = rbinom(n, 1, 0.5)) rf <- ranger(dependent.variable.name = "y", data = dt, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt)) expect_silent(predict(rf, dt[, 1, drop = FALSE])) dt2 <- data.frame(y = rbinom(n, 1, 0.5), x = runif(n)) rf <- ranger(dependent.variable.name = "y", data = dt2, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt2)) expect_silent(predict(rf, dt2[, 2, drop = FALSE])) }) test_that("Alternative interface regression prediction works if only independent variable given, two independent variables", { n <- 50 dt <- data.frame(x1 = runif(n), x2 = runif(n), y = rbinom(n, 1, 0.5)) rf <- ranger(dependent.variable.name = "y", data = dt, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt)) expect_silent(predict(rf, dt[, 1:2])) dt2 <- data.frame(y = rbinom(n, 1, 0.5), x1 = runif(n), x2 = runif(n)) rf <- ranger(dependent.variable.name = "y", data = dt2, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt2)) expect_silent(predict(rf, dt2[, 2:3])) }) test_that("Alternative interface regression prediction: Results not all the same", { n <- 50 dt <- data.frame(x = runif(n), y = rbinom(n, 1, 0.5)) rf <- ranger(dependent.variable.name = "y", data = dt, num.trees = 5, write.forest = TRUE) expect_gt(diff(range(predict(rf, dt)$predictions)), 0) expect_gt(diff(range(predict(rf, dt[, 1, drop = FALSE])$predictions)), 0) dt2 <- data.frame(y = rbinom(n, 1, 0.5), x = runif(n)) rf <- ranger(dependent.variable.name = "y", data = dt2, num.trees = 5, write.forest = TRUE) expect_gt(diff(range(predict(rf, dt2)$predictions)), 0) expect_gt(diff(range(predict(rf, dt2[, 2, drop = FALSE])$predictions)), 0) }) ## Special tests for random forests for regression test_that("Variance splitting not working on classification data", { expect_error(ranger(Species ~ ., iris, splitrule = "variance")) }) ## Splitrule test_that("default splitrule is variance for regression", { set.seed(42) rf1 <- ranger(Sepal.Length ~ ., iris, num.trees = 5) set.seed(42) rf2 <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "variance") expect_equal(rf1$splitrule, "variance") expect_equal(rf2$splitrule, "variance") expect_equal(rf1$prediction.error, rf2$prediction.error) }) test_that("splitrule extratrees is different from variance for regression", { set.seed(42) rf1 <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "extratrees") set.seed(42) rf2 <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "variance") expect_equal(rf1$splitrule, "extratrees") expect_equal(rf2$splitrule, "variance") expect_false(rf1$prediction.error == rf2$prediction.error) }) test_that("splitrule maxstat is different from variance for regression", { set.seed(42) rf1 <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "maxstat") set.seed(42) rf2 <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "variance") expect_equal(rf1$splitrule, "maxstat") expect_equal(rf2$splitrule, "variance") expect_false(rf1$prediction.error == rf2$prediction.error) }) ranger/tests/testthat/test_interface.R0000755000176200001440000001512714027301520017657 0ustar liggesuserslibrary(ranger) library(survival) context("ranger_interface") ## Formula interface test_that("All variables included if . in formula", { rf <- ranger(Species ~ ., iris, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(colnames(iris)[1:4])) }) test_that("Variable excluded if - in formula", { rf <- ranger(Species ~ . -Petal.Length, iris, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(c("Sepal.Length", "Sepal.Width", "Petal.Width"))) }) test_that("Interaction included if : in formula", { rf <- ranger(Species ~ Petal.Length + Sepal.Length:Sepal.Width, iris, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(c("Petal.Length", "Sepal.Length:Sepal.Width"))) }) test_that("Interaction included if * in formula", { rf <- ranger(Species ~ Petal.Length + Sepal.Length*Sepal.Width, iris, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(c("Petal.Length", "Sepal.Length", "Sepal.Width", "Sepal.Length:Sepal.Width"))) }) ## Formula interface, survival test_that("All variables included if . in formula", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(colnames(veteran)[c(1:2, 5:8)])) }) test_that("Variable excluded if - in formula", { rf <- ranger(Surv(time, status) ~ . - celltype - age, veteran, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(c("trt", "karno", "diagtime", "prior"))) }) test_that("Interaction included if : in formula", { rf <- ranger(Surv(time, status) ~ celltype + age:prior, veteran, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(c("celltype", "age:prior"))) }) test_that("Interaction included if * in formula", { rf <- ranger(Surv(time, status) ~ celltype + age*prior, veteran, num.trees = 5) expect_equal(sort(rf$forest$independent.variable.names), sort(c("celltype", "age", "prior", "age:prior"))) }) test_that("Error if interaction of factor variable included", { expect_error(ranger(Surv(time, status) ~ celltype*prior, veteran, num.trees = 5), "Error: Only numeric columns allowed in interaction terms.") }) test_that("Working if dependent variable has attributes other than names", { iris2 <- iris attr(iris2$Sepal.Width, "aaa") <- "bbb" expect_silent(ranger(data = iris2, dependent.variable = "Sepal.Width")) }) test_that("Working if dependent variable is matrix with one column", { iris2 <- iris iris2$Sepal.Width = scale(iris$Sepal.Width) expect_silent(ranger(data = iris2, dependent.variable = "Sepal.Width")) }) test_that("Same result with x/y interface, classification", { set.seed(300) rf_formula <- ranger(Species ~ ., iris, num.trees = 5) set.seed(300) rf_xy <- ranger(y = iris[, 5], x = iris[, -5], num.trees = 5) expect_equal(rf_formula$prediction.error, rf_xy$prediction.error) expect_equal(rf_formula$predictions, rf_xy$predictions) }) test_that("Same result with x/y interface, regression", { set.seed(300) rf_formula <- ranger(Sepal.Length ~ ., iris, num.trees = 5) set.seed(300) rf_xy <- ranger(y = iris[, 1], x = iris[, -1], num.trees = 5) expect_equal(rf_formula$prediction.error, rf_xy$prediction.error) expect_equal(rf_formula$predictions, rf_xy$predictions) }) test_that("Same result with x/y interface, survival", { set.seed(300) rf_formula <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) set.seed(300) rf_xy <- ranger(y = veteran[, c(3, 4)], x = veteran[, c(-3, -4)], num.trees = 5) expect_equal(rf_formula$prediction.error, rf_xy$prediction.error) expect_equal(rf_formula$predictions, rf_xy$predictions) }) test_that("Column order does not change prediction", { dat <- iris[, c(sample(1:4), 5)] rf <- ranger(dependent.variable.name = "Species", data = iris) set.seed(42) pred1 <- predict(rf, iris)$predictions set.seed(42) pred2 <- predict(rf, dat)$predictions expect_equal(pred1, pred2) }) # Tibbles # This is failing on Rdevel. Possible without suggesting tibble package? # if (requireNamespace("tibble", quietly = TRUE)) { # tb <- tibble::as_tibble(iris) # } # test_that("Training works with tibbles, formula interface", { # skip_if_not_installed("tibble") # set.seed(1000) # rf1 <- ranger(Species ~ ., tb, num.trees = 5) # # set.seed(1000) # rf2 <- ranger(Species ~ ., iris, num.trees = 5) # # expect_equal(rf1$prediction.error, rf2$prediction.error) # # pred1 <- levels(iris$Species)[rf1$predictions[!is.na(rf1$predictions)]] # pred2 <- as.character(rf2$predictions[!is.na(rf2$predictions)]) # expect_equal(pred1, pred2) # }) # # test_that("Training works with tibbles, alternative interface", { # skip_if_not_installed("tibble") # set.seed(1000) # rf1 <- ranger(dependent.variable.name = "Species", data = tb, num.trees = 5) # # set.seed(1000) # rf2 <- ranger(dependent.variable.name = "Species", data = iris, num.trees = 5) # # expect_equal(rf1$prediction.error, rf2$prediction.error) # # pred1 <- levels(iris$Species)[rf1$predictions[!is.na(rf1$predictions)]] # pred2 <- as.character(rf2$predictions[!is.na(rf2$predictions)]) # expect_equal(pred1, pred2) # }) # # test_that("Prediction works with tibbles, formula interface", { # skip_if_not_installed("tibble") # set.seed(1000) # rf1 <- ranger(Species ~ ., tb, num.trees = 5) # # set.seed(1000) # rf2 <- ranger(Species ~ ., iris, num.trees = 5) # # set.seed(1000) # pred1 <- predict(rf1, tb) # set.seed(1000) # pred2 <- predict(rf1, iris) # set.seed(1000) # pred3 <- predict(rf2, tb) # set.seed(1000) # pred4 <- predict(rf2, iris) # # expect_equal(pred1$predictions, pred2$predictions) # expect_equal(pred2$predictions, pred3$predictions) # expect_equal(pred3$predictions, pred4$predictions) # }) # # test_that("Prediction works with tibbles, alternative interface", { # skip_if_not_installed("tibble") # set.seed(1000) # rf1 <- ranger(dependent.variable.name = "Species", data = tb, num.trees = 5) # # set.seed(1000) # rf2 <- ranger(dependent.variable.name = "Species", data = iris, num.trees = 5) # # set.seed(1000) # pred1 <- predict(rf1, tb) # set.seed(1000) # pred2 <- predict(rf1, iris) # set.seed(1000) # pred3 <- predict(rf2, tb) # set.seed(1000) # pred4 <- predict(rf2, iris) # # expect_equal(pred1$predictions, pred2$predictions) # expect_equal(pred2$predictions, pred3$predictions) # expect_equal(pred3$predictions, pred4$predictions) # }) ranger/tests/testthat/test_seed.R0000755000176200001440000000144514027301520016635 0ustar liggesusers## Tests for using seeds library(ranger) context("ranger_seed") ## Initialize the random forests ind = 1:150 %in% sample(150, 100) set.seed(2) mod1 = ranger(Species ~ ., data = iris[ind, ], write.forest = TRUE, num.trees = 50) pred1 = predict(mod1, data = iris[!ind, ]) set.seed(2) mod2 = ranger(Species ~ ., data = iris[ind, ], write.forest = TRUE, num.trees = 50) pred2 = predict(mod2, data = iris[!ind, ]) set.seed(2) mod3 = ranger(dependent.variable.name = "Species", data = iris[ind, ], write.forest = TRUE, num.trees = 50) pred3 = predict(mod3, data = iris[!ind, ]) ## Tests test_that("same result with same seed", { expect_equal(pred1$predictions, pred2$predictions) }) test_that("same result with same seed, different interface", { expect_equal(pred1$predictions, pred3$predictions) }) ranger/tests/testthat/test_csrf.R0000755000176200001440000000163514027301520016653 0ustar liggesuserslibrary(ranger) library(survival) context("case-specific RF") test_that("csrf classification returns predictions", { train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) iris.train <- iris[train.idx, ] iris.test <- iris[-train.idx, ] pred <- csrf(Species ~ ., training_data = iris.train, test_data = iris.test, params1 = list(num.trees = 10), params2 = list(num.trees = 3)) expect_is(pred, "factor") expect_equal(length(pred), nrow(iris)/3) }) test_that("csrf regression returns predictions", { train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) iris.train <- iris[train.idx, ] iris.test <- iris[-train.idx, ] pred <- csrf(Sepal.Length ~ ., training_data = iris.train, test_data = iris.test, params1 = list(num.trees = 10), params2 = list(num.trees = 3)) expect_is(pred, "numeric") expect_equal(length(pred), nrow(iris)/3) }) ranger/tests/testthat/test_sparse.R0000755000176200001440000001334214027301520017211 0ustar liggesuserslibrary(ranger) library(survival) library(Matrix) library(methods) context("ranger_sparse") ## Iris sparse data iris_sparse <- Matrix(data.matrix(iris), sparse = TRUE) ## 0/1 sparse data n <- 100 p <- 5 x <- replicate(p, rbinom(n, 1, .1)) y <- rbinom(n, 1, .5) dat <- data.frame(y = y, x) dat_matrix <- data.matrix(dat) dat_sparse <- Matrix(dat_matrix, sparse = TRUE) # Survival sparse data dat_survival <- data.frame(x, time = round(runif(n, 0, 10)), status = rbinom(n, 1, .7)) dat_survival_matrix <- data.matrix(dat_survival) dat_survival_sparse <- Matrix(dat_survival_matrix, sparse = TRUE) test_that("Same result with sparse data for iris classification", { set.seed(56) rf1 <- ranger(data = iris_sparse, dependent.variable.name = "Species", classification = TRUE, num.trees = 5) set.seed(56) rf2 <- ranger(data = iris, dependent.variable.name = "Species", num.trees = 5) expect_equal(rf1$prediction.error, rf2$prediction.error) pred1 <- levels(iris$Species)[rf1$predictions[!is.na(rf1$predictions)]] pred2 <- as.character(rf2$predictions[!is.na(rf2$predictions)]) expect_equal(pred1, pred2) }) test_that("Same result with sparse data for iris regression", { set.seed(56) rf1 <- ranger(data = iris_sparse, dependent.variable.name = "Sepal.Length", classification = FALSE, num.trees = 5) set.seed(56) rf2 <- ranger(data = iris, dependent.variable.name = "Sepal.Length", num.trees = 5) expect_equal(rf1$prediction.error, rf2$prediction.error) pred1 <- rf1$predictions[!is.na(rf1$predictions)] pred2 <- rf2$predictions[!is.na(rf2$predictions)] expect_equal(pred1, pred2) }) test_that("Same result with sparse data for 0/1 classification", { set.seed(56) rf1 <- ranger(data = dat_sparse, dependent.variable.name = "y", classification = TRUE, num.trees = 5) set.seed(56) rf2 <- ranger(data = dat, dependent.variable.name = "y", classification = TRUE, num.trees = 5) expect_equal(rf1$prediction.error, rf2$prediction.error) pred1 <- as.character(rf1$predictions[!is.na(rf1$predictions)]) pred2 <- as.character(rf2$predictions[!is.na(rf2$predictions)]) expect_equal(pred1, pred2) }) test_that("Same result with sparse data for 0/1 regression", { set.seed(56) rf1 <- ranger(data = dat_sparse, dependent.variable.name = "y", classification = FALSE, num.trees = 5) set.seed(56) rf2 <- ranger(data = dat, dependent.variable.name = "y", num.trees = 5) expect_equal(rf1$prediction.error, rf2$prediction.error) pred1 <- rf1$predictions[!is.na(rf1$predictions)] pred2 <- rf2$predictions[!is.na(rf2$predictions)] expect_equal(pred1, pred2) }) test_that("Same result with sparse data for 0/1 probability prediction", { set.seed(56) rf1 <- ranger(data = dat_sparse, dependent.variable.name = "y", probability = TRUE, num.trees = 5) set.seed(56) rf2 <- ranger(data = dat, dependent.variable.name = "y", probability = TRUE, num.trees = 5) expect_equal(rf1$prediction.error, rf2$prediction.error) pred1 <- rf1$predictions[!is.na(rf1$predictions)] pred2 <- rf2$predictions[!is.na(rf2$predictions)] expect_equal(pred1, pred2) }) test_that("Same result with sparse data for survival", { set.seed(56) rf1 <- ranger(data = dat_survival_sparse, dependent.variable.name = "time", status.variable.name = "status", num.trees = 5) set.seed(56) rf2 <- ranger(data = dat_survival, dependent.variable.name = "time", status.variable.name = "status", num.trees = 5) expect_equal(rf1$prediction.error, rf2$prediction.error) pred1 <- rf1$survival[!is.na(rf1$survival)] pred2 <- rf2$survival[!is.na(rf2$survival)] expect_equal(pred1, pred2) }) test_that("Survival prediction is the same with or without outcome in prediction data", { rf <- ranger(data = dat_survival_sparse, dependent.variable.name = "time", status.variable.name = "status", num.trees = 5) pred1 <- predict(rf, dat_survival_sparse)$survival pred2 <- predict(rf, dat_survival_sparse[, c(-6, -7)])$survival expect_equal(pred1, pred2) }) test_that("Prediction is the same if training or testing data is sparse", { idx <- sample(nrow(iris), 2/3*nrow(iris)) train <- iris[idx, ] test <- iris[-idx, ] train_sparse <- Matrix(data.matrix(train), sparse = TRUE) test_sparse <- Matrix(data.matrix(test), sparse = TRUE) set.seed(42) rf1 <- ranger(data = train, dependent.variable.name = "Species", classification = TRUE, num.trees = 5) pred1 <- predict(rf1, test) pred1_sparse <- predict(rf1, test_sparse) set.seed(42) rf2 <- ranger(data = train_sparse, dependent.variable.name = "Species", classification = TRUE, num.trees = 5) pred2 <- predict(rf2, test) pred2_sparse <- predict(rf2, test_sparse) expect_equal(pred1$predictions, pred1_sparse$predictions) expect_equal(as.character(pred1$predictions), levels(iris$Species)[pred2$predictions]) expect_equal(pred2$predictions, pred2_sparse$predictions) }) test_that("Sparse probability prediction works correctly", { rf <- ranger(data = dat_sparse, dependent.variable.name = "y", classification = TRUE, probability = TRUE, num.trees = 5) pred <- predict(rf, dat_sparse) expect_equal(dim(pred$predictions), c(nrow(dat_sparse), 2)) }) test_that("Corrected importance working for sparse data", { rf <- ranger(data = dat_sparse, dependent.variable.name = "y", classification = TRUE, num.trees = 5, importance = "impurity_corrected") expect_equal(names(rf$variable.importance), colnames(dat_sparse)[-1]) }) test_that("Sample size output is correct for sparse data", { rf <- ranger(data = dat_sparse, dependent.variable.name = "y", classification = TRUE, num.trees = 5) expect_equal(rf$num.samples, nrow(dat_sparse)) rf <- ranger(x = dat_sparse[, -1], y = as.factor(y), num.trees = 5) expect_equal(rf$num.samples, nrow(dat_sparse)) }) ranger/tests/testthat/test_ranger.R0000755000176200001440000003404714027301520017177 0ustar liggesuserslibrary(ranger) library(survival) context("ranger") test_that("Matrix interface works for Probability estimation", { rf <- ranger(dependent.variable.name = "Species", data = data.matrix(iris), write.forest = TRUE, probability = TRUE) expect_equal(rf$treetype, "Probability estimation") expect_equal(rf$forest$independent.variable.names, colnames(iris)[1:4]) }) test_that("Matrix interface prediction works for Probability estimation", { dat <- data.matrix(iris) rf <- ranger(dependent.variable.name = "Species", data = dat, write.forest = TRUE, probability = TRUE) expect_silent(predict(rf, dat)) }) test_that("no warning if data.frame has two classes", { dat <- iris class(dat) <- c("data.frame", "data.table") expect_silent(ranger(Species ~ ., data = dat)) }) test_that("Error if sample fraction is 0 or >1", { expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = 0)) expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = 1.1)) }) test_that("Error if sample fraction is vector for regression", { expect_error(ranger(Sepal.Length ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2)), "Error: Invalid value for sample\\.fraction\\. Vector values only valid for classification forests\\.") }) test_that("Error if sample fraction is vector of wrong size", { expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2)), "Error: Invalid value for sample\\.fraction\\. Expecting 3 values, provided 2\\.") }) test_that("Error if element of sample fraction vector is <0 or >1", { expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 1.1, 0.3)), "Error: Invalid value for sample\\.fraction. Please give a value in \\(0,1\\] or a vector of values in \\[0,1\\]\\.") expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(-3, 0.5, 0.3)), "Error: Invalid value for sample.fraction. Please give a value in \\(0,1] or a vector of values in \\[0,1\\]\\.") }) test_that("Error if sum of sample fraction vector is 0", { expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0, 0, 0)), "Error: Invalid value for sample\\.fraction. Sum of values must be >0\\.") }) test_that("Error if replace=FALSE and not enough samples", { expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), replace = FALSE, keep.inbag = TRUE), "Error: Not enough samples in class virginica; available: 50, requested: 60.") expect_silent(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), replace = TRUE, keep.inbag = TRUE)) }) test_that("Error if sample.fraction and case.weights", { expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), case.weights = rbinom(nrow(iris), 1, 0.5)), "Error: Combination of case\\.weights and class-wise sampling not supported\\.") }) test_that("Inbag counts match sample fraction, classification", { ## With replacement rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), replace = TRUE, keep.inbag = TRUE) inbag <- do.call(cbind, rf$inbag.counts) expect_equal(unique(colSums(inbag[iris$Species == "setosa", ])), 30) expect_equal(unique(colSums(inbag[iris$Species == "versicolor", ])), 45) expect_equal(unique(colSums(inbag[iris$Species == "virginica", ])), 60) ## Without replacement rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2, 0.3), replace = FALSE, keep.inbag = TRUE) inbag <- do.call(cbind, rf$inbag.counts) expect_equal(unique(colSums(inbag[iris$Species == "setosa", ])), 15) expect_equal(unique(colSums(inbag[iris$Species == "versicolor", ])), 30) expect_equal(unique(colSums(inbag[iris$Species == "virginica", ])), 45) ## Different order, without replacement dat <- iris[c(51:100, 101:150, 1:50), ] rf <- ranger(Species ~ ., dat, num.trees = 5, sample.fraction = c(0.1, 0.2, 0.3), replace = FALSE, keep.inbag = TRUE) inbag <- do.call(cbind, rf$inbag.counts) expect_equal(unique(colSums(inbag[dat$Species == "setosa", ])), 15) expect_equal(unique(colSums(inbag[dat$Species == "versicolor", ])), 30) expect_equal(unique(colSums(inbag[dat$Species == "virginica", ])), 45) }) test_that("Inbag counts match sample fraction, probability", { ## With replacement rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), replace = TRUE, keep.inbag = TRUE, probability = TRUE) inbag <- do.call(cbind, rf$inbag.counts) expect_equal(unique(colSums(inbag[1:50, ])), 30) expect_equal(unique(colSums(inbag[51:100, ])), 45) expect_equal(unique(colSums(inbag[101:150, ])), 60) ## Without replacement rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2, 0.3), replace = FALSE, keep.inbag = TRUE, probability = TRUE) inbag <- do.call(cbind, rf$inbag.counts) expect_equal(unique(colSums(inbag[1:50, ])), 15) expect_equal(unique(colSums(inbag[51:100, ])), 30) expect_equal(unique(colSums(inbag[101:150, ])), 45) }) test_that("as.factor() in formula works", { n <- 20 dt <- data.frame(x = runif(n), y = rbinom(n, 1, 0.5)) expect_silent(ranger(as.factor(y) ~ ., data = dt, num.trees = 5, write.forest = TRUE)) }) test_that("holdout mode holding out data with 0 weight", { weights <- rbinom(nrow(iris), 1, 0.5) rf <- ranger(Species ~ ., iris, num.trees = 5, importance = "permutation", case.weights = weights, replace = FALSE, sample.fraction = 0.632*mean(weights), holdout = TRUE, keep.inbag = TRUE) inbag <- data.frame(rf$inbag.counts) expect_true(all(inbag[weights == 0, ] == 0)) }) test_that("holdout mode uses holdout OOB data", { weights <- rbinom(nrow(iris), 1, 0.5) rf <- ranger(Species ~ ., iris, num.trees = 5, importance = "permutation", case.weights = weights, replace = FALSE, sample.fraction = 0.632*mean(weights), holdout = TRUE, keep.inbag = TRUE) expect_false(any(is.na(rf$predictions[weights == 0]))) expect_true(all(is.na(rf$predictions[weights == 1]))) }) test_that("holdout mode not working if no weights", { expect_error(ranger(Species ~ ., iris, num.trees = 5, importance = "permutation", holdout = TRUE)) }) test_that("holdout mode: no OOB prediction if no 0 weights", { weights <- runif(nrow(iris)) rf <- ranger(Species ~ ., iris, num.trees = 5, importance = "permutation", case.weights = weights, replace = FALSE, holdout = TRUE, keep.inbag = TRUE) expect_true(all(is.na(rf$predictions))) }) test_that("OOB error is correct for 1 tree, classification", { n <- 50 dat <- data.frame(y = factor(rbinom(n, 1, .5)), x = rnorm(n)) rf <- ranger(y ~ ., dat, num.trees = 1) expect_equal(rf$prediction.error, mean(rf$predictions != dat$y, na.rm = TRUE)) }) test_that("OOB error is correct for 1 tree, probability prediction", { n <- 50 dat <- data.frame(y = factor(rbinom(n, 1, .5)), x = rnorm(n)) rf <- ranger(y ~ ., dat, num.trees = 1, probability = TRUE) prob <- c(rf$predictions[dat$y == "0", 1], rf$predictions[dat$y == "1", 2]) expect_equal(rf$prediction.error, mean((1 - prob)^2, na.rm = TRUE)) }) test_that("OOB error is correct for 1 tree, regression", { n <- 50 dat <- data.frame(y = rbinom(n, 1, .5), x = rnorm(n)) rf <- ranger(y ~ ., dat, num.trees = 1) expect_equal(rf$prediction.error, mean((dat$y - rf$predictions)^2, na.rm = TRUE)) }) test_that("Missing value columns detected in training", { dat <- iris dat[25, 1] <- NA expect_error(ranger(Species ~ ., dat, num.trees = 5), "Missing data in columns: Sepal.Length") dat <- iris dat[4, 5] <- NA expect_error(ranger(Species ~ ., dat, num.trees = 5), "Missing data in dependent variable.") }) test_that("No error if missing value in irrelevant column, training", { dat <- iris dat[1, "Sepal.Width"] <- NA expect_silent(ranger(Species ~ Sepal.Length, dat, num.trees = 5)) }) test_that("No error if missing value in irrelevant column, prediction", { rf <- ranger(Species ~ Sepal.Length, iris, num.trees = 5) dat <- iris dat[1, "Sepal.Width"] <- NA expect_silent(predict(rf, dat)) }) test_that("Split points are at (A+B)/2 for numeric features, regression variance splitting", { dat <- data.frame(y = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10) split_points <- sapply(1:rf$num.trees, function(i) { res <- treeInfo(rf, i)$splitval res[!is.na(res)] }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, regression maxstat splitting", { dat <- data.frame(y = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10, splitrule = "maxstat", alpha = 1) split_points <- sapply(1:rf$num.trees, function(i) { res <- treeInfo(rf, i)$splitval res[!is.na(res)] }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, classification", { dat <- data.frame(y = factor(rbinom(100, 1, .5)), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10) split_points <- sapply(1:rf$num.trees, function(i) { res <- treeInfo(rf, i)$splitval res[!is.na(res)] }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, probability", { dat <- data.frame(y = factor(rbinom(100, 1, .5)), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10, probability = TRUE) split_points <- sapply(1:rf$num.trees, function(i) { res <- treeInfo(rf, i)$splitval res[!is.na(res)] }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, survival logrank splitting", { dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "logrank") split_points <- sapply(1:rf$num.trees, function(i) { res <- treeInfo(rf, i)$splitval res[!is.na(res)] }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, survival C-index splitting", { dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "C") split_points <- sapply(1:rf$num.trees, function(i) { res <- treeInfo(rf, i)$splitval res[!is.na(res)] }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, survival maxstat splitting", { dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "maxstat", alpha = 1) split_points <- sapply(1:rf$num.trees, function(i) { res <- treeInfo(rf, i)$splitval res[!is.na(res)] }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("No error if variable named forest", { dat <- iris dat$forest <- rnorm(150) rf <- ranger(Species ~ ., dat, num.trees = 5) expect_silent(predict(rf, dat)) }) test_that("Prediction error not NA if oob.error=TRUE", { rf <- ranger(Species ~ ., iris, num.trees = 5) expect_false(is.na(rf$prediction.error)) rf <- ranger(Surv(time,status) ~ ., veteran, num.trees = 5) expect_false(is.na(rf$prediction.error)) }) test_that("Prediction error is NA if oob.error=FALSE", { rf <- ranger(Species ~ ., iris, num.trees = 5, oob.error = FALSE) expect_true(is.na(rf$prediction.error)) rf <- ranger(Surv(time,status) ~ ., veteran, num.trees = 5, oob.error = FALSE) expect_true(is.na(rf$prediction.error)) }) test_that("Tree depth creates trees of correct size", { # Recursive function to get tree depth depth <- function(rf, tree, i) { left <- rf$forest$child.nodeIDs[[tree]][[1]][i] + 1 right <- rf$forest$child.nodeIDs[[tree]][[2]][i] + 1 if (left <= 1) { 0 } else { 1 + max(c(depth(rf, tree, left), depth(rf, tree, right))) } } forest_depth <- function(rf) { sapply(1:rf$num.trees, depth, rf = rf, i = 1) } # Depth 1 rf <- ranger(Species ~ ., iris, num.trees = 5, max.depth = 1) expect_true(all(forest_depth(rf) <= 1)) # Depth 4 rf <- ranger(Species ~ ., iris, num.trees = 5, max.depth = 4) expect_true(all(forest_depth(rf) <= 4)) # Random depth (deeper trees) max.depth <- round(runif(1, 1, 20)) dat <- data.frame(y = runif(100, 0, 1), x = runif(100, 0, 1)) rf <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 1, max.depth = max.depth) expect_true(all(forest_depth(rf) <= max.depth)) }) test_that("Tree depth 0 equivalent to unlimited", { set.seed(200) rf1 <- ranger(Species ~ ., iris, num.trees = 5, max.depth = 0) set.seed(200) rf2 <- ranger(Species ~ ., iris, num.trees = 5) expect_equal(sapply(rf1$forest$split.varIDs, length), sapply(rf2$forest$split.varIDs, length)) }) test_that("Meaningful predictions with max.depth = 1", { rf <- ranger(Sepal.Length ~ ., iris, max.depth = 1, num.trees = 5) pred <- predict(rf, iris)$predictions expect_gte(min(pred), min(iris$Sepal.Length)) expect_lte(max(pred), max(iris$Sepal.Length)) }) test_that("Does not crash when variable named 'none'", { dat <- data.frame(y = rbinom(100, 1, .5), x = rbinom(100, 1, .5), none = rbinom(100, 1, .5)) rf <- ranger(data = dat, dependent.variable.name = "y") expect_equal(rf$forest$independent.variable.names, c("x", "none")) expect_silent(predict(rf, dat)) }) test_that("mtry function input works as expected", { rf <- ranger(Species ~ ., data = iris, mtry = function(n) n - 1) expect_equal(3, rf$mtry) }) test_that("mtry function error halts the ranger function", { expect_error( ranger(Species ~ ., data = iris, mtry = function(n) stop("this is some error")), "mtry function evaluation resulted in an error.") }) ranger/tests/testthat/test_inbag.R0000755000176200001440000001031214027301520016766 0ustar liggesusers## Tests for inbag functions library(ranger) context("ranger_inbag") ## Tests test_that("Inbag count matrix if of right size, with replacement", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE) expect_equal(dim(data.frame(rf$inbag.counts)), c(nrow(iris), rf$num.trees)) }) test_that("Inbag count matrix if of right size, without replacement", { rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, keep.inbag = TRUE) expect_equal(dim(data.frame(rf$inbag.counts)), c(nrow(iris), rf$num.trees)) }) test_that("Inbag count matrix if of right size, with replacement, weighted", { rf <- ranger(Species ~ ., iris, num.trees = 5, case.weights = runif(nrow(iris)), keep.inbag = TRUE) expect_equal(dim(data.frame(rf$inbag.counts)), c(nrow(iris), rf$num.trees)) }) test_that("Inbag count matrix if of right size, without replacement, weighted", { rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, case.weights = runif(nrow(iris)), keep.inbag = TRUE) expect_equal(dim(data.frame(rf$inbag.counts)), c(nrow(iris), rf$num.trees)) }) test_that("Number of samples is right sample fraction, replace=FALSE, default", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE, replace = FALSE) num.inbag <- sapply(rf$inbag.counts, function(x) { sum(x > 0) }) sample.fraction <- mean(num.inbag/nrow(iris)) expect_gt(sample.fraction, 0.6) expect_lt(sample.fraction, 0.7) }) test_that("Number of samples is right sample fraction, replace=FALSE, 0.3", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE, replace = FALSE, sample.fraction = 0.3) num.inbag <- sapply(rf$inbag.counts, function(x) { sum(x > 0) }) sample.fraction <- mean(num.inbag/nrow(iris)) expect_gt(sample.fraction, 0.25) expect_lt(sample.fraction, 0.35) }) test_that("Number of samples is right sample fraction, replace=TRUE, default", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE, replace = TRUE) num.inbag <- sapply(rf$inbag.counts, function(x) { sum(x > 0) }) sample.fraction <- mean(num.inbag/nrow(iris)) expected.sample.fraction <- 1-exp(-1) expect_gt(sample.fraction, expected.sample.fraction-0.05) expect_lt(sample.fraction, expected.sample.fraction+0.05) }) test_that("Number of samples is right sample fraction, replace=TRUE, 0.5", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE, replace = TRUE, sample.fraction = 0.5) num.inbag <- sapply(rf$inbag.counts, function(x) { sum(x > 0) }) sample.fraction <- mean(num.inbag/nrow(iris)) expected.sample.fraction <- 1-exp(-0.5) expect_gt(sample.fraction, expected.sample.fraction-0.05) expect_lt(sample.fraction, expected.sample.fraction+0.05) }) test_that("Number of samples is right sample fraction, replace=FALSE, 0.3, weighted", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE, replace = FALSE, sample.fraction = 0.3, case.weights = runif(nrow(iris))) num.inbag <- sapply(rf$inbag.counts, function(x) { sum(x > 0) }) sample.fraction <- mean(num.inbag/nrow(iris)) expect_gt(sample.fraction, 0.25) expect_lt(sample.fraction, 0.35) }) test_that("Number of samples is right sample fraction, replace=TRUE, 0.5, weighted", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE, replace = TRUE, sample.fraction = 0.5, case.weights = runif(nrow(iris))) num.inbag <- sapply(rf$inbag.counts, function(x) { sum(x > 0) }) sample.fraction <- mean(num.inbag/nrow(iris)) expected.sample.fraction <- 1-exp(-0.5) expect_gt(sample.fraction, expected.sample.fraction-0.05) expect_lt(sample.fraction, expected.sample.fraction+0.05) }) test_that("Manual inbag selection selects correct observations", { inbag <- replicate(5, rbinom(nrow(iris), 1, .5), simplify = FALSE) rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, keep.inbag = TRUE, inbag = inbag) expect_equal(rf$inbag.counts, inbag) inbag <- replicate(5, round(runif(nrow(iris), 0, 5)), simplify = FALSE) rf <- ranger(Species ~ ., iris, num.trees = 5, replace = TRUE, keep.inbag = TRUE, inbag = inbag) expect_equal(rf$inbag.counts, inbag) })ranger/tests/testthat/test_betasplit.R0000755000176200001440000000106314027301517017706 0ustar liggesuserslibrary(ranger) context("ranger_betasplit") # Generate data with 0..1 outcome n <- 100 p <- 4 beta <- c(0, 1, 2, 3) x <- replicate(p, runif(n)) y <- as.vector(x %*% beta) y <- (y-min(y))/(max(y)-min(y)) dat <- data.frame(y = y, x) test_that("beta splitting works for regression", { rf <- ranger(y ~ ., dat, splitrule = "beta", num.trees = 50) expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.2) }) test_that("beta splitting not working for non 0..1 outcome", { expect_error(ranger(Sepal.Length ~ ., iris, splitrule = "beta", num.trees = 50)) })ranger/tests/testthat/test_importance_pvalues.R0000755000176200001440000001602014027301520021610 0ustar liggesuserslibrary(ranger) library(survival) context("importance_pvalues") ## GenABEL data if (requireNamespace("GenABEL", quietly = TRUE)) { dat_gwaa <- readRDS("../test_gwaa.rds") } ## 0 noise variables rf_p0 <- ranger(Species ~., iris, num.trees = 5, importance = "permutation", write.forest = TRUE) holdout_p0 <- holdoutRF(Species ~., iris, num.trees = 5) ## 100 noise variables n <- nrow(iris) p <- 100 noise <- replicate(p, rnorm(n)) colnames(noise) <- paste0("noise", 1:p) dat_n100 <- cbind(iris, noise) rf_p100 <- ranger(Species ~., dat_n100, num.trees = 5, importance = "permutation", write.forest = TRUE) holdout_p100 <- holdoutRF(Species ~., dat_n100, num.trees = 5) ## General test_that("Importance p-values Janitza: Error if impurity importance", { rf <- ranger(Species ~., iris, num.trees = 5, importance = "impurity") expect_error(importance_pvalues(rf, method = "janitza")) }) ## Janitza test_that("Importance p-values Janitza: warning if few negative importance values", { expect_warning(importance_pvalues(rf_p100, method = "janitza")) }) test_that("Importance p-values Janitza: returns correct dimensions", { expect_warning(vimp <- importance_pvalues(rf_p100, method = "janitza")) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(104, 2)) }) test_that("Importance p-values Janitza: error if no importance", { rf_none <- ranger(Species ~., iris, num.trees = 5, importance = "none", write.forest = TRUE) expect_error(importance_pvalues(rf_none, method = "janitza")) }) test_that("Importance p-values Janitza: error if Gini importance", { rf_imp <- ranger(Species ~., iris, num.trees = 5, importance = "impurity", write.forest = TRUE) expect_error(importance_pvalues(rf_imp, method = "janitza")) }) test_that("Importance p-values Janitza: error if no unimportant variables", { # Error only when all importance values positive skip_if(any(rf_p0$variable.importance <= 0)) expect_warning(expect_error(importance_pvalues(rf_p0, method = "janitza"))) }) test_that("Importance p-values Janitza: warning for regression", { rf <- ranger(Sepal.Length ~., dat_n100, num.trees = 5, importance = "permutation", write.forest = TRUE) expect_warning(importance_pvalues(rf, method = "janitza")) }) test_that("Importance p-values Janitza-Holdout: returns correct dimensions", { expect_warning(vimp <- importance_pvalues(holdout_p100, method = "janitza")) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(104, 2)) }) ## Altmann test_that("Importance p-values Altmann: returns correct dimensions", { vimp <- importance_pvalues(rf_p0, method = "altmann", formula = Species ~ ., data = iris) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(4, 2)) }) test_that("Importance p-values Altmann: error if no importance", { rf_none <- ranger(Species ~., iris, num.trees = 5, importance = "none", write.forest = TRUE) expect_error(importance_pvalues(rf_none, method = "altmann", formula = Species ~ ., data = iris)) }) test_that("Importance p-values Altmann: not working for holdoutRF", { expect_error(importance_pvalues(holdout_p0, method = "altmann", formula = Species ~ ., data = iris)) }) test_that("Importance p-values Altmann: No zero p-values", { vimp <- importance_pvalues(rf_p0, method = "altmann", formula = Species ~ ., data = iris) expect_false(any(vimp[, "pvalue"] == 0)) }) test_that("Importance p-values Altmann: working with character formula", { vimp <- importance_pvalues(rf_p0, method = "altmann", formula = "Species ~ .", data = iris) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(4, 2)) }) ## Hold-out RF test_that("HoldoutRF working", { expect_is(holdout_p0, "holdoutRF") }) test_that("HoldoutRF working with GenABEL data", { skip_if_not_installed("GenABEL") holdout_gwaa <- holdoutRF(CHD ~., dat_gwaa, num.trees = 5) expect_is(holdout_p0, "holdoutRF") }) test_that("HoldoutRF ... argument working", { rf <- holdoutRF(Species ~., iris, num.trees = 5) expect_equal(rf$rf1$num.trees, 5) }) test_that("HoldoutRF working with formula", { rf <- holdoutRF(Species ~., iris, num.trees = 5) expect_equal(rf$rf1$treetype, "Classification") rf <- holdoutRF(Species ~., data = iris, num.trees = 5) expect_equal(rf$rf1$treetype, "Classification") rf <- holdoutRF(formula = Species ~., iris, num.trees = 5) expect_equal(rf$rf1$treetype, "Classification") rf <- holdoutRF(data = iris, formula = Species ~., num.trees = 5) expect_equal(rf$rf1$treetype, "Classification") }) test_that("HoldoutRF working with dependent.variable.name", { rf <- holdoutRF(dependent.variable.name = "Species", data = iris, num.trees = 5) expect_equal(rf$rf1$treetype, "Classification") rf <- holdoutRF(data = iris, dependent.variable.name = "Species", num.trees = 5) expect_equal(rf$rf1$treetype, "Classification") }) test_that("HoldoutRF not working if importance argument used", { expect_error(holdoutRF(Species ~., iris, num.trees = 5, importance = "impurity"), "Error: Argument 'importance' not supported in holdoutRF.") }) test_that("HoldoutRF not working if replace argument used", { expect_error(holdoutRF(Species ~., iris, num.trees = 5, replace = TRUE), "Error: Argument 'replace' not supported in holdoutRF.") }) ## Survival, 0 noise variables rf_p0_surv <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, importance = "permutation", write.forest = TRUE) #holdout_p0_surv <- holdoutRF(Surv(time, status) ~ ., veteran, num.trees = 5) ## Survival, 100 noise variables p <- 100 noise <- replicate(p, rnorm(nrow(veteran))) colnames(noise) <- paste0("noise", 1:p) dat_n100_surv <- cbind(veteran, noise) rf_p100_surv <- ranger(Surv(time, status) ~., dat_n100_surv, num.trees = 5, importance = "permutation", write.forest = TRUE) #holdout_p100_surv <- holdoutRF(Surv(time, status) ~., dat_n100_surv, num.trees = 5) test_that("Survival importance p-values Janitza: returns correct dimensions", { expect_warning(vimp <- importance_pvalues(rf_p100_surv, method = "janitza")) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(106, 2)) }) test_that("Survival importance p-values Altmann: returns correct dimensions", { vimp <- importance_pvalues(rf_p0_surv, method = "altmann", formula = Surv(time, status) ~ ., data = veteran) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(6, 2)) }) test_that("Survival importance p-values Altmann working with corrected impurity importance", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, importance = "impurity_corrected") vimp <- importance_pvalues(rf, method = "altmann", formula = Surv(time, status) ~ ., data = veteran) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(6, 2)) }) test_that("Survival importance p-values Janitza working with corrected impurity importance", { rf <- ranger(Surv(time, status) ~ ., dat_n100_surv, num.trees = 5, importance = "impurity_corrected") expect_warning(vimp <- importance_pvalues(rf, method = "janitza")) expect_is(vimp, "matrix") expect_equal(dim(vimp), c(106, 2)) }) ranger/tests/testthat/test_unordered.R0000755000176200001440000003533114027301520017705 0ustar liggesuserslibrary(ranger) library(survival) context("ranger_unordered") test_that("Old parameters still work", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) rf.false <- ranger(y ~ ., data = dt, num.trees = 5, write.forest = TRUE, respect.unordered.factors = FALSE) rf.true <- ranger(y ~ ., data = dt, num.trees = 5, write.forest = TRUE, respect.unordered.factors = TRUE) expect_null(rf.false$forest$covariate.levels) expect_equal(length(rf.true$forest$covariate.levels), 1) }) test_that("If respect.unordered.factors='partition', regard characters as unordered", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) set.seed(2) rf.char <- ranger(y ~ ., data = dt, num.trees = 5, min.node.size = n/2, respect.unordered.factors = 'partition') dt$x <- factor(dt$x, ordered = FALSE) set.seed(2) rf.fac <- ranger(y ~ ., data = dt, num.trees = 5, min.node.size = n/2, respect.unordered.factors = 'partition') expect_equal(rf.char$prediction.error, rf.fac$prediction.error) }) test_that("If respect.unordered.factors='ignore', regard characters as ordered", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) set.seed(2) rf.char <- ranger(y ~ ., data = dt, num.trees = 5, min.node.size = n/2, respect.unordered.factors = 'ignore') dt$x <- factor(dt$x, ordered = FALSE) set.seed(2) rf.fac <- ranger(y ~ ., data = dt, num.trees = 5, min.node.size = n/2, respect.unordered.factors = 'ignore') expect_equal(rf.char$prediction.error, rf.fac$prediction.error) }) test_that("Error if other value for respect.unordered.factors", { expect_error(ranger(y ~ ., iris, num.trees = 5, respect.unordered.factors = NULL)) }) test_that("Same results if no unordered factors", { set.seed(100) rf1 <- ranger(Species ~ ., iris, num.trees = 5, respect.unordered.factors = 'ignore') set.seed(100) rf2 <- ranger(Species ~ ., iris, num.trees = 5, respect.unordered.factors = 'order') set.seed(100) rf3 <- ranger(Species ~ ., iris, num.trees = 5, respect.unordered.factors = 'partition') expect_equal(rf1$confusion.matrix, rf2$confusion.matrix) expect_equal(rf1$confusion.matrix, rf3$confusion.matrix) }) test_that("Unordered splitting working for classification", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = factor(rbinom(n, 1, 0.5)), stringsAsFactors = FALSE) rf <- ranger(y ~ ., data = dt, num.trees = 5, min.node.size = n/2, respect.unordered.factors = 'partition') expect_true(any(!rf$forest$is.ordered)) }) test_that("Unordered splitting working for probability", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = factor(rbinom(n, 1, 0.5)), stringsAsFactors = FALSE) rf <- ranger(y ~ ., data = dt, num.trees = 5, min.node.size = n/2, respect.unordered.factors = 'partition', probability = TRUE) expect_true(any(!rf$forest$is.ordered)) }) test_that("Order splitting working for multiclass classification", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = factor(sample(c("A", "B", "C", "D"), n, replace = TRUE)), stringsAsFactors = FALSE) rf <- ranger(y ~ ., data = dt, num.trees = 5, respect.unordered.factors = 'order', probability = TRUE) expect_true(all(rf$forest$is.ordered)) }) test_that("Order splitting working for multiclass probability", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = factor(sample(c("A", "B", "C", "D"), n, replace = TRUE)), stringsAsFactors = FALSE) rf <- ranger(y ~ ., data = dt, num.trees = 5, respect.unordered.factors = 'order', probability = TRUE) expect_true(all(rf$forest$is.ordered)) }) test_that("Order splitting working with alternative interface", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = factor(sample(c("A", "B", "C", "D"), n, replace = TRUE)), stringsAsFactors = FALSE) rf <- ranger(dependent.variable.name = "y", data = dt, num.trees = 5, respect.unordered.factors = 'order') expect_true(all(rf$forest$is.ordered)) rf <- ranger(dependent.variable.name = "y", data = dt, num.trees = 5, respect.unordered.factors = 'order', probability = TRUE) expect_true(all(rf$forest$is.ordered)) }) test_that("Order splitting working with single level factor", { n <- 20 # Binary classification dt_class <- data.frame(x = sample(c("A"), n, replace = TRUE), y = factor(sample(c("A", "B"), n, replace = TRUE)), stringsAsFactors = FALSE) expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = FALSE)) expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = TRUE)) # Multiclass classification dt_mult <- data.frame(x = sample(c("A"), n, replace = TRUE), y = factor(sample(c("A", "B", "C", "D"), n, replace = TRUE)), stringsAsFactors = FALSE) expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = FALSE)) expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = TRUE)) # Regression dt_cont <- data.frame(x = sample(c("A"), n, replace = TRUE), y = rnorm(n), stringsAsFactors = FALSE) expect_silent(ranger(y ~ ., data = dt_cont, num.trees = 5, respect.unordered.factors = 'order')) # Survival dt_surv <- data.frame(x = sample(c("A"), n, replace = TRUE), time = rnorm(n), status = rbinom(n, 1, .5), stringsAsFactors = FALSE) expect_silent(ranger(Surv(time, status) ~ ., data = dt_surv, num.trees = 5, respect.unordered.factors = 'order')) }) test_that("Unordered splitting working for survival", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, min.node.size = 50, respect.unordered.factors = 'partition') expect_true(any(!rf$forest$is.ordered)) }) test_that("Order splitting working for survival", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, min.node.size = 50, respect.unordered.factors = 'order') expect_true(all(rf$forest$is.ordered)) }) test_that("Order splitting working for survival with alternative interface", { rf <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = veteran, num.trees = 5, min.node.size = 50, respect.unordered.factors = 'order') expect_true(all(rf$forest$is.ordered)) }) test_that("Error if too many factors in 'partition' mode", { n <- 100 dt <- data.frame(x = factor(1:100, ordered = FALSE), y = rbinom(n, 1, 0.5)) expect_error(ranger(y ~ ., data = dt, num.trees = 5, respect.unordered.factors = 'partition')) }) test_that("Survival forest with 'order' mode works", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, write.forest = TRUE, respect.unordered.factors = 'order') expect_equal(sort(rf$forest$covariate.levels$celltype), sort(levels(veteran$celltype))) pred <- predict(rf, veteran) expect_is(pred, "ranger.prediction") }) test_that("maxstat splitting not working with unordered factors", { expect_error(ranger(Sepal.Length ~ ., iris, splitrule = "maxstat", respect.unordered.factors = "partition")) expect_error(ranger(Surv(time, status) ~ ., veteran, splitrule = "maxstat", respect.unordered.factors = "partition")) }) test_that("C splitting not working with unordered factors", { expect_error(ranger(Surv(time, status) ~ ., veteran, splitrule = "C", respect.unordered.factors = "partition")) }) test_that("Warning for maxstat with 'order' mode", { expect_warning(ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "maxstat", respect.unordered.factors = 'order')) }) test_that("No warning for multiclass classification/probability and survival with 'order' mode", { expect_silent(ranger(Species ~ ., iris, num.trees = 5, respect.unordered.factors = 'order')) expect_silent(ranger(Species ~ ., iris, num.trees = 5, probability = TRUE, respect.unordered.factors = 'order')) expect_silent(ranger(Surv(time, status) ~ ., veteran, num.trees = 5, respect.unordered.factors = 'order')) }) test_that("No error if new levels in predict, 1 column", { set.seed(1) n <- 20 train <- data.frame(x = sample(c("A", "B", "C"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) test <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) ## ignore rf.ignore <- ranger(y ~ ., data = train, num.trees = 5, respect.unordered.factors = 'ignore') expect_silent(predict(rf.ignore, test)) ## partition rf.partition <- ranger(y ~ ., data = train, num.trees = 5, respect.unordered.factors = 'partition') expect_silent(predict(rf.partition, test)) ## order rf.order <- ranger(y ~ ., data = train, num.trees = 5, respect.unordered.factors = 'order') expect_silent(predict(rf.order, test)) }) test_that("No error if new levels in predict, 2 columns", { set.seed(1) n <- 20 train <- data.frame(x1 = sample(c("A", "B", "C"), n, replace = TRUE), x2 = sample(c("A", "B", "C"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) test <- data.frame(x1 = sample(c("A", "B", "C", "D"), n, replace = TRUE), x2 = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) ## ignore rf.ignore <- ranger(y ~ ., data = train, num.trees = 5, respect.unordered.factors = 'ignore') expect_silent(predict(rf.ignore, test)) ## partition rf.partition <- ranger(y ~ ., data = train, num.trees = 5, respect.unordered.factors = 'partition') expect_silent(predict(rf.partition, test)) ## order rf.order <- ranger(y ~ ., data = train, num.trees = 5, respect.unordered.factors = 'order') expect_silent(predict(rf.order, test)) }) test_that("No error if NA factor levels and order", { df <- data.frame(x = addNA(factor(c("a", "a", NA, NA, "b", "b"))), y = c(1, 2, 3, 4, 5, 6)) expect_silent(ranger(dependent.variable.name = "y", data = df, respect.unordered.factors = "order")) }) test_that("Order splitting working when numerics in data", { n <- 20 # Binary classification dt_class <- data.frame(x1 = sample(c("A", "B", "C"), n, replace = TRUE), x2 = sample(1:3, n, replace = TRUE), y = factor(sample(c("A", "B"), n, replace = TRUE)), stringsAsFactors = FALSE) rf <- expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = FALSE)) expect_silent(predict(rf, dt_class)) rf <- expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = TRUE)) expect_silent(predict(rf, dt_class)) # Multiclass classification dt_mult <- data.frame(x1 = sample(c("A", "B", "C"), n, replace = TRUE), x2 = sample(1:3, n, replace = TRUE), y = factor(sample(c("A", "B", "C", "D"), n, replace = TRUE)), stringsAsFactors = FALSE) rf <- expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = FALSE)) expect_silent(predict(rf, dt_mult)) rf <- expect_silent(ranger(y ~ ., data = dt_class, num.trees = 5, respect.unordered.factors = 'order', probability = TRUE)) expect_silent(predict(rf, dt_mult)) # Regression dt_cont <- data.frame(x1 = sample(c("A", "B", "C"), n, replace = TRUE), x2 = sample(1:3, n, replace = TRUE), y = rnorm(n), stringsAsFactors = FALSE) rf <- expect_silent(ranger(y ~ ., data = dt_cont, num.trees = 5, respect.unordered.factors = 'order')) expect_silent(predict(rf, dt_cont)) # Survival dt_surv <- data.frame(x1 = sample(c("A", "B", "C"), n, replace = TRUE), x2 = as.numeric(sample(1:3, n, replace = TRUE)), time = rnorm(n), status = rbinom(n, 1, .5), stringsAsFactors = FALSE) rf <- expect_silent(ranger(Surv(time, status) ~ ., data = dt_surv, num.trees = 5, respect.unordered.factors = 'order')) expect_silent(predict(rf, dt_surv)) # Survival with Surv() in data dt_surv <- data.frame(x1 = sample(c("A", "B", "C"), n, replace = TRUE), x2 = as.numeric(sample(1:3, n, replace = TRUE)), y = Surv(rnorm(n), rbinom(n, 1, .5)), stringsAsFactors = FALSE) rf <- expect_silent(ranger(y ~ ., data = dt_surv, num.trees = 5, respect.unordered.factors = 'order')) expect_silent(predict(rf, dt_surv)) }) test_that("Partition splitting working for large number of levels", { n <- 43 dt <- data.frame(x = factor(1:n, ordered = FALSE), y = rbinom(n, 1, 0.5)) rf <- ranger(y ~ ., data = dt, num.trees = 10, splitrule = "extratrees") max_split <- max(sapply(1:rf$num.trees, function(i) { max(log2(rf$forest$split.values[[i]]), na.rm = TRUE) })) expect_lte(max_split, n) }) test_that("Order splitting working for quantreg forests", { n <- 20 dt <- data.frame(x = sample(c("A", "B", "C"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = TRUE) expect_silent(ranger(y ~ ., dt, num.trees = 5, quantreg = TRUE, respect.unordered.factors = 'order')) }) ranger/tests/testthat/test_genabel.R0000755000176200001440000000702014027301520017305 0ustar liggesuserslibrary(ranger) library(survival) context("genabel") test_that("classification gwaa rf is of class ranger with 14 elements", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat.gwaa <- readRDS("../test_gwaa.rds") rf <- ranger(CHD ~ ., data = dat.gwaa) expect_is(rf, "ranger") expect_equal(length(rf), 14) }) test_that("GenABEL prediction works if no covariates and formula used", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat <- readRDS("../test_gwaa.rds") dat@phdata$Age <- NULL rf <- ranger(CHD ~ .-Sex, data = dat, num.trees = 5) expect_silent(predict(rf, dat)) }) test_that("SNP ordering working for binary classification", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat.gwaa <- readRDS("../test_gwaa.rds") rf <- ranger(CHD ~ ., data = dat.gwaa, num.trees = 5, respect.unordered.factors = "order") expect_is(rf$forest$snp.order, "list") expect_length(rf$forest$snp.order, nsnps(dat.gwaa)) }) test_that("SNP ordering working for regression", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat.gwaa <- readRDS("../test_gwaa.rds") rf <- ranger(Age ~ ., data = dat.gwaa, num.trees = 5, respect.unordered.factors = "order") expect_is(rf$forest$snp.order, "list") expect_length(rf$forest$snp.order, nsnps(dat.gwaa)) }) test_that("SNP ordering not working for multiclass", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat <- readRDS("../test_gwaa.rds") dat@phdata$mc <- factor(round(runif(nids(dat), 1, 5))) expect_error(ranger(mc ~ ., data = dat, num.trees = 5, respect.unordered.factors = "order"), "Error: Ordering of SNPs currently only implemented for regression and binary outcomes.") }) test_that("SNP ordering not working for survival", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat <- readRDS("../test_gwaa.rds") dat@phdata$time <- runif(nids(dat), 1, 100) dat@phdata$status <- rbinom(nids(dat), 1, 0.5) expect_error(ranger(Surv(time, status) ~ ., data = dat, num.trees = 5, respect.unordered.factors = "order"), "Error: Ordering of SNPs currently only implemented for regression and binary outcomes.") }) test_that("SNP ordering working with corrected importance", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat.gwaa <- readRDS("../test_gwaa.rds") rf <- ranger(CHD ~ ., data = dat.gwaa, num.trees = 5, respect.unordered.factors = "order", importance = "impurity_corrected") expect_is(rf$forest$snp.order, "list") expect_length(rf$forest$snp.order, nsnps(dat.gwaa)) rf <- ranger(CHD ~ 0, data = dat.gwaa, num.trees = 5, respect.unordered.factors = "order", importance = "impurity_corrected") expect_is(rf$forest$snp.order, "list") expect_length(rf$forest$snp.order, nsnps(dat.gwaa)) }) test_that("SNP ordering not working with corrected importance for survival", { skip_if_not_installed("GenABEL") skip_if_not_installed("MASS") library(GenABEL) dat <- readRDS("../test_gwaa.rds") dat@phdata$time <- runif(nids(dat), 1, 100) dat@phdata$status <- rbinom(nids(dat), 1, 0.5) expect_error(ranger(Surv(time, status) ~ 0, data = dat, num.trees = 5, respect.unordered.factors = "order", importance = "impurity_corrected"), "Error: Ordering of SNPs currently only implemented for regression and binary outcomes.") }) ranger/tests/testthat/test_classification.R0000755000176200001440000001714114027301517020716 0ustar liggesusers## Tests for random forests for classification library(ranger) context("ranger_class") ## Initialize the random forest for classification dat <- data.matrix(iris) rg.class <- ranger(Species ~ ., data = iris) rg.mat <- ranger(dependent.variable.name = "Species", data = dat, classification = TRUE) ## Basic tests (for all random forests equal) test_that("classification result is of class ranger with 14 elements", { expect_is(rg.class, "ranger") expect_equal(length(rg.class), 14) }) test_that("classification prediction returns factor", { expect_is(rg.class$predictions, "factor") expect_null(dim(rg.class$predictions)) pred <- predict(rg.class, iris) expect_is(pred$predictions, "factor") expect_null(dim(pred$predictions)) }) test_that("results have 500 trees", { expect_equal(rg.class$num.trees, 500) }) test_that("results have right number of independent variables", { expect_equal(rg.class$num.independent.variables, ncol(iris) - 1) }) test_that("Alternative interface works for classification", { rf <- ranger(dependent.variable.name = "Species", data = iris) expect_equal(rf$treetype, "Classification") }) test_that("Matrix interface works for classification", { expect_equal(rg.mat$treetype, "Classification") expect_equal(rg.mat$forest$independent.variable.names, colnames(iris)[1:4]) }) test_that("Matrix interface prediction works for classification", { expect_silent(predict(rg.mat, dat)) }) test_that("save.memory option works for classification", { rf <- ranger(Species ~ ., data = iris, save.memory = TRUE) expect_equal(rf$treetype, "Classification") }) test_that("predict.all for classification returns numeric matrix of size trees x n", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) pred <- predict(rf, iris, predict.all = TRUE) expect_is(pred$predictions, "matrix") expect_equal(dim(pred$predictions), c(nrow(iris), rf$num.trees)) }) test_that("Majority vote of predict.all for classification is equal to forest prediction", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) pred_forest <- predict(rf, iris, predict.all = FALSE) pred_trees <- predict(rf, iris, predict.all = TRUE) ## Majority vote, NA for ties pred_num <- apply(pred_trees$predictions, 1, function(x) { res <- which(tabulate(x) == max(tabulate(x))) if (length(res) == 1) { res } else { NA } }) pred <- integer.to.factor(pred_num, rf$forest$levels) idx <- !is.na(pred) expect_equal(pred[idx], pred_forest$predictions[idx]) }) test_that("Alternative interface classification prediction works if only independent variable given, one independent variable", { n <- 50 dt <- data.frame(x = runif(n), y = factor(rbinom(n, 1, 0.5))) rf <- ranger(dependent.variable.name = "y", data = dt, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt)) expect_silent(predict(rf, dt[, 1, drop = FALSE])) dt2 <- data.frame(y = factor(rbinom(n, 1, 0.5)), x = runif(n)) rf <- ranger(dependent.variable.name = "y", data = dt2, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt2)) expect_silent(predict(rf, dt2[, 2, drop = FALSE])) }) test_that("Alternative interface classification prediction works if only independent variable given, two independent variables", { n <- 50 dt <- data.frame(x1 = runif(n), x2 = runif(n), y = factor(rbinom(n, 1, 0.5))) rf <- ranger(dependent.variable.name = "y", data = dt, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt)) expect_silent(predict(rf, dt[, 1:2])) dt2 <- data.frame(y = factor(rbinom(n, 1, 0.5)), x1 = runif(n), x2 = runif(n)) rf <- ranger(dependent.variable.name = "y", data = dt2, num.trees = 5, write.forest = TRUE) expect_silent(predict(rf, dt2)) expect_silent(predict(rf, dt2[, 2:3])) }) ## Special tests for random forests for classification test_that("predict works for single observations, classification", { pred <- predict(rg.class, head(iris, 1)) expect_equal(pred$predictions, iris[1,"Species"]) }) test_that("confusion matrix is of right dimension", { expect_equal(dim(rg.class$confusion.matrix), rep(nlevels(iris$Species), 2)) }) test_that("confusion matrix has right dimnames", { expect_equal(dimnames(rg.class$confusion.matrix), list(true = levels(iris$Species), predicted = levels(iris$Species))) }) test_that("confusion matrix rows are the true classes", { expect_equal(as.numeric(rowSums(rg.class$confusion.matrix)), as.numeric(table(iris$Species))) }) test_that("confusion matrix rows are the true classes if using case weights", { rf <- ranger(Species ~ ., data = iris, num.trees = 5, case.weights = c(rep(100, 5), rep(5, 145))) expect_equal(as.numeric(rowSums(rf$confusion.matrix)), as.numeric(table(iris$Species))) }) ## Splitrule test_that("default splitrule is Gini for classification", { set.seed(42) rf1 <- ranger(Species ~ ., iris, num.trees = 5) set.seed(42) rf2 <- ranger(Species ~ ., iris, num.trees = 5, splitrule = "gini") expect_equal(rf1$splitrule, "gini") expect_equal(rf2$splitrule, "gini") expect_equal(rf1$prediction.error, rf2$prediction.error) }) test_that("default splitrule is Gini for probability", { set.seed(42) rf1 <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE) set.seed(42) rf2 <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE, splitrule = "gini") expect_equal(rf1$splitrule, "gini") expect_equal(rf2$splitrule, "gini") expect_equal(rf1$prediction.error, rf2$prediction.error) }) test_that("splitrule extratrees is different from Gini for classification", { set.seed(42) rf1 <- ranger(Species ~ ., iris, num.trees = 5, splitrule = "extratrees") set.seed(42) rf2 <- ranger(Species ~ ., iris, num.trees = 5, splitrule = "gini") expect_equal(rf1$splitrule, "extratrees") expect_equal(rf2$splitrule, "gini") expect_false(rf1$prediction.error == rf2$prediction.error) }) test_that("splitrule extratrees is different from Gini for probability", { set.seed(42) rf1 <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE, splitrule = "extratrees") set.seed(42) rf2 <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE, splitrule = "gini") expect_equal(rf1$splitrule, "extratrees") expect_equal(rf2$splitrule, "gini") expect_false(rf1$prediction.error == rf2$prediction.error) }) test_that("Working with numerically almost exact splitting values", { dat <- data.frame(a = factor(1:2), z = c(1.7629414498915687570246291215880773, 1.7629414498915689790692340466193854)) expect_silent(ranger(a ~ ., data = dat, num.threads = 1, num.trees = 1)) }) test_that("No error if unused factor levels in outcome", { expect_warning(rf <- ranger(Species ~ ., iris[1:100, ], num.trees = 5), "^Dropped unused factor level\\(s\\) in dependent variable\\: virginica\\.") pred <- predict(rf, iris) expect_equal(levels(pred$predictions), levels(iris$Species)) }) test_that("Predictions with unused factor levels are not NA", { expect_warning(rf <- ranger(Species ~ ., iris[51:150, ], num.trees = 5), "^Dropped unused factor level\\(s\\) in dependent variable\\: setosa\\.") pred <- predict(rf, iris) expect_equal(sum(is.na(pred$predictions)), 0) }) test_that("classification with logical returns factor", { rf <- ranger(Species=="setosa" ~ ., data = iris, num.trees = 5) expect_is(rf$predictions, "numeric") expect_null(dim(rf$predictions)) pred <- predict(rf, iris) expect_is(pred$predictions, "numeric") expect_null(dim(pred$predictions)) }) ranger/tests/testthat/test_hellinger.R0000755000176200001440000000245014027301520017663 0ustar liggesuserslibrary(ranger) library(survival) context("ranger_hellinger") test_that("Hellinger splitting works for classification", { rf <- ranger(Species ~ ., droplevels(iris[1:100, ]), splitrule = "hellinger", num.trees = 5) expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.3) }) test_that("Hellinger splitting works for classification with non-factor response", { dat <- iris[1:100, ] dat$Species <- as.numeric(dat$Species) rf <- ranger(Species ~ ., dat, num.trees = 5, classification = TRUE, splitrule = "hellinger") expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.3) }) test_that("Hellinger splitting works for probability estimation", { rf <- ranger(Species ~ ., droplevels(iris[1:100, ]), splitrule = "hellinger", num.trees = 5, probability = TRUE) expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.3) }) test_that("Hellinger splitting not working for multiclass", { expect_error(ranger(Species ~ ., iris, splitrule = "hellinger"), "Error: Hellinger splitrule only implemented for binary classification\\.") }) test_that("Hellinger splitting not working for regression", { expect_error(ranger(Sepal.Length ~ ., iris, splitrule = "hellinger"), "Error: Hellinger splitrule only implemented for binary classification\\.") }) ranger/tests/testthat/test_print.R0000755000176200001440000000115314027301520017045 0ustar liggesusers## Tests for print function library(ranger) context("ranger_print") ## Initialize the random forest rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) ## Test print ranger function expect_that(print(rf), prints_text("Ranger result")) ## Test print forest function expect_that(print(rf$forest), prints_text("Ranger forest object")) ## Test print prediction function expect_that(print(predict(rf, iris)), prints_text("Ranger prediction")) ## Test str ranger function expect_that(str(rf), prints_text("List of 14")) ## Test str forest function expect_that(str(rf$forest), prints_text("List of 9")) ranger/tests/testthat/test_jackknife.R0000755000176200001440000001636214027301520017646 0ustar liggesusers## Tests for the (infitesimal) jackknife for standard error prediction library(ranger) library(survival) library(methods) context("ranger_jackknife") test_that("jackknife standard error prediction working for regression", { idx <- sample(nrow(iris), 10) test <- iris[idx, ] train <- iris[-idx, ] rf <- ranger(Petal.Length ~ ., train, num.trees = 5, keep.inbag = TRUE) pred <- predict(rf, test, type = "se", se.method = "jack") expect_equal(length(pred$predictions), nrow(test)) }) test_that("jackknife standard error prediction not working for other tree types", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE) expect_error(predict(rf, iris, type = "se", se.method = "jack"), "Error: Jackknife standard error prediction currently only available for regression.") rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE, probability = TRUE) expect_error(predict(rf, iris, type = "se", se.method = "jack"), "Error: Jackknife standard error prediction currently only available for regression.") rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, keep.inbag = TRUE) expect_error(predict(rf, veteran, type = "se", se.method = "jack"), "Error: Jackknife standard error prediction currently only available for regression.") }) test_that("IJ standard error prediction working for regression", { idx <- sample(nrow(iris), 21) test <- iris[idx, ] train <- iris[-idx, ] rf <- ranger(Petal.Length ~ ., train, num.trees = 5, keep.inbag = TRUE) pred <- predict(rf, test, type = "se", se.method = "infjack") expect_equal(length(pred$predictions), nrow(test)) }) test_that("IJ standard error prediction working for probability", { idx <- sample(nrow(iris), 25) test <- iris[idx, ] train <- iris[-idx, ] rf <- ranger(Species ~ ., train, num.trees = 5, keep.inbag = TRUE, probability = TRUE) pred <- predict(rf, test, type = "se", se.method = "infjack") expect_equal(nrow(pred$predictions), nrow(test)) }) test_that("IJ standard error prediction not working for other tree types", { rf <- ranger(Species ~ ., iris, num.trees = 5, keep.inbag = TRUE) expect_error(predict(rf, iris, type = "se", se.method = "infjack"), "Error: Not a probability forest. Set probability=TRUE to use the infinitesimal jackknife standard error prediction for classification.") rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, keep.inbag = TRUE) expect_error(predict(rf, veteran, type = "se", se.method = "infjack"), "Error: Infinitesimal jackknife standard error prediction not yet available for survival.") }) test_that("standard error prediction not working if keep.inbag = FALSE", { rf <- ranger(Petal.Length ~ ., iris, num.trees = 5) expect_error(predict(rf, iris, type = "se"), "Error: No saved inbag counts in ranger object. Please set keep.inbag=TRUE when calling ranger.") }) test_that("standard error prediction not working if no OOB observations", { test <- iris[-1, ] train <- iris[1, ] rf <- ranger(Petal.Length ~ ., train, num.trees = 5, keep.inbag = TRUE) expect_error(predict(rf, iris, type = "se"), "Error: No OOB observations found, consider increasing num.trees or reducing sample.fraction.") }) test_that("standard error prediction working for single testing observation", { test <- iris[1, ] train <- iris[-1, ] rf <- ranger(Petal.Length ~ ., train, num.trees = 5, keep.inbag = TRUE) pred_jack <- predict(rf, test, type = "se", se.method = "jack") expect_equal(length(pred_jack$predictions), nrow(test)) expect_warning(pred_ij <- predict(rf, test, type = "se", se.method = "infjack"), "Sample size <=20, no calibration performed\\.") expect_equal(length(pred_ij$predictions), nrow(test)) }) test_that("standard error response prediction is the same as response prediction", { idx <- sample(nrow(iris), 25) test <- iris[idx, ] train <- iris[-idx, ] set.seed(100) rf_se <- ranger(Petal.Length ~ ., train, num.trees = 5, keep.inbag = TRUE) pred_se <- predict(rf_se, test, type = "se") set.seed(100) rf_resp <- ranger(Petal.Length ~ ., train, num.trees = 5) pred_resp <- predict(rf_resp, test, type = "response") expect_equal(pred_se$predictions, pred_resp$predictions) }) test_that("standard error response prediction is the same as response prediction, probability", { n <- 100 p <- 5 x <- replicate(p, rbinom(n, 1, .1)) y <- as.factor(rbinom(n, 1, .5)) dat <- data.frame(y = y, x) idx <- sample(nrow(dat), 25) test <- dat[idx, ] train <- dat[-idx, ] set.seed(100) rf_se <- ranger(y ~ ., train, num.trees = 5, keep.inbag = TRUE, probability = TRUE) pred_se <- predict(rf_se, test, type = "se", se.method = "infjack") colnames(pred_se$predictions) <- levels(train$y) set.seed(100) rf_resp <- ranger(y ~ ., train, num.trees = 5, probability = TRUE) pred_resp <- predict(rf_resp, test, type = "response") expect_equal(pred_se$predictions, pred_resp$predictions) }) test_that("Warning for few observations with IJ", { idx <- sample(nrow(iris), 10) test <- iris[idx, ] train <- iris[-idx, ] rf5 <- ranger(Species ~ ., train, num.trees = 5, keep.inbag = TRUE, probability = TRUE) expect_warning(predict(rf5, test, type = "se", se.method = "infjack"), "Sample size <=20, no calibration performed.") }) test_that("standard error is working for tree subsets, jack", { idx <- sample(nrow(iris), 25) test <- iris[idx, ] train <- iris[-idx, ] rf <- ranger(Petal.Length ~ ., train, num.trees = 50, keep.inbag = TRUE) pred5 <- predict(rf, test, type = "se", se.method = "jack", num.trees = 5) pred50 <- predict(rf, test, type = "se", se.method = "jack") expect_lt(mean(pred50$se), mean(pred5$se)) }) test_that("standard error is working for tree subsets, infjack", { idx <- sample(nrow(iris), 25) test <- iris[idx, ] train <- iris[-idx, ] rf <- ranger(Petal.Length ~ ., train, num.trees = 50, keep.inbag = TRUE) pred5 <- predict(rf, test, type = "se", se.method = "infjack", num.trees = 5) pred50 <- predict(rf, test, type = "se", se.method = "infjack") expect_lt(mean(pred50$se), mean(pred5$se)) }) test_that("No error for se estimation for many observations", { n <- 60000 dat <- data.frame(y = rbinom(n, 1, .5), x = rbinom(n, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 2, keep.inbag = TRUE) expect_silent(predict(rf, dat, type = "se", se.method = "infjack")) }) test_that("Standard error prediction working for single observation, regression", { test <- iris[1, , drop = FALSE] train <- iris[-1, ] rf <- ranger(Petal.Length ~ ., train, num.trees = 5, keep.inbag = TRUE) # Jackknife pred <- predict(rf, test, type = "se", se.method = "jack") expect_length(pred$se, 1) # IJ pred <- expect_warning(predict(rf, test, type = "se", se.method = "infjack")) expect_length(pred$se, 1) }) test_that("Standard error prediction working for single observation, probability", { test <- iris[134, , drop = FALSE] train <- iris[-134, ] rf <- ranger(Species ~ ., train, num.trees = 5, keep.inbag = TRUE, probability = TRUE) # IJ pred <- expect_warning(predict(rf, test, type = "se", se.method = "infjack")) expect_equal(dim(pred$se), c(1, 3)) }) ranger/tests/testthat/test_treeInfo.R0000755000176200001440000003350214027301520017467 0ustar liggesuserslibrary(ranger) library(survival) context("ranger_treeInfo") ## Classification rf.class.formula <- ranger(Species ~ ., iris, num.trees = 5) rf.class.first <- ranger(dependent.variable.name = "Species", data = iris[, c(5, 1:4)], num.trees = 5) rf.class.mid <- ranger(dependent.variable.name = "Species", data = iris[, c(1:2, 5, 3:4)], num.trees = 5) rf.class.last <- ranger(dependent.variable.name = "Species", data = iris, num.trees = 5) ti.class.formula <- treeInfo(rf.class.formula) ti.class.first <- treeInfo(rf.class.first) ti.class.mid <- treeInfo(rf.class.mid) ti.class.last <- treeInfo(rf.class.last) test_that("Terminal nodes have only prediction, non-terminal nodes all others, classification formula", { expect_true(all(is.na(ti.class.formula[ti.class.formula$terminal, 2:6]))) expect_true(all(!is.na(ti.class.formula[ti.class.formula$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.class.formula[!ti.class.formula$terminal, -8]))) expect_true(all(is.na(ti.class.formula[!ti.class.formula$terminal, 8]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, classification depvarname first", { expect_true(all(is.na(ti.class.first[ti.class.first$terminal, 2:6]))) expect_true(all(!is.na(ti.class.first[ti.class.first$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.class.first[!ti.class.first$terminal, -8]))) expect_true(all(is.na(ti.class.first[!ti.class.first$terminal, 8]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, classification depvarname mid", { expect_true(all(is.na(ti.class.mid[ti.class.mid$terminal, 2:6]))) expect_true(all(!is.na(ti.class.mid[ti.class.mid$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.class.mid[!ti.class.mid$terminal, -8]))) expect_true(all(is.na(ti.class.mid[!ti.class.mid$terminal, 8]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, classification depvarname last", { expect_true(all(is.na(ti.class.last[ti.class.last$terminal, 2:6]))) expect_true(all(!is.na(ti.class.last[ti.class.last$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.class.last[!ti.class.last$terminal, -8]))) expect_true(all(is.na(ti.class.last[!ti.class.last$terminal, 8]))) }) test_that("Names in treeInfo match, classification", { varnames <- colnames(iris)[1:4] expect_true(all(is.na(ti.class.formula$splitvarName) | ti.class.formula$splitvarName %in% varnames)) expect_true(all(is.na(ti.class.first$splitvarName) | ti.class.first$splitvarName %in% varnames)) expect_true(all(is.na(ti.class.mid$splitvarName) | ti.class.mid$splitvarName %in% varnames)) expect_true(all(is.na(ti.class.last$splitvarName) | ti.class.last$splitvarName %in% varnames)) }) test_that("Prediction for classification is factor with correct levels", { expect_is(ti.class.formula$prediction, "factor") expect_equal(levels(ti.class.formula$prediction), levels(iris$Species)) }) test_that("Prediction for matrix classification is integer with correct values", { rf <- ranger(dependent.variable.name = "Species", data = data.matrix(iris), num.trees = 5, classification = TRUE) ti <- treeInfo(rf, 1) expect_is(ti$prediction, "numeric") expect_equal(sort(unique(ti$prediction)), 1:3) }) ## Regression n <- 20 dat <- data.frame(y = rnorm(n), replicate(2, runif(n)), replicate(2, rbinom(n, size = 1, prob = .5))) rf.regr.formula <- ranger(y ~ ., dat, num.trees = 5) rf.regr.first <- ranger(dependent.variable.name = "y", data = dat, num.trees = 5) rf.regr.mid <- ranger(dependent.variable.name = "y", data = dat[, c(2:3, 1, 4:5)], num.trees = 5) rf.regr.last <- ranger(dependent.variable.name = "y", data = dat[, c(2:5, 1)], num.trees = 5) ti.regr.formula <- treeInfo(rf.regr.formula) ti.regr.first <- treeInfo(rf.regr.first) ti.regr.mid <- treeInfo(rf.regr.mid) ti.regr.last <- treeInfo(rf.regr.last) test_that("Terminal nodes have only prediction, non-terminal nodes all others, regression formula", { expect_true(all(is.na(ti.regr.formula[ti.regr.formula$terminal, 2:6]))) expect_true(all(!is.na(ti.regr.formula[ti.regr.formula$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.regr.formula[!ti.regr.formula$terminal, -8]))) expect_true(all(is.na(ti.regr.formula[!ti.regr.formula$terminal, 8]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, regression depvarname first", { expect_true(all(is.na(ti.regr.first[ti.regr.first$terminal, 2:6]))) expect_true(all(!is.na(ti.regr.first[ti.regr.first$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.regr.first[!ti.regr.first$terminal, -8]))) expect_true(all(is.na(ti.regr.first[!ti.regr.first$terminal, 8]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, regression depvarname mid", { expect_true(all(is.na(ti.regr.mid[ti.regr.mid$terminal, 2:6]))) expect_true(all(!is.na(ti.regr.mid[ti.regr.mid$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.regr.mid[!ti.regr.mid$terminal, -8]))) expect_true(all(is.na(ti.regr.mid[!ti.regr.mid$terminal, 8]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, regression depvarname last", { expect_true(all(is.na(ti.regr.last[ti.regr.last$terminal, 2:6]))) expect_true(all(!is.na(ti.regr.last[ti.regr.last$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.regr.last[!ti.regr.last$terminal, -8]))) expect_true(all(is.na(ti.regr.last[!ti.regr.last$terminal, 8]))) }) test_that("Names in treeInfo match, regression", { varnames <- c("X1", "X2", "X1.1", "X2.1") expect_true(all(is.na(ti.regr.formula$splitvarName) | ti.regr.formula$splitvarName %in% varnames)) expect_true(all(is.na(ti.regr.first$splitvarName) | ti.regr.first$splitvarName %in% varnames)) expect_true(all(is.na(ti.regr.mid$splitvarName) | ti.regr.mid$splitvarName %in% varnames)) expect_true(all(is.na(ti.regr.last$splitvarName) | ti.regr.last$splitvarName %in% varnames)) }) test_that("Prediction for regression is numeric in correct range", { expect_is(ti.regr.formula$prediction, "numeric") expect_true(all(is.na(ti.regr.formula$prediction) | ti.regr.formula$prediction >= min(dat$y))) expect_true(all(is.na(ti.regr.formula$prediction) | ti.regr.formula$prediction <= max(dat$y))) }) ## Probability estimation rf.prob.formula <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE) rf.prob.first <- ranger(dependent.variable.name = "Species", data = iris[, c(5, 1:4)], num.trees = 5, probability = TRUE) rf.prob.mid <- ranger(dependent.variable.name = "Species", data = iris[, c(1:2, 5, 3:4)], num.trees = 5, probability = TRUE) rf.prob.last <- ranger(dependent.variable.name = "Species", data = iris, num.trees = 5, probability = TRUE) ti.prob.formula <- treeInfo(rf.prob.formula) ti.prob.first <- treeInfo(rf.prob.first) ti.prob.mid <- treeInfo(rf.prob.mid) ti.prob.last <- treeInfo(rf.prob.last) test_that("Terminal nodes have only prediction, non-terminal nodes all others, probability formula", { expect_true(all(is.na(ti.prob.formula[ti.prob.formula$terminal, 2:6]))) expect_true(all(!is.na(ti.prob.formula[ti.prob.formula$terminal, c(1, 7:10)]))) expect_true(all(!is.na(ti.prob.formula[!ti.prob.formula$terminal, c(-8, -9, -10)]))) expect_true(all(is.na(ti.prob.formula[!ti.prob.formula$terminal, 8:10]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, probability depvarname first", { expect_true(all(is.na(ti.prob.first[ti.prob.first$terminal, 2:6]))) expect_true(all(!is.na(ti.prob.first[ti.prob.first$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.prob.first[!ti.prob.first$terminal, c(-8, -9, -10)]))) expect_true(all(is.na(ti.prob.first[!ti.prob.first$terminal, 8:10]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, probability depvarname mid", { expect_true(all(is.na(ti.prob.mid[ti.prob.mid$terminal, 2:6]))) expect_true(all(!is.na(ti.prob.mid[ti.prob.mid$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.prob.mid[!ti.prob.mid$terminal, c(-8, -9, -10)]))) expect_true(all(is.na(ti.prob.mid[!ti.prob.mid$terminal, 8:10]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, probability depvarname last", { expect_true(all(is.na(ti.prob.last[ti.prob.last$terminal, 2:6]))) expect_true(all(!is.na(ti.prob.last[ti.prob.last$terminal, c(1, 7:8)]))) expect_true(all(!is.na(ti.prob.last[!ti.prob.last$terminal, c(-8, -9, -10)]))) expect_true(all(is.na(ti.prob.last[!ti.prob.last$terminal, 8:10]))) }) test_that("Names in treeInfo match, probability", { varnames <- colnames(iris)[1:4] expect_true(all(is.na(ti.prob.formula$splitvarName) | ti.prob.formula$splitvarName %in% varnames)) expect_true(all(is.na(ti.prob.first$splitvarName) | ti.prob.first$splitvarName %in% varnames)) expect_true(all(is.na(ti.prob.mid$splitvarName) | ti.prob.mid$splitvarName %in% varnames)) expect_true(all(is.na(ti.prob.last$splitvarName) | ti.prob.last$splitvarName %in% varnames)) }) test_that("Prediction for probability is one probability per class, sum to 1", { expect_equal(ncol(ti.prob.formula), 10) expect_is(ti.prob.formula$pred.setosa, "numeric") expect_true(all(!ti.prob.formula$terminal | rowSums(ti.prob.formula[, 8:10]) == 1)) }) test_that("Prediction for probability has correct factor levels", { dat <- iris[c(101:150, 1:100), ] rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 5, probability = TRUE) # Predict pred_rf <- predict(rf, dat, num.trees = 1)$predictions # Predict with treeInfo ti <- treeInfo(rf) terminal_nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] pred_ti <- as.matrix(ti[terminal_nodes + 1, grep("pred", colnames(ti))]) colnames(pred_ti) <- gsub("pred\\.", "", colnames(pred_ti)) rownames(pred_ti) <- NULL expect_equal(pred_rf, pred_ti) }) ## Survival rf.surv.formula <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) rf.surv.first <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = veteran[, c(3:4, 1:2, 5:8)], num.trees = 5) rf.surv.mid <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = veteran, num.trees = 5) rf.surv.last <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = veteran[, c(2, 1, 5:8, 3:4)], num.trees = 5) ti.surv.formula <- treeInfo(rf.surv.formula) ti.surv.first <- treeInfo(rf.surv.first) ti.surv.mid <- treeInfo(rf.surv.mid) ti.surv.last <- treeInfo(rf.surv.last) test_that("Terminal nodes have only nodeID, non-terminal nodes all, survival formula", { expect_true(all(is.na(ti.surv.formula[ti.surv.formula$terminal, 2:6]))) expect_true(all(!is.na(ti.surv.formula[ti.surv.formula$terminal, c(1, 7)]))) expect_true(all(!is.na(ti.surv.formula[!ti.surv.formula$terminal, ]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, survival depvarname first", { expect_true(all(is.na(ti.surv.first[ti.surv.first$terminal, 2:6]))) expect_true(all(!is.na(ti.surv.first[ti.surv.first$terminal, c(1, 7)]))) expect_true(all(!is.na(ti.surv.first[!ti.surv.first$terminal, ]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, survival depvarname mid", { expect_true(all(is.na(ti.surv.mid[ti.surv.mid$terminal, 2:6]))) expect_true(all(!is.na(ti.surv.mid[ti.surv.mid$terminal, c(1, 7)]))) expect_true(all(!is.na(ti.surv.mid[!ti.surv.mid$terminal, ]))) }) test_that("Terminal nodes have only prediction, non-terminal nodes all others, survival depvarname last", { expect_true(all(is.na(ti.surv.last[ti.surv.last$terminal, 2:6]))) expect_true(all(!is.na(ti.surv.last[ti.surv.last$terminal, c(1, 7)]))) expect_true(all(!is.na(ti.surv.last[!ti.surv.last$terminal, ]))) }) test_that("Names in treeInfo match, survival", { varnames <- colnames(veteran)[c(1:2, 5:8)] expect_true(all(is.na(ti.surv.formula$splitvarName) | ti.surv.formula$splitvarName %in% varnames)) expect_true(all(is.na(ti.surv.first$splitvarName) | ti.surv.first$splitvarName %in% varnames)) expect_true(all(is.na(ti.surv.mid$splitvarName) | ti.surv.mid$splitvarName %in% varnames)) expect_true(all(is.na(ti.surv.last$splitvarName) | ti.surv.last$splitvarName %in% varnames)) }) test_that("No prediction for Survival", { expect_equal(ncol(ti.surv.formula), 7) }) ## General test_that("Error if no saved forest", { expect_error(treeInfo(ranger(Species ~ ., iris, write.forest = FALSE)), "Error\\: No saved forest in ranger object\\. Please set write.forest to TRUE when calling ranger\\.") }) ## Unordered splitting test_that("Spitting value is comma separated list for partition splitting", { n <- 50 dat <- data.frame(x = sample(c("A", "B", "C", "D", "E"), n, replace = TRUE), y = rbinom(n, 1, 0.5), stringsAsFactors = FALSE) rf.partition <- ranger(y ~ ., dat, num.trees = 5, respect.unordered.factors = "partition") ti.partition <- treeInfo(rf.partition) expect_is(ti.partition$splitval, "character") expect_true(all(is.na(ti.partition$splitval) | grepl("^\\d+(?:,\\d+)*$", ti.partition$splitval))) }) test_that("Spitting value is numeric for order splitting", { set.seed(100) rf.order <- ranger(Sepal.Length ~ ., iris, num.trees = 5, respect.unordered.factors = "order") ti.order <- treeInfo(rf.order) expect_is(ti.order$splitval[!ti.order$terminal & ti.order$splitvarName == "Species"], "numeric") }) test_that("treeInfo works for 31 unordered factor levels but not for 32", { n <- 31 dt <- data.frame(x = factor(1:n, ordered = FALSE), y = rbinom(n, 1, 0.5)) rf <- ranger(y ~ ., data = dt, num.trees = 10, splitrule = "extratrees") expect_silent(treeInfo(rf)) n <- 32 dt <- data.frame(x = factor(1:n, ordered = FALSE), y = rbinom(n, 1, 0.5)) rf <- ranger(y ~ ., data = dt, num.trees = 10, splitrule = "extratrees") expect_warning(treeInfo(rf), "Unordered splitting levels can only be shown for up to 31 levels.") }) ranger/tests/testthat/test_splitweights.R0000755000176200001440000000374414027301520020447 0ustar liggesusers## Tests for split select weights library(ranger) context("ranger_splitweights") ## Tests test_that("split select weights work", { expect_silent(ranger(Species ~ ., iris, num.trees = 5, split.select.weights = c(0.1, 0.2, 0.3, 0.4))) expect_error(ranger(Species ~ ., iris, num.trees = 5, split.select.weights = c(0.1, 0.2, 0.3))) }) test_that("split select weights work with 0s and 1s", { num.trees <- 5 weights <- replicate(num.trees, sample(c(0, 0, 1, 1)), simplify = FALSE) rf <- ranger(Species ~ ., iris, num.trees = num.trees, split.select.weights = weights) selected_correctly <- sapply(1:rf$num.trees, function(i) { all(treeInfo(rf, i)[,"splitvarID"] %in% c(which(weights[[i]] > 0) - 1, NA)) }) expect_true(all(selected_correctly)) }) test_that("Tree-wise split select weights work", { num.trees <- 5 weights <- replicate(num.trees, runif(ncol(iris)-1), simplify = FALSE) expect_silent(ranger(Species ~ ., iris, num.trees = num.trees, split.select.weights = weights)) weights <- replicate(num.trees+1, runif(ncol(iris)-1), simplify = FALSE) expect_error(ranger(Species ~ ., iris, num.trees = num.trees, split.select.weights = weights)) }) test_that("always split variables work", { expect_silent(ranger(Species ~ ., iris, num.trees = 10, always.split.variables = c("Petal.Length", "Petal.Width"), mtry = 2)) expect_silent(ranger(dependent.variable.name = "Species", data = iris, num.trees = 10, always.split.variables = c("Petal.Length", "Petal.Width"), mtry = 2)) }) test_that("Tree-wise split select weights work with 0s", { num.trees <- 5 weights <- replicate(num.trees, sample(c(0, 0, 0.5, 0.5)), simplify = FALSE) rf <- ranger(Species ~ ., iris, mtry = 2, num.trees = num.trees, split.select.weights = weights) selected_correctly <- sapply(1:num.trees, function(i) { all(treeInfo(rf, i)[,"splitvarID"] %in% c(which(weights[[i]] > 0) - 1, NA)) }) expect_true(all(selected_correctly)) }) ranger/tests/testthat/test_survival.R0000755000176200001440000001203714027301520017567 0ustar liggesusers## Tests for random forests for survival analysis library(ranger) library(survival) context("ranger_surv") ## Initialize the random forest for survival analysis rg.surv <- ranger(Surv(time, status) ~ ., data = veteran, num.trees = 10) ## Basic tests (for all random forests equal) test_that("survival result is of class ranger with 15 elements", { expect_is(rg.surv, "ranger") expect_equal(length(rg.surv), 15) }) test_that("results have right number of trees", { expect_equal(rg.surv$num.trees, 10) }) test_that("results have right number of independent variables", { expect_equal(rg.surv$num.independent.variables, ncol(veteran) - 2) }) test_that("Alternative interface works for survival", { rf <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = veteran, num.trees = 10) expect_equal(rf$treetype, "Survival") }) test_that("Alternative interface prediction works for survival", { rf <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = veteran, num.trees = 10) expect_equal(predict(rf, veteran)$num.independent.variables, ncol(veteran) - 2) expect_equal(predict(rf, veteran[, setdiff(names(veteran), c("time", "status"))])$num.independent.variables, ncol(veteran) - 2) }) test_that("Matrix interface works for survival", { rf <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = data.matrix(veteran), write.forest = TRUE, num.trees = 10) expect_equal(rf$treetype, "Survival") expect_equal(rf$forest$independent.variable.names, colnames(veteran)[c(1:2, 5:8)]) }) test_that("Matrix interface prediction works for survival", { dat <- data.matrix(veteran) rf <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = dat, write.forest = TRUE, num.trees = 10) expect_silent(predict(rf, dat)) }) test_that("growing works for single observations, survival", { rf <- ranger(Surv(time, status) ~ ., veteran[1, ], write.forest = TRUE, num.trees = 10) expect_is(rf$survival, "matrix") }) test_that("predict works for single observations, survival", { rf <- ranger(Surv(time, status) ~ ., veteran, write.forest = TRUE, num.trees = 10) pred <- predict(rf, head(veteran, 1)) expect_equal(length(pred$survival), length(rf$unique.death.times)) }) ## Special tests for random forests for survival analysis test_that("unique death times in survival result is right", { expect_equal(rg.surv$unique.death.times, sort(unique(veteran$time))) }) test_that("C-index splitting works", { rf <- ranger(Surv(time, status) ~ ., data = veteran, splitrule = "C", num.trees = 10) expect_equal(rf$treetype, "Survival") }) test_that("C-index splitting not working on classification data", { expect_error(ranger(Species ~ ., iris, splitrule = "C", num.trees = 10)) }) test_that("Logrank splitting not working on classification data", { expect_error(ranger(Species ~ ., iris, splitrule = "logrank", num.trees = 10)) }) test_that("No error if survival tree without OOB observations", { dat <- data.frame(time = c(1,2), status = c(0,1), x = c(1,2)) expect_silent(ranger(Surv(time, status) ~ ., dat, num.trees = 1, num.threads = 1)) }) test_that("predict.all for survival returns 3d array of size samples x times x trees", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) pred <- predict(rf, veteran, predict.all = TRUE) expect_is(pred$survival, "array") expect_equal(dim(pred$survival), c(nrow(veteran), length(pred$unique.death.times), rf$num.trees)) expect_is(pred$chf, "array") expect_equal(dim(pred$chf), c(nrow(veteran), length(pred$unique.death.times), rf$num.trees)) }) test_that("Mean of predict.all for survival is equal to forest prediction", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) pred_forest <- predict(rf, veteran, predict.all = FALSE) pred_trees <- predict(rf, veteran, predict.all = TRUE) expect_equal(apply(pred_trees$chf, 1:2, mean), pred_forest$chf) }) test_that("timepoints() function returns timepoints", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) expect_equal(timepoints(rf), rf$unique.death.times) pred <- predict(rf, veteran) expect_equal(timepoints(pred), rf$unique.death.times) }) test_that("timepoints() working on survival forest only", { rf <- ranger(Species ~ ., iris, num.trees = 5) expect_error(timepoints(rf), "No timepoints found. Object is no Survival forest.") pred <- predict(rf, iris) expect_error(timepoints(pred), "No timepoints found. Object is no Survival prediction object.") }) test_that("Survival error without covariates", { expect_error(ranger(Surv(time, status) ~ ., veteran[, c("time", "status")], num.trees = 5), "Error: No covariates found.") }) test_that("Survival error for competing risk data", { sobj <- Surv(veteran$time, factor(sample(1:3, nrow(veteran), replace = TRUE))) expect_error(ranger(y = sobj, x = veteran[, 1:2], num.trees = 5), "Error: Competing risks not supported yet\\. Use status=1 for events and status=0 for censoring\\.") }) ranger/tests/testthat/test_extratrees.R0000755000176200001440000000457214027301520020107 0ustar liggesuserslibrary(ranger) library(survival) context("ranger_extratrees") test_that("extratrees splitting works for classification", { rf <- ranger(Species ~ ., iris, splitrule = "extratrees") expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.2) }) test_that("extratrees unordered splitting works for classification", { n <- 20 dat <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = factor(rbinom(n, 1, 0.5)), stringsAsFactors = FALSE) rf <- ranger(y ~ ., data = dat, num.trees = 5, min.node.size = n/2, splitrule = "extratrees", respect.unordered.factors = 'partition') expect_is(rf, "ranger") }) test_that("extratrees splitting works for probability estimation", { rf <- ranger(Species ~ ., iris, probability = TRUE, splitrule = "extratrees") expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.2) }) test_that("extratrees unordered splitting works for probability estimation", { n <- 20 dat <- data.frame(x = sample(c("A", "B", "C", "D"), n, replace = TRUE), y = factor(rbinom(n, 1, 0.5)), stringsAsFactors = FALSE) rf <- ranger(y ~ ., data = dat, num.trees = 5, min.node.size = n/2, splitrule = "extratrees", respect.unordered.factors = 'partition', probability = TRUE) expect_is(rf, "ranger") }) test_that("extratrees splitting works for regression", { rf <- ranger(Sepal.Length ~ ., iris, splitrule = "extratrees") expect_is(rf, "ranger") expect_gt(rf$r.squared, 0.5) }) test_that("extratrees unordered splitting works for regression", { rf <- ranger(Sepal.Length ~ ., iris, splitrule = "extratrees", respect.unordered.factors = "partition") expect_is(rf, "ranger") expect_gt(rf$r.squared, 0.5) }) test_that("extratrees splitting works for survival", { rf <- ranger(Surv(time, status) ~ ., veteran, splitrule = "extratrees") expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.4) }) test_that("extratrees unordered splitting works for survival", { rf <- ranger(Surv(time, status) ~ ., veteran, splitrule = "extratrees", respect.unordered.factors = "partition") expect_is(rf, "ranger") expect_lt(rf$prediction.error, 0.4) }) test_that("extratrees splitting works for large number of random splits", { expect_silent(ranger(Species ~ ., iris, splitrule = "extratrees", num.random.splits = 100)) }) ranger/tests/testthat/test_formula.R0000755000176200001440000000154414027301520017362 0ustar liggesuserslibrary(ranger) library(survival) test_that("LHS formula", { callRanger <- function() { myTransformation <- function(x) { x } ranger(myTransformation(Species) ~ ., data = iris) } expect_error(callRanger(), NA) }) test_that("Formula works with matrix data", { set.seed(10) rf1 <- ranger(Species ~ ., iris, num.trees = 5) pred1 <- rf1$predictions set.seed(10) rf2 <- ranger(Species ~ ., data.matrix(iris), num.trees = 5, classification = TRUE) pred2 <- factor(levels(iris$Species)[rf2$predictions], levels = levels(iris$Species)) expect_equal(pred1, pred2) }) test_that("Formula works with matrix data, survival", { set.seed(10) rf1 <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) set.seed(10) rf2 <- ranger(Surv(time, status) ~ ., data.matrix(veteran), num.trees = 5) expect_equal(rf1$chf, rf2$chf) }) ranger/tests/testthat/test_predict.R0000755000176200001440000001205314027301520017344 0ustar liggesusers## Tests for predictions library(ranger) library(survival) context("ranger_pred") ## Tests test_that("predict returns good prediction", { rf <- ranger(Species ~ ., iris, write.forest = TRUE) pred <- predict(rf, iris) expect_gt(mean(iris$Species == predictions(pred)), 0.9) }) test_that("case weights work", { expect_silent(ranger(Species ~ ., iris, num.trees = 5, case.weights = rep(1, nrow(iris)))) ## Should only predict setosa now weights <- c(rep(1, 50), rep(0, 100)) rf <- ranger(Species ~ ., iris, num.trees = 5, case.weights = weights, write.forest = TRUE) pred <- predict(rf, iris)$predictions expect_true(all(pred == "setosa")) }) test_that("Prediction works correctly if dependent variable is not first or last", { dat <- iris[, c(1:2, 5, 3:4)] rf <- ranger(Species ~ ., dat, num.trees = 5, write.forest = TRUE) expect_gte(mean(predictions(predict(rf, dat)) == dat$Species), 0.9) ## No response column expect_gte(mean(predictions(predict(rf, dat[, -3])) == dat$Species), 0.9) }) test_that("Prediction works correctly if dependent variable is not first or last, alternative interface", { dat <- iris[, c(1:2, 5, 3:4)] rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 5, write.forest = TRUE) expect_gte(mean(predictions(predict(rf, dat)) == dat$Species), 0.9) ## No response column expect_gte(mean(predictions(predict(rf, dat[, -3])) == dat$Species), 0.9) }) test_that("Missing value columns detected in predict", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) dat <- iris dat[4, 4] <- NA dat[25, 1] <- NA expect_error(predict(rf, dat), "Missing data in columns: Sepal.Length, Petal.Width.") }) test_that("If num.trees set, these number is used for predictions", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) pred <- predict(rf, iris, predict.all = TRUE, num.trees = 3) expect_equal(pred$num.trees, 3) expect_equal(dim(pred$predictions), c(nrow(iris), 3)) }) test_that("If num.trees not set, all trees are used for prediction", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) pred <- predict(rf, iris, predict.all = TRUE) expect_equal(pred$num.trees, 5) expect_equal(dim(pred$predictions), c(nrow(iris), 5)) }) test_that("Error if unknown value for type", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) expect_error(predict(rf, iris, type = "class")) }) test_that("Terminal nodes returned by predict are node ids, classification", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) pred <- predict(rf, iris, type = "terminalNodes") expect_equal(dim(pred$predictions), c(nrow(iris), rf$num.trees)) expect_true(all(pred$predictions > 0)) expect_true(all(pred$predictions < max(sapply(rf$forest$split.varIDs, length)))) }) test_that("Terminal nodes returned by predict are node ids, probability", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE, probability = TRUE) pred <- predict(rf, iris, type = "terminalNodes") expect_equal(dim(pred$predictions), c(nrow(iris), rf$num.trees)) expect_true(all(pred$predictions > 0)) expect_true(all(pred$predictions < max(sapply(rf$forest$split.varIDs, length)))) }) test_that("Terminal nodes returned by predict are node ids, regression", { rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, write.forest = TRUE) pred <- predict(rf, iris, type = "terminalNodes") expect_equal(dim(pred$predictions), c(nrow(iris), rf$num.trees)) expect_true(all(pred$predictions > 0)) expect_true(all(pred$predictions < max(sapply(rf$forest$split.varIDs, length)))) }) test_that("Terminal nodes returned by predict are node ids, survival", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, write.forest = TRUE) pred <- predict(rf, veteran, type = "terminalNodes") expect_equal(dim(pred$predictions), c(nrow(veteran), rf$num.trees)) expect_true(all(pred$predictions > 0)) expect_true(all(pred$predictions < max(sapply(rf$forest$split.varIDs, length)))) }) test_that("Same result with warning if getTerminalNodeIDs() used", { rf <- ranger(Species ~ ., iris, num.trees = 5) pred <- predict(rf, iris, type = "terminalNodes") expect_warning(expect_equal(getTerminalNodeIDs(rf, iris), pred$predictions)) }) test_that("predict.all works for single observation", { rf <- ranger(Species ~ ., iris, num.trees = 5, write.forest = TRUE) pred <- predict(rf, iris[1, ], predict.all = TRUE) expect_equal(dim(pred$predictions), c(1, rf$num.trees)) }) test_that("predict.all factor probabilities in correct order", { rf <- ranger(Species ~ ., iris[c(51:100, 101:150, 1:50), ], probability = TRUE, num.trees = 5) pred_all <- rowMeans(predict(rf, iris, predict.all = TRUE)$predictions[139,,]) pred_mean <- predict(rf, iris, predict.all = FALSE)$predictions[139,] expect_equal(pred_all, pred_mean, tolerance = .001) }) test_that("Warning if predicting with corrected impurity importance", { rf <- ranger(Species ~ ., iris, num.trees = 5, importance = "impurity_corrected") expect_warning(predict(rf, iris)) }) ranger/tests/testthat/test_classweights.R0000755000176200001440000000356614027301517020431 0ustar liggesusers## Tests for class weights library(ranger) context("ranger_classweights") test_that("No error if class weights used", { expect_silent(ranger(Species ~ ., iris, num.trees = 5, class.weights = c(0.5, 1, 0.1))) }) test_that("Prediction accuracy for minority class increases with higher weight", { n <- 100 x <- rnorm(n) beta0 <- -3 beta <- 1 y <- factor(rbinom(n, 1, plogis(beta0 + beta * x))) dat <- data.frame(y = y, x) rf <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 50, class.weights = c(1, 1)) acc_major <- mean((rf$predictions == dat$y)[dat$y == "0"], na.rm = TRUE) acc_minor <- mean((rf$predictions == dat$y)[dat$y == "1"], na.rm = TRUE) rf <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 50, class.weights = c(0.01, 0.99)) acc_major_weighted <- mean((rf$predictions == dat$y)[dat$y == "0"], na.rm = TRUE) acc_minor_weighted <- mean((rf$predictions == dat$y)[dat$y == "1"], na.rm = TRUE) expect_gt(acc_minor_weighted, acc_minor) }) # test_that("Prediction error worse if class weights 0", { # rf <- ranger(Species ~ ., iris, num.trees = 5) # rf_weighted <- ranger(Species ~ ., iris, num.trees = 5, class.weights = c(1, 0, 0)) # expect_lt(rf$prediction.error, rf_weighted$prediction.error) # }) test_that("Error if class weights of wrong size", { expect_error(ranger(Species ~ ., iris, num.trees = 5, class.weights = c(0.5, 1)), "Error: Number of class weights not equal to number of classes.") }) test_that("Error if class weights NA", { expect_error(ranger(Species ~ ., iris, num.trees = 5, class.weights = c(0.5, 1, NA)), "missing value where TRUE/FALSE needed") }) test_that("Error if class weights not numeric", { expect_error(ranger(Species ~ ., iris, num.trees = 5, class.weights = c(0.5, 1, "a")), "Error: Invalid value for class.weights. Please give a vector of non-negative values.") }) ranger/tests/testthat/test_quantreg.R0000755000176200001440000000416614027301520017546 0ustar liggesuserslibrary(ranger) context("ranger_quantreg") rf.quant <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE, keep.inbag = TRUE, num.trees = 50) pred.quant <- predict(rf.quant, mtcars[27:32, ], type = "quantiles") test_that("Quantile prediction is of correct size", { expect_equal(dim(pred.quant$predictions), c(pred.quant$num.samples, 3)) }) test_that("Quantile OOB prediction is of correct size", { pred <- predict(rf.quant, data = NULL, type = "quantiles") expect_equal(dim(pred$predictions), c(rf.quant$num.samples, 3)) }) test_that("Lower quantile smaller central smaller upper", { expect_true(all(pred.quant$predictions[, 1] < pred.quant$predictions[, 2])) expect_true(all(pred.quant$predictions[, 2] < pred.quant$predictions[, 3])) }) test_that("Working for single quantile", { expect_silent(pred <- predict(rf.quant, data = mtcars[27:32, ], type = "quantiles", quantiles = .5)) expect_silent(pred <- predict(rf.quant, data = NULL, type = "quantiles", quantiles = .5)) }) test_that("Working for single new observation", { expect_silent(pred <- predict(rf.quant, mtcars[27, ], type = "quantiles")) }) test_that("Working for constant variables", { dat <- data.frame(x1 = 1, x2 = seq(1,10), y = seq(1,10)) rf <- ranger(y ~ ., dat, quantreg = TRUE) expect_silent(predict(rf, dat, type = "quantiles")) }) test_that("Error message if no regression forest", { rf <- ranger(Species ~ ., iris, num.trees = 5) expect_error(predict(rf, iris, type = "quantiles"), "Error\\: Quantile prediction implemented only for regression outcomes\\.") }) test_that("Error message if not grown with quantreg=TRUE", { rf <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = FALSE, num.trees = 50) expect_error(predict(rf, mtcars[27:32, ], type = "quantiles"), "Error\\: Set quantreg\\=TRUE in ranger\\(\\.\\.\\.\\) for quantile prediction\\.") }) test_that("User specified function works as expected", { pred_sample <- predict(rf.quant, mtcars[27:32, ], type = "quantiles", what = function(x) sample(x, 10, replace = TRUE)) expect_equal(dim(pred_sample$predictions), c(pred_sample$num.samples, 10)) }) ranger/tests/testthat/test_importance.R0000755000176200001440000001140114027301520020047 0ustar liggesusers## Tests for importance measures library(ranger) context("ranger_imp") set.seed(123) ## Classification rg.imp.class <- ranger(Species ~ ., data = iris, num.trees = 5, importance = "impurity") rg.perm.class <- ranger(Species ~ ., data = iris, num.trees = 5, importance = "permutation") rg.scale.perm.class <- ranger(Species ~ ., data = iris, num.trees = 5, importance = "permutation", scale.permutation.importance = TRUE) ## Probability estimation rg.imp.prob <- ranger(Species ~ ., data = iris, num.trees = 5, importance = "impurity", probability = TRUE) rg.perm.prob <- ranger(Species ~ ., data = iris, num.trees = 5, importance = "permutation", probability = TRUE) rg.scale.perm.prob <- ranger(Species ~ ., data = iris, num.trees = 5, importance = "permutation", scale.permutation.importance = TRUE, probability = TRUE) ## Regression rg.imp.regr <- ranger(Sepal.Length ~ ., data = iris, num.trees = 5, importance = "impurity") rg.perm.regr <- ranger(Sepal.Length ~ ., data = iris, num.trees = 5, importance = "permutation") rg.scale.perm.regr <- ranger(Sepal.Length ~ ., data = iris, num.trees = 5, importance = "permutation", scale.permutation.importance = TRUE) ## Survival rg.perm.surv <- ranger(Surv(time, status) ~ ., data = veteran, num.trees = 5, importance = "permutation") rg.scale.perm.surv <- ranger(Surv(time, status) ~ ., data = veteran, num.trees = 5, importance = "permutation", scale.permutation.importance = TRUE) ## Tests test_that("importance measures work, classification", { expect_is(rg.imp.class$variable.importance, "numeric") expect_is(rg.perm.class$variable.importance, "numeric") expect_is(rg.scale.perm.class$variable.importance, "numeric") }) test_that("importance measures work, probability", { expect_is(rg.imp.prob$variable.importance, "numeric") expect_is(rg.perm.prob$variable.importance, "numeric") expect_is(rg.scale.perm.prob$variable.importance, "numeric") }) test_that("importance measures work, regression", { expect_is(rg.imp.regr$variable.importance, "numeric") expect_is(rg.perm.regr$variable.importance, "numeric") expect_is(rg.scale.perm.regr$variable.importance, "numeric") }) test_that("importance measures work, survival", { expect_is(rg.perm.surv$variable.importance, "numeric") expect_is(rg.scale.perm.surv$variable.importance, "numeric") }) test_that("impurity importance is larger than 1", { expect_gt(rg.imp.class$variable.importance[1], 1) expect_gt(rg.imp.prob$variable.importance[1], 1) expect_gt(rg.imp.regr$variable.importance[1], 1) }) test_that("unscaled importance is smaller than 1", { expect_lt(rg.perm.class$variable.importance[1], 1) expect_lt(rg.perm.prob$variable.importance[1], 1) expect_lt(rg.perm.regr$variable.importance[1], 1) expect_lt(rg.perm.surv$variable.importance[3], 1) }) test_that("scaled importance is larger than unscaled importance", { expect_gt(abs(rg.scale.perm.class$variable.importance[1]), abs(rg.perm.class$variable.importance[1])) expect_gt(abs(rg.scale.perm.prob$variable.importance[1]), abs(rg.perm.prob$variable.importance[1])) expect_gt(abs(rg.scale.perm.regr$variable.importance[1]), abs(rg.perm.regr$variable.importance[1])) expect_gt(abs(rg.scale.perm.surv$variable.importance[1]), abs(rg.perm.surv$variable.importance[1])) }) test_that("error thrown if no importance in object", { rf <- ranger(Species ~ ., data = iris, num.trees = 5) expect_error(importance(rf), "No variable importance found. Please use 'importance' option when growing the forest.") }) test_that("Survival permutation importance is smaller than 1", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, importance = "permutation") expect_lt(rf$variable.importance[1], 1) }) test_that("Survival impurity importance is larger than 1", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, importance = "impurity") expect_gt(rf$variable.importance[1], 1) }) test_that("Survival corrected impurity importance is smaller than 1", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 20, importance = "impurity_corrected") expect_lt(min(abs(rf$variable.importance)), 1) }) test_that("Gini importance non-negative with class weights", { rf <- ranger(Species ~ ., data = iris, class.weights = c(.2, .3, .8), num.trees = 5, importance = "impurity") expect_true(all(rf$variable.importance >= 0)) rf <- ranger(Species ~ ., data = iris, class.weights = c(.2, .3, .8), num.trees = 5, importance = "impurity", probability = TRUE) expect_true(all(rf$variable.importance >= 0)) }) ranger/tests/testthat/test_char.R0000755000176200001440000000121614027301517016634 0ustar liggesusers## Tests for character data library(ranger) context("ranger_char") ## Initialize random forests dat <- iris dat$Test <- paste0("AA",as.character(1:nrow(dat))) ## Tests test_that("no warning or error if character vector in data", { expect_silent(rf <- ranger(Species ~ ., dat, num.trees = 5, write.forest = TRUE)) expect_silent(predict(rf, dat)) }) test_that("no warning or error if character vector in data, alternative interface", { expect_silent(rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 5, write.forest = TRUE)) expect_silent(predict(rf, dat)) }) ranger/tests/testthat/test_regularization.R0000755000176200001440000001662414027301520020761 0ustar liggesuserslibrary(ranger) library(survival) context("ranger_regularization") n <- 50 p <- 4 dat_reg <- data.frame(y = rnorm(n), x = replicate(p, runif(n))) dat_class <- data.frame(y = factor(rbinom(n, 1, .5)), x = replicate(p, runif(n))) dat_surv <- data.frame(time = runif(n), status = rbinom(n, 1, .5), x = replicate(p, runif(n))) get_num_splitvars <- function(rf) { all_splitvars <- do.call(c, lapply(1:rf$num.trees, function(t) { treeInfo(rf, t)[, "splitvarID"] })) length(unique(all_splitvars[!is.na(all_splitvars)])) } test_that("same results with 1 and p regularization coefficients, regression", { seed <- runif(1 , 0, .Machine$integer.max) set.seed(seed) rf1 <- ranger(y ~ ., dat_reg, num.trees = 5, num.threads = 1, regularization.factor = .1) set.seed(seed) rf2 <- ranger(y ~ ., dat_reg, num.trees = 5, num.threads = 1, regularization.factor = rep(.1, p)) expect_equal(rf1$prediction.error, rf2$prediction.error) }) test_that("same results with 1 and p regularization coefficients, classification", { seed <- runif(1 , 0, .Machine$integer.max) set.seed(seed) rf1 <- ranger(y ~ ., dat_class, num.trees = 5, num.threads = 1, regularization.factor = .1) set.seed(seed) rf2 <- ranger(y ~ ., dat_class, num.trees = 5, num.threads = 1, regularization.factor = rep(.1, p)) expect_equal(rf1$prediction.error, rf2$prediction.error) }) test_that("Error if maxstat splitrule and regularization", { expect_error(ranger(y ~ ., dat_reg, num.trees = 5, splitrule = "maxstat", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE), "Error: Regularization cannot be used with 'maxstat' splitrule\\.") }) # Regression test_that("Fewer variables used with regularization, regression", { rf_noreg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4) rf_reg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, regression extratrees", { rf_noreg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees") rf_reg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, regression beta", { dat <- data.frame(y = rbinom(n, 1, .5), x = replicate(p, runif(n))) rf_noreg <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "beta") rf_reg <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "beta", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) # Classification test_that("Fewer variables used with regularization, classification", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4) rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, classification extratrees", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, classification hellinger", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "hellinger") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "hellinger", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) # Probability test_that("Fewer variables used with regularization, probability", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE) rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, probability extratrees", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "extratrees") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, probability hellinger", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "hellinger") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "hellinger", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) # Survival test_that("Fewer variables used with regularization, survival", { rf_noreg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4) rf_reg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, survival extratrees", { rf_noreg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees") rf_reg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, survival C", { rf_noreg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "C") rf_reg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "C", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) ranger/tests/test_gwaa.rds0000755000176200001440000005362214027301517015375 0ustar liggesusers p#z6{ڒeV).'qRv(Ê,J{ A ^4/A쒳wȝ{.R.F$tw $e&`$rl%R&X;ÊAÉ !:$]Nv,ӿK.y7޼[.ֿsބq {|ΥKo(|Cp~M|w^o^o|}}?7wI9=o޸_w>8Ex׃7\/}o= }?f׾&|c}_?{+}k_)Y+W1g~L~?7~wnnmŸ]g~$sr~}?S\O}}{_r߯Е ?x??}\•Û}_KoA߷O~H~[?[RW?nO~pҽ{_Gۼ |ݾῷ߸߼Wa/wݿ+wp׿ԟտ:jOo}5wwnWcñȿ?Pv7[.j|;{r^o2^kϪeQL-?o| goCUKO%^])Zj-W[[0pQ{KQ>HQ)Jz(Jz(Jz)^Jz)^Jz)>J(>J(>Jw]:k4}ԇH|}?yf"̷JGCh7g3/.w9j:g틠{g$|r.u\>w D5 ^OE%=y/ޠE5|/O(]q{¾ ss_M}@!ꆽ.;z .?O  x}.:ް;)AnLx~] Y:.{}~;0 /a7&\ހ+0 7 /tM0p]`4.{!7='hQA_&H^:/@kmn/pU7H -܁swr:9s壾xA?˞+ |.奆8 z´AAy[v}tuV/ң~/upPC_|>A!k mTt@P6C> xfC$[ :rӅ|pjxC`LpRA& ==? ݈w@XL]\vnpQ S~-r h(8 oCn. Q!5p.gHTS<zMa-pj^f΍n7%G;T7q0lB#0c.0!ָ迀Rv ! x|2V`*B75` Ђ Eb~nZNƜ@T8F7Ixd /0{H/ؠnB~7& Db8pmC-p5nz8Ew|`O 't!*ѽ.w101C@ ? # 6! {bpG]Q0/ Nj *$?@!Q]a&9[}bⵏC& 2s.b]6Ohp"&:@zdŒyzZg yAZS' @3~@Nb܄,_< K.!i4'h 0Am>34`^6Aw2P" 3y |q@~f@]^4&/]oT0 h&2b8?M~hPh)Mbќyp %ђ0' Q ~A0` A@yqw遁L:͎xF`:qL ॽ:IӂaF\a$h tdyB|E2atC,Ser0xkf M\1o@w15 [(ޜGˀ}C$1]t aWv\BL*M"Pf<]Μs"#0EOΉ E KI0x2,>F[!a$!?".bMLDn᪠^i31 (21w(u~/px '17V@ 9uнCl&Ծ< 0i9tDc9E}/2;| zY`ޡՖP {΀Zd*>>&=&`7$2Gs,0Ӡ y/=pl<yLbmlo dęcY^H\dݧIϤ$}.*?2`! u !eK<C3aƗIh-LUpbϠ!> 5!$S6 & L݈T i!fȴ$0Fq˧+8N&zhs%~p &G$=Юajy ÔވEwtN)߆]c.R{i  E'1щx0 94ba2sU{T2,=3h&wzT!`m@B)!^u 1yǐMF f""2n;b*B~N9)-$Kʚ3$\Q>4axLʾ:FB. <J/P~@6CW`H7tE=D, f±$ЯM[ 0)#V(D#SO3 p?L#Π fP"&IDs6!7]g$.)ʹ dW ٮ>2vQI7;go6M5CW0,\B+Dƍ3Bd3Hx0V]nlцp1Zy7ȕ0ѭ ކ7aLYL Whyid)HLse'XR^H` a|qtAVVH!&B2ELS^v~c$& bQ?ƔI8P2 SdЩY vIBbH/3% fL8*i  {<™W-ͽ=9Q-ւ0T|_ 7L:C%^NHAinV`dLozL5d%oxٹWQAC[uoӕ\L1"q4 @̓, @[ u1Eq!.K(l()_LB iy5GPS^b~C]t*''.nIƼ$3S8N̔=,ep|!vZp @Z̦z0D6L] LR7SZ03S s@gY̾=v &)6 3 se$Bd `^ =0Qѡ s7^2Gp" @C 242Lfh! jw%SL5Bd-,naV#%qf/`VLTiD^vF7̮9XfN',u?%z L& ΃)%e~fثYtI|@IOؙ,psi)dQ:Lfj\/(\̢P y<6c^?T2B߯aiɣ_SQ)G!鰟^ &9HP]|:$5(pZ? ;7/h`*Ĥ豇{w@nfi2Y.b "IXbpAf̓S-z04.aA|:nZ *2:Z|`?L< H@ 8?!܀K Cnfڂ$Oa& h[mR$f3p?A!C!1OжL=0+B !̿KH!ScBu '%BC bN(DO}@$0sd-+@Wut(+=].M yԊ,Fl/bhEP 0? =CA&41+oعspAv=eWk@Ri"C^<z f^'fI ,6`'k "= CkaZpݺᰗ"5=&I{7&s Y>%\ ",/8C>$! i# 2S:'(3!D2HVBhKT7AK ʀ4j:]ds}2;GT>Sp$+ C2kxB6DP$ CDg!0!qk?N!af yY;op]+k7azF Z:d"!RNf5^7M6}7~2T/ /X$.r}-S! 乨 {R-uZ!twǤ~a_[0<`4$ ·$`֘ ^=]^ * ;#&l^1K}A,! &IW=02m~IΆI Y]]?L)LwYFeC$J/9]~ԢSzt_ô?a0K Bf!ƒuV^ <~4 ZyI|3 :RfZ%?`Nu"_$9b C5')z3)Bd֜e=aN/Y$@\r6awYL!Z$YX{"e5RA!@yW̧*J L[W(9=@ϓ!H^&SL}sf$Q CD>$Ɂ< YOa򃵋ۄ({ي#fԞxeʏcZx݌C`' Bg 7L+HVp1%~Hm̯PO>&2&5 ɔQZ&Ϙ/0EfMϔӆ .C4 a~yPMM ?S,fBe`&dʆ&vk^")J9}!D)22sw.PH .Ha ?$ -Cx+]J&߉|htd!'"A0I)2 )Y6NȲ,vTD M\LL;fA0׷i߹26:Ɋl`]k9M׫[ }zG?/[C_?o%׏__Gzů?c^$lOLg7o8|qVy^Y⍳7o忏GQ(}>ʿ+( ʿ+( ʿ+( ʿ+( ?@P(?@P(?DQ(C!?DQ(C!ʟ@(O ?'Pʟ@(O ?'PʟD(OI?'Q$ʟD(OI?'Q$ʿ(*ʿ(*ʿ(*ʿ(*?BPG(#?BPG(#ʟBS( O)?PʟBS( O)?PO4O4O4O4O4O4O4O4ʟF(Oi?Q4ʟF(Oi?Q4?FQ(c1?FQ(c1ʿk(ʿk(ʿk(ʿk(ʟA3(Ϡ?gP ʟA3(Ϡ?gP ʟE(ϢY?gQ,ʟE(ϢY?gQ,E<}|x'Ix|x~??x#|7>;xD|}<"O1'3u_Gu_Gu_Gu_Gu_Gu_Gu_Gu_G @7 @7 @7 @7 @7 @7 @7 @7MD7MD7MD7MD7MD7MD7MD7MD7- B- B- B- B- B- B- B- BmFmFmFmFmFmFmFmFAwAwAwAwAwAwAwAw]Ew]Ew]Ew]Ew]Ew]Ew]Ew]Ew=C=C=C=C=C=C=C=C}G}G}G}G}G}G}G}G?@?@?@?@?@?@?@?@C?DC?DC?DC?DC?DC?DC?DC?D#?B#?B#?B#?B#?B#?B#?B#?Bc?Fc?Fc?Fc?Fc?Fc?Fc?Fc?F?AO?AO?AO?AO?AO?AO?AO?AO?`/h+ΣU/;Vy*MΣUΣU|y*}{ΣUuy:Vud -dddd _}¥Ko0\r٨ͯ/R^]_ \ ^ ]=S/MP{OPz(Jz(Jz(^Jz)^Jz)^J(>J(>J(;pKo<DT-Nn/͝TvΪD~^kveڶmuL۬imnmbVRM]kf˻v[UˬuZj}ڨZ7 yJu@Z5un];~9y-b4i53>V_(OVIph*[mmltxYHgPgcY.Z fLߖvF:vj;j\2M2Q[3\>0XllfW+]k +b75muuXl]j**k{1miNf%k"-yv(W|rթjTə|V5ՍrD=m`hbqM,r.&Hb>l4tz*C_Y^ٲnF \R^,R,Ƅ\S*|t!3k۽u0;U177KK*#3ZQH+`JlXQ2b2ʉħHVjnK\2H7-M^fL^fWFY߫a 嚶~8Թ)YS䈠칚\̀::h '2+_^iiMXfX5G€j~rl6bU{ <~fuiaYfdHm}֏5ڮr6 h=MUۋJs(u۵)RVeZaT*ʌ=_hIiapƖ/JOd1r&R" w٭@r<<_*H7I0+b6n-سۻvUVBn{XinTֆnT!;jﴴzUcn5XU ϏVӌ*1Sz#Ɓiޭ؍ii56ujwr[QEc:&Raf++Bs\?ڔp,Flt,{Q9XQ~8y~Έ%-.1W>]Hb4_ԓ)VT_n7z̈́::GrL6#GV4WE9.'9i0[\"&5 zqj8J{Ɍr?ohc򧻱u%+$"頼>WNb7ˮ&dT+Ʌ9XS:K\R\ ͸u/Ve>?}{hXTEE_zTgVwʳvԨvüWh2:QۛOMnN&YeS3ZW2'ns`6\?e_΢e+UUk^eT)a$qКDAL%=215 #]06h2j{M.b1f-wk-Z$9PӍF?1w(ؚ[Y)kƈfRWo 3tlt{"e@Vc|,k jC9Yۮ#t |uR9Ys1u*R(UL(Y׳[JZ]Ez>W;mYR2rA{eeuhجQ 9U!2/fYf08PӉUgZ]gVM-E˒.%Ip%I :tiv{x*iD1VzLeN4_⍕B2cz^ɯ>rQ hVk"\RPajAnXh,25z.$%b1~iZGSv%y>NZ]eێLJFiZD4UkzV[ J1Q&'moiu6Zٚvʳr}>k<xhjn?hFRK휤Bqq$e KPV96uZԆfT=/oTUp/7*{b.ű>SX[VTˆZҞ&G@L"?87QWbEe9jZD.ȯBxzR` 2ujQ%QQۭzϰg: B붾=WjBKHҿ!x!R_Mw58Z)Ihj)0R6:QDDQH<(NMB(DwjW|P--ݰ Ū KC|TPB{!)ER~7#E.H*ܦU#^9 xZ*DٽlQRTYḰ0?(U\}3mѲ{JO ÂuzIA޳+άnJKG2P{++3աD` "5BFbҲe#$$PWɁ#FJ] $RVuqrj ]AH. )||\D`93kRc56эAծCgJ|󒴐+dl՗[jVXZI_I$9!eUFR;UIN\`}ڟ,??S->K@@9x:%7a r,.b꼖]Qsr'q% q9 ~.%imK_I_QT)u-T[=Pug  @؟739Ph-NlrL46-8V JZD,&!fI=bו LsX_ڷ5գ5/DqPN<"͞}DjsbM+hTU9i+/WKJUDԕ8deuvԟ=$Qj %F5qTJFڵ-6z1m{[nr3IhZfWJ YÕ^ݶ^Xv  ]hvc^78(L.KaGn7Tm## dD6:}d owǃrrQUĨUL65E~-gfAHW^ߝ17c"h5ku5ߑ[dp*[TE:\\4jNMywg:R_U(D#*H6\Txg!ϩb. >L!H&[/ z~?lǸ( iQU+*[SPǷEch+[A.)p*.@!&fFkz^*Q%9\#PKO ͶgU`%˺R/B[ڿ{ye)TU^ R4Z]C4[YE~'$NG`Om=@TU嘐ȍ$"*I{R1R7׻nZUYMJ.s|6 Ɲ-Y,-$Թzm(9- wKܚ/i%kvϒu5c,6_Hg~&b6A yZcbFzn,G8YIl17W]X5-~\ݑn9<,hg!P4K6CUQX?Y~j A*.fT:WiɌw}JAĵXWG!OD=~_G٫@t0|bpdxNKbdUkV yIׁ56 !&!xsy%Ei.#Esb㖲 !7 EA➂kqy>Gr^PiR! H$pM./f^|]nTNu'V)}izU2ޔj9IM/k;:6F>}## c,6HaxZUK 䀖Ahu@VѬ~R ENl(K\8$m sljE@ZVpJ8$rWrځ.wjf~}~lxKYg3J}+jz|nz4Gff6W!\MZl^jo8u &X^I#R.-72Y%`ubgz=.%ژZ;JU_N$}j8!U^Te ݃ !D#R3-y+h6)"r r!~Ӥx. J QU)jQk ڵ=qdq׏lZ}YM@x!Kr6+Q*A>imkqk)?2zowoX \%q`,.D-[իKJ|]_؏$%җ̉DsBB 8x|3'K^T{/YۋD 7GzĎV%4 Ȣd(~gx,=ОR 5[F`dyǶ=9pqKF^U55.\x_ rgH_, lAj&,/Fԫ 9[,aCZl4^T&u%;MնDjlfR?;R-ylwGJ3'WGVsqe츴W__>ڹ[ {)>KJDepGn*\!:Hd(8j3Hs;!;l&s2 vOzVôl, a\&@!ls,ZFK 94oHb yoQN4& 0Y:HNVx~T;^rek!NrԮϕ^STaQUpVf#KT.! 5+&)=̫~8p#YE qTZƆi{V(A[ЫSZ36 ]L-iH bN?j憮HdRDHZѵ˚]D'J L^U%{בylTW㣆9m6{RϪ\tg moZIdCR,/ Üt!7!|I%" 4H q\l(>FlcDb^5M<ۮBrUoXq$p H(l"bBТn>\lN[Ь-kDRFϥJۚ^6C!ʫ^Z="T>)Վ@Dfu"VlJ!)RV[ۅVuQ:]2-L2=BQOg"⚪Vv3XmuCݷw2bu5Z?q. q?y9JAQS yt9XS I[Ҹٝ5:ԫJ^*ֳ @E"Fi\o=kw Kʛjٌ$acm<@T{fKD6Y56Q 8'' ^t.ۆHduU͂YdsQB*|qrf2RrӇCl{C?hdʨ,ωY kwQeN$bZmLOjڂ4_ ~>>3yR;IV}4}}fnvH=eZhz qGRXFE1} iLBdlN-w\Bs;E4,kym JT~~:"igz%3_.֪%䧂C\QW"{EgGZZ,Z9uOEIЋeG>[j͑f>ղQ3ie~+5X˶{p̩_GticL82pR٬.FK|qy& ւ3UH R^r:Se!H$ l|"rM\W?ڈLZ嵑is6тyO,hLKnot:ޚK}H--襫6WXi i3 Ypn>sI9ഐ >{<_rRX\&1AU~&d5YVue遍^薗^6Zn^mZ}2[3[u+<_6^ݥ6JvI/OvE*vM @{"$po#]ml|Z}dhml4/kRZr5.#Iȵ8MqBt;WEU.vVnٳ=$¶wbI&L HS>9%kp`j{iv' 9̭GcmtRh#fZ[7WC{j%J4v9M5OjSw N?q\HԎgV:a5u$HUne/%n1Gc\ 6'r$ +;J]-jPx@YUQLӝ6 ; $jWxemy m9>d5-[>r§XvX1ͣ8˩^=OfrFJU)nƲ9 QRGf*ƹ\UXimT!YS/9cVmswrC{XjNvt*]9LvlnW  Dj3TJy^хxjlu%6j)iYNcC57kʭu@9)jR)Yf^%m9$l:li1]fͰݲfVXwj"kW,R?y!K"9䚒8Rz5Rґ5ݱY>[.-֡z 繬*TҪՙk#9mVjo7!70"Db-3c^=[kNiݭt2 ddhWVU^l/X,ˍVwɔ6:iVy4ƴm m17%O,rS˟Cyq . `y"/ŹUf&mX4*5z#f<7Epk7`;hX:nue7ݫg|5"ʵvRX88sJ4ZN&bMijZjz[ !ɑ~T?G` YRZ[%53h'ڜ]M6eHY[5aZ=TZh5Kmd~WMUez\5֓\枉̳r}U%-*YeԶ HÝZͪZJzY1hnB qAuM+TOJR^ʪIQZ6$ A^%+(KEi*.psBLfbXSQ͞K5cV5gZkL][Y5ҭڳӭvsz{Mi,HA4؉T!%52D]=/d"$B>@ODŽ|4׌[&zRunGAq n<[$z:J%Zע}^6D}0J%?4& G@4?3Wr`FYNu9j*"ŀςS.ZT!sۉWʴlogcbs:_6vfkvкN[7:)VS  Y h>1\הѪ͂Skv^ˬU#]-stV"ʜN5UbpJyRd㤸ؒ04(i$AV8.NJ˂vgTFd1&O PTI`6 f?/הּ 浝ƴUɹFX5Wk;KZiwGQ_FZq]vծv;^1?\2# y09~T%-%2\ ɤF 勇5Vbvn5,.MJjC̀"KYS+\k?;״:v˞<6#h}3QUOƳ:xV/F6Vc e ޿*VYѶZ0IM "YnD2r썬M$#5-RUde 2l&1_h%G$t͎aO/V[S b)եBl8ǁW78[/AXnyajqrYY$)L$[5nYD@/&=N d+ir+5dՌ-i[@_;U޼|(UM&مj{2nʽh1/T/$(h$#y%IN+kThĄf #include #include #include "globals.h" #include "Forest.h" #include "TreeProbability.h" namespace ranger { class ForestProbability: public Forest { public: ForestProbability() = default; ForestProbability(const ForestProbability&) = delete; ForestProbability& operator=(const ForestProbability&) = delete; virtual ~ForestProbability() override = default; void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector>>& forest_terminal_class_counts, std::vector& is_ordered_variable); std::vector>> getTerminalClassCounts() const; const std::vector& getClassValues() const { return class_values; } void setClassWeights(std::vector& class_weights) { this->class_weights = class_weights; } protected: void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; void computePredictionErrorInternal() override; void writeOutputInternal() override; void writeConfusionFile() override; void writePredictionFile() override; void saveToFileInternal(std::ofstream& outfile) override; void loadFromFileInternal(std::ifstream& infile) override; // Classes of the dependent variable and classIDs for responses std::vector class_values; std::vector response_classIDs; std::vector> sampleIDs_per_class; // Splitting weights std::vector class_weights; private: const std::vector& getTreePrediction(size_t tree_idx, size_t sample_idx) const; size_t getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const; }; } // namespace ranger #endif /* FORESTPROBABILITY_H_ */ ranger/src/ForestRegression.h0000755000176200001440000000342014027301517015775 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef FORESTREGRESSION_H_ #define FORESTREGRESSION_H_ #include #include #include "globals.h" #include "Forest.h" namespace ranger { class ForestRegression: public Forest { public: ForestRegression() = default; ForestRegression(const ForestRegression&) = delete; ForestRegression& operator=(const ForestRegression&) = delete; virtual ~ForestRegression() override = default; void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& is_ordered_variable); private: void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; void computePredictionErrorInternal() override; void writeOutputInternal() override; void writeConfusionFile() override; void writePredictionFile() override; void saveToFileInternal(std::ofstream& outfile) override; void loadFromFileInternal(std::ifstream& infile) override; private: double getTreePrediction(size_t tree_idx, size_t sample_idx) const; size_t getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const; }; } // namespace ranger #endif /* FORESTREGRESSION_H_ */ ranger/src/DataSparse.h0000755000176200001440000000440214027301517014522 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of Ranger. Ranger is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Ranger is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Ranger. If not, see . Written by: Marvin N. Wright Institut für Medizinische Biometrie und Statistik Universität zu Lübeck Ratzeburger Allee 160 23562 Lübeck http://www.imbs-luebeck.de #-------------------------------------------------------------------------------*/ #ifndef DATASPARSE_H_ #define DATASPARSE_H_ #include #include "globals.h" #include "utility.h" #include "Data.h" namespace ranger { class DataSparse: public Data { public: DataSparse() = default; DataSparse(Eigen::SparseMatrix& x, Rcpp::NumericMatrix& y, std::vector variable_names, size_t num_rows, size_t num_cols); DataSparse(const DataSparse&) = delete; DataSparse& operator=(const DataSparse&) = delete; virtual ~DataSparse() override = default; double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance if (col >= num_cols) { col = getUnpermutedVarID(col); row = getPermutedSampleID(row); } return x.coeff(row, col); } double get_y(size_t row, size_t col) const override { return y[col * num_rows + row]; } // #nocov start void reserveMemory(size_t y_cols) override { // Not needed } void set_x(size_t col, size_t row, double value, bool& error) override { x.coeffRef(row, col) = value; } void set_y(size_t col, size_t row, double value, bool& error) override { y[col * num_rows + row] = value; } // #nocov end private: Eigen::SparseMatrix x; Rcpp::NumericMatrix y; }; } // namespace ranger #endif /* DATASPARSE_H_ */ ranger/src/Forest.h0000755000176200001440000002212414027301517013736 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef FOREST_H_ #define FOREST_H_ #include #include #include #include #include #ifndef OLD_WIN_R_BUILD #include #include #include #include #endif #include "globals.h" #include "Tree.h" #include "Data.h" namespace ranger { class Forest { public: Forest(); Forest(const Forest&) = delete; Forest& operator=(const Forest&) = delete; virtual ~Forest() = default; // Init from c++ main or Rcpp from R void initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, std::string split_select_weights_file, const std::vector& always_split_variable_names, std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, std::vector>& split_select_weights, const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void init(std::unique_ptr input_data, uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); virtual void initInternal() = 0; // Grow or predict void run(bool verbose, bool compute_oob_error); // Write results to output files void writeOutput(); virtual void writeOutputInternal() = 0; virtual void writeConfusionFile() = 0; virtual void writePredictionFile() = 0; void writeImportanceFile(); // Save forest to file void saveToFile(); virtual void saveToFileInternal(std::ofstream& outfile) = 0; std::vector>> getChildNodeIDs() { std::vector>> result; for (auto& tree : trees) { result.push_back(tree->getChildNodeIDs()); } return result; } std::vector> getSplitVarIDs() { std::vector> result; for (auto& tree : trees) { result.push_back(tree->getSplitVarIDs()); } return result; } std::vector> getSplitValues() { std::vector> result; for (auto& tree : trees) { result.push_back(tree->getSplitValues()); } return result; } const std::vector& getVariableImportance() const { return variable_importance; } const std::vector& getVariableImportanceCasewise() const { return variable_importance_casewise; } double getOverallPredictionError() const { return overall_prediction_error; } const std::vector>>& getPredictions() const { return predictions; } size_t getNumTrees() const { return num_trees; } uint getMtry() const { return mtry; } uint getMinNodeSize() const { return min_node_size; } size_t getNumIndependentVariables() const { return num_independent_variables; } const std::vector& getIsOrderedVariable() const { return data->getIsOrderedVariable(); } std::vector> getInbagCounts() const { std::vector> result; for (auto& tree : trees) { result.push_back(tree->getInbagCounts()); } return result; } const std::vector>& getSnpOrder() const { return data->getSnpOrder(); } protected: void grow(); virtual void growInternal() = 0; // Predict using existing tree from file and data as prediction data void predict(); virtual void allocatePredictMemory() = 0; virtual void predictInternal(size_t sample_idx) = 0; void computePredictionError(); virtual void computePredictionErrorInternal() = 0; void computePermutationImportance(); // Multithreading methods for growing/prediction/importance, called by each thread void growTreesInThread(uint thread_idx, std::vector* variable_importance); void predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction); void predictInternalInThread(uint thread_idx); void computeTreePermutationImportanceInThread(uint thread_idx, std::vector& importance, std::vector& variance, std::vector& importance_casewise); // Load forest from file void loadFromFile(std::string filename); virtual void loadFromFileInternal(std::ifstream& infile) = 0; void loadDependentVariableNamesFromFile(std::string filename); // Load data from file std::unique_ptr loadDataFromFile(const std::string& data_path); // Set split select weights and variables to be always considered for splitting void setSplitWeightVector(std::vector>& split_select_weights); void setAlwaysSplitVariables(const std::vector& always_split_variable_names); // Show progress every few seconds #ifdef OLD_WIN_R_BUILD void showProgress(std::string operation, clock_t start_time, clock_t& lap_time); #else void showProgress(std::string operation, size_t max_progress); #endif // Verbose output stream, cout if verbose==true, logfile if not std::ostream* verbose_out; std::vector dependent_variable_names; // time,status for survival size_t num_trees; uint mtry; uint min_node_size; size_t num_independent_variables; uint seed; size_t num_samples; bool prediction_mode; MemoryMode memory_mode; bool sample_with_replacement; bool memory_saving_splitting; SplitRule splitrule; bool predict_all; bool keep_inbag; std::vector sample_fraction; bool holdout; PredictionType prediction_type; uint num_random_splits; uint max_depth; // MAXSTAT splitrule double alpha; double minprop; // Multithreading uint num_threads; std::vector thread_ranges; #ifndef OLD_WIN_R_BUILD std::mutex mutex; std::condition_variable condition_variable; #endif std::vector> trees; std::unique_ptr data; std::vector>> predictions; double overall_prediction_error; // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable // Deterministic variables are always selected std::vector deterministic_varIDs; std::vector> split_select_weights; // Bootstrap weights std::vector case_weights; // Pre-selected bootstrap samples (per tree) std::vector> manual_inbag; // Random number generator std::mt19937_64 random_number_generator; std::string output_prefix; ImportanceMode importance_mode; // Regularization std::vector regularization_factor; bool regularization_usedepth; std::vector split_varIDs_used; // Variable importance for all variables in forest std::vector variable_importance; // Casewise variable importance for all variables in forest std::vector variable_importance_casewise; // Computation progress (finished trees) size_t progress; #ifdef R_BUILD size_t aborted_threads; bool aborted; #endif }; } // namespace ranger #endif /* FOREST_H_ */ ranger/src/utility.cpp0000755000176200001440000004733114027301517014541 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include #include #include #include #include #include #include "utility.h" #include "globals.h" #include "Data.h" namespace ranger { void equalSplit(std::vector& result, uint start, uint end, uint num_parts) { result.reserve(num_parts + 1); // Return range if only 1 part if (num_parts == 1) { result.push_back(start); result.push_back(end + 1); return; } // Return vector from start to end+1 if more parts than elements if (num_parts > end - start + 1) { for (uint i = start; i <= end + 1; ++i) { result.push_back(i); } return; } uint length = (end - start + 1); uint part_length_short = length / num_parts; uint part_length_long = (uint) ceil(length / ((double) num_parts)); uint cut_pos = length % num_parts; // Add long ranges for (uint i = start; i < start + cut_pos * part_length_long; i = i + part_length_long) { result.push_back(i); } // Add short ranges for (uint i = start + cut_pos * part_length_long; i <= end + 1; i = i + part_length_short) { result.push_back(i); } } void loadDoubleVectorFromFile(std::vector& result, std::string filename) { // #nocov start // Open input file std::ifstream input_file; input_file.open(filename); if (!input_file.good()) { throw std::runtime_error("Could not open file: " + filename); } // Read the first line, ignore the rest std::string line; getline(input_file, line); std::stringstream line_stream(line); double token; while (line_stream >> token) { result.push_back(token); } } // #nocov end void drawWithoutReplacement(std::vector& result, std::mt19937_64& random_number_generator, size_t max, size_t num_samples) { if (num_samples < max / 10) { drawWithoutReplacementSimple(result, random_number_generator, max, num_samples); } else { //drawWithoutReplacementKnuth(result, random_number_generator, max, skip, num_samples); drawWithoutReplacementFisherYates(result, random_number_generator, max, num_samples); } } void drawWithoutReplacementSkip(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples) { if (num_samples < max / 10) { drawWithoutReplacementSimple(result, random_number_generator, max, skip, num_samples); } else { //drawWithoutReplacementKnuth(result, random_number_generator, max, skip, num_samples); drawWithoutReplacementFisherYates(result, random_number_generator, max, skip, num_samples); } } void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& random_number_generator, size_t max, size_t num_samples) { result.reserve(num_samples); // Set all to not selected std::vector temp; temp.resize(max, false); std::uniform_int_distribution unif_dist(0, max - 1); for (size_t i = 0; i < num_samples; ++i) { size_t draw; do { draw = unif_dist(random_number_generator); } while (temp[draw]); temp[draw] = true; result.push_back(draw); } } void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples) { result.reserve(num_samples); // Set all to not selected std::vector temp; temp.resize(max, false); std::uniform_int_distribution unif_dist(0, max - 1 - skip.size()); for (size_t i = 0; i < num_samples; ++i) { size_t draw; do { draw = unif_dist(random_number_generator); for (auto& skip_value : skip) { if (draw >= skip_value) { ++draw; } } } while (temp[draw]); temp[draw] = true; result.push_back(draw); } } void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937_64& random_number_generator, size_t max, size_t num_samples) { // Create indices result.resize(max); std::iota(result.begin(), result.end(), 0); // Draw without replacement using Fisher Yates algorithm std::uniform_real_distribution distribution(0.0, 1.0); for (size_t i = 0; i < num_samples; ++i) { size_t j = i + distribution(random_number_generator) * (max - i); std::swap(result[i], result[j]); } result.resize(num_samples); } void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples) { // Create indices result.resize(max); std::iota(result.begin(), result.end(), 0); // Skip indices for (size_t i = 0; i < skip.size(); ++i) { result.erase(result.begin() + skip[skip.size() - 1 - i]); } // Draw without replacement using Fisher Yates algorithm std::uniform_real_distribution distribution(0.0, 1.0); for (size_t i = 0; i < num_samples; ++i) { size_t j = i + distribution(random_number_generator) * (max - skip.size() - i); std::swap(result[i], result[j]); } result.resize(num_samples); } void drawWithoutReplacementWeighted(std::vector& result, std::mt19937_64& random_number_generator, size_t max_index, size_t num_samples, const std::vector& weights) { result.reserve(num_samples); // Set all to not selected std::vector temp; temp.resize(max_index + 1, false); std::discrete_distribution<> weighted_dist(weights.begin(), weights.end()); for (size_t i = 0; i < num_samples; ++i) { size_t draw; do { draw = weighted_dist(random_number_generator); } while (temp[draw]); temp[draw] = true; result.push_back(draw); } } double mostFrequentValue(const std::unordered_map& class_count, std::mt19937_64 random_number_generator) { std::vector major_classes; // Find maximum count size_t max_count = 0; for (auto& class_value : class_count) { if (class_value.second > max_count) { max_count = class_value.second; major_classes.clear(); major_classes.push_back(class_value.first); } else if (class_value.second == max_count) { major_classes.push_back(class_value.first); } } if (major_classes.size() == 1) { return major_classes[0]; } else { // Choose randomly std::uniform_int_distribution unif_dist(0, major_classes.size() - 1); return major_classes[unif_dist(random_number_generator)]; } } double computeConcordanceIndex(const Data& data, const std::vector& sum_chf, const std::vector& sample_IDs, std::vector* prediction_error_casewise) { // Compute concordance index double concordance = 0; double permissible = 0; std::vector concordance_casewise; std::vector permissible_casewise; if (prediction_error_casewise) { concordance_casewise.resize(prediction_error_casewise->size(), 0); permissible_casewise.resize(prediction_error_casewise->size(), 0); } for (size_t i = 0; i < sum_chf.size(); ++i) { size_t sample_i = i; if (!sample_IDs.empty()) { sample_i = sample_IDs[i]; } double time_i = data.get_y(sample_i, 0); double status_i = data.get_y(sample_i, 1); double conc, perm; if (prediction_error_casewise) { conc = concordance_casewise[i]; perm = permissible_casewise[i]; } else { conc = 0; perm = 0; } for (size_t j = i + 1; j < sum_chf.size(); ++j) { size_t sample_j = j; if (!sample_IDs.empty()) { sample_j = sample_IDs[j]; } double time_j = data.get_y(sample_j, 0); double status_j = data.get_y(sample_j, 1); if (time_i < time_j && status_i == 0) { continue; } if (time_j < time_i && status_j == 0) { continue; } if (time_i == time_j && status_i == status_j) { continue; } double co; if (time_i < time_j && sum_chf[i] > sum_chf[j]) { co = 1; } else if (time_j < time_i && sum_chf[j] > sum_chf[i]) { co = 1; } else if (sum_chf[i] == sum_chf[j]) { co = 0.5; } else { co = 0; } conc += co; perm += 1; if (prediction_error_casewise) { concordance_casewise[j] += co; permissible_casewise[j] += 1; } } concordance += conc; permissible += perm; if (prediction_error_casewise) { concordance_casewise[i] = conc; permissible_casewise[i] = perm; } } if (prediction_error_casewise) { for (size_t i = 0; i < prediction_error_casewise->size(); ++i) { (*prediction_error_casewise)[i] = 1 - concordance_casewise[i] / permissible_casewise[i]; } } return (concordance / permissible); } std::string uintToString(uint number) { #if WIN_R_BUILD == 1 std::stringstream temp; temp << number; return temp.str(); #else return std::to_string(number); #endif } std::string beautifyTime(uint seconds) { // #nocov start std::string result; // Add seconds, minutes, hours, days if larger than zero uint out_seconds = (uint) seconds % 60; result = uintToString(out_seconds) + " seconds"; uint out_minutes = (seconds / 60) % 60; if (seconds / 60 == 0) { return result; } else if (out_minutes == 1) { result = "1 minute, " + result; } else { result = uintToString(out_minutes) + " minutes, " + result; } uint out_hours = (seconds / 3600) % 24; if (seconds / 3600 == 0) { return result; } else if (out_hours == 1) { result = "1 hour, " + result; } else { result = uintToString(out_hours) + " hours, " + result; } uint out_days = (seconds / 86400); if (out_days == 0) { return result; } else if (out_days == 1) { result = "1 day, " + result; } else { result = uintToString(out_days) + " days, " + result; } return result; } // #nocov end // #nocov start size_t roundToNextMultiple(size_t value, uint multiple) { if (multiple == 0) { return value; } size_t remainder = value % multiple; if (remainder == 0) { return value; } return value + multiple - remainder; } // #nocov end void splitString(std::vector& result, const std::string& input, char split_char) { // #nocov start std::istringstream ss(input); std::string token; while (std::getline(ss, token, split_char)) { result.push_back(token); } } // #nocov end void splitString(std::vector& result, const std::string& input, char split_char) { // #nocov start std::istringstream ss(input); std::string token; while (std::getline(ss, token, split_char)) { result.push_back(std::stod(token)); } } // #nocov end void shuffleAndSplit(std::vector& first_part, std::vector& second_part, size_t n_all, size_t n_first, std::mt19937_64 random_number_generator) { // Reserve space first_part.resize(n_all); // Fill with 0..n_all-1 and shuffle std::iota(first_part.begin(), first_part.end(), 0); std::shuffle(first_part.begin(), first_part.end(), random_number_generator); // Copy to second part second_part.resize(n_all - n_first); std::copy(first_part.begin() + n_first, first_part.end(), second_part.begin()); // Resize first part first_part.resize(n_first); } void shuffleAndSplitAppend(std::vector& first_part, std::vector& second_part, size_t n_all, size_t n_first, const std::vector& mapping, std::mt19937_64 random_number_generator) { // Old end is start position for new data size_t first_old_size = first_part.size(); size_t second_old_size = second_part.size(); // Reserve space first_part.resize(first_old_size + n_all); std::vector::iterator first_start_pos = first_part.begin() + first_old_size; // Fill with 0..n_all-1 and shuffle std::iota(first_start_pos, first_part.end(), 0); std::shuffle(first_start_pos, first_part.end(), random_number_generator); // Mapping for (std::vector::iterator j = first_start_pos; j != first_part.end(); ++j) { *j = mapping[*j]; } // Copy to second part second_part.resize(second_part.size() + n_all - n_first); std::vector::iterator second_start_pos = second_part.begin() + second_old_size; std::copy(first_start_pos + n_first, first_part.end(), second_start_pos); // Resize first part first_part.resize(first_old_size + n_first); } std::string checkUnorderedVariables(const Data& data, const std::vector& unordered_variable_names) { // #nocov start size_t num_rows = data.getNumRows(); std::vector sampleIDs(num_rows); std::iota(sampleIDs.begin(), sampleIDs.end(), 0); // Check for all unordered variables for (auto& variable_name : unordered_variable_names) { size_t varID = data.getVariableID(variable_name); std::vector all_values; data.getAllValues(all_values, sampleIDs, varID, 0, sampleIDs.size()); // Check level count size_t max_level_count = 8 * sizeof(size_t) - 1; if (all_values.size() > max_level_count) { return "Too many levels in unordered categorical variable " + variable_name + ". Only " + uintToString(max_level_count) + " levels allowed on this system."; } // Check positive integers if (!checkPositiveIntegers(all_values)) { return "Not all values in unordered categorical variable " + variable_name + " are positive integers."; } } return ""; } // #nocov end bool checkPositiveIntegers(const std::vector& all_values) { // #nocov start for (auto& value : all_values) { if (value < 1 || !(floor(value) == value)) { return false; } } return true; } // #nocov end double maxstatPValueLau92(double b, double minprop, double maxprop) { if (b < 1) { return 1.0; } // Compute only once (minprop/maxprop don't change during runtime) static double logprop = log((maxprop * (1 - minprop)) / ((1 - maxprop) * minprop)); double db = dstdnorm(b); double p = 4 * db / b + db * (b - 1 / b) * logprop; if (p > 0) { return p; } else { return 0; } } double maxstatPValueLau94(double b, double minprop, double maxprop, size_t N, const std::vector& m) { double D = 0; for (size_t i = 0; i < m.size() - 1; ++i) { double m1 = m[i]; double m2 = m[i + 1]; double t = sqrt(1.0 - m1 * (N - m2) / ((N - m1) * m2)); D += 1 / M_PI * exp(-b * b / 2) * (t - (b * b / 4 - 1) * (t * t * t) / 6); } return 2 * (1 - pstdnorm(b)) + D; } double maxstatPValueUnadjusted(double b) { return 2 * pstdnorm(-b); } double dstdnorm(double x) { return exp(-0.5 * x * x) / sqrt(2 * M_PI); } double pstdnorm(double x) { return 0.5 * (1 + erf(x / sqrt(2.0))); } std::vector adjustPvalues(std::vector& unadjusted_pvalues) { size_t num_pvalues = unadjusted_pvalues.size(); std::vector adjusted_pvalues(num_pvalues, 0); // Get order of p-values std::vector indices = order(unadjusted_pvalues, true); // Compute adjusted p-values adjusted_pvalues[indices[0]] = unadjusted_pvalues[indices[0]]; for (size_t i = 1; i < indices.size(); ++i) { size_t idx = indices[i]; size_t idx_last = indices[i - 1]; adjusted_pvalues[idx] = std::min(adjusted_pvalues[idx_last], (double) num_pvalues / (double) (num_pvalues - i) * unadjusted_pvalues[idx]); } return adjusted_pvalues; } std::vector logrankScores(const std::vector& time, const std::vector& status) { size_t n = time.size(); std::vector scores(n); // Get order of timepoints std::vector indices = order(time, false); // Compute scores double cumsum = 0; size_t last_unique = -1; for (size_t i = 0; i < n; ++i) { // Continue if next value is the same if (i < n - 1 && time[indices[i]] == time[indices[i + 1]]) { continue; } // Compute sum and scores for all non-unique values in a row for (size_t j = last_unique + 1; j <= i; ++j) { cumsum += status[indices[j]] / (n - i); } for (size_t j = last_unique + 1; j <= i; ++j) { scores[indices[j]] = status[indices[j]] - cumsum; } // Save last computed value last_unique = i; } return scores; } void maxstat(const std::vector& scores, const std::vector& x, const std::vector& indices, double& best_maxstat, double& best_split_value, double minprop, double maxprop) { size_t n = x.size(); double sum_all_scores = 0; for (size_t i = 0; i < n; ++i) { sum_all_scores += scores[indices[i]]; } // Compute sum of differences from mean for variance double mean_scores = sum_all_scores / n; double sum_mean_diff = 0; for (size_t i = 0; i < n; ++i) { sum_mean_diff += (scores[i] - mean_scores) * (scores[i] - mean_scores); } // Get smallest and largest split to consider, -1 for compatibility with R maxstat size_t minsplit = 0; if (n * minprop > 1) { minsplit = n * minprop - 1; } size_t maxsplit = n * maxprop - 1; // For all unique x-values best_maxstat = -1; best_split_value = -1; double sum_scores = 0; size_t n_left = 0; for (size_t i = 0; i <= maxsplit; ++i) { sum_scores += scores[indices[i]]; n_left++; // Dont consider splits smaller than minsplit for splitting (but count) if (i < minsplit) { continue; } // Consider only unique values if (i < n - 1 && x[indices[i]] == x[indices[i + 1]]) { continue; } // If value is largest possible value, stop if (x[indices[i]] == x[indices[n - 1]]) { break; } double S = sum_scores; double E = (double) n_left / (double) n * sum_all_scores; double V = (double) n_left * (double) (n - n_left) / (double) (n * (n - 1)) * sum_mean_diff; double T = fabs((S - E) / sqrt(V)); if (T > best_maxstat) { best_maxstat = T; // Use mid-point split if possible if (i < n - 1) { best_split_value = (x[indices[i]] + x[indices[i + 1]]) / 2; } else { best_split_value = x[indices[i]]; } } } } std::vector numSamplesLeftOfCutpoint(std::vector& x, const std::vector& indices) { std::vector num_samples_left; num_samples_left.reserve(x.size()); for (size_t i = 0; i < x.size(); ++i) { if (i == 0) { num_samples_left.push_back(1); } else if (x[indices[i]] == x[indices[i - 1]]) { ++num_samples_left[num_samples_left.size() - 1]; } else { num_samples_left.push_back(num_samples_left[num_samples_left.size() - 1] + 1); } } return num_samples_left; } // #nocov start std::stringstream& readFromStream(std::stringstream& in, double& token) { if (!(in >> token) && (std::fpclassify(token) == FP_SUBNORMAL)) { in.clear(); } return in; } // #nocov end double betaLogLik(double y, double mean, double phi) { // Avoid 0 and 1 if (y < std::numeric_limits::epsilon()) { y = std::numeric_limits::epsilon(); } else if (y >= 1) { y = 1 - std::numeric_limits::epsilon(); } if (mean < std::numeric_limits::epsilon()) { mean = std::numeric_limits::epsilon(); } else if (mean >= 1) { mean = 1 - std::numeric_limits::epsilon(); } if (phi < std::numeric_limits::epsilon()) { phi = std::numeric_limits::epsilon(); } else if (mean >= 1) { phi = 1 - std::numeric_limits::epsilon(); } return (lgamma(phi) - lgamma(mean * phi) - lgamma((1 - mean) * phi) + (mean * phi - 1) * log(y) + ((1 - mean) * phi - 1) * log(1 - y)); } } // namespace ranger ranger/src/Forest.cpp0000755000176200001440000010747514043725515014314 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include #ifndef OLD_WIN_R_BUILD #include #include #endif #include "utility.h" #include "Forest.h" #include "DataChar.h" #include "DataDouble.h" #include "DataFloat.h" namespace ranger { Forest::Forest() : verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0), seed(0), num_samples( 0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting( false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout( false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth( DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), num_threads(DEFAULT_NUM_THREADS), data { }, overall_prediction_error( NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false), progress(0) { } // #nocov start void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, std::string split_select_weights_file, const std::vector& always_split_variable_names, std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { this->memory_mode = memory_mode; this->verbose_out = verbose_out; if (!dependent_variable_name.empty()) { if (status_variable_name.empty()) { this->dependent_variable_names = {dependent_variable_name}; } else { this->dependent_variable_names = {dependent_variable_name, status_variable_name}; } } // Set prediction mode bool prediction_mode = false; if (!load_forest_filename.empty()) { prediction_mode = true; } // Sample fraction default and convert to vector if (sample_fraction == 0) { if (sample_with_replacement) { sample_fraction = DEFAULT_SAMPLE_FRACTION_REPLACE; } else { sample_fraction = DEFAULT_SAMPLE_FRACTION_NOREPLACE; } } std::vector sample_fraction_vector = { sample_fraction }; if (prediction_mode) { loadDependentVariableNamesFromFile(load_forest_filename); } // Call other init function init(loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, false, max_depth, regularization_factor, regularization_usedepth); if (prediction_mode) { loadFromFile(load_forest_filename); } // Set variables to be always considered for splitting if (!always_split_variable_names.empty()) { setAlwaysSplitVariables(always_split_variable_names); } // TODO: Read 2d weights for tree-wise split select weights // Load split select weights from file if (!split_select_weights_file.empty()) { std::vector> split_select_weights; split_select_weights.resize(1); loadDoubleVectorFromFile(split_select_weights[0], split_select_weights_file); if (split_select_weights[0].size() != num_independent_variables) { throw std::runtime_error("Number of split select weights is not equal to number of independent variables."); } setSplitWeightVector(split_select_weights); } // Load case weights from file if (!case_weights_file.empty()) { loadDoubleVectorFromFile(case_weights, case_weights_file); if (case_weights.size() != num_samples) { throw std::runtime_error("Number of case weights is not equal to number of samples."); } } // Sample from non-zero weights in holdout mode if (holdout && !case_weights.empty()) { size_t nonzero_weights = 0; for (auto& weight : case_weights) { if (weight > 0) { ++nonzero_weights; } } this->sample_fraction[0] = this->sample_fraction[0] * ((double) nonzero_weights / (double) num_samples); } // Check if all catvars are coded in integers starting at 1 if (!unordered_variable_names.empty()) { std::string error_message = checkUnorderedVariables(*data, unordered_variable_names); if (!error_message.empty()) { throw std::runtime_error(error_message); } } } // #nocov end void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, std::vector>& split_select_weights, const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { this->verbose_out = verbose_out; // Call other init function init(std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth); // Set variables to be always considered for splitting if (!always_split_variable_names.empty()) { setAlwaysSplitVariables(always_split_variable_names); } // Set split select weights if (!split_select_weights.empty()) { setSplitWeightVector(split_select_weights); } // Set case weights if (!case_weights.empty()) { if (case_weights.size() != num_samples) { throw std::runtime_error("Number of case weights not equal to number of samples."); } this->case_weights = case_weights; } // Set manual inbag if (!manual_inbag.empty()) { this->manual_inbag = manual_inbag; } // Keep inbag counts this->keep_inbag = keep_inbag; } void Forest::init(std::unique_ptr input_data, uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { // Initialize data with memmode this->data = std::move(input_data); // Initialize random number generator and set seed if (seed == 0) { std::random_device random_device; random_number_generator.seed(random_device()); } else { random_number_generator.seed(seed); } // Set number of threads if (num_threads == DEFAULT_NUM_THREADS) { #ifdef OLD_WIN_R_BUILD this->num_threads = 1; #else this->num_threads = std::thread::hardware_concurrency(); #endif } else { this->num_threads = num_threads; } // Set member variables this->num_trees = num_trees; this->mtry = mtry; this->seed = seed; this->output_prefix = output_prefix; this->importance_mode = importance_mode; this->min_node_size = min_node_size; this->prediction_mode = prediction_mode; this->sample_with_replacement = sample_with_replacement; this->memory_saving_splitting = memory_saving_splitting; this->splitrule = splitrule; this->predict_all = predict_all; this->sample_fraction = sample_fraction; this->holdout = holdout; this->alpha = alpha; this->minprop = minprop; this->prediction_type = prediction_type; this->num_random_splits = num_random_splits; this->max_depth = max_depth; this->regularization_factor = regularization_factor; this->regularization_usedepth = regularization_usedepth; // Set number of samples and variables num_samples = data->getNumRows(); num_independent_variables = data->getNumCols(); // Set unordered factor variables if (!prediction_mode) { data->setIsOrderedVariable(unordered_variable_names); } initInternal(); // Init split select weights split_select_weights.push_back(std::vector()); // Init manual inbag manual_inbag.push_back(std::vector()); // Check if mtry is in valid range if (this->mtry > num_independent_variables) { throw std::runtime_error("mtry can not be larger than number of variables in data."); } // Check if any observations samples if ((size_t) num_samples * sample_fraction[0] < 1) { throw std::runtime_error("sample_fraction too small, no observations sampled."); } // Permute samples for corrected Gini importance if (importance_mode == IMP_GINI_CORRECTED) { data->permuteSampleIDs(random_number_generator); } // Order SNP levels if in "order" splitting if (!prediction_mode && order_snps) { data->orderSnpLevels((importance_mode == IMP_GINI_CORRECTED)); } // Regularization if (regularization_factor.size() > 0) { if (regularization_factor.size() == 1 && num_independent_variables > 1) { double single_regularization_factor = regularization_factor[0]; this->regularization_factor.resize(num_independent_variables, single_regularization_factor); } else if (regularization_factor.size() != num_independent_variables) { throw std::runtime_error("Use 1 or p (the number of predictor variables) regularization factors."); } // Set all variables to not used split_varIDs_used.resize(num_independent_variables, false); } } void Forest::run(bool verbose, bool compute_oob_error) { if (prediction_mode) { if (verbose && verbose_out) { *verbose_out << "Predicting .." << std::endl; } predict(); } else { if (verbose && verbose_out) { *verbose_out << "Growing trees .." << std::endl; } grow(); if (verbose && verbose_out) { *verbose_out << "Computing prediction error .." << std::endl; } if (compute_oob_error) { computePredictionError(); } if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW || importance_mode == IMP_PERM_RAW || importance_mode == IMP_PERM_CASEWISE) { if (verbose && verbose_out) { *verbose_out << "Computing permutation variable importance .." << std::endl; } computePermutationImportance(); } } } // #nocov start void Forest::writeOutput() { if (verbose_out) *verbose_out << std::endl; writeOutputInternal(); if (verbose_out) { if (dependent_variable_names.size() >= 1) { *verbose_out << "Dependent variable name: " << dependent_variable_names[0] << std::endl; } *verbose_out << "Number of trees: " << num_trees << std::endl; *verbose_out << "Sample size: " << num_samples << std::endl; *verbose_out << "Number of independent variables: " << num_independent_variables << std::endl; *verbose_out << "Mtry: " << mtry << std::endl; *verbose_out << "Target node size: " << min_node_size << std::endl; *verbose_out << "Variable importance mode: " << importance_mode << std::endl; *verbose_out << "Memory mode: " << memory_mode << std::endl; *verbose_out << "Seed: " << seed << std::endl; *verbose_out << "Number of threads: " << num_threads << std::endl; *verbose_out << std::endl; } if (prediction_mode) { writePredictionFile(); } else { if (verbose_out) { *verbose_out << "Overall OOB prediction error: " << overall_prediction_error << std::endl; *verbose_out << std::endl; } if (!split_select_weights.empty() & !split_select_weights[0].empty()) { if (verbose_out) { *verbose_out << "Warning: Split select weights used. Variable importance measures are only comparable for variables with equal weights." << std::endl; } } if (importance_mode != IMP_NONE) { writeImportanceFile(); } writeConfusionFile(); } } void Forest::writeImportanceFile() { // Open importance file for writing std::string filename = output_prefix + ".importance"; std::ofstream importance_file; importance_file.open(filename, std::ios::out); if (!importance_file.good()) { throw std::runtime_error("Could not write to importance file: " + filename + "."); } if (importance_mode == IMP_PERM_CASEWISE) { // Write variable names for (auto& variable_name : data->getVariableNames()) { importance_file << variable_name << " "; } importance_file << std::endl; // Write importance values for (size_t i = 0; i < num_samples; ++i) { for (size_t j = 0; j < num_independent_variables; ++j) { if (variable_importance_casewise.size() <= (j * num_samples + i)) { throw std::runtime_error("Memory error in local variable importance."); } importance_file << variable_importance_casewise[j * num_samples + i] << " "; } importance_file << std::endl; } } else { // Write importance to file for (size_t i = 0; i < variable_importance.size(); ++i) { std::string variable_name = data->getVariableNames()[i]; importance_file << variable_name << ": " << variable_importance[i] << std::endl; } } importance_file.close(); if (verbose_out) *verbose_out << "Saved variable importance to file " << filename << "." << std::endl; } void Forest::saveToFile() { // Open file for writing std::string filename = output_prefix + ".forest"; std::ofstream outfile; outfile.open(filename, std::ios::binary); if (!outfile.good()) { throw std::runtime_error("Could not write to output file: " + filename + "."); } // Write dependent variable names uint num_dependent_variables = dependent_variable_names.size(); if (num_dependent_variables >= 1) { outfile.write((char*) &num_dependent_variables, sizeof(num_dependent_variables)); for (auto& var_name : dependent_variable_names) { size_t length = var_name.size(); outfile.write((char*) &length, sizeof(length)); outfile.write((char*) var_name.c_str(), length * sizeof(char)); } } else { throw std::runtime_error("Missing dependent variable name."); } // Write num_trees outfile.write((char*) &num_trees, sizeof(num_trees)); // Write is_ordered_variable saveVector1D(data->getIsOrderedVariable(), outfile); saveToFileInternal(outfile); // Write tree data for each tree for (auto& tree : trees) { tree->appendToFile(outfile); } // Close file outfile.close(); if (verbose_out) *verbose_out << "Saved forest to file " << filename << "." << std::endl; } // #nocov end void Forest::grow() { // Create thread ranges equalSplit(thread_ranges, 0, num_trees - 1, num_threads); // Call special grow functions of subclasses. There trees must be created. growInternal(); // Init trees, create a seed for each tree, based on main seed std::uniform_int_distribution udist; for (size_t i = 0; i < num_trees; ++i) { uint tree_seed; if (seed == 0) { tree_seed = udist(random_number_generator); } else { tree_seed = (i + 1) * seed; } // Get split select weights for tree std::vector* tree_split_select_weights; if (split_select_weights.size() > 1) { tree_split_select_weights = &split_select_weights[i]; } else { tree_split_select_weights = &split_select_weights[0]; } // Get inbag counts for tree std::vector* tree_manual_inbag; if (manual_inbag.size() > 1) { tree_manual_inbag = &manual_inbag[i]; } else { tree_manual_inbag = &manual_inbag[0]; } trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights, importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth, ®ularization_factor, regularization_usedepth, &split_varIDs_used); } // Init variable importance variable_importance.resize(num_independent_variables, 0); // Grow trees in multiple threads #ifdef OLD_WIN_R_BUILD // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); for (size_t i = 0; i < num_trees; ++i) { trees[i]->grow(&variable_importance); progress++; showProgress("Growing trees..", start_time, lap_time); } // #nocov end #else progress = 0; #ifdef R_BUILD aborted = false; aborted_threads = 0; #endif std::vector threads; threads.reserve(num_threads); // Initialize importance per thread std::vector> variable_importance_threads(num_threads); for (uint i = 0; i < num_threads; ++i) { if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { variable_importance_threads[i].resize(num_independent_variables, 0); } threads.emplace_back(&Forest::growTreesInThread, this, i, &(variable_importance_threads[i])); } showProgress("Growing trees..", num_trees); for (auto &thread : threads) { thread.join(); } #ifdef R_BUILD if (aborted_threads > 0) { throw std::runtime_error("User interrupt."); } #endif // Sum thread importances if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { variable_importance.resize(num_independent_variables, 0); for (size_t i = 0; i < num_independent_variables; ++i) { for (uint j = 0; j < num_threads; ++j) { variable_importance[i] += variable_importance_threads[j][i]; } } variable_importance_threads.clear(); } #endif // Divide importance by number of trees if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { for (auto& v : variable_importance) { v /= num_trees; } } } void Forest::predict() { // Predict trees in multiple threads and join the threads with the main thread #ifdef OLD_WIN_R_BUILD // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); for (size_t i = 0; i < num_trees; ++i) { trees[i]->predict(data.get(), false); progress++; showProgress("Predicting..", start_time, lap_time); } // For all samples get tree predictions allocatePredictMemory(); for (size_t sample_idx = 0; sample_idx < data->getNumRows(); ++sample_idx) { predictInternal(sample_idx); } // #nocov end #else progress = 0; #ifdef R_BUILD aborted = false; aborted_threads = 0; #endif // Predict std::vector threads; threads.reserve(num_threads); for (uint i = 0; i < num_threads; ++i) { threads.emplace_back(&Forest::predictTreesInThread, this, i, data.get(), false); } showProgress("Predicting..", num_trees); for (auto &thread : threads) { thread.join(); } // Aggregate predictions allocatePredictMemory(); threads.clear(); threads.reserve(num_threads); progress = 0; for (uint i = 0; i < num_threads; ++i) { threads.emplace_back(&Forest::predictInternalInThread, this, i); } showProgress("Aggregating predictions..", num_samples); for (auto &thread : threads) { thread.join(); } #ifdef R_BUILD if (aborted_threads > 0) { throw std::runtime_error("User interrupt."); } #endif #endif } void Forest::computePredictionError() { // Predict trees in multiple threads #ifdef OLD_WIN_R_BUILD // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); for (size_t i = 0; i < num_trees; ++i) { trees[i]->predict(data.get(), true); progress++; showProgress("Predicting..", start_time, lap_time); } // #nocov end #else std::vector threads; threads.reserve(num_threads); progress = 0; for (uint i = 0; i < num_threads; ++i) { threads.emplace_back(&Forest::predictTreesInThread, this, i, data.get(), true); } showProgress("Computing prediction error..", num_trees); for (auto &thread : threads) { thread.join(); } #ifdef R_BUILD if (aborted_threads > 0) { throw std::runtime_error("User interrupt."); } #endif #endif // Call special function for subclasses computePredictionErrorInternal(); } void Forest::computePermutationImportance() { // Compute tree permutation importance in multiple threads #ifdef OLD_WIN_R_BUILD // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); // Initialize importance and variance variable_importance.resize(num_independent_variables, 0); std::vector variance; if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { variance.resize(num_independent_variables, 0); } if (importance_mode == IMP_PERM_CASEWISE) { variable_importance_casewise.resize(num_independent_variables * num_samples, 0); } // Compute importance for (size_t i = 0; i < num_trees; ++i) { trees[i]->computePermutationImportance(variable_importance, variance, variable_importance_casewise); progress++; showProgress("Computing permutation importance..", start_time, lap_time); } #else progress = 0; #ifdef R_BUILD aborted = false; aborted_threads = 0; #endif std::vector threads; threads.reserve(num_threads); // Initialize importance and variance std::vector> variable_importance_threads(num_threads); std::vector> variance_threads(num_threads); std::vector> variable_importance_casewise_threads(num_threads); // Compute importance for (uint i = 0; i < num_threads; ++i) { variable_importance_threads[i].resize(num_independent_variables, 0); if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { variance_threads[i].resize(num_independent_variables, 0); } if (importance_mode == IMP_PERM_CASEWISE) { variable_importance_casewise_threads[i].resize(num_independent_variables * num_samples, 0); } threads.emplace_back(&Forest::computeTreePermutationImportanceInThread, this, i, std::ref(variable_importance_threads[i]), std::ref(variance_threads[i]), std::ref(variable_importance_casewise_threads[i])); } showProgress("Computing permutation importance..", num_trees); for (auto &thread : threads) { thread.join(); } #ifdef R_BUILD if (aborted_threads > 0) { throw std::runtime_error("User interrupt."); } #endif // Sum thread importances variable_importance.resize(num_independent_variables, 0); for (size_t i = 0; i < num_independent_variables; ++i) { for (uint j = 0; j < num_threads; ++j) { variable_importance[i] += variable_importance_threads[j][i]; } } variable_importance_threads.clear(); // Sum thread variances std::vector variance(num_independent_variables, 0); if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { for (size_t i = 0; i < num_independent_variables; ++i) { for (uint j = 0; j < num_threads; ++j) { variance[i] += variance_threads[j][i]; } } variance_threads.clear(); } // Sum thread casewise importances if (importance_mode == IMP_PERM_CASEWISE) { variable_importance_casewise.resize(num_independent_variables * num_samples, 0); for (size_t i = 0; i < variable_importance_casewise.size(); ++i) { for (uint j = 0; j < num_threads; ++j) { variable_importance_casewise[i] += variable_importance_casewise_threads[j][i]; } } variable_importance_casewise_threads.clear(); } #endif for (size_t i = 0; i < variable_importance.size(); ++i) { variable_importance[i] /= num_trees; // Normalize by variance for scaled permutation importance if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { if (variance[i] != 0) { variance[i] = variance[i] / num_trees - variable_importance[i] * variable_importance[i]; variable_importance[i] /= sqrt(variance[i] / num_trees); } } } if (importance_mode == IMP_PERM_CASEWISE) { for (size_t i = 0; i < variable_importance_casewise.size(); ++i) { variable_importance_casewise[i] /= num_trees; } } } #ifndef OLD_WIN_R_BUILD void Forest::growTreesInThread(uint thread_idx, std::vector* variable_importance) { if (thread_ranges.size() > thread_idx + 1) { for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { trees[i]->grow(variable_importance); // Check for user interrupt #ifdef R_BUILD if (aborted) { std::unique_lock lock(mutex); ++aborted_threads; condition_variable.notify_one(); return; } #endif // Increase progress by 1 tree std::unique_lock lock(mutex); ++progress; condition_variable.notify_one(); } } } void Forest::predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction) { if (thread_ranges.size() > thread_idx + 1) { for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { trees[i]->predict(prediction_data, oob_prediction); // Check for user interrupt #ifdef R_BUILD if (aborted) { std::unique_lock lock(mutex); ++aborted_threads; condition_variable.notify_one(); return; } #endif // Increase progress by 1 tree std::unique_lock lock(mutex); ++progress; condition_variable.notify_one(); } } } void Forest::predictInternalInThread(uint thread_idx) { // Create thread ranges std::vector predict_ranges; equalSplit(predict_ranges, 0, num_samples - 1, num_threads); if (predict_ranges.size() > thread_idx + 1) { for (size_t i = predict_ranges[thread_idx]; i < predict_ranges[thread_idx + 1]; ++i) { predictInternal(i); // Check for user interrupt #ifdef R_BUILD if (aborted) { std::unique_lock lock(mutex); ++aborted_threads; condition_variable.notify_one(); return; } #endif // Increase progress by 1 tree std::unique_lock lock(mutex); ++progress; condition_variable.notify_one(); } } } void Forest::computeTreePermutationImportanceInThread(uint thread_idx, std::vector& importance, std::vector& variance, std::vector& importance_casewise) { if (thread_ranges.size() > thread_idx + 1) { for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { trees[i]->computePermutationImportance(importance, variance, importance_casewise); // Check for user interrupt #ifdef R_BUILD if (aborted) { std::unique_lock lock(mutex); ++aborted_threads; condition_variable.notify_one(); return; } #endif // Increase progress by 1 tree std::unique_lock lock(mutex); ++progress; condition_variable.notify_one(); } } } #endif // #nocov start void Forest::loadFromFile(std::string filename) { if (verbose_out) *verbose_out << "Loading forest from file " << filename << "." << std::endl; // Open file for reading std::ifstream infile; infile.open(filename, std::ios::binary); if (!infile.good()) { throw std::runtime_error("Could not read from input file: " + filename + "."); } // Skip dependent variable names (already read) uint num_dependent_variables; infile.read((char*) &num_dependent_variables, sizeof(num_dependent_variables)); for (size_t i = 0; i < num_dependent_variables; ++i) { size_t length; infile.read((char*) &length, sizeof(size_t)); infile.ignore(length); } // Read num_trees infile.read((char*) &num_trees, sizeof(num_trees)); // Read is_ordered_variable readVector1D(data->getIsOrderedVariable(), infile); // Read tree data. This is different for tree types -> virtual function loadFromFileInternal(infile); infile.close(); // Create thread ranges equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } void Forest::loadDependentVariableNamesFromFile(std::string filename) { // Open file for reading std::ifstream infile; infile.open(filename, std::ios::binary); if (!infile.good()) { throw std::runtime_error("Could not read from input file: " + filename + "."); } // Read dependent variable names dependent_variable_names.clear(); uint num_dependent_variables = 0; infile.read((char*) &num_dependent_variables, sizeof(num_dependent_variables)); for (size_t i = 0; i < num_dependent_variables; ++i) { size_t length; infile.read((char*) &length, sizeof(size_t)); char* temp = new char[length + 1]; infile.read((char*) temp, length * sizeof(char)); temp[length] = '\0'; dependent_variable_names.push_back(temp); delete[] temp; } infile.close(); } std::unique_ptr Forest::loadDataFromFile(const std::string& data_path) { std::unique_ptr result { }; switch (memory_mode) { case MEM_DOUBLE: result = make_unique(); break; case MEM_FLOAT: result = make_unique(); break; case MEM_CHAR: result = make_unique(); break; } if (verbose_out) *verbose_out << "Loading input file: " << data_path << "." << std::endl; bool found_rounding_error = result->loadFromFile(data_path, dependent_variable_names); if (found_rounding_error && verbose_out) { *verbose_out << "Warning: Rounding or Integer overflow occurred. Use FLOAT or DOUBLE precision to avoid this." << std::endl; } return result; } // #nocov end void Forest::setSplitWeightVector(std::vector>& split_select_weights) { // Size should be 1 x num_independent_variables or num_trees x num_independent_variables if (split_select_weights.size() != 1 && split_select_weights.size() != num_trees) { throw std::runtime_error("Size of split select weights not equal to 1 or number of trees."); } // Reserve space size_t num_weights = num_independent_variables; if (importance_mode == IMP_GINI_CORRECTED) { num_weights = 2 * num_independent_variables; } if (split_select_weights.size() == 1) { this->split_select_weights[0].resize(num_weights); } else { this->split_select_weights.clear(); this->split_select_weights.resize(num_trees, std::vector(num_weights)); } // Split up in deterministic and weighted variables, ignore zero weights for (size_t i = 0; i < split_select_weights.size(); ++i) { size_t num_zero_weights = 0; // Size should be 1 x num_independent_variables or num_trees x num_independent_variables if (split_select_weights[i].size() != num_independent_variables) { throw std::runtime_error("Number of split select weights not equal to number of independent variables."); } for (size_t j = 0; j < split_select_weights[i].size(); ++j) { double weight = split_select_weights[i][j]; if (weight == 0) { ++num_zero_weights; } else if (weight < 0 || weight > 1) { throw std::runtime_error("One or more split select weights not in range [0,1]."); } else { this->split_select_weights[i][j] = weight; } } // Copy weights for corrected impurity importance if (importance_mode == IMP_GINI_CORRECTED) { std::vector* sw = &(this->split_select_weights[i]); std::copy_n(sw->begin(), num_independent_variables, sw->begin() + num_independent_variables); } if (num_weights - num_zero_weights < mtry) { throw std::runtime_error("Too many zeros in split select weights. Need at least mtry variables to split at."); } } } void Forest::setAlwaysSplitVariables(const std::vector& always_split_variable_names) { deterministic_varIDs.reserve(num_independent_variables); for (auto& variable_name : always_split_variable_names) { size_t varID = data->getVariableID(variable_name); deterministic_varIDs.push_back(varID); } if (deterministic_varIDs.size() + this->mtry > num_independent_variables) { throw std::runtime_error( "Number of variables to be always considered for splitting plus mtry cannot be larger than number of independent variables."); } // Also add variables for corrected impurity importance if (importance_mode == IMP_GINI_CORRECTED) { size_t num_deterministic_varIDs = deterministic_varIDs.size(); for (size_t k = 0; k < num_deterministic_varIDs; ++k) { deterministic_varIDs.push_back(k + num_independent_variables); } } } #ifdef OLD_WIN_R_BUILD // #nocov start void Forest::showProgress(std::string operation, clock_t start_time, clock_t& lap_time) { // Check for user interrupt if (checkInterrupt()) { throw std::runtime_error("User interrupt."); } double elapsed_time = (clock() - lap_time) / CLOCKS_PER_SEC; if (elapsed_time > STATUS_INTERVAL) { double relative_progress = (double) progress / (double) num_trees; double time_from_start = (clock() - start_time) / CLOCKS_PER_SEC; uint remaining_time = (1 / relative_progress - 1) * time_from_start; if (verbose_out) { *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " << beautifyTime(remaining_time) << "." << std::endl; } lap_time = clock(); } } // #nocov end #else void Forest::showProgress(std::string operation, size_t max_progress) { using std::chrono::steady_clock; using std::chrono::duration_cast; using std::chrono::seconds; steady_clock::time_point start_time = steady_clock::now(); steady_clock::time_point last_time = steady_clock::now(); std::unique_lock lock(mutex); // Wait for message from threads and show output if enough time elapsed while (progress < max_progress) { condition_variable.wait(lock); seconds elapsed_time = duration_cast(steady_clock::now() - last_time); // Check for user interrupt #ifdef R_BUILD if (!aborted && checkInterrupt()) { aborted = true; } if (aborted && aborted_threads >= num_threads) { return; } #endif if (progress > 0 && elapsed_time.count() > STATUS_INTERVAL) { double relative_progress = (double) progress / (double) max_progress; seconds time_from_start = duration_cast(steady_clock::now() - start_time); uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); if (verbose_out) { *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " << beautifyTime(remaining_time) << "." << std::endl; } last_time = steady_clock::now(); } } } #endif } // namespace ranger ranger/src/TreeProbability.h0000755000176200001440000001207214027301517015575 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef TREEPROBABILITY_H_ #define TREEPROBABILITY_H_ #include #include #include "globals.h" #include "Tree.h" namespace ranger { class TreeProbability: public Tree { public: TreeProbability(std::vector* class_values, std::vector* response_classIDs, std::vector>* sampleIDs_per_class, std::vector* class_weights); // Create from loaded forest TreeProbability(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values, std::vector* class_values, std::vector* response_classIDs, std::vector>& terminal_class_counts); TreeProbability(const TreeProbability&) = delete; TreeProbability& operator=(const TreeProbability&) = delete; virtual ~TreeProbability() override = default; void allocateMemory() override; void addToTerminalNodes(size_t nodeID); void computePermutationImportanceInternal(std::vector>* permutations); void appendToFileInternal(std::ofstream& file) override; const std::vector& getPrediction(size_t sampleID) const { size_t terminal_nodeID = prediction_terminal_nodeIDs[sampleID]; return terminal_class_counts[terminal_nodeID]; } size_t getPredictionTerminalNodeID(size_t sampleID) const { return prediction_terminal_nodeIDs[sampleID]; } const std::vector>& getTerminalClassCounts() const { return terminal_class_counts; } private: bool splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) override; void createEmptyNodeInternal() override; double computePredictionAccuracyInternal(std::vector* prediction_error_casewise) override; // Called by splitNodeInternal(). Sets split_varIDs and split_values. bool findBestSplit(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& counter_per_class, std::vector& counter); void findBestSplitValueLargeQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); bool findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& class_counts_right, std::vector& n_right); void findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void addImpurityImportance(size_t nodeID, size_t varID, double decrease); void bootstrapClassWise() override; void bootstrapWithoutReplacementClassWise() override; void cleanUpInternal() override { counter.clear(); counter.shrink_to_fit(); counter_per_class.clear(); counter_per_class.shrink_to_fit(); } // Classes of the dependent variable and classIDs for responses const std::vector* class_values; const std::vector* response_classIDs; const std::vector>* sampleIDs_per_class; // Class counts in terminal nodes. Empty for non-terminal nodes. std::vector> terminal_class_counts; // Splitting weights const std::vector* class_weights; std::vector counter; std::vector counter_per_class; }; } // namespace ranger #endif /* TREEPROBABILITY_H_ */ ranger/src/DataChar.h0000755000176200001440000000346114027301517014146 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ // Ignore in coverage report (not used in R package) // #nocov start #ifndef DATACHAR_H_ #define DATACHAR_H_ #include #include #include "globals.h" #include "utility.h" #include "Data.h" namespace ranger { class DataChar: public Data { public: DataChar() = default; DataChar(const DataChar&) = delete; DataChar& operator=(const DataChar&) = delete; virtual ~DataChar() override = default; double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { col = getUnpermutedVarID(col); row = getPermutedSampleID(row); } if (col < num_cols_no_snp) { return x[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } double get_y(size_t row, size_t col) const override { return y[col * num_rows + row]; } void reserveMemory(size_t y_cols) override { x.resize(num_cols * num_rows); y.resize(y_cols * num_rows); } void set_x(size_t col, size_t row, double value, bool& error) override { x[col * num_rows + row] = value; } void set_y(size_t col, size_t row, double value, bool& error) override { y[col * num_rows + row] = value; } private: std::vector x; std::vector y; }; } // namespace ranger #endif /* DATACHAR_H_ */ // #nocov end ranger/src/ForestSurvival.cpp0000755000176200001440000002652114027301517016032 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include "utility.h" #include "ForestSurvival.h" #include "Data.h" namespace ranger { void ForestSurvival::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector> >& forest_chf, std::vector& unique_timepoints, std::vector& is_ordered_variable) { this->num_trees = num_trees; this->unique_timepoints = unique_timepoints; data->setIsOrderedVariable(is_ordered_variable); // Create trees trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], forest_chf[i], &this->unique_timepoints, &response_timepointIDs)); } // Create thread ranges equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } std::vector>> ForestSurvival::getChf() const { std::vector>> result; result.reserve(num_trees); for (const auto& tree : trees) { const auto& temp = dynamic_cast(*tree); result.push_back(temp.getChf()); } return result; } void ForestSurvival::initInternal() { // If mtry not set, use floored square root of number of independent variables. if (mtry == 0) { unsigned long temp = ceil(sqrt((double) num_independent_variables)); mtry = std::max((unsigned long) 1, temp); } // Set minimal node size if (min_node_size == 0) { min_node_size = DEFAULT_MIN_NODE_SIZE_SURVIVAL; } // Create unique timepoints if (!prediction_mode) { std::set unique_timepoint_set; for (size_t i = 0; i < num_samples; ++i) { unique_timepoint_set.insert(data->get_y(i, 0)); } unique_timepoints.reserve(unique_timepoint_set.size()); for (auto& t : unique_timepoint_set) { unique_timepoints.push_back(t); } // Create response_timepointIDs for (size_t i = 0; i < num_samples; ++i) { double value = data->get_y(i, 0); // If timepoint is already in unique_timepoints, use ID. Else create a new one. uint timepointID = find(unique_timepoints.begin(), unique_timepoints.end(), value) - unique_timepoints.begin(); response_timepointIDs.push_back(timepointID); } } // Sort data if extratrees and not memory saving mode if (splitrule == EXTRATREES && !memory_saving_splitting) { data->sort(); } } void ForestSurvival::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back(make_unique(&unique_timepoints, &response_timepointIDs)); } } void ForestSurvival::allocatePredictMemory() { size_t num_prediction_samples = data->getNumRows(); size_t num_timepoints = unique_timepoints.size(); if (predict_all) { predictions = std::vector>>(num_prediction_samples, std::vector>(num_timepoints, std::vector(num_trees, 0))); } else if (prediction_type == TERMINALNODES) { predictions = std::vector>>(1, std::vector>(num_prediction_samples, std::vector(num_trees, 0))); } else { predictions = std::vector>>(1, std::vector>(num_prediction_samples, std::vector(num_timepoints, 0))); } } void ForestSurvival::predictInternal(size_t sample_idx) { // For each timepoint sum over trees if (predict_all) { for (size_t j = 0; j < unique_timepoints.size(); ++j) { for (size_t k = 0; k < num_trees; ++k) { predictions[sample_idx][j][k] = getTreePrediction(k, sample_idx)[j]; } } } else if (prediction_type == TERMINALNODES) { for (size_t k = 0; k < num_trees; ++k) { predictions[0][sample_idx][k] = getTreePredictionTerminalNodeID(k, sample_idx); } } else { for (size_t j = 0; j < unique_timepoints.size(); ++j) { double sample_time_prediction = 0; for (size_t k = 0; k < num_trees; ++k) { sample_time_prediction += getTreePrediction(k, sample_idx)[j]; } predictions[0][sample_idx][j] = sample_time_prediction / num_trees; } } } void ForestSurvival::computePredictionErrorInternal() { size_t num_timepoints = unique_timepoints.size(); // For each sample sum over trees where sample is OOB std::vector samples_oob_count; samples_oob_count.resize(num_samples, 0); predictions = std::vector>>(1, std::vector>(num_samples, std::vector(num_timepoints, 0))); for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { for (size_t sample_idx = 0; sample_idx < trees[tree_idx]->getNumSamplesOob(); ++sample_idx) { size_t sampleID = trees[tree_idx]->getOobSampleIDs()[sample_idx]; std::vector tree_sample_chf = getTreePrediction(tree_idx, sample_idx); for (size_t time_idx = 0; time_idx < tree_sample_chf.size(); ++time_idx) { predictions[0][sampleID][time_idx] += tree_sample_chf[time_idx]; } ++samples_oob_count[sampleID]; } } // Divide sample predictions by number of trees where sample is oob and compute summed chf for samples std::vector sum_chf; sum_chf.reserve(predictions[0].size()); std::vector oob_sampleIDs; oob_sampleIDs.reserve(predictions[0].size()); for (size_t i = 0; i < predictions[0].size(); ++i) { if (samples_oob_count[i] > 0) { double sum = 0; for (size_t j = 0; j < predictions[0][i].size(); ++j) { predictions[0][i][j] /= samples_oob_count[i]; sum += predictions[0][i][j]; } sum_chf.push_back(sum); oob_sampleIDs.push_back(i); } } // Use all samples which are OOB at least once overall_prediction_error = 1 - computeConcordanceIndex(*data, sum_chf, oob_sampleIDs, NULL); } // #nocov start void ForestSurvival::writeOutputInternal() { if (verbose_out) { *verbose_out << "Tree type: " << "Survival" << std::endl; if (dependent_variable_names.size() >= 2) { *verbose_out << "Status variable name: " << dependent_variable_names[1] << std::endl; } } } void ForestSurvival::writeConfusionFile() { // Open confusion file for writing std::string filename = output_prefix + ".confusion"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to confusion file: " + filename + "."); } // Write confusion to file outfile << "Overall OOB prediction error (1 - C): " << overall_prediction_error << std::endl; outfile.close(); if (verbose_out) *verbose_out << "Saved prediction error to file " << filename << "." << std::endl; } void ForestSurvival::writePredictionFile() { // Open prediction file for writing std::string filename = output_prefix + ".prediction"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to prediction file: " + filename + "."); } // Write outfile << "Unique timepoints: " << std::endl; for (auto& timepoint : unique_timepoints) { outfile << timepoint << " "; } outfile << std::endl << std::endl; outfile << "Cumulative hazard function, one row per sample: " << std::endl; if (predict_all) { for (size_t k = 0; k < num_trees; ++k) { outfile << "Tree " << k << ":" << std::endl; for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { outfile << predictions[i][j][k] << " "; } outfile << std::endl; } outfile << std::endl; } } else { for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { for (size_t k = 0; k < predictions[i][j].size(); ++k) { outfile << predictions[i][j][k] << " "; } outfile << std::endl; } } } if (verbose_out) *verbose_out << "Saved predictions to file " << filename << "." << std::endl; } void ForestSurvival::saveToFileInternal(std::ofstream& outfile) { // Write num_variables outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_SURVIVAL; outfile.write((char*) &treetype, sizeof(treetype)); // Write unique timepoints saveVector1D(unique_timepoints, outfile); } void ForestSurvival::loadFromFileInternal(std::ifstream& infile) { // Read number of variables size_t num_variables_saved; infile.read((char*) &num_variables_saved, sizeof(num_variables_saved)); // Read treetype TreeType treetype; infile.read((char*) &treetype, sizeof(treetype)); if (treetype != TREE_SURVIVAL) { throw std::runtime_error("Wrong treetype. Loaded file is not a survival forest."); } // Read unique timepoints unique_timepoints.clear(); readVector1D(unique_timepoints, infile); for (size_t i = 0; i < num_trees; ++i) { // Read data std::vector> child_nodeIDs; readVector2D(child_nodeIDs, infile); std::vector split_varIDs; readVector1D(split_varIDs, infile); std::vector split_values; readVector1D(split_values, infile); // Read chf std::vector terminal_nodes; readVector1D(terminal_nodes, infile); std::vector> chf_vector; readVector2D(chf_vector, infile); // Convert chf to vector with empty elements for non-terminal nodes std::vector> chf; chf.resize(child_nodeIDs[0].size(), std::vector()); // for (size_t i = 0; i < child_nodeIDs.size(); ++i) { // chf.push_back(std::vector()); // } for (size_t j = 0; j < terminal_nodes.size(); ++j) { chf[terminal_nodes[j]] = chf_vector[j]; } // If dependent variable not in test data, throw error if (num_variables_saved != num_independent_variables) { throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree trees.push_back( make_unique(child_nodeIDs, split_varIDs, split_values, chf, &unique_timepoints, &response_timepointIDs)); } } const std::vector& ForestSurvival::getTreePrediction(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPrediction(sample_idx); } size_t ForestSurvival::getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPredictionTerminalNodeID(sample_idx); } // #nocov end }// namespace ranger ranger/src/utilityRcpp.cpp0000755000176200001440000000563414027301517015366 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of Ranger. Ranger is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Ranger is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Ranger. If not, see . Written by: Marvin N. Wright Institut für Medizinische Biometrie und Statistik Universität zu Lübeck Ratzeburger Allee 160 23562 Lübeck Germany http://www.imbs-luebeck.de #-------------------------------------------------------------------------------*/ #include // Count number of elements in reference smaller than values //[[Rcpp::export]] Rcpp::IntegerVector numSmaller(Rcpp::NumericVector values, Rcpp::NumericVector reference) { std::sort(reference.begin(), reference.end()); Rcpp::IntegerVector result(values.size()); for (int i = 0; i < values.size(); ++i) result[i] = std::lower_bound(reference.begin(), reference.end(), values[i]) - reference.begin(); return result; } // Get random other obs. in same terminal node //[[Rcpp::export]] Rcpp::NumericMatrix randomObsNode(Rcpp::IntegerMatrix groups, Rcpp::NumericVector y, Rcpp::IntegerMatrix inbag_counts) { Rcpp::NumericMatrix result(groups.nrow(), groups.ncol()); // Loop through trees for (size_t i = 0; i < groups.ncol(); ++i) { // Init result with NA for (size_t j = 0; j < groups.nrow(); ++j) { result(j, i) = NA_REAL; } // Order by terminal node ID std::vector idx(groups.nrow()); std::iota(idx.begin(), idx.end(), 0); std::sort(std::begin(idx), std::end(idx), [&](size_t j1, size_t j2) {return groups(j1, i) < groups(j2, i);}); // Loop through change points (next node) size_t j = 0; while(j < idx.size()) { // Find next change point size_t k = j; while (k < idx.size() && groups(idx[j], i) == groups(idx[k], i)) { ++k; } // If other observation in same node if (k - j >= 2) { // Loop through observations between change points for (size_t l = j; l < k; ++l) { // Only OOB observations if (inbag_counts(idx[l], i) > 0) { continue; } // Select random observation in same terminal node, retry if same obs. selected size_t rnd = l; while (rnd == l) { rnd = j - 1 + Rcpp::sample(k - j, 1, false)[0]; } result(idx[l], i) = y(idx[rnd]); } } // Next change point j = k; } } return result; } ranger/src/TreeProbability.cpp0000755000176200001440000007320314027301517016133 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include "TreeProbability.h" #include "utility.h" #include "Data.h" namespace ranger { TreeProbability::TreeProbability(std::vector* class_values, std::vector* response_classIDs, std::vector>* sampleIDs_per_class, std::vector* class_weights) : class_values(class_values), response_classIDs(response_classIDs), sampleIDs_per_class(sampleIDs_per_class), class_weights( class_weights), counter(0), counter_per_class(0) { } TreeProbability::TreeProbability(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values, std::vector* class_values, std::vector* response_classIDs, std::vector>& terminal_class_counts) : Tree(child_nodeIDs, split_varIDs, split_values), class_values(class_values), response_classIDs(response_classIDs), sampleIDs_per_class( 0), terminal_class_counts(terminal_class_counts), class_weights(0), counter(0), counter_per_class(0) { } void TreeProbability::allocateMemory() { // Init counters if not in memory efficient mode if (!memory_saving_splitting) { size_t num_classes = class_values->size(); size_t max_num_splits = data->getMaxNumUniqueValues(); // Use number of random splits for extratrees if (splitrule == EXTRATREES && num_random_splits > max_num_splits) { max_num_splits = num_random_splits; } counter.resize(max_num_splits); counter_per_class.resize(num_classes * max_num_splits); } } void TreeProbability::addToTerminalNodes(size_t nodeID) { size_t num_samples_in_node = end_pos[nodeID] - start_pos[nodeID]; terminal_class_counts[nodeID].resize(class_values->size(), 0); // Compute counts for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t classID = (*response_classIDs)[sampleID]; ++terminal_class_counts[nodeID][classID]; } // Compute fractions for (size_t i = 0; i < terminal_class_counts[nodeID].size(); ++i) { terminal_class_counts[nodeID][i] /= num_samples_in_node; } } void TreeProbability::appendToFileInternal(std::ofstream& file) { // #nocov start // Add Terminal node class counts // Convert to vector without empty elements and save std::vector terminal_nodes; std::vector> terminal_class_counts_vector; for (size_t i = 0; i < terminal_class_counts.size(); ++i) { if (!terminal_class_counts[i].empty()) { terminal_nodes.push_back(i); terminal_class_counts_vector.push_back(terminal_class_counts[i]); } } saveVector1D(terminal_nodes, file); saveVector2D(terminal_class_counts_vector, file); } // #nocov end bool TreeProbability::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { // Stop if maximum node size or depth reached size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { addToTerminalNodes(nodeID); return true; } // Check if node is pure and set split_value to estimate and stop if pure bool pure = true; double pure_value = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_y(sampleID, 0); if (pos != start_pos[nodeID] && value != pure_value) { pure = false; break; } pure_value = value; } if (pure) { addToTerminalNodes(nodeID); return true; } // Find best split, stop if no decrease of impurity bool stop; if (splitrule == EXTRATREES) { stop = findBestSplitExtraTrees(nodeID, possible_split_varIDs); } else { stop = findBestSplit(nodeID, possible_split_varIDs); } if (stop) { addToTerminalNodes(nodeID); return true; } return false; } void TreeProbability::createEmptyNodeInternal() { terminal_class_counts.push_back(std::vector()); } double TreeProbability::computePredictionAccuracyInternal(std::vector* prediction_error_casewise) { size_t num_predictions = prediction_terminal_nodeIDs.size(); double sum_of_squares = 0; for (size_t i = 0; i < num_predictions; ++i) { size_t sampleID = oob_sampleIDs[i]; size_t real_classID = (*response_classIDs)[sampleID]; size_t terminal_nodeID = prediction_terminal_nodeIDs[i]; double predicted_value = terminal_class_counts[terminal_nodeID][real_classID]; double err = (1 - predicted_value) * (1 - predicted_value); if (prediction_error_casewise) { (*prediction_error_casewise)[i] = err; } sum_of_squares += err; } return (1.0 - sum_of_squares / (double) num_predictions); } bool TreeProbability::findBestSplit(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; size_t num_classes = class_values->size(); double best_decrease = -1; size_t best_varID = 0; double best_value = 0; std::vector class_counts(num_classes); // Compute overall class counts for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; ++class_counts[sample_classID]; } // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { // Use memory saving method if option set if (memory_saving_splitting) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { // Use faster method for both cases double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); if (q < Q_THRESHOLD) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } } } else { findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } } // Stop if no good split found if (best_decrease < 0) { return true; } // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } void TreeProbability::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector possible_split_values; data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (possible_split_values.size() < 2) { return; } const size_t num_splits = possible_split_values.size(); if (memory_saving_splitting) { std::vector class_counts_right(num_splits * num_classes), n_right(num_splits); findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, class_counts_right, n_right); } else { std::fill_n(counter_per_class.begin(), num_splits * num_classes, 0); std::fill_n(counter.begin(), num_splits, 0); findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, counter_per_class, counter); } } void TreeProbability::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& counter_per_class, std::vector& counter) { for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; size_t idx = std::lower_bound(possible_split_values.begin(), possible_split_values.end(), data->get_x(sampleID, varID)) - possible_split_values.begin(); ++counter_per_class[idx * num_classes + sample_classID]; ++counter[idx]; } size_t n_left = 0; std::vector class_counts_left(num_classes); // Compute decrease of impurity for each split for (size_t i = 0; i < possible_split_values.size() - 1; ++i) { // Stop if nothing here if (counter[i] == 0) { continue; } n_left += counter[i]; // Stop if right child empty size_t n_right = num_samples_node - n_left; if (n_right == 0) { break; } double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; } // TPR is number of outcome 1s in one node / total number of 1s // FPR is number of outcome 0s in one node / total number of 0s double tpr = (double) (class_counts[1] - class_counts_left[1]) / (double) class_counts[1]; double fpr = (double) (class_counts[0] - class_counts_left[0]) / (double) class_counts[0]; // Decrease of impurity double a1 = sqrt(tpr) - sqrt(fpr); double a2 = sqrt(1 - tpr) - sqrt(1 - fpr); decrease = sqrt(a1 * a1 + a2 * a2); } else { // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; size_t class_count_right = class_counts[j] - class_counts_left[j]; sum_left += (*class_weights)[j] * class_counts_left[j] * class_counts_left[j]; sum_right += (*class_weights)[j] * class_count_right * class_count_right; } // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { // Use mid-point split best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; best_varID = varID; best_decrease = decrease; // Use smaller value if average is numerically the same as the larger value if (best_value == possible_split_values[i + 1]) { best_value = possible_split_values[i]; } } } } void TreeProbability::findBestSplitValueLargeQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Set counters to 0 size_t num_unique = data->getNumUniqueDataValues(varID); std::fill_n(counter_per_class.begin(), num_unique * num_classes, 0); std::fill_n(counter.begin(), num_unique, 0); // Count values for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); size_t classID = (*response_classIDs)[sampleID]; ++counter[index]; ++counter_per_class[index * num_classes + classID]; } size_t n_left = 0; std::vector class_counts_left(num_classes); // Compute decrease of impurity for each split for (size_t i = 0; i < num_unique - 1; ++i) { // Stop if nothing here if (counter[i] == 0) { continue; } n_left += counter[i]; // Stop if right child empty size_t n_right = num_samples_node - n_left; if (n_right == 0) { break; } double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; } // TPR is number of outcome 1s in one node / total number of 1s // FPR is number of outcome 0s in one node / total number of 0s double tpr = (double) (class_counts[1] - class_counts_left[1]) / (double) class_counts[1]; double fpr = (double) (class_counts[0] - class_counts_left[0]) / (double) class_counts[0]; // Decrease of impurity double a1 = sqrt(tpr) - sqrt(fpr); double a2 = sqrt(1 - tpr) - sqrt(1 - fpr); decrease = sqrt(a1 * a1 + a2 * a2); } else { // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; size_t class_count_right = class_counts[j] - class_counts_left[j]; sum_left += (*class_weights)[j] * class_counts_left[j] * class_counts_left[j]; sum_right += (*class_weights)[j] * class_count_right * class_count_right; } // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { // Find next value in this node size_t j = i + 1; while (j < num_unique && counter[j] == 0) { ++j; } // Use mid-point split best_value = (data->getUniqueDataValue(varID, i) + data->getUniqueDataValue(varID, j)) / 2; best_varID = varID; best_decrease = decrease; // Use smaller value if average is numerically the same as the larger value if (best_value == data->getUniqueDataValue(varID, j)) { best_value = data->getUniqueDataValue(varID, i); } } } } void TreeProbability::findBestSplitValueUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector factor_levels; data->getAllValues(factor_levels, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (factor_levels.size() < 2) { return; } // Number of possible splits is 2^num_levels size_t num_splits = (1ULL << factor_levels.size()); // Compute decrease of impurity for each possible split // Split where all left (0) or all right (1) are excluded // The second half of numbers is just left/right switched the first half -> Exclude second half for (size_t local_splitID = 1; local_splitID < num_splits / 2; ++local_splitID) { // Compute overall splitID by shifting local factorIDs to global positions size_t splitID = 0; for (size_t j = 0; j < factor_levels.size(); ++j) { if ((local_splitID & (1ULL << j))) { double level = factor_levels[j]; size_t factorID = floor(level) - 1; splitID = splitID | (1ULL << factorID); } } // Initialize std::vector class_counts_right(num_classes); size_t n_right = 0; // Count classes in left and right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++n_right; ++class_counts_right[sample_classID]; } } size_t n_left = num_samples_node - n_right; double decrease; if (splitrule == HELLINGER) { // TPR is number of outcome 1s in one node / total number of 1s // FPR is number of outcome 0s in one node / total number of 0s double tpr = (double) class_counts_right[1] / (double) class_counts[1]; double fpr = (double) class_counts_right[0] / (double) class_counts[0]; // Decrease of impurity double a1 = sqrt(tpr) - sqrt(fpr); double a2 = sqrt(1 - tpr) - sqrt(1 - fpr); decrease = sqrt(a1 * a1 + a2 * a2); } else { // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { size_t class_count_right = class_counts_right[j]; size_t class_count_left = class_counts[j] - class_count_right; sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } // Decrease of impurity decrease = sum_left / (double) n_left + sum_right / (double) n_right; } // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = splitID; best_varID = varID; best_decrease = decrease; } } } bool TreeProbability::findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; size_t num_classes = class_values->size(); double best_decrease = -1; size_t best_varID = 0; double best_value = 0; std::vector class_counts(num_classes); // Compute overall class counts for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; ++class_counts[sample_classID]; } // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } } // Stop if no good split found if (best_decrease < 0) { return true; } // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } void TreeProbability::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Get min/max values of covariate in node double min; double max; data->getMinMaxValues(min, max, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (min == max) { return; } // Create possible split values: Draw randomly between min and max std::vector possible_split_values; std::uniform_real_distribution udist(min, max); possible_split_values.reserve(num_random_splits); for (size_t i = 0; i < num_random_splits; ++i) { possible_split_values.push_back(udist(random_number_generator)); } if (num_random_splits > 1) { std::sort(possible_split_values.begin(), possible_split_values.end()); } const size_t num_splits = possible_split_values.size(); if (memory_saving_splitting) { std::vector class_counts_right(num_splits * num_classes), n_right(num_splits); findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, class_counts_right, n_right); } else { std::fill_n(counter_per_class.begin(), num_splits * num_classes, 0); std::fill_n(counter.begin(), num_splits, 0); findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, counter_per_class, counter); } } void TreeProbability::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& class_counts_right, std::vector& n_right) { const size_t num_splits = possible_split_values.size(); // Count samples in right child per class and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_x(sampleID, varID); uint sample_classID = (*response_classIDs)[sampleID]; // Count samples until split_value reached for (size_t i = 0; i < num_splits; ++i) { if (value > possible_split_values[i]) { ++n_right[i]; ++class_counts_right[i * num_classes + sample_classID]; } else { break; } } } // Compute decrease of impurity for each possible split for (size_t i = 0; i < num_splits; ++i) { // Stop if one child empty size_t n_left = num_samples_node - n_right[i]; if (n_left == 0 || n_right[i] == 0) { continue; } // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { size_t class_count_right = class_counts_right[i * num_classes + j]; size_t class_count_left = class_counts[j] - class_count_right; sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right[i]; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = possible_split_values[i]; best_varID = varID; best_decrease = decrease; } } } void TreeProbability::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { size_t num_unique_values = data->getNumUniqueDataValues(varID); // Get all factor indices in node std::vector factor_in_node(num_unique_values, false); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); factor_in_node[index] = true; } // Vector of indices in and out of node std::vector indices_in_node; std::vector indices_out_node; indices_in_node.reserve(num_unique_values); indices_out_node.reserve(num_unique_values); for (size_t i = 0; i < num_unique_values; ++i) { if (factor_in_node[i]) { indices_in_node.push_back(i); } else { indices_out_node.push_back(i); } } // Generate num_random_splits splits for (size_t i = 0; i < num_random_splits; ++i) { std::vector split_subset; split_subset.reserve(num_unique_values); // Draw random subsets, sample all partitions with equal probability if (indices_in_node.size() > 1) { size_t num_partitions = (2ULL << (indices_in_node.size() - 1ULL)) - 2ULL; // 2^n-2 (don't allow full or empty) std::uniform_int_distribution udist(1, num_partitions); size_t splitID_in_node = udist(random_number_generator); for (size_t j = 0; j < indices_in_node.size(); ++j) { if ((splitID_in_node & (1ULL << j)) > 0) { split_subset.push_back(indices_in_node[j]); } } } if (indices_out_node.size() > 1) { size_t num_partitions = (2ULL << (indices_out_node.size() - 1ULL)) - 1ULL; // 2^n-1 (allow full or empty) std::uniform_int_distribution udist(0, num_partitions); size_t splitID_out_node = udist(random_number_generator); for (size_t j = 0; j < indices_out_node.size(); ++j) { if ((splitID_out_node & (1ULL << j)) > 0) { split_subset.push_back(indices_out_node[j]); } } } // Assign union of the two subsets to right child size_t splitID = 0; for (auto& idx : split_subset) { splitID |= 1ULL << idx; } // Initialize std::vector class_counts_right(num_classes); size_t n_right = 0; // Count classes in left and right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++n_right; ++class_counts_right[sample_classID]; } } size_t n_left = num_samples_node - n_right; // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { size_t class_count_right = class_counts_right[j]; size_t class_count_left = class_counts[j] - class_count_right; sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = splitID; best_varID = varID; best_decrease = decrease; } } } void TreeProbability::addImpurityImportance(size_t nodeID, size_t varID, double decrease) { double best_decrease = decrease; if (splitrule != HELLINGER) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; std::vector class_counts; class_counts.resize(class_values->size(), 0); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; class_counts[sample_classID]++; } double sum_node = 0; for (size_t i = 0; i < class_counts.size(); ++i) { sum_node += (*class_weights)[i] * class_counts[i] * class_counts[i]; } best_decrease = decrease - sum_node / (double) num_samples_node; } // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { (*variable_importance)[tempvarID] -= best_decrease; } else { (*variable_importance)[tempvarID] += best_decrease; } } void TreeProbability::bootstrapClassWise() { // Number of samples is sum of sample fraction * number of samples size_t num_samples_inbag = 0; double sum_sample_fraction = 0; for (auto& s : *sample_fraction) { num_samples_inbag += (size_t) num_samples * s; sum_sample_fraction += s; } // Reserve space, reserve a little more to be save) sampleIDs.reserve(num_samples_inbag); oob_sampleIDs.reserve(num_samples * (exp(-sum_sample_fraction) + 0.1)); // Start with all samples OOB inbag_counts.resize(num_samples, 0); // Draw samples for each class for (size_t i = 0; i < sample_fraction->size(); ++i) { // Draw samples of class with replacement as inbag and mark as not OOB size_t num_samples_class = (*sampleIDs_per_class)[i].size(); size_t num_samples_inbag_class = round(num_samples * (*sample_fraction)[i]); std::uniform_int_distribution unif_dist(0, num_samples_class - 1); for (size_t s = 0; s < num_samples_inbag_class; ++s) { size_t draw = (*sampleIDs_per_class)[i][unif_dist(random_number_generator)]; sampleIDs.push_back(draw); ++inbag_counts[draw]; } } // Save OOB samples for (size_t s = 0; s < inbag_counts.size(); ++s) { if (inbag_counts[s] == 0) { oob_sampleIDs.push_back(s); } } num_samples_oob = oob_sampleIDs.size(); if (!keep_inbag) { inbag_counts.clear(); inbag_counts.shrink_to_fit(); } } void TreeProbability::bootstrapWithoutReplacementClassWise() { // Draw samples for each class for (size_t i = 0; i < sample_fraction->size(); ++i) { size_t num_samples_class = (*sampleIDs_per_class)[i].size(); size_t num_samples_inbag_class = round(num_samples * (*sample_fraction)[i]); shuffleAndSplitAppend(sampleIDs, oob_sampleIDs, num_samples_class, num_samples_inbag_class, (*sampleIDs_per_class)[i], random_number_generator); } num_samples_oob = oob_sampleIDs.size(); if (keep_inbag) { // All observation are 0 or 1 times inbag inbag_counts.resize(num_samples, 1); for (size_t i = 0; i < oob_sampleIDs.size(); i++) { inbag_counts[oob_sampleIDs[i]] = 0; } } } } // namespace ranger ranger/src/TreeRegression.h0000755000176200001440000001053014027301517015432 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef TREEREGRESSION_H_ #define TREEREGRESSION_H_ #include #include "globals.h" #include "Tree.h" namespace ranger { class TreeRegression: public Tree { public: TreeRegression() = default; // Create from loaded forest TreeRegression(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values); TreeRegression(const TreeRegression&) = delete; TreeRegression& operator=(const TreeRegression&) = delete; virtual ~TreeRegression() override = default; void allocateMemory() override; double estimate(size_t nodeID); void computePermutationImportanceInternal(std::vector>* permutations); void appendToFileInternal(std::ofstream& file) override; double getPrediction(size_t sampleID) const { size_t terminal_nodeID = prediction_terminal_nodeIDs[sampleID]; return (split_values[terminal_nodeID]); } size_t getPredictionTerminalNodeID(size_t sampleID) const { return prediction_terminal_nodeIDs[sampleID]; } private: bool splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) override; void createEmptyNodeInternal() override; double computePredictionAccuracyInternal(std::vector* prediction_error_casewise) override; // Called by splitNodeInternal(). Sets split_varIDs and split_values. bool findBestSplit(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums, std::vector& counter); void findBestSplitValueLargeQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueUnordered(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); bool findBestSplitMaxstat(size_t nodeID, std::vector& possible_split_varIDs); bool findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueExtraTrees(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueExtraTrees(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums_right, std::vector& n_right); void findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); bool findBestSplitBeta(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueBeta(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueBeta(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums_right, std::vector& n_right); void addImpurityImportance(size_t nodeID, size_t varID, double decrease); double computePredictionMSE(); void cleanUpInternal() override { counter.clear(); counter.shrink_to_fit(); sums.clear(); sums.shrink_to_fit(); } std::vector counter; std::vector sums; }; } // namespace ranger #endif /* TREEREGRESSION_H_ */ ranger/src/ForestSurvival.h0000755000176200001440000000421214027301517015470 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef FORESTSURVIVAL_H_ #define FORESTSURVIVAL_H_ #include #include #include "globals.h" #include "Forest.h" #include "TreeSurvival.h" namespace ranger { class ForestSurvival: public Forest { public: ForestSurvival() = default; ForestSurvival(const ForestSurvival&) = delete; ForestSurvival& operator=(const ForestSurvival&) = delete; virtual ~ForestSurvival() override = default; void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector> >& forest_chf, std::vector& unique_timepoints, std::vector& is_ordered_variable); std::vector>> getChf() const; const std::vector& getUniqueTimepoints() const { return unique_timepoints; } private: void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; void computePredictionErrorInternal() override; void writeOutputInternal() override; void writeConfusionFile() override; void writePredictionFile() override; void saveToFileInternal(std::ofstream& outfile) override; void loadFromFileInternal(std::ifstream& infile) override; std::vector unique_timepoints; std::vector response_timepointIDs; private: const std::vector& getTreePrediction(size_t tree_idx, size_t sample_idx) const; size_t getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const; }; } // namespace ranger #endif /* FORESTSURVIVAL_H_ */ ranger/src/ForestClassification.h0000755000176200001440000000466714027301517016626 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef FORESTCLASSIFICATION_H_ #define FORESTCLASSIFICATION_H_ #include #include #include #include #include "globals.h" #include "Forest.h" namespace ranger { class ForestClassification: public Forest { public: ForestClassification() = default; ForestClassification(const ForestClassification&) = delete; ForestClassification& operator=(const ForestClassification&) = delete; virtual ~ForestClassification() override = default; void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector& is_ordered_variable); const std::vector& getClassValues() const { return class_values; } void setClassWeights(std::vector& class_weights) { this->class_weights = class_weights; } protected: void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; void computePredictionErrorInternal() override; void writeOutputInternal() override; void writeConfusionFile() override; void writePredictionFile() override; void saveToFileInternal(std::ofstream& outfile) override; void loadFromFileInternal(std::ifstream& infile) override; // Classes of the dependent variable and classIDs for responses std::vector class_values; std::vector response_classIDs; std::vector> sampleIDs_per_class; // Splitting weights std::vector class_weights; // Table with classifications and true classes std::map, size_t> classification_table; private: double getTreePrediction(size_t tree_idx, size_t sample_idx) const; size_t getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const; }; } // namespace ranger #endif /* FORESTCLASSIFICATION_H_ */ ranger/src/DataSparse.cpp0000755000176200001440000000256414027301517015064 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of Ranger. Ranger is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Ranger is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Ranger. If not, see . Written by: Marvin N. Wright Institut für Medizinische Biometrie und Statistik Universität zu Lübeck Ratzeburger Allee 160 23562 Lübeck Germany http://www.imbs-luebeck.de #-------------------------------------------------------------------------------*/ #include "DataSparse.h" namespace ranger { DataSparse::DataSparse(Eigen::SparseMatrix& x, Rcpp::NumericMatrix& y, std::vector variable_names, size_t num_rows, size_t num_cols) : x { }{ this->x.swap(x); this->y = y; this->variable_names = variable_names; this->num_rows = num_rows; this->num_cols = num_cols; this->num_cols_no_snp = num_cols; } } // namespace ranger ranger/src/TreeClassification.cpp0000755000176200001440000007271314027301517016613 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include "TreeClassification.h" #include "utility.h" #include "Data.h" namespace ranger { TreeClassification::TreeClassification(std::vector* class_values, std::vector* response_classIDs, std::vector>* sampleIDs_per_class, std::vector* class_weights) : class_values(class_values), response_classIDs(response_classIDs), sampleIDs_per_class(sampleIDs_per_class), class_weights( class_weights), counter(0), counter_per_class(0) { } TreeClassification::TreeClassification(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values, std::vector* class_values, std::vector* response_classIDs) : Tree(child_nodeIDs, split_varIDs, split_values), class_values(class_values), response_classIDs(response_classIDs), sampleIDs_per_class( 0), class_weights(0), counter { }, counter_per_class { } { } void TreeClassification::allocateMemory() { // Init counters if not in memory efficient mode if (!memory_saving_splitting) { size_t num_classes = class_values->size(); size_t max_num_splits = data->getMaxNumUniqueValues(); // Use number of random splits for extratrees if (splitrule == EXTRATREES && num_random_splits > max_num_splits) { max_num_splits = num_random_splits; } counter.resize(max_num_splits); counter_per_class.resize(num_classes * max_num_splits); } } double TreeClassification::estimate(size_t nodeID) { // Count classes over samples in node and return class with maximum count std::vector class_count = std::vector(class_values->size(), 0.0); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t value = (*response_classIDs)[sampleID]; class_count[value] += (*class_weights)[value]; } if (end_pos[nodeID] > start_pos[nodeID]) { size_t result_classID = mostFrequentClass(class_count, random_number_generator); return ((*class_values)[result_classID]); } else { throw std::runtime_error("Error: Empty node."); } } void TreeClassification::appendToFileInternal(std::ofstream& file) { // #nocov start // Empty on purpose } // #nocov end bool TreeClassification::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { // Stop if maximum node size or depth reached size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { split_values[nodeID] = estimate(nodeID); return true; } // Check if node is pure and set split_value to estimate and stop if pure bool pure = true; double pure_value = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_y(sampleID, 0); if (pos != start_pos[nodeID] && value != pure_value) { pure = false; break; } pure_value = value; } if (pure) { split_values[nodeID] = pure_value; return true; } // Find best split, stop if no decrease of impurity bool stop; if (splitrule == EXTRATREES) { stop = findBestSplitExtraTrees(nodeID, possible_split_varIDs); } else { stop = findBestSplit(nodeID, possible_split_varIDs); } if (stop) { split_values[nodeID] = estimate(nodeID); return true; } return false; } void TreeClassification::createEmptyNodeInternal() { // Empty on purpose } double TreeClassification::computePredictionAccuracyInternal(std::vector* prediction_error_casewise) { size_t num_predictions = prediction_terminal_nodeIDs.size(); size_t num_missclassifications = 0; for (size_t i = 0; i < num_predictions; ++i) { size_t terminal_nodeID = prediction_terminal_nodeIDs[i]; double predicted_value = split_values[terminal_nodeID]; double real_value = data->get_y(oob_sampleIDs[i], 0); if (predicted_value != real_value) { ++num_missclassifications; if (prediction_error_casewise) { (*prediction_error_casewise)[i] = 1; } } else { if (prediction_error_casewise) { (*prediction_error_casewise)[i] = 0; } } } return (1.0 - (double) num_missclassifications / (double) num_predictions); } bool TreeClassification::findBestSplit(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; size_t num_classes = class_values->size(); double best_decrease = -1; size_t best_varID = 0; double best_value = 0; std::vector class_counts(num_classes); // Compute overall class counts for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; ++class_counts[sample_classID]; } // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { // Use memory saving method if option set if (memory_saving_splitting) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { // Use faster method for both cases double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); if (q < Q_THRESHOLD) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } } } else { findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } } // Stop if no good split found if (best_decrease < 0) { return true; } // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute gini index for this node and to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addGiniImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } void TreeClassification::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector possible_split_values; data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (possible_split_values.size() < 2) { return; } const size_t num_splits = possible_split_values.size(); if (memory_saving_splitting) { std::vector class_counts_right(num_splits * num_classes), n_right(num_splits); findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, class_counts_right, n_right); } else { std::fill_n(counter_per_class.begin(), num_splits * num_classes, 0); std::fill_n(counter.begin(), num_splits, 0); findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, counter_per_class, counter); } } void TreeClassification::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& counter_per_class, std::vector& counter) { for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; size_t idx = std::lower_bound(possible_split_values.begin(), possible_split_values.end(), data->get_x(sampleID, varID)) - possible_split_values.begin(); ++counter_per_class[idx * num_classes + sample_classID]; ++counter[idx]; } size_t n_left = 0; std::vector class_counts_left(num_classes); // Compute decrease of impurity for each split for (size_t i = 0; i < possible_split_values.size() - 1; ++i) { // Stop if nothing here if (counter[i] == 0) { continue; } n_left += counter[i]; // Stop if right child empty size_t n_right = num_samples_node - n_left; if (n_right == 0) { break; } double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; } // TPR is number of outcome 1s in one node / total number of 1s // FPR is number of outcome 0s in one node / total number of 0s double tpr = (double) (class_counts[1] - class_counts_left[1]) / (double) class_counts[1]; double fpr = (double) (class_counts[0] - class_counts_left[0]) / (double) class_counts[0]; // Decrease of impurity double a1 = sqrt(tpr) - sqrt(fpr); double a2 = sqrt(1 - tpr) - sqrt(1 - fpr); decrease = sqrt(a1 * a1 + a2 * a2); } else { // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; size_t class_count_right = class_counts[j] - class_counts_left[j]; sum_left += (*class_weights)[j] * class_counts_left[j] * class_counts_left[j]; sum_right += (*class_weights)[j] * class_count_right * class_count_right; } // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { // Use mid-point split best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; best_varID = varID; best_decrease = decrease; // Use smaller value if average is numerically the same as the larger value if (best_value == possible_split_values[i + 1]) { best_value = possible_split_values[i]; } } } } void TreeClassification::findBestSplitValueLargeQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Set counters to 0 size_t num_unique = data->getNumUniqueDataValues(varID); std::fill_n(counter_per_class.begin(), num_unique * num_classes, 0); std::fill_n(counter.begin(), num_unique, 0); // Count values for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); size_t classID = (*response_classIDs)[sampleID]; ++counter[index]; ++counter_per_class[index * num_classes + classID]; } size_t n_left = 0; std::vector class_counts_left(num_classes); // Compute decrease of impurity for each split for (size_t i = 0; i < num_unique - 1; ++i) { // Stop if nothing here if (counter[i] == 0) { continue; } n_left += counter[i]; // Stop if right child empty size_t n_right = num_samples_node - n_left; if (n_right == 0) { break; } double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; } // TPR is number of outcome 1s in one node / total number of 1s // FPR is number of outcome 0s in one node / total number of 0s double tpr = (double) (class_counts[1] - class_counts_left[1]) / (double) class_counts[1]; double fpr = (double) (class_counts[0] - class_counts_left[0]) / (double) class_counts[0]; // Decrease of impurity double a1 = sqrt(tpr) - sqrt(fpr); double a2 = sqrt(1 - tpr) - sqrt(1 - fpr); decrease = sqrt(a1 * a1 + a2 * a2); } else { // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { class_counts_left[j] += counter_per_class[i * num_classes + j]; size_t class_count_right = class_counts[j] - class_counts_left[j]; sum_left += (*class_weights)[j] * class_counts_left[j] * class_counts_left[j]; sum_right += (*class_weights)[j] * class_count_right * class_count_right; } // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { // Find next value in this node size_t j = i + 1; while (j < num_unique && counter[j] == 0) { ++j; } // Use mid-point split best_value = (data->getUniqueDataValue(varID, i) + data->getUniqueDataValue(varID, j)) / 2; best_varID = varID; best_decrease = decrease; // Use smaller value if average is numerically the same as the larger value if (best_value == data->getUniqueDataValue(varID, j)) { best_value = data->getUniqueDataValue(varID, i); } } } } void TreeClassification::findBestSplitValueUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector factor_levels; data->getAllValues(factor_levels, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (factor_levels.size() < 2) { return; } // Number of possible splits is 2^num_levels size_t num_splits = (1ULL << factor_levels.size()); // Compute decrease of impurity for each possible split // Split where all left (0) or all right (1) are excluded // The second half of numbers is just left/right switched the first half -> Exclude second half for (size_t local_splitID = 1; local_splitID < num_splits / 2; ++local_splitID) { // Compute overall splitID by shifting local factorIDs to global positions size_t splitID = 0; for (size_t j = 0; j < factor_levels.size(); ++j) { if ((local_splitID & (1ULL << j))) { double level = factor_levels[j]; size_t factorID = floor(level) - 1; splitID = splitID | (1ULL << factorID); } } // Initialize std::vector class_counts_right(num_classes); size_t n_right = 0; // Count classes in left and right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++n_right; ++class_counts_right[sample_classID]; } } size_t n_left = num_samples_node - n_right; double decrease; if (splitrule == HELLINGER) { // TPR is number of outcome 1s in one node / total number of 1s // FPR is number of outcome 0s in one node / total number of 0s double tpr = (double) class_counts_right[1] / (double) class_counts[1]; double fpr = (double) class_counts_right[0] / (double) class_counts[0]; // Decrease of impurity double a1 = sqrt(tpr) - sqrt(fpr); double a2 = sqrt(1 - tpr) - sqrt(1 - fpr); decrease = sqrt(a1 * a1 + a2 * a2); } else { // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { size_t class_count_right = class_counts_right[j]; size_t class_count_left = class_counts[j] - class_count_right; sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } // Decrease of impurity decrease = sum_left / (double) n_left + sum_right / (double) n_right; } // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = splitID; best_varID = varID; best_decrease = decrease; } } } bool TreeClassification::findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; size_t num_classes = class_values->size(); double best_decrease = -1; size_t best_varID = 0; double best_value = 0; std::vector class_counts(num_classes); // Compute overall class counts for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; ++class_counts[sample_classID]; } // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } } // Stop if no good split found if (best_decrease < 0) { return true; } // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute gini index for this node and to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addGiniImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } void TreeClassification::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Get min/max values of covariate in node double min; double max; data->getMinMaxValues(min, max, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (min == max) { return; } // Create possible split values: Draw randomly between min and max std::vector possible_split_values; std::uniform_real_distribution udist(min, max); possible_split_values.reserve(num_random_splits); for (size_t i = 0; i < num_random_splits; ++i) { possible_split_values.push_back(udist(random_number_generator)); } if (num_random_splits > 1) { std::sort(possible_split_values.begin(), possible_split_values.end()); } const size_t num_splits = possible_split_values.size(); if (memory_saving_splitting) { std::vector class_counts_right(num_splits * num_classes), n_right(num_splits); findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, class_counts_right, n_right); } else { std::fill_n(counter_per_class.begin(), num_splits * num_classes, 0); std::fill_n(counter.begin(), num_splits, 0); findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, counter_per_class, counter); } } void TreeClassification::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& class_counts_right, std::vector& n_right) { const size_t num_splits = possible_split_values.size(); // Count samples in right child per class and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_x(sampleID, varID); uint sample_classID = (*response_classIDs)[sampleID]; // Count samples until split_value reached for (size_t i = 0; i < num_splits; ++i) { if (value > possible_split_values[i]) { ++n_right[i]; ++class_counts_right[i * num_classes + sample_classID]; } else { break; } } } // Compute decrease of impurity for each possible split for (size_t i = 0; i < num_splits; ++i) { // Stop if one child empty size_t n_left = num_samples_node - n_right[i]; if (n_left == 0 || n_right[i] == 0) { continue; } // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { size_t class_count_right = class_counts_right[i * num_classes + j]; size_t class_count_left = class_counts[j] - class_count_right; sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right[i]; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = possible_split_values[i]; best_varID = varID; best_decrease = decrease; } } } void TreeClassification::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { size_t num_unique_values = data->getNumUniqueDataValues(varID); // Get all factor indices in node std::vector factor_in_node(num_unique_values, false); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); factor_in_node[index] = true; } // Vector of indices in and out of node std::vector indices_in_node; std::vector indices_out_node; indices_in_node.reserve(num_unique_values); indices_out_node.reserve(num_unique_values); for (size_t i = 0; i < num_unique_values; ++i) { if (factor_in_node[i]) { indices_in_node.push_back(i); } else { indices_out_node.push_back(i); } } // Generate num_random_splits splits for (size_t i = 0; i < num_random_splits; ++i) { std::vector split_subset; split_subset.reserve(num_unique_values); // Draw random subsets, sample all partitions with equal probability if (indices_in_node.size() > 1) { size_t num_partitions = (2ULL << (indices_in_node.size() - 1ULL)) - 2ULL; // 2^n-2 (don't allow full or empty) std::uniform_int_distribution udist(1, num_partitions); size_t splitID_in_node = udist(random_number_generator); for (size_t j = 0; j < indices_in_node.size(); ++j) { if ((splitID_in_node & (1ULL << j)) > 0) { split_subset.push_back(indices_in_node[j]); } } } if (indices_out_node.size() > 1) { size_t num_partitions = (2ULL << (indices_out_node.size() - 1ULL)) - 1ULL; // 2^n-1 (allow full or empty) std::uniform_int_distribution udist(0, num_partitions); size_t splitID_out_node = udist(random_number_generator); for (size_t j = 0; j < indices_out_node.size(); ++j) { if ((splitID_out_node & (1ULL << j)) > 0) { split_subset.push_back(indices_out_node[j]); } } } // Assign union of the two subsets to right child size_t splitID = 0; for (auto& idx : split_subset) { splitID |= 1ULL << idx; } // Initialize std::vector class_counts_right(num_classes); size_t n_right = 0; // Count classes in left and right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++n_right; ++class_counts_right[sample_classID]; } } size_t n_left = num_samples_node - n_right; // Sum of squares double sum_left = 0; double sum_right = 0; for (size_t j = 0; j < num_classes; ++j) { size_t class_count_right = class_counts_right[j]; size_t class_count_left = class_counts[j] - class_count_right; sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = splitID; best_varID = varID; best_decrease = decrease; } } } void TreeClassification::addGiniImportance(size_t nodeID, size_t varID, double decrease) { double best_decrease = decrease; if (splitrule != HELLINGER) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; std::vector class_counts; class_counts.resize(class_values->size(), 0); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; class_counts[sample_classID]++; } double sum_node = 0; for (size_t i = 0; i < class_counts.size(); ++i) { sum_node += (*class_weights)[i] * class_counts[i] * class_counts[i]; } double impurity_node = (sum_node / (double) num_samples_node); // Account for the regularization regularize(impurity_node, varID); best_decrease = decrease - impurity_node; } // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { (*variable_importance)[tempvarID] -= best_decrease; } else { (*variable_importance)[tempvarID] += best_decrease; } } void TreeClassification::bootstrapClassWise() { // Number of samples is sum of sample fraction * number of samples size_t num_samples_inbag = 0; double sum_sample_fraction = 0; for (auto& s : *sample_fraction) { num_samples_inbag += (size_t) num_samples * s; sum_sample_fraction += s; } // Reserve space, reserve a little more to be save) sampleIDs.reserve(num_samples_inbag); oob_sampleIDs.reserve(num_samples * (exp(-sum_sample_fraction) + 0.1)); // Start with all samples OOB inbag_counts.resize(num_samples, 0); // Draw samples for each class for (size_t i = 0; i < sample_fraction->size(); ++i) { // Draw samples of class with replacement as inbag and mark as not OOB size_t num_samples_class = (*sampleIDs_per_class)[i].size(); size_t num_samples_inbag_class = round(num_samples * (*sample_fraction)[i]); std::uniform_int_distribution unif_dist(0, num_samples_class - 1); for (size_t s = 0; s < num_samples_inbag_class; ++s) { size_t draw = (*sampleIDs_per_class)[i][unif_dist(random_number_generator)]; sampleIDs.push_back(draw); ++inbag_counts[draw]; } } // Save OOB samples for (size_t s = 0; s < inbag_counts.size(); ++s) { if (inbag_counts[s] == 0) { oob_sampleIDs.push_back(s); } } num_samples_oob = oob_sampleIDs.size(); if (!keep_inbag) { inbag_counts.clear(); inbag_counts.shrink_to_fit(); } } void TreeClassification::bootstrapWithoutReplacementClassWise() { // Draw samples for each class for (size_t i = 0; i < sample_fraction->size(); ++i) { size_t num_samples_class = (*sampleIDs_per_class)[i].size(); size_t num_samples_inbag_class = round(num_samples * (*sample_fraction)[i]); shuffleAndSplitAppend(sampleIDs, oob_sampleIDs, num_samples_class, num_samples_inbag_class, (*sampleIDs_per_class)[i], random_number_generator); } num_samples_oob = oob_sampleIDs.size(); if (keep_inbag) { // All observation are 0 or 1 times inbag inbag_counts.resize(num_samples, 1); for (size_t i = 0; i < oob_sampleIDs.size(); i++) { inbag_counts[oob_sampleIDs[i]] = 0; } } } } // namespace ranger ranger/src/TreeSurvival.h0000755000176200001440000001104514027301517015127 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef TREESURVIVAL_H_ #define TREESURVIVAL_H_ #include #include "globals.h" #include "Tree.h" namespace ranger { class TreeSurvival: public Tree { public: TreeSurvival(std::vector* unique_timepoints, std::vector* response_timepointIDs); // Create from loaded forest TreeSurvival(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values, std::vector> chf, std::vector* unique_timepoints, std::vector* response_timepointIDs); TreeSurvival(const TreeSurvival&) = delete; TreeSurvival& operator=(const TreeSurvival&) = delete; virtual ~TreeSurvival() override = default; void allocateMemory() override; void appendToFileInternal(std::ofstream& file) override; void computePermutationImportanceInternal(std::vector>* permutations); const std::vector>& getChf() const { return chf; } const std::vector& getPrediction(size_t sampleID) const { size_t terminal_nodeID = prediction_terminal_nodeIDs[sampleID]; return chf[terminal_nodeID]; } size_t getPredictionTerminalNodeID(size_t sampleID) const { return prediction_terminal_nodeIDs[sampleID]; } private: void createEmptyNodeInternal() override; void computeSurvival(size_t nodeID); double computePredictionAccuracyInternal(std::vector* prediction_error_casewise) override; bool splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) override; bool findBestSplit(size_t nodeID, std::vector& possible_split_varIDs); bool findBestSplitMaxstat(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueLogRank(size_t nodeID, size_t varID, std::vector& possible_split_values, double& best_value, size_t& best_varID, double& best_logrank); void findBestSplitValueLogRankUnordered(size_t nodeID, size_t varID, std::vector& factor_levels, double& best_value, size_t& best_varID, double& best_logrank); void findBestSplitValueAUC(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_auc); void computeDeathCounts(size_t nodeID); void computeChildDeathCounts(size_t nodeID, size_t varID, std::vector& possible_split_values, std::vector& num_samples_right_child, std::vector& num_samples_at_risk_right_child, std::vector& num_deaths_right_child, size_t num_splits); void computeAucSplit(double time_k, double time_l, double status_k, double status_l, double value_k, double value_l, size_t num_splits, std::vector& possible_split_values, std::vector& num_count, std::vector& num_total); void findBestSplitValueLogRank(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank); void findBestSplitValueLogRankUnordered(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank); bool findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueExtraTrees(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank); void findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank); void addImpurityImportance(size_t nodeID, size_t varID, double decrease); void cleanUpInternal() override { num_deaths.clear(); num_deaths.shrink_to_fit(); num_samples_at_risk.clear(); num_samples_at_risk.shrink_to_fit(); } // Unique time points for all individuals (not only this bootstrap), sorted const std::vector* unique_timepoints; size_t num_timepoints; const std::vector* response_timepointIDs; // For all terminal nodes CHF for all unique timepoints. For other nodes empty vector. std::vector> chf; // Fields to save to while tree growing std::vector num_deaths; std::vector num_samples_at_risk; }; } // namespace ranger #endif /* TREESURVIVAL_H_ */ ranger/src/ForestClassification.cpp0000755000176200001440000002573114027301517017154 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include #include #include "utility.h" #include "ForestClassification.h" #include "TreeClassification.h" #include "Data.h" namespace ranger { void ForestClassification::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector& is_ordered_variable) { this->num_trees = num_trees; this->class_values = class_values; data->setIsOrderedVariable(is_ordered_variable); // Create trees trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], &this->class_values, &response_classIDs)); } // Create thread ranges equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } void ForestClassification::initInternal() { // If mtry not set, use floored square root of number of independent variables. if (mtry == 0) { unsigned long temp = sqrt((double) num_independent_variables); mtry = std::max((unsigned long) 1, temp); } // Set minimal node size if (min_node_size == 0) { min_node_size = DEFAULT_MIN_NODE_SIZE_CLASSIFICATION; } // Create class_values and response_classIDs if (!prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { double value = data->get_y(i, 0); // If classID is already in class_values, use ID. Else create a new one. uint classID = find(class_values.begin(), class_values.end(), value) - class_values.begin(); if (classID == class_values.size()) { class_values.push_back(value); } response_classIDs.push_back(classID); } if (splitrule == HELLINGER && class_values.size() != 2) { throw std::runtime_error("Hellinger splitrule only implemented for binary classification."); } } // Create sampleIDs_per_class if required if (sample_fraction.size() > 1) { sampleIDs_per_class.resize(sample_fraction.size()); for (auto& v : sampleIDs_per_class) { v.reserve(num_samples); } for (size_t i = 0; i < num_samples; ++i) { size_t classID = response_classIDs[i]; sampleIDs_per_class[classID].push_back(i); } } // Set class weights all to 1 class_weights = std::vector(class_values.size(), 1.0); // Sort data if memory saving mode if (!memory_saving_splitting) { data->sort(); } } void ForestClassification::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( make_unique(&class_values, &response_classIDs, &sampleIDs_per_class, &class_weights)); } } void ForestClassification::allocatePredictMemory() { size_t num_prediction_samples = data->getNumRows(); if (predict_all || prediction_type == TERMINALNODES) { predictions = std::vector>>(1, std::vector>(num_prediction_samples, std::vector(num_trees))); } else { predictions = std::vector>>(1, std::vector>(1, std::vector(num_prediction_samples))); } } void ForestClassification::predictInternal(size_t sample_idx) { if (predict_all || prediction_type == TERMINALNODES) { // Get all tree predictions for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { if (prediction_type == TERMINALNODES) { predictions[0][sample_idx][tree_idx] = getTreePredictionTerminalNodeID(tree_idx, sample_idx); } else { predictions[0][sample_idx][tree_idx] = getTreePrediction(tree_idx, sample_idx); } } } else { // Count classes over trees and save class with maximum count std::unordered_map class_count; for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { ++class_count[getTreePrediction(tree_idx, sample_idx)]; } predictions[0][0][sample_idx] = mostFrequentValue(class_count, random_number_generator); } } void ForestClassification::computePredictionErrorInternal() { // Class counts for samples std::vector> class_counts; class_counts.reserve(num_samples); for (size_t i = 0; i < num_samples; ++i) { class_counts.push_back(std::unordered_map()); } // For each tree loop over OOB samples and count classes for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { for (size_t sample_idx = 0; sample_idx < trees[tree_idx]->getNumSamplesOob(); ++sample_idx) { size_t sampleID = trees[tree_idx]->getOobSampleIDs()[sample_idx]; ++class_counts[sampleID][getTreePrediction(tree_idx, sample_idx)]; } } // Compute majority vote for each sample predictions = std::vector>>(1, std::vector>(1, std::vector(num_samples))); for (size_t i = 0; i < num_samples; ++i) { if (!class_counts[i].empty()) { predictions[0][0][i] = mostFrequentValue(class_counts[i], random_number_generator); } else { predictions[0][0][i] = NAN; } } // Compare predictions with true data size_t num_missclassifications = 0; size_t num_predictions = 0; for (size_t i = 0; i < predictions[0][0].size(); ++i) { double predicted_value = predictions[0][0][i]; if (!std::isnan(predicted_value)) { ++num_predictions; double real_value = data->get_y(i, 0); if (predicted_value != real_value) { ++num_missclassifications; } ++classification_table[std::make_pair(real_value, predicted_value)]; } } overall_prediction_error = (double) num_missclassifications / (double) num_predictions; } // #nocov start void ForestClassification::writeOutputInternal() { if (verbose_out) { *verbose_out << "Tree type: " << "Classification" << std::endl; } } void ForestClassification::writeConfusionFile() { // Open confusion file for writing std::string filename = output_prefix + ".confusion"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to confusion file: " + filename + "."); } // Write confusion to file outfile << "Overall OOB prediction error (Fraction missclassified): " << overall_prediction_error << std::endl; outfile << std::endl; outfile << "Class specific prediction errors:" << std::endl; outfile << " "; for (auto& class_value : class_values) { outfile << " " << class_value; } outfile << std::endl; for (auto& predicted_value : class_values) { outfile << "predicted " << predicted_value << " "; for (auto& real_value : class_values) { size_t value = classification_table[std::make_pair(real_value, predicted_value)]; outfile << value; if (value < 10) { outfile << " "; } else if (value < 100) { outfile << " "; } else if (value < 1000) { outfile << " "; } else if (value < 10000) { outfile << " "; } else if (value < 100000) { outfile << " "; } } outfile << std::endl; } outfile.close(); if (verbose_out) *verbose_out << "Saved confusion matrix to file " << filename << "." << std::endl; } void ForestClassification::writePredictionFile() { // Open prediction file for writing std::string filename = output_prefix + ".prediction"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to prediction file: " + filename + "."); } // Write outfile << "Predictions: " << std::endl; if (predict_all) { for (size_t k = 0; k < num_trees; ++k) { outfile << "Tree " << k << ":" << std::endl; for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { outfile << predictions[i][j][k] << std::endl; } } outfile << std::endl; } } else { for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { for (size_t k = 0; k < predictions[i][j].size(); ++k) { outfile << predictions[i][j][k] << std::endl; } } } } if (verbose_out) *verbose_out << "Saved predictions to file " << filename << "." << std::endl; } void ForestClassification::saveToFileInternal(std::ofstream& outfile) { // Write num_variables outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_CLASSIFICATION; outfile.write((char*) &treetype, sizeof(treetype)); // Write class_values saveVector1D(class_values, outfile); } void ForestClassification::loadFromFileInternal(std::ifstream& infile) { // Read number of variables size_t num_variables_saved; infile.read((char*) &num_variables_saved, sizeof(num_variables_saved)); // Read treetype TreeType treetype; infile.read((char*) &treetype, sizeof(treetype)); if (treetype != TREE_CLASSIFICATION) { throw std::runtime_error("Wrong treetype. Loaded file is not a classification forest."); } // Read class_values readVector1D(class_values, infile); for (size_t i = 0; i < num_trees; ++i) { // Read data std::vector> child_nodeIDs; readVector2D(child_nodeIDs, infile); std::vector split_varIDs; readVector1D(split_varIDs, infile); std::vector split_values; readVector1D(split_values, infile); // If dependent variable not in test data, throw error if (num_variables_saved != num_independent_variables) { throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree trees.push_back( make_unique(child_nodeIDs, split_varIDs, split_values, &class_values, &response_classIDs)); } } double ForestClassification::getTreePrediction(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPrediction(sample_idx); } size_t ForestClassification::getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPredictionTerminalNodeID(sample_idx); } // #nocov end }// namespace ranger ranger/src/TreeClassification.h0000755000176200001440000001136214027301517016251 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef TREECLASSIFICATION_H_ #define TREECLASSIFICATION_H_ #include #include "globals.h" #include "Tree.h" namespace ranger { class TreeClassification: public Tree { public: TreeClassification(std::vector* class_values, std::vector* response_classIDs, std::vector>* sampleIDs_per_class, std::vector* class_weights); // Create from loaded forest TreeClassification(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values, std::vector* class_values, std::vector* response_classIDs); TreeClassification(const TreeClassification&) = delete; TreeClassification& operator=(const TreeClassification&) = delete; virtual ~TreeClassification() override = default; void allocateMemory() override; double estimate(size_t nodeID); void computePermutationImportanceInternal(std::vector>* permutations); void appendToFileInternal(std::ofstream& file) override; double getPrediction(size_t sampleID) const { size_t terminal_nodeID = prediction_terminal_nodeIDs[sampleID]; return split_values[terminal_nodeID]; } size_t getPredictionTerminalNodeID(size_t sampleID) const { return prediction_terminal_nodeIDs[sampleID]; } private: bool splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) override; void createEmptyNodeInternal() override; double computePredictionAccuracyInternal(std::vector* prediction_error_casewise) override; // Called by splitNodeInternal(). Sets split_varIDs and split_values. bool findBestSplit(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueSmallQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& counter_per_class, std::vector& counter); void findBestSplitValueLargeQ(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); bool findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs); void findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void findBestSplitValueExtraTrees(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, const std::vector& possible_split_values, std::vector& class_counts_right, std::vector& n_right); void findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, size_t num_classes, const std::vector& class_counts, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease); void addGiniImportance(size_t nodeID, size_t varID, double decrease); void bootstrapClassWise() override; void bootstrapWithoutReplacementClassWise() override; void cleanUpInternal() override { counter.clear(); counter.shrink_to_fit(); counter_per_class.clear(); counter_per_class.shrink_to_fit(); } // Classes of the dependent variable and classIDs for responses const std::vector* class_values; const std::vector* response_classIDs; const std::vector>* sampleIDs_per_class; // Splitting weights const std::vector* class_weights; std::vector counter; std::vector counter_per_class; }; } // namespace ranger #endif /* TREECLASSIFICATION_H_ */ ranger/src/Tree.h0000755000176200001440000001556514027301517013406 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef TREE_H_ #define TREE_H_ #include #include #include #include #include "globals.h" #include "Data.h" namespace ranger { class Tree { public: Tree(); // Create from loaded forest Tree(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values); virtual ~Tree() = default; Tree(const Tree&) = delete; Tree& operator=(const Tree&) = delete; void init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, bool regularization_usedepth, std::vector* split_varIDs_used); virtual void allocateMemory() = 0; void grow(std::vector* variable_importance); void predict(const Data* prediction_data, bool oob_prediction); void computePermutationImportance(std::vector& forest_importance, std::vector& forest_variance, std::vector& forest_importance_casewise); void appendToFile(std::ofstream& file); virtual void appendToFileInternal(std::ofstream& file) = 0; const std::vector>& getChildNodeIDs() const { return child_nodeIDs; } const std::vector& getSplitValues() const { return split_values; } const std::vector& getSplitVarIDs() const { return split_varIDs; } const std::vector& getOobSampleIDs() const { return oob_sampleIDs; } size_t getNumSamplesOob() const { return num_samples_oob; } const std::vector& getInbagCounts() const { return inbag_counts; } protected: void createPossibleSplitVarSubset(std::vector& result); bool splitNode(size_t nodeID); virtual bool splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) = 0; void createEmptyNode(); virtual void createEmptyNodeInternal() = 0; size_t dropDownSamplePermuted(size_t permuted_varID, size_t sampleID, size_t permuted_sampleID); void permuteAndPredictOobSamples(size_t permuted_varID, std::vector& permutations); virtual double computePredictionAccuracyInternal(std::vector* prediction_error_casewise) = 0; void bootstrap(); void bootstrapWithoutReplacement(); void bootstrapWeighted(); void bootstrapWithoutReplacementWeighted(); virtual void bootstrapClassWise(); virtual void bootstrapWithoutReplacementClassWise(); void setManualInbag(); virtual void cleanUpInternal() = 0; void regularize(double& decrease, size_t varID) { if (regularization) { if (importance_mode == IMP_GINI_CORRECTED) { varID = data->getUnpermutedVarID(varID); } if ((*regularization_factor)[varID] != 1) { if (!(*split_varIDs_used)[varID]) { if (regularization_usedepth) { decrease *= std::pow((*regularization_factor)[varID], depth + 1); } else { decrease *= (*regularization_factor)[varID]; } } } } } void regularizeNegative(double& decrease, size_t varID) { if (regularization) { if (importance_mode == IMP_GINI_CORRECTED) { varID = data->getUnpermutedVarID(varID); } if ((*regularization_factor)[varID] != 1) { if (!(*split_varIDs_used)[varID]) { if (regularization_usedepth) { decrease /= std::pow((*regularization_factor)[varID], depth + 1); } else { decrease /= (*regularization_factor)[varID]; } } } } } void saveSplitVarID(size_t varID) { if (regularization) { if (importance_mode == IMP_GINI_CORRECTED) { (*split_varIDs_used)[data->getUnpermutedVarID(varID)] = true; } else { (*split_varIDs_used)[varID] = true; } } } uint mtry; // Number of samples (all samples, not only inbag for this tree) size_t num_samples; // Number of OOB samples size_t num_samples_oob; // Minimum node size to split, like in original RF nodes of smaller size can be produced uint min_node_size; // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable // Deterministic variables are always selected const std::vector* deterministic_varIDs; const std::vector* split_select_weights; // Bootstrap weights const std::vector* case_weights; // Pre-selected bootstrap samples const std::vector* manual_inbag; // Splitting variable for each node std::vector split_varIDs; // Value to split at for each node, for now only binary split // For terminal nodes the prediction value is saved here std::vector split_values; // Vector of left and right child node IDs, 0 for no child std::vector> child_nodeIDs; // All sampleIDs in the tree, will be re-ordered while splitting std::vector sampleIDs; // For each node a vector with start and end positions std::vector start_pos; std::vector end_pos; // IDs of OOB individuals, sorted std::vector oob_sampleIDs; // Holdout mode bool holdout; // Inbag counts bool keep_inbag; std::vector inbag_counts; // Random number generator std::mt19937_64 random_number_generator; // Pointer to original data const Data* data; // Regularization bool regularization; std::vector* regularization_factor; bool regularization_usedepth; std::vector* split_varIDs_used; // Variable importance for all variables std::vector* variable_importance; ImportanceMode importance_mode; // When growing here the OOB set is used // Terminal nodeIDs for prediction samples std::vector prediction_terminal_nodeIDs; bool sample_with_replacement; const std::vector* sample_fraction; bool memory_saving_splitting; SplitRule splitrule; double alpha; double minprop; uint num_random_splits; uint max_depth; uint depth; size_t last_left_nodeID; }; } // namespace ranger #endif /* TREE_H_ */ ranger/src/Tree.cpp0000755000176200001440000004651314027301517013736 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include "Tree.h" #include "utility.h" namespace ranger { Tree::Tree() : mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), case_weights( 0), manual_inbag(0), oob_sampleIDs(0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth( false), split_varIDs_used(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement( true), sample_fraction(0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( 0) { } Tree::Tree(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values) : mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), case_weights( 0), manual_inbag(0), split_varIDs(split_varIDs), split_values(split_values), child_nodeIDs(child_nodeIDs), oob_sampleIDs( 0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), split_varIDs_used( 0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), sample_fraction( 0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( 0) { } void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, bool regularization_usedepth, std::vector* split_varIDs_used) { this->data = data; this->mtry = mtry; this->num_samples = num_samples; this->memory_saving_splitting = memory_saving_splitting; // Create root node, assign bootstrap sample and oob samples child_nodeIDs.push_back(std::vector()); child_nodeIDs.push_back(std::vector()); createEmptyNode(); // Initialize random number generator and set seed random_number_generator.seed(seed); this->deterministic_varIDs = deterministic_varIDs; this->split_select_weights = split_select_weights; this->importance_mode = importance_mode; this->min_node_size = min_node_size; this->sample_with_replacement = sample_with_replacement; this->splitrule = splitrule; this->case_weights = case_weights; this->manual_inbag = manual_inbag; this->keep_inbag = keep_inbag; this->sample_fraction = sample_fraction; this->holdout = holdout; this->alpha = alpha; this->minprop = minprop; this->num_random_splits = num_random_splits; this->max_depth = max_depth; this->regularization_factor = regularization_factor; this->regularization_usedepth = regularization_usedepth; this->split_varIDs_used = split_varIDs_used; // Regularization if (regularization_factor->size() > 0) { regularization = true; } else { regularization = false; } } void Tree::grow(std::vector* variable_importance) { // Allocate memory for tree growing allocateMemory(); this->variable_importance = variable_importance; // Bootstrap, dependent if weighted or not and with or without replacement if (!case_weights->empty()) { if (sample_with_replacement) { bootstrapWeighted(); } else { bootstrapWithoutReplacementWeighted(); } } else if (sample_fraction->size() > 1) { if (sample_with_replacement) { bootstrapClassWise(); } else { bootstrapWithoutReplacementClassWise(); } } else if (!manual_inbag->empty()) { setManualInbag(); } else { if (sample_with_replacement) { bootstrap(); } else { bootstrapWithoutReplacement(); } } // Init start and end positions start_pos[0] = 0; end_pos[0] = sampleIDs.size(); // While not all nodes terminal, split next node size_t num_open_nodes = 1; size_t i = 0; depth = 0; while (num_open_nodes > 0) { // Split node bool is_terminal_node = splitNode(i); if (is_terminal_node) { --num_open_nodes; } else { ++num_open_nodes; if (i >= last_left_nodeID) { // If new level, increase depth // (left_node saves left-most node in current level, new level reached if that node is splitted) last_left_nodeID = split_varIDs.size() - 2; ++depth; } } ++i; } // Delete sampleID vector to save memory sampleIDs.clear(); sampleIDs.shrink_to_fit(); cleanUpInternal(); } void Tree::predict(const Data* prediction_data, bool oob_prediction) { size_t num_samples_predict; if (oob_prediction) { num_samples_predict = num_samples_oob; } else { num_samples_predict = prediction_data->getNumRows(); } prediction_terminal_nodeIDs.resize(num_samples_predict, 0); // For each sample start in root, drop down the tree and return final value for (size_t i = 0; i < num_samples_predict; ++i) { size_t sample_idx; if (oob_prediction) { sample_idx = oob_sampleIDs[i]; } else { sample_idx = i; } size_t nodeID = 0; while (1) { // Break if terminal node if (child_nodeIDs[0][nodeID] == 0 && child_nodeIDs[1][nodeID] == 0) { break; } // Move to child size_t split_varID = split_varIDs[nodeID]; double value = prediction_data->get_x(sample_idx, split_varID); if (prediction_data->isOrderedVariable(split_varID)) { if (value <= split_values[nodeID]) { // Move to left child nodeID = child_nodeIDs[0][nodeID]; } else { // Move to right child nodeID = child_nodeIDs[1][nodeID]; } } else { size_t factorID = floor(value) - 1; size_t splitID = floor(split_values[nodeID]); // Left if 0 found at position factorID if (!(splitID & (1ULL << factorID))) { // Move to left child nodeID = child_nodeIDs[0][nodeID]; } else { // Move to right child nodeID = child_nodeIDs[1][nodeID]; } } } prediction_terminal_nodeIDs[i] = nodeID; } } void Tree::computePermutationImportance(std::vector& forest_importance, std::vector& forest_variance, std::vector& forest_importance_casewise) { size_t num_independent_variables = data->getNumCols(); // Compute normal prediction accuracy for each tree. Predictions already computed.. double accuracy_normal; std::vector prederr_normal_casewise; std::vector prederr_shuf_casewise; if (importance_mode == IMP_PERM_CASEWISE) { prederr_normal_casewise.resize(num_samples_oob, 0); prederr_shuf_casewise.resize(num_samples_oob, 0); accuracy_normal = computePredictionAccuracyInternal(&prederr_normal_casewise); } else { accuracy_normal = computePredictionAccuracyInternal(NULL); } prediction_terminal_nodeIDs.clear(); prediction_terminal_nodeIDs.resize(num_samples_oob, 0); // Reserve space for permutations, initialize with oob_sampleIDs std::vector permutations(oob_sampleIDs); // Randomly permute for all independent variables for (size_t i = 0; i < num_independent_variables; ++i) { // Permute and compute prediction accuracy again for this permutation and save difference permuteAndPredictOobSamples(i, permutations); double accuracy_permuted; if (importance_mode == IMP_PERM_CASEWISE) { accuracy_permuted = computePredictionAccuracyInternal(&prederr_shuf_casewise); for (size_t j = 0; j < num_samples_oob; ++j) { size_t pos = i * num_samples + oob_sampleIDs[j]; forest_importance_casewise[pos] += prederr_shuf_casewise[j] - prederr_normal_casewise[j]; } } else { accuracy_permuted = computePredictionAccuracyInternal(NULL); } double accuracy_difference = accuracy_normal - accuracy_permuted; forest_importance[i] += accuracy_difference; // Compute variance if (importance_mode == IMP_PERM_BREIMAN) { forest_variance[i] += accuracy_difference * accuracy_difference; } else if (importance_mode == IMP_PERM_LIAW) { forest_variance[i] += accuracy_difference * accuracy_difference * num_samples_oob; } } } // #nocov start void Tree::appendToFile(std::ofstream& file) { // Save general fields saveVector2D(child_nodeIDs, file); saveVector1D(split_varIDs, file); saveVector1D(split_values, file); // Call special functions for subclasses to save special fields. appendToFileInternal(file); } // #nocov end void Tree::createPossibleSplitVarSubset(std::vector& result) { size_t num_vars = data->getNumCols(); // For corrected Gini importance add dummy variables if (importance_mode == IMP_GINI_CORRECTED) { num_vars += data->getNumCols(); } // Randomly add non-deterministic variables (according to weights if needed) if (split_select_weights->empty()) { if (deterministic_varIDs->empty()) { drawWithoutReplacement(result, random_number_generator, num_vars, mtry); } else { drawWithoutReplacementSkip(result, random_number_generator, num_vars, (*deterministic_varIDs), mtry); } } else { drawWithoutReplacementWeighted(result, random_number_generator, num_vars, mtry, *split_select_weights); } // Always use deterministic variables std::copy(deterministic_varIDs->begin(), deterministic_varIDs->end(), std::inserter(result, result.end())); } bool Tree::splitNode(size_t nodeID) { // Select random subset of variables to possibly split at std::vector possible_split_varIDs; createPossibleSplitVarSubset(possible_split_varIDs); // Call subclass method, sets split_varIDs and split_values bool stop = splitNodeInternal(nodeID, possible_split_varIDs); if (stop) { // Terminal node return true; } size_t split_varID = split_varIDs[nodeID]; double split_value = split_values[nodeID]; // Save non-permuted variable for prediction split_varIDs[nodeID] = data->getUnpermutedVarID(split_varID); // Create child nodes size_t left_child_nodeID = split_varIDs.size(); child_nodeIDs[0][nodeID] = left_child_nodeID; createEmptyNode(); start_pos[left_child_nodeID] = start_pos[nodeID]; size_t right_child_nodeID = split_varIDs.size(); child_nodeIDs[1][nodeID] = right_child_nodeID; createEmptyNode(); start_pos[right_child_nodeID] = end_pos[nodeID]; // For each sample in node, assign to left or right child if (data->isOrderedVariable(split_varID)) { // Ordered: left is <= splitval and right is > splitval size_t pos = start_pos[nodeID]; while (pos < start_pos[right_child_nodeID]) { size_t sampleID = sampleIDs[pos]; if (data->get_x(sampleID, split_varID) <= split_value) { // If going to left, do nothing ++pos; } else { // If going to right, move to right end --start_pos[right_child_nodeID]; std::swap(sampleIDs[pos], sampleIDs[start_pos[right_child_nodeID]]); } } } else { // Unordered: If bit at position is 1 -> right, 0 -> left size_t pos = start_pos[nodeID]; while (pos < start_pos[right_child_nodeID]) { size_t sampleID = sampleIDs[pos]; double level = data->get_x(sampleID, split_varID); size_t factorID = floor(level) - 1; size_t splitID = floor(split_value); // Left if 0 found at position factorID if (!(splitID & (1ULL << factorID))) { // If going to left, do nothing ++pos; } else { // If going to right, move to right end --start_pos[right_child_nodeID]; std::swap(sampleIDs[pos], sampleIDs[start_pos[right_child_nodeID]]); } } } // End position of left child is start position of right child end_pos[left_child_nodeID] = start_pos[right_child_nodeID]; end_pos[right_child_nodeID] = end_pos[nodeID]; // No terminal node return false; } void Tree::createEmptyNode() { split_varIDs.push_back(0); split_values.push_back(0); child_nodeIDs[0].push_back(0); child_nodeIDs[1].push_back(0); start_pos.push_back(0); end_pos.push_back(0); createEmptyNodeInternal(); } size_t Tree::dropDownSamplePermuted(size_t permuted_varID, size_t sampleID, size_t permuted_sampleID) { // Start in root and drop down size_t nodeID = 0; while (child_nodeIDs[0][nodeID] != 0 || child_nodeIDs[1][nodeID] != 0) { // Permute if variable is permutation variable size_t split_varID = split_varIDs[nodeID]; size_t sampleID_final = sampleID; if (split_varID == permuted_varID) { sampleID_final = permuted_sampleID; } // Move to child double value = data->get_x(sampleID_final, split_varID); if (data->isOrderedVariable(split_varID)) { if (value <= split_values[nodeID]) { // Move to left child nodeID = child_nodeIDs[0][nodeID]; } else { // Move to right child nodeID = child_nodeIDs[1][nodeID]; } } else { size_t factorID = floor(value) - 1; size_t splitID = floor(split_values[nodeID]); // Left if 0 found at position factorID if (!(splitID & (1ULL << factorID))) { // Move to left child nodeID = child_nodeIDs[0][nodeID]; } else { // Move to right child nodeID = child_nodeIDs[1][nodeID]; } } } return nodeID; } void Tree::permuteAndPredictOobSamples(size_t permuted_varID, std::vector& permutations) { // Permute OOB sample //std::vector permutations(oob_sampleIDs); std::shuffle(permutations.begin(), permutations.end(), random_number_generator); // For each sample, drop down the tree and add prediction for (size_t i = 0; i < num_samples_oob; ++i) { size_t nodeID = dropDownSamplePermuted(permuted_varID, oob_sampleIDs[i], permutations[i]); prediction_terminal_nodeIDs[i] = nodeID; } } void Tree::bootstrap() { // Use fraction (default 63.21%) of the samples size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0]; // Reserve space, reserve a little more to be save) sampleIDs.reserve(num_samples_inbag); oob_sampleIDs.reserve(num_samples * (exp(-(*sample_fraction)[0]) + 0.1)); std::uniform_int_distribution unif_dist(0, num_samples - 1); // Start with all samples OOB inbag_counts.resize(num_samples, 0); // Draw num_samples samples with replacement (num_samples_inbag out of n) as inbag and mark as not OOB for (size_t s = 0; s < num_samples_inbag; ++s) { size_t draw = unif_dist(random_number_generator); sampleIDs.push_back(draw); ++inbag_counts[draw]; } // Save OOB samples for (size_t s = 0; s < inbag_counts.size(); ++s) { if (inbag_counts[s] == 0) { oob_sampleIDs.push_back(s); } } num_samples_oob = oob_sampleIDs.size(); if (!keep_inbag) { inbag_counts.clear(); inbag_counts.shrink_to_fit(); } } void Tree::bootstrapWeighted() { // Use fraction (default 63.21%) of the samples size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0]; // Reserve space, reserve a little more to be save) sampleIDs.reserve(num_samples_inbag); oob_sampleIDs.reserve(num_samples * (exp(-(*sample_fraction)[0]) + 0.1)); std::discrete_distribution<> weighted_dist(case_weights->begin(), case_weights->end()); // Start with all samples OOB inbag_counts.resize(num_samples, 0); // Draw num_samples samples with replacement (n out of n) as inbag and mark as not OOB for (size_t s = 0; s < num_samples_inbag; ++s) { size_t draw = weighted_dist(random_number_generator); sampleIDs.push_back(draw); ++inbag_counts[draw]; } // Save OOB samples. In holdout mode these are the cases with 0 weight. if (holdout) { for (size_t s = 0; s < (*case_weights).size(); ++s) { if ((*case_weights)[s] == 0) { oob_sampleIDs.push_back(s); } } } else { for (size_t s = 0; s < inbag_counts.size(); ++s) { if (inbag_counts[s] == 0) { oob_sampleIDs.push_back(s); } } } num_samples_oob = oob_sampleIDs.size(); if (!keep_inbag) { inbag_counts.clear(); inbag_counts.shrink_to_fit(); } } void Tree::bootstrapWithoutReplacement() { // Use fraction (default 63.21%) of the samples size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0]; shuffleAndSplit(sampleIDs, oob_sampleIDs, num_samples, num_samples_inbag, random_number_generator); num_samples_oob = oob_sampleIDs.size(); if (keep_inbag) { // All observation are 0 or 1 times inbag inbag_counts.resize(num_samples, 1); for (size_t i = 0; i < oob_sampleIDs.size(); i++) { inbag_counts[oob_sampleIDs[i]] = 0; } } } void Tree::bootstrapWithoutReplacementWeighted() { // Use fraction (default 63.21%) of the samples size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0]; drawWithoutReplacementWeighted(sampleIDs, random_number_generator, num_samples - 1, num_samples_inbag, *case_weights); // All observation are 0 or 1 times inbag inbag_counts.resize(num_samples, 0); for (auto& sampleID : sampleIDs) { inbag_counts[sampleID] = 1; } // Save OOB samples. In holdout mode these are the cases with 0 weight. if (holdout) { for (size_t s = 0; s < (*case_weights).size(); ++s) { if ((*case_weights)[s] == 0) { oob_sampleIDs.push_back(s); } } } else { for (size_t s = 0; s < inbag_counts.size(); ++s) { if (inbag_counts[s] == 0) { oob_sampleIDs.push_back(s); } } } num_samples_oob = oob_sampleIDs.size(); if (!keep_inbag) { inbag_counts.clear(); inbag_counts.shrink_to_fit(); } } void Tree::bootstrapClassWise() { // Empty on purpose (virtual function only implemented in classification and probability) } void Tree::bootstrapWithoutReplacementClassWise() { // Empty on purpose (virtual function only implemented in classification and probability) } void Tree::setManualInbag() { // Select observation as specified in manual_inbag vector sampleIDs.reserve(manual_inbag->size()); inbag_counts.resize(num_samples, 0); for (size_t i = 0; i < manual_inbag->size(); ++i) { size_t inbag_count = (*manual_inbag)[i]; if ((*manual_inbag)[i] > 0) { for (size_t j = 0; j < inbag_count; ++j) { sampleIDs.push_back(i); } inbag_counts[i] = inbag_count; } else { oob_sampleIDs.push_back(i); } } num_samples_oob = oob_sampleIDs.size(); // Shuffle samples std::shuffle(sampleIDs.begin(), sampleIDs.end(), random_number_generator); if (!keep_inbag) { inbag_counts.clear(); inbag_counts.shrink_to_fit(); } } } // namespace ranger ranger/src/DataFloat.h0000755000176200001440000000347614027301517014344 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ // Ignore in coverage report (not used in R package) // #nocov start #ifndef DATAFLOAT_H_ #define DATAFLOAT_H_ #include #include #include "globals.h" #include "utility.h" #include "Data.h" namespace ranger { class DataFloat: public Data { public: DataFloat() = default; DataFloat(const DataFloat&) = delete; DataFloat& operator=(const DataFloat&) = delete; virtual ~DataFloat() override = default; double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { col = getUnpermutedVarID(col); row = getPermutedSampleID(row); } if (col < num_cols_no_snp) { return x[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } double get_y(size_t row, size_t col) const override { return y[col * num_rows + row]; } void reserveMemory(size_t y_cols) override { x.resize(num_cols * num_rows); y.resize(y_cols * num_rows); } void set_x(size_t col, size_t row, double value, bool& error) override { x[col * num_rows + row] = value; } void set_y(size_t col, size_t row, double value, bool& error) override { y[col * num_rows + row] = value; } private: std::vector x; std::vector y; }; } // namespace ranger #endif /* DATAFLOAT_H_ */ // #nocov end ranger/src/Makevars0000755000176200001440000000006714027301517014021 0ustar liggesusers## Use c++11 CXX_STD = CXX11 PKG_CPPFLAGS = -DR_BUILD ranger/src/Data.h0000755000176200001440000001462114027301517013350 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef DATA_H_ #define DATA_H_ #include #include #include #include #include #include "globals.h" namespace ranger { class Data { public: Data(); Data(const Data&) = delete; Data& operator=(const Data&) = delete; virtual ~Data() = default; virtual double get_x(size_t row, size_t col) const = 0; virtual double get_y(size_t row, size_t col) const = 0; size_t getVariableID(const std::string& variable_name) const; virtual void reserveMemory(size_t y_cols) = 0; virtual void set_x(size_t col, size_t row, double value, bool& error) = 0; virtual void set_y(size_t col, size_t row, double value, bool& error) = 0; void addSnpData(unsigned char* snp_data, size_t num_cols_snp); bool loadFromFile(std::string filename, std::vector& dependent_variable_names); bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line, std::vector& dependent_variable_names); bool loadFromFileOther(std::ifstream& input_file, std::string header_line, std::vector& dependent_variable_names, char seperator); void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const; void getMinMaxValues(double& min, double&max, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const; size_t getIndex(size_t row, size_t col) const { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { col = getUnpermutedVarID(col); row = getPermutedSampleID(row); } if (col < num_cols_no_snp) { return index_data[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } // #nocov start (cannot be tested anymore because GenABEL not on CRAN) size_t getSnp(size_t row, size_t col, size_t col_permuted) const { // Get data out of snp storage. -1 because of GenABEL coding. size_t idx = (col - num_cols_no_snp) * num_rows_rounded + row; size_t result = ((snp_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1; // TODO: Better way to treat missing values? if (result > 2) { result = 0; } // Order SNPs if (order_snps) { if (col_permuted >= num_cols) { result = snp_order[col_permuted - 2 * num_cols_no_snp][result]; } else { result = snp_order[col - num_cols_no_snp][result]; } } return result; } // #nocov end double getUniqueDataValue(size_t varID, size_t index) const { // Use permuted data for corrected impurity importance if (varID >= num_cols) { varID = getUnpermutedVarID(varID); } if (varID < num_cols_no_snp) { return unique_data_values[varID][index]; } else { // For GWAS data the index is the value return (index); } } size_t getNumUniqueDataValues(size_t varID) const { // Use permuted data for corrected impurity importance if (varID >= num_cols) { varID = getUnpermutedVarID(varID); } if (varID < num_cols_no_snp) { return unique_data_values[varID].size(); } else { // For GWAS data 0,1,2 return (3); } } void sort(); void orderSnpLevels(bool corrected_importance); const std::vector& getVariableNames() const { return variable_names; } size_t getNumCols() const { return num_cols; } size_t getNumRows() const { return num_rows; } size_t getMaxNumUniqueValues() const { if (snp_data == 0 || max_num_unique_values > 3) { // If no snp data or one variable with more than 3 unique values, return that value return max_num_unique_values; } else { // If snp data and no variable with more than 3 unique values, return 3 return 3; } } std::vector& getIsOrderedVariable() noexcept { return is_ordered_variable; } void setIsOrderedVariable(const std::vector& unordered_variable_names) { is_ordered_variable.resize(num_cols, true); for (auto& variable_name : unordered_variable_names) { size_t varID = getVariableID(variable_name); is_ordered_variable[varID] = false; } } void setIsOrderedVariable(std::vector& is_ordered_variable) { this->is_ordered_variable = is_ordered_variable; } bool isOrderedVariable(size_t varID) const { // Use permuted data for corrected impurity importance if (varID >= num_cols) { varID = getUnpermutedVarID(varID); } return is_ordered_variable[varID]; } void permuteSampleIDs(std::mt19937_64 random_number_generator) { permuted_sampleIDs.resize(num_rows); std::iota(permuted_sampleIDs.begin(), permuted_sampleIDs.end(), 0); std::shuffle(permuted_sampleIDs.begin(), permuted_sampleIDs.end(), random_number_generator); } size_t getPermutedSampleID(size_t sampleID) const { return permuted_sampleIDs[sampleID]; } size_t getUnpermutedVarID(size_t varID) const { if (varID >= num_cols) { varID -= num_cols; } return varID; } // #nocov start (cannot be tested anymore because GenABEL not on CRAN) const std::vector>& getSnpOrder() const { return snp_order; } void setSnpOrder(std::vector>& snp_order) { this->snp_order = snp_order; order_snps = true; } // #nocov end protected: std::vector variable_names; size_t num_rows; size_t num_rows_rounded; size_t num_cols; unsigned char* snp_data; size_t num_cols_no_snp; bool externalData; std::vector index_data; std::vector> unique_data_values; size_t max_num_unique_values; // For each varID true if ordered std::vector is_ordered_variable; // Permuted samples for corrected impurity importance std::vector permuted_sampleIDs; // Order of 0/1/2 for ordered splitting std::vector> snp_order; bool order_snps; }; } // namespace ranger #endif /* DATA_H_ */ ranger/src/Makevars.win0000755000176200001440000000010514027301517014606 0ustar liggesusers## Use c++11 CXX_STD = CXX11 PKG_CPPFLAGS = -DR_BUILD -DWIN_R_BUILD ranger/src/DataRcpp.h0000755000176200001440000000475614027301517014205 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of Ranger. Ranger is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Ranger is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Ranger. If not, see . Written by: Marvin N. Wright Institut für Medizinische Biometrie und Statistik Universität zu Lübeck Ratzeburger Allee 160 23562 Lübeck http://www.imbs-luebeck.de #-------------------------------------------------------------------------------*/ #ifndef DATARCPP_H_ #define DATARCPP_H_ #include #include "globals.h" #include "utility.h" #include "Data.h" namespace ranger { class DataRcpp: public Data { public: DataRcpp() = default; DataRcpp(Rcpp::NumericMatrix& x, Rcpp::NumericMatrix& y, std::vector variable_names, size_t num_rows, size_t num_cols) { this->x = x; this->y = y; this->variable_names = variable_names; this->num_rows = num_rows; this->num_cols = num_cols; this->num_cols_no_snp = num_cols; } DataRcpp(const DataRcpp&) = delete; DataRcpp& operator=(const DataRcpp&) = delete; virtual ~DataRcpp() override = default; double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { col = getUnpermutedVarID(col); row = getPermutedSampleID(row); } if (col < num_cols_no_snp) { return x(row, col); } else { return getSnp(row, col, col_permuted); } } double get_y(size_t row, size_t col) const override { return y(row, col); } // #nocov start void reserveMemory(size_t y_cols) override { // Not needed } void set_x(size_t col, size_t row, double value, bool& error) override { x(row, col) = value; } void set_y(size_t col, size_t row, double value, bool& error) override { y(row, col) = value; } // #nocov end private: Rcpp::NumericMatrix x; Rcpp::NumericMatrix y; }; } // namespace ranger #endif /* DATARCPP_H_ */ ranger/src/ForestProbability.cpp0000755000176200001440000002727314027301517016504 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include "utility.h" #include "ForestProbability.h" #include "TreeProbability.h" #include "Data.h" namespace ranger { void ForestProbability::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector>>& forest_terminal_class_counts, std::vector& is_ordered_variable) { this->num_trees = num_trees; this->class_values = class_values; data->setIsOrderedVariable(is_ordered_variable); // Create trees trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], &this->class_values, &response_classIDs, forest_terminal_class_counts[i])); } // Create thread ranges equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } std::vector>> ForestProbability::getTerminalClassCounts() const { std::vector>> result; result.reserve(num_trees); for (const auto& tree : trees) { const auto& temp = dynamic_cast(*tree); result.push_back(temp.getTerminalClassCounts()); } return result; } void ForestProbability::initInternal() { // If mtry not set, use floored square root of number of independent variables. if (mtry == 0) { unsigned long temp = sqrt((double) num_independent_variables); mtry = std::max((unsigned long) 1, temp); } // Set minimal node size if (min_node_size == 0) { min_node_size = DEFAULT_MIN_NODE_SIZE_PROBABILITY; } // Create class_values and response_classIDs if (!prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { double value = data->get_y(i, 0); // If classID is already in class_values, use ID. Else create a new one. uint classID = find(class_values.begin(), class_values.end(), value) - class_values.begin(); if (classID == class_values.size()) { class_values.push_back(value); } response_classIDs.push_back(classID); } if (splitrule == HELLINGER && class_values.size() != 2) { throw std::runtime_error("Hellinger splitrule only implemented for binary classification."); } } // Create sampleIDs_per_class if required if (sample_fraction.size() > 1) { sampleIDs_per_class.resize(sample_fraction.size()); for (auto& v : sampleIDs_per_class) { v.reserve(num_samples); } for (size_t i = 0; i < num_samples; ++i) { size_t classID = response_classIDs[i]; sampleIDs_per_class[classID].push_back(i); } } // Set class weights all to 1 class_weights = std::vector(class_values.size(), 1.0); // Sort data if memory saving mode if (!memory_saving_splitting) { data->sort(); } } void ForestProbability::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( make_unique(&class_values, &response_classIDs, &sampleIDs_per_class, &class_weights)); } } void ForestProbability::allocatePredictMemory() { size_t num_prediction_samples = data->getNumRows(); if (predict_all) { predictions = std::vector>>(num_prediction_samples, std::vector>(class_values.size(), std::vector(num_trees, 0))); } else if (prediction_type == TERMINALNODES) { predictions = std::vector>>(1, std::vector>(num_prediction_samples, std::vector(num_trees, 0))); } else { predictions = std::vector>>(1, std::vector>(num_prediction_samples, std::vector(class_values.size(), 0))); } } void ForestProbability::predictInternal(size_t sample_idx) { // For each sample compute proportions in each tree for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { if (predict_all) { std::vector counts = getTreePrediction(tree_idx, sample_idx); for (size_t class_idx = 0; class_idx < counts.size(); ++class_idx) { predictions[sample_idx][class_idx][tree_idx] += counts[class_idx]; } } else if (prediction_type == TERMINALNODES) { predictions[0][sample_idx][tree_idx] = getTreePredictionTerminalNodeID(tree_idx, sample_idx); } else { std::vector counts = getTreePrediction(tree_idx, sample_idx); for (size_t class_idx = 0; class_idx < counts.size(); ++class_idx) { predictions[0][sample_idx][class_idx] += counts[class_idx]; } } } // Average over trees if (!predict_all && prediction_type != TERMINALNODES) { for (size_t class_idx = 0; class_idx < predictions[0][sample_idx].size(); ++class_idx) { predictions[0][sample_idx][class_idx] /= num_trees; } } } void ForestProbability::computePredictionErrorInternal() { // For each sample sum over trees where sample is OOB std::vector samples_oob_count; samples_oob_count.resize(num_samples, 0); predictions = std::vector>>(1, std::vector>(num_samples, std::vector(class_values.size(), 0))); for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { for (size_t sample_idx = 0; sample_idx < trees[tree_idx]->getNumSamplesOob(); ++sample_idx) { size_t sampleID = trees[tree_idx]->getOobSampleIDs()[sample_idx]; std::vector counts = getTreePrediction(tree_idx, sample_idx); for (size_t class_idx = 0; class_idx < counts.size(); ++class_idx) { predictions[0][sampleID][class_idx] += counts[class_idx]; } ++samples_oob_count[sampleID]; } } // MSE with predicted probability and true data size_t num_predictions = 0; overall_prediction_error = 0; for (size_t i = 0; i < predictions[0].size(); ++i) { if (samples_oob_count[i] > 0) { ++num_predictions; for (size_t j = 0; j < predictions[0][i].size(); ++j) { predictions[0][i][j] /= (double) samples_oob_count[i]; } size_t real_classID = response_classIDs[i]; double predicted_value = predictions[0][i][real_classID]; overall_prediction_error += (1 - predicted_value) * (1 - predicted_value); } else { for (size_t j = 0; j < predictions[0][i].size(); ++j) { predictions[0][i][j] = NAN; } } } overall_prediction_error /= (double) num_predictions; } // #nocov start void ForestProbability::writeOutputInternal() { if (verbose_out) { *verbose_out << "Tree type: " << "Probability estimation" << std::endl; } } void ForestProbability::writeConfusionFile() { // Open confusion file for writing std::string filename = output_prefix + ".confusion"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to confusion file: " + filename + "."); } // Write confusion to file outfile << "Overall OOB prediction error (MSE): " << overall_prediction_error << std::endl; outfile.close(); if (verbose_out) *verbose_out << "Saved prediction error to file " << filename << "." << std::endl; } void ForestProbability::writePredictionFile() { // Open prediction file for writing std::string filename = output_prefix + ".prediction"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to prediction file: " + filename + "."); } // Write outfile << "Class predictions, one sample per row." << std::endl; for (auto& class_value : class_values) { outfile << class_value << " "; } outfile << std::endl << std::endl; if (predict_all) { for (size_t k = 0; k < num_trees; ++k) { outfile << "Tree " << k << ":" << std::endl; for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { outfile << predictions[i][j][k] << " "; } outfile << std::endl; } outfile << std::endl; } } else { for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { for (size_t k = 0; k < predictions[i][j].size(); ++k) { outfile << predictions[i][j][k] << " "; } outfile << std::endl; } } } if (verbose_out) *verbose_out << "Saved predictions to file " << filename << "." << std::endl; } void ForestProbability::saveToFileInternal(std::ofstream& outfile) { // Write num_variables outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_PROBABILITY; outfile.write((char*) &treetype, sizeof(treetype)); // Write class_values saveVector1D(class_values, outfile); } void ForestProbability::loadFromFileInternal(std::ifstream& infile) { // Read number of variables size_t num_variables_saved; infile.read((char*) &num_variables_saved, sizeof(num_variables_saved)); // Read treetype TreeType treetype; infile.read((char*) &treetype, sizeof(treetype)); if (treetype != TREE_PROBABILITY) { throw std::runtime_error("Wrong treetype. Loaded file is not a probability estimation forest."); } // Read class_values readVector1D(class_values, infile); for (size_t i = 0; i < num_trees; ++i) { // Read data std::vector> child_nodeIDs; readVector2D(child_nodeIDs, infile); std::vector split_varIDs; readVector1D(split_varIDs, infile); std::vector split_values; readVector1D(split_values, infile); // Read Terminal node class counts std::vector terminal_nodes; readVector1D(terminal_nodes, infile); std::vector> terminal_class_counts_vector; readVector2D(terminal_class_counts_vector, infile); // Convert Terminal node class counts to vector with empty elemtents for non-terminal nodes std::vector> terminal_class_counts; terminal_class_counts.resize(child_nodeIDs[0].size(), std::vector()); for (size_t j = 0; j < terminal_nodes.size(); ++j) { terminal_class_counts[terminal_nodes[j]] = terminal_class_counts_vector[j]; } // If dependent variable not in test data, throw error if (num_variables_saved != num_independent_variables) { throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree trees.push_back( make_unique(child_nodeIDs, split_varIDs, split_values, &class_values, &response_classIDs, terminal_class_counts)); } } const std::vector& ForestProbability::getTreePrediction(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPrediction(sample_idx); } size_t ForestProbability::getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPredictionTerminalNodeID(sample_idx); } // #nocov end }// namespace ranger ranger/src/utility.h0000755000176200001440000004652014027301517014205 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef UTILITY_H_ #define UTILITY_H_ #include #include #include #include #include #include #include #include #include #include #include #ifdef R_BUILD #include #endif #include "globals.h" #include "Data.h" namespace ranger { /** * Split sequence start..end in num_parts parts with sizes as equal as possible. * @param result Result vector of size num_parts+1. Ranges for the parts are then result[0]..result[1]-1, result[1]..result[2]-1, .. * @param start minimum value * @param end maximum value * @param num_parts number of parts */ void equalSplit(std::vector& result, uint start, uint end, uint num_parts); // #nocov start /** * Write a 1d vector to filestream. First the size is written as size_t, then all vector elements. * @param vector Vector with elements of type T to write to file. * @param file ofstream object to write to. */ /** * Write a 1d vector to filestream. First the size is written, then all vector elements. * @param vector Vector of type T to save * @param file ofstream object to write to. */ template inline void saveVector1D(const std::vector& vector, std::ofstream& file) { // Save length size_t length = vector.size(); file.write((char*) &length, sizeof(length)); file.write((char*) vector.data(), length * sizeof(T)); } template<> inline void saveVector1D(const std::vector& vector, std::ofstream& file) { // Save length size_t length = vector.size(); file.write((char*) &length, sizeof(length)); // Save vector for (size_t i = 0; i < vector.size(); ++i) { bool v = vector[i]; file.write((char*) &v, sizeof(v)); } } /** * Read a 1d vector written by saveVector1D() from filestream. * @param result Result vector with elements of type T. * @param file ifstream object to read from. */ template inline void readVector1D(std::vector& result, std::ifstream& file) { // Read length size_t length; file.read((char*) &length, sizeof(length)); result.resize(length); file.read((char*) result.data(), length * sizeof(T)); } template<> inline void readVector1D(std::vector& result, std::ifstream& file) { // Read length size_t length; file.read((char*) &length, sizeof(length)); // Read vector. for (size_t i = 0; i < length; ++i) { bool temp; file.read((char*) &temp, sizeof(temp)); result.push_back(temp); } } /** * Write a 2d vector to filestream. First the size of the first dim is written as size_t, then for all inner vectors the size and elements. * @param vector Vector of vectors of type T to write to file. * @param file ofstream object to write to. */ template inline void saveVector2D(const std::vector>& vector, std::ofstream& file) { // Save length of first dim size_t length = vector.size(); file.write((char*) &length, sizeof(length)); // Save outer vector for (auto& inner_vector : vector) { // Save inner vector saveVector1D(inner_vector, file); } } /** * Read a 2d vector written by saveVector2D() from filestream. * @param result Result vector of vectors with elements of type T. * @param file ifstream object to read from. */ template inline void readVector2D(std::vector>& result, std::ifstream& file) { // Read length of first dim size_t length; file.read((char*) &length, sizeof(length)); result.resize(length); // Read outer vector for (size_t i = 0; i < length; ++i) { // Read inner vector readVector1D(result[i], file); } } /** * Read a double vector from text file. Reads only the first line. * @param result Result vector of doubles with contents * @param filename filename of input file */ void loadDoubleVectorFromFile(std::vector& result, std::string filename); // #nocov end /** * Draw random numbers in a range without replacements. * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param range_length Length of range. Interval to draw from: 0..max-1 * @param num_samples Number of samples to draw */ void drawWithoutReplacement(std::vector& result, std::mt19937_64& random_number_generator, size_t range_length, size_t num_samples); /** * Draw random numbers in a range without replacement and skip values. * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param range_length Length of range. Interval to draw from: 0..max-1 * @param skip Values to skip * @param num_samples Number of samples to draw */ void drawWithoutReplacementSkip(std::vector& result, std::mt19937_64& random_number_generator, size_t range_length, const std::vector& skip, size_t num_samples); /** * Simple algorithm for sampling without replacement, faster for smaller num_samples * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param range_length Length of range. Interval to draw from: 0..max-1 * @param num_samples Number of samples to draw */ void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& random_number_generator, size_t max, size_t num_samples); /** * Simple algorithm for sampling without replacement (skip values), faster for smaller num_samples * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param range_length Length of range. Interval to draw from: 0..max-1 * @param skip Values to skip * @param num_samples Number of samples to draw */ void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples); /** * Fisher Yates algorithm for sampling without replacement. * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param max Length of range. Interval to draw from: 0..max-1 * @param num_samples Number of samples to draw */ void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937_64& random_number_generator, size_t max, size_t num_samples); /** * Fisher Yates algorithm for sampling without replacement (skip values). * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param max Length of range. Interval to draw from: 0..max-1 * @param skip Values to skip * @param num_samples Number of samples to draw */ void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples); /** * Draw random numers without replacement and with weighted probabilites from 0..n-1. * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param max_index Maximum index to draw * @param num_samples Number of samples to draw * @param weights A weight for each element of indices */ void drawWithoutReplacementWeighted(std::vector& result, std::mt19937_64& random_number_generator, size_t max_index, size_t num_samples, const std::vector& weights); /** * Draw random numbers of a vector without replacement. * @param result Vector to add results to. Will not be cleaned before filling. * @param input Vector to draw values from. * @param random_number_generator Random number generator * @param num_samples Number of samples to draw */ template void drawWithoutReplacementFromVector(std::vector& result, const std::vector& input, std::mt19937_64& random_number_generator, size_t num_samples) { // Draw random indices std::vector result_idx; result_idx.reserve(num_samples); std::vector skip; // Empty vector (no skip) drawWithoutReplacementSkip(result_idx, random_number_generator, input.size(), skip, num_samples); // Add vector values to result for (auto& idx : result_idx) { result.push_back(input[idx]); } } /** * Returns the most frequent class index of a vector with counts for the classes. Returns a random class if counts are equal. * @param class_count Vector with class counts * @param random_number_generator Random number generator * @return Most frequent class index. Out of range index if all 0. */ template size_t mostFrequentClass(const std::vector& class_count, std::mt19937_64 random_number_generator) { std::vector major_classes; // Find maximum count T max_count = 0; for (size_t i = 0; i < class_count.size(); ++i) { T count = class_count[i]; if (count > max_count) { max_count = count; major_classes.clear(); major_classes.push_back(i); } else if (count == max_count) { major_classes.push_back(i); } } if (max_count == 0) { return class_count.size(); } else if (major_classes.size() == 1) { return major_classes[0]; } else { // Choose randomly std::uniform_int_distribution unif_dist(0, major_classes.size() - 1); return major_classes[unif_dist(random_number_generator)]; } } /** * Returns the most frequent value of a map with counts for the values. Returns a random class if counts are equal. * @param class_count Map with classes and counts * @param random_number_generator Random number generator * @return Most frequent value */ double mostFrequentValue(const std::unordered_map& class_count, std::mt19937_64 random_number_generator); /** * Compute concordance index for given data and summed cumulative hazard function/estimate * @param data Reference to Data object * @param sum_chf Summed chf over timepoints for each sample * @param sample_IDs IDs of samples, for example OOB samples * @param prediction_error_casewise An optional output vector with casewise prediction errors. * If pointer is NULL, casewise prediction errors should not be computed. * @return concordance index */ double computeConcordanceIndex(const Data& data, const std::vector& sum_chf, const std::vector& sample_IDs, std::vector* prediction_error_casewise); /** * Convert a unsigned integer to string * @param number Number to convert * @return Converted number as string */ std::string uintToString(uint number); /** * Beautify output of time. * @param seconds Time in seconds * @return Time in days, hours, minutes and seconds as string */ std::string beautifyTime(uint seconds); /** * Round up to next multiple of a number. * @param value Value to be rounded up. * @param multiple Number to multiply. * @return Rounded number */ size_t roundToNextMultiple(size_t value, uint multiple); /** * Split string in string parts separated by character. * @param result Splitted string * @param input String to be splitted * @param split_char Char to separate parts */ void splitString(std::vector& result, const std::string& input, char split_char); /** * Split string in double parts separated by character. * @param result Splitted string * @param input String to be splitted * @param split_char Char to separate parts */ void splitString(std::vector& result, const std::string& input, char split_char); /** * Create numbers from 0 to n_all-1, shuffle and split in two parts. * @param first_part First part * @param second_part Second part * @param n_all Number elements * @param n_first Number of elements of first part * @param random_number_generator Random number generator */ void shuffleAndSplit(std::vector& first_part, std::vector& second_part, size_t n_all, size_t n_first, std::mt19937_64 random_number_generator); /** * Create numbers from 0 to n_all-1, shuffle and split in two parts. Append to existing data. * @param first_part First part * @param second_part Second part * @param n_all Number elements * @param n_first Number of elements of first part * @param mapping Values to use instead of 0...n-1 * @param random_number_generator Random number generator */ void shuffleAndSplitAppend(std::vector& first_part, std::vector& second_part, size_t n_all, size_t n_first, const std::vector& mapping, std::mt19937_64 random_number_generator); /** * Check if not too many factor levels and all values in unordered categorical variables are positive integers. * @param data Reference to data object * @param unordered_variable_names Names of unordered variables * @return Error message, empty if no problem occured */ std::string checkUnorderedVariables(const Data& data, const std::vector& unordered_variable_names); /** * Check if all values in double vector are positive integers. * @param all_values Double vector to check * @return True if all values are positive integers */ bool checkPositiveIntegers(const std::vector& all_values); /** * Compute p-value for maximally selected rank statistics using Lau92 approximation * See Lausen, B. & Schumacher, M. (1992). Biometrics 48, 73-85. * @param b Quantile * @param minprop Minimal proportion of observations left of cutpoint * @param maxprop Maximal proportion of observations left of cutpoint * @return p-value for quantile b */ double maxstatPValueLau92(double b, double minprop, double maxprop); /** * Compute p-value for maximally selected rank statistics using Lau94 approximation * See Lausen, B., Sauerbrei, W. & Schumacher, M. (1994). Computational Statistics. 483-496. * @param b Quantile * @param minprop Minimal proportion of observations left of cutpoint * @param maxprop Maximal proportion of observations left of cutpoint * @param N Number of observations * @param m Vector with number of observations smaller or equal than cutpoint, sorted, only for unique cutpoints * @return p-value for quantile b */ double maxstatPValueLau94(double b, double minprop, double maxprop, size_t N, const std::vector& m); /** * Compute unadjusted p-value for rank statistics * @param b Quantile * @return p-value for quantile b */ double maxstatPValueUnadjusted(double b); /** * Standard normal density * @param x Quantile * @return Standard normal density at quantile x */ double dstdnorm(double x); /** * Standard normal distribution * @param x Quantile * @return Standard normal distribution at quantile x */ double pstdnorm(double x); /** * Adjust p-values with Benjamini/Hochberg * @param unadjusted_pvalues Unadjusted p-values (input) * @param adjusted_pvalues Adjusted p-values (result) */ std::vector adjustPvalues(std::vector& unadjusted_pvalues); /** * Get indices of sorted values * @param values Values to sort * @param decreasing Order decreasing * @return Indices of sorted values */ template std::vector order(const std::vector& values, bool decreasing) { // Create index vector std::vector indices(values.size()); std::iota(indices.begin(), indices.end(), 0); // Sort index vector based on value vector if (decreasing) { std::sort(std::begin(indices), std::end(indices), [&](size_t i1, size_t i2) {return values[i1] > values[i2];}); } else { std::sort(std::begin(indices), std::end(indices), [&](size_t i1, size_t i2) {return values[i1] < values[i2];}); } return indices; } /** * Sample ranks starting from 1. Ties are given the average rank. * @param values Values to rank * @return Ranks of input values */ template std::vector rank(const std::vector& values) { size_t num_values = values.size(); // Order std::vector indices = order(values, false); // Compute ranks, start at 1 std::vector ranks(num_values); size_t reps = 1; for (size_t i = 0; i < num_values; i += reps) { // Find number of replications reps = 1; while (i + reps < num_values && values[indices[i]] == values[indices[i + reps]]) { ++reps; } // Assign rank to all replications for (size_t j = 0; j < reps; ++j) ranks[indices[i + j]] = (2 * (double) i + (double) reps - 1) / 2 + 1; } return ranks; } /** * Compute Logrank scores for survival times * @param time Survival time * @param status Censoring indicator * @return Logrank scores */ std::vector logrankScores(const std::vector& time, const std::vector& status); /** * Compute maximally selected rank statistics * @param scores Scores for dependent variable (y) * @param x Independent variable * @param indices Ordering of x values * @param best_maxstat Maximally selected statistic (output) * @param best_split_value Split value for maximally selected statistic (output) * @param minprop Minimal proportion of observations left of cutpoint * @param maxprop Maximal proportion of observations left of cutpoint */ void maxstat(const std::vector& scores, const std::vector& x, const std::vector& indices, double& best_maxstat, double& best_split_value, double minprop, double maxprop); /** * Compute number of samples smaller or equal than each unique value in x * @param x Value vector * @param indices Ordering of x * @return Vector of number of samples smaller or equal than each unique value in x */ std::vector numSamplesLeftOfCutpoint(std::vector& x, const std::vector& indices); /** * Read from stringstream and ignore failbit for subnormal numbers * See: https://bugs.llvm.org/show_bug.cgi?id=39012 * @param in Input string stream * @param token Output token * @return Input string stream with removed failbit if subnormal number */ std::stringstream& readFromStream(std::stringstream& in, double& token); /** * Compute log-likelihood of beta distribution * @param y Response * @param mean Mean * @param phi Phi * @return Log-likelihood */ double betaLogLik(double y, double mean, double phi); // User interrupt from R #ifdef R_BUILD static void chkIntFn(void *dummy) { R_CheckUserInterrupt(); } inline bool checkInterrupt() { return (R_ToplevelExec(chkIntFn, NULL) == FALSE); } #endif // Provide make_unique (not available in C++11) namespace detail { template struct _Unique_if { typedef std::unique_ptr _Single_object; }; template struct _Unique_if { typedef std::unique_ptr _Unknown_bound; }; template struct _Unique_if { typedef void _Known_bound; }; } // namespace detail template typename detail::_Unique_if::_Single_object make_unique(Args&&... args) { return std::unique_ptr(new T(std::forward(args)...)); } template typename detail::_Unique_if::_Unknown_bound make_unique(size_t n) { typedef typename std::remove_extent::type U; return std::unique_ptr(new U[n]()); } template typename detail::_Unique_if::_Known_bound make_unique(Args&&...) = delete; } // namespace ranger #endif /* UTILITY_H_ */ ranger/src/TreeSurvival.cpp0000755000176200001440000010022314027301517015457 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include "utility.h" #include "TreeSurvival.h" #include "Data.h" namespace ranger { TreeSurvival::TreeSurvival(std::vector* unique_timepoints, std::vector* response_timepointIDs) : unique_timepoints(unique_timepoints), response_timepointIDs(response_timepointIDs), num_deaths(0), num_samples_at_risk( 0) { this->num_timepoints = unique_timepoints->size(); } TreeSurvival::TreeSurvival(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values, std::vector> chf, std::vector* unique_timepoints, std::vector* response_timepointIDs) : Tree(child_nodeIDs, split_varIDs, split_values), unique_timepoints(unique_timepoints), response_timepointIDs( response_timepointIDs), chf(chf), num_deaths(0), num_samples_at_risk(0) { this->num_timepoints = unique_timepoints->size(); } void TreeSurvival::allocateMemory() { // Number of deaths and samples at risk for each timepoint num_deaths.resize(num_timepoints); num_samples_at_risk.resize(num_timepoints); } void TreeSurvival::appendToFileInternal(std::ofstream& file) { // #nocov start // Convert to vector without empty elements and save std::vector terminal_nodes; std::vector> chf_vector; for (size_t i = 0; i < chf.size(); ++i) { if (!chf[i].empty()) { terminal_nodes.push_back(i); chf_vector.push_back(chf[i]); } } saveVector1D(terminal_nodes, file); saveVector2D(chf_vector, file); } // #nocov end void TreeSurvival::createEmptyNodeInternal() { chf.push_back(std::vector()); } void TreeSurvival::computeSurvival(size_t nodeID) { std::vector chf_temp; chf_temp.reserve(num_timepoints); double chf_value = 0; for (size_t i = 0; i < num_timepoints; ++i) { if (num_samples_at_risk[i] != 0) { chf_value += (double) num_deaths[i] / (double) num_samples_at_risk[i]; } chf_temp.push_back(chf_value); } chf[nodeID] = chf_temp; } double TreeSurvival::computePredictionAccuracyInternal(std::vector* prediction_error_casewise) { // Compute summed chf for samples std::vector sum_chf; for (size_t i = 0; i < prediction_terminal_nodeIDs.size(); ++i) { size_t terminal_nodeID = prediction_terminal_nodeIDs[i]; sum_chf.push_back(std::accumulate(chf[terminal_nodeID].begin(), chf[terminal_nodeID].end(), 0.0)); } // Return concordance index return computeConcordanceIndex(*data, sum_chf, oob_sampleIDs, prediction_error_casewise); } bool TreeSurvival::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { // Stop if node is pure bool pure = true; double pure_time = 0; double pure_status = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double time = data->get_y(sampleID, 0); double status = data->get_y(sampleID, 1); if (pos != start_pos[nodeID] && (time != pure_time || status != pure_status)) { pure = false; break; } pure_time = time; pure_status = status; } if (pure) { computeDeathCounts(nodeID); computeSurvival(nodeID); return true; } if (splitrule == MAXSTAT) { return findBestSplitMaxstat(nodeID, possible_split_varIDs); } else if (splitrule == EXTRATREES) { return findBestSplitExtraTrees(nodeID, possible_split_varIDs); } else { return findBestSplit(nodeID, possible_split_varIDs); } } bool TreeSurvival::findBestSplit(size_t nodeID, std::vector& possible_split_varIDs) { double best_decrease = -1; size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; size_t best_varID = 0; double best_value = 0; computeDeathCounts(nodeID); // Stop if maximum node size or depth reached (will check again for each child node) if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { computeSurvival(nodeID); return true; } // Stop early if no split posssible if (num_samples_node >= 2 * min_node_size) { // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { if (splitrule == LOGRANK) { findBestSplitValueLogRank(nodeID, varID, best_value, best_varID, best_decrease); } else if (splitrule == AUC || splitrule == AUC_IGNORE_TIES) { findBestSplitValueAUC(nodeID, varID, best_value, best_varID, best_decrease); } } else { findBestSplitValueLogRankUnordered(nodeID, varID, best_value, best_varID, best_decrease); } } } // Stop and save CHF if no good split found (this is terminal node). if (best_decrease < 0) { computeSurvival(nodeID); return true; } else { // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } } bool TreeSurvival::findBestSplitMaxstat(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { computeDeathCounts(nodeID); computeSurvival(nodeID); return true; } // Compute scores std::vector time; time.reserve(num_samples_node); std::vector status; status.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; time.push_back(data->get_y(sampleID, 0)); status.push_back(data->get_y(sampleID, 1)); } std::vector scores = logrankScores(time, status); // Save split stats std::vector pvalues; pvalues.reserve(possible_split_varIDs.size()); std::vector values; values.reserve(possible_split_varIDs.size()); std::vector candidate_varIDs; candidate_varIDs.reserve(possible_split_varIDs.size()); std::vector test_statistics; test_statistics.reserve(possible_split_varIDs.size()); // Compute p-values for (auto& varID : possible_split_varIDs) { // Get all observations std::vector x; x.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; x.push_back(data->get_x(sampleID, varID)); } // Order by x std::vector indices = order(x, false); //std::vector indices = orderInData(data, sampleIDs[nodeID], varID, false); // Compute maximally selected rank statistics double best_maxstat; double best_split_value; maxstat(scores, x, indices, best_maxstat, best_split_value, minprop, 1 - minprop); //maxstatInData(scores, data, sampleIDs[nodeID], varID, indices, best_maxstat, best_split_value, minprop, 1 - minprop); if (best_maxstat > -1) { // Compute number of samples left of cutpoints std::vector num_samples_left = numSamplesLeftOfCutpoint(x, indices); //std::vector num_samples_left = numSamplesLeftOfCutpointInData(data, sampleIDs[nodeID], varID, indices); // Remove largest cutpoint (all observations left) num_samples_left.pop_back(); // Use unadjusted p-value if only 1 split point double pvalue; if (num_samples_left.size() == 1) { pvalue = maxstatPValueUnadjusted(best_maxstat); } else { // Compute p-values double pvalue_lau92 = maxstatPValueLau92(best_maxstat, minprop, 1 - minprop); double pvalue_lau94 = maxstatPValueLau94(best_maxstat, minprop, 1 - minprop, num_samples_node, num_samples_left); // Use minimum of Lau92 and Lau94 pvalue = std::min(pvalue_lau92, pvalue_lau94); } // Save split stats pvalues.push_back(pvalue); values.push_back(best_split_value); candidate_varIDs.push_back(varID); test_statistics.push_back(best_maxstat); } } double adjusted_best_pvalue = std::numeric_limits::max(); size_t best_varID = 0; double best_value = 0; double best_maxstat = 0; if (pvalues.size() > 0) { // Adjust p-values with Benjamini/Hochberg std::vector adjusted_pvalues = adjustPvalues(pvalues); double min_pvalue = std::numeric_limits::max(); for (size_t i = 0; i < pvalues.size(); ++i) { if (pvalues[i] < min_pvalue) { min_pvalue = pvalues[i]; best_varID = candidate_varIDs[i]; best_value = values[i]; adjusted_best_pvalue = adjusted_pvalues[i]; best_maxstat = test_statistics[i]; } } } // Stop and save CHF if no good split found (this is terminal node). if (adjusted_best_pvalue > alpha) { computeDeathCounts(nodeID); computeSurvival(nodeID); return true; } else { // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_maxstat); } return false; } } void TreeSurvival::computeDeathCounts(size_t nodeID) { // Initialize for (size_t i = 0; i < num_timepoints; ++i) { num_deaths[i] = 0; num_samples_at_risk[i] = 0; } for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double survival_time = data->get_y(sampleID, 0); size_t t = 0; while (t < num_timepoints && (*unique_timepoints)[t] < survival_time) { ++num_samples_at_risk[t]; ++t; } // Now t is the survival time, add to at risk and to death if death if (t < num_timepoints) { ++num_samples_at_risk[t]; if (data->get_y(sampleID, 1) == 1) { ++num_deaths[t]; } } } } void TreeSurvival::computeChildDeathCounts(size_t nodeID, size_t varID, std::vector& possible_split_values, std::vector& num_samples_right_child, std::vector& delta_samples_at_risk_right_child, std::vector& num_deaths_right_child, size_t num_splits) { // Count deaths in right child per timepoint and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_x(sampleID, varID); size_t survival_timeID = (*response_timepointIDs)[sampleID]; // Count deaths until split_value reached for (size_t i = 0; i < num_splits; ++i) { if (value > possible_split_values[i]) { ++num_samples_right_child[i]; ++delta_samples_at_risk_right_child[i * num_timepoints + survival_timeID]; if (data->get_y(sampleID, 1) == 1) { ++num_deaths_right_child[i * num_timepoints + survival_timeID]; } } else { break; } } } } void TreeSurvival::findBestSplitValueLogRank(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; // Create possible split values std::vector possible_split_values; data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (possible_split_values.size() < 2) { return; } // -1 because no split possible at largest value size_t num_splits = possible_split_values.size() - 1; // Initialize std::vector num_deaths_right_child(num_splits * num_timepoints); std::vector delta_samples_at_risk_right_child(num_splits * num_timepoints); std::vector num_samples_right_child(num_splits); computeChildDeathCounts(nodeID, varID, possible_split_values, num_samples_right_child, delta_samples_at_risk_right_child, num_deaths_right_child, num_splits); // Compute logrank test for all splits and use best for (size_t i = 0; i < num_splits; ++i) { double numerator = 0; double denominator_squared = 0; // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child[i]; if (num_samples_right_child[i] < min_node_size || num_samples_left_child < min_node_size) { continue; } // Compute logrank test statistic for this split size_t num_samples_at_risk_right_child = num_samples_right_child[i]; for (size_t t = 0; t < num_timepoints; ++t) { if (num_samples_at_risk[t] < 2 || num_samples_at_risk_right_child < 1) { break; } if (num_deaths[t] > 0) { // Numerator and demoninator for log-rank test, notation from Ishwaran et al. double di = (double) num_deaths[t]; double di1 = (double) num_deaths_right_child[i * num_timepoints + t]; double Yi = (double) num_samples_at_risk[t]; double Yi1 = (double) num_samples_at_risk_right_child; numerator += di1 - Yi1 * (di / Yi); denominator_squared += (Yi1 / Yi) * (1.0 - Yi1 / Yi) * ((Yi - di) / (Yi - 1)) * di; } // Reduce number of samples at risk for next timepoint num_samples_at_risk_right_child -= delta_samples_at_risk_right_child[i * num_timepoints + t]; } double logrank = -1; if (denominator_squared != 0) { logrank = fabs(numerator / sqrt(denominator_squared)); } // Regularization regularize(logrank, varID); if (logrank > best_logrank) { best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; best_varID = varID; best_logrank = logrank; // Use smaller value if average is numerically the same as the larger value if (best_value == possible_split_values[i + 1]) { best_value = possible_split_values[i]; } } } } void TreeSurvival::findBestSplitValueLogRankUnordered(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; // Create possible split values std::vector factor_levels; data->getAllValues(factor_levels, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (factor_levels.size() < 2) { return; } // Number of possible splits is 2^num_levels size_t num_splits = (1ULL << factor_levels.size()); // Compute logrank test statistic for each possible split // Split where all left (0) or all right (1) are excluded // The second half of numbers is just left/right switched the first half -> Exclude second half for (size_t local_splitID = 1; local_splitID < num_splits / 2; ++local_splitID) { // Compute overall splitID by shifting local factorIDs to global positions size_t splitID = 0; for (size_t j = 0; j < factor_levels.size(); ++j) { if ((local_splitID & (1ULL << j))) { double level = factor_levels[j]; size_t factorID = floor(level) - 1; splitID = splitID | (1ULL << factorID); } } // Initialize std::vector num_deaths_right_child(num_timepoints); std::vector delta_samples_at_risk_right_child(num_timepoints); size_t num_samples_right_child = 0; double numerator = 0; double denominator_squared = 0; // Count deaths in right child per timepoint for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t survival_timeID = (*response_timepointIDs)[sampleID]; double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++num_samples_right_child; ++delta_samples_at_risk_right_child[survival_timeID]; if (data->get_y(sampleID, 1) == 1) { ++num_deaths_right_child[survival_timeID]; } } } // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child; if (num_samples_right_child < min_node_size || num_samples_left_child < min_node_size) { continue; } // Compute logrank test statistic for this split size_t num_samples_at_risk_right_child = num_samples_right_child; for (size_t t = 0; t < num_timepoints; ++t) { if (num_samples_at_risk[t] < 2 || num_samples_at_risk_right_child < 1) { break; } if (num_deaths[t] > 0) { // Numerator and demoninator for log-rank test, notation from Ishwaran et al. double di = (double) num_deaths[t]; double di1 = (double) num_deaths_right_child[t]; double Yi = (double) num_samples_at_risk[t]; double Yi1 = (double) num_samples_at_risk_right_child; numerator += di1 - Yi1 * (di / Yi); denominator_squared += (Yi1 / Yi) * (1.0 - Yi1 / Yi) * ((Yi - di) / (Yi - 1)) * di; } // Reduce number of samples at risk for next timepoint num_samples_at_risk_right_child -= delta_samples_at_risk_right_child[t]; } double logrank = -1; if (denominator_squared != 0) { logrank = fabs(numerator / sqrt(denominator_squared)); } // Regularization regularize(logrank, varID); if (logrank > best_logrank) { best_value = splitID; best_varID = varID; best_logrank = logrank; } } } void TreeSurvival::findBestSplitValueAUC(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_auc) { // Create possible split values std::vector possible_split_values; data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (possible_split_values.size() < 2) { return; } size_t num_node_samples = end_pos[nodeID] - start_pos[nodeID]; size_t num_splits = possible_split_values.size() - 1; size_t num_possible_pairs = num_node_samples * (num_node_samples - 1) / 2; // Initialize std::vector num_count(num_splits, num_possible_pairs); std::vector num_total(num_splits, num_possible_pairs); std::vector num_samples_left_child(num_splits); // For all pairs for (size_t k = start_pos[nodeID]; k < end_pos[nodeID]; ++k) { size_t sample_k = sampleIDs[k]; double time_k = data->get_y(sample_k, 0); double status_k = data->get_y(sample_k, 1); double value_k = data->get_x(sample_k, varID); // Count samples in left node for (size_t i = 0; i < num_splits; ++i) { double split_value = possible_split_values[i]; if (value_k <= split_value) { ++num_samples_left_child[i]; } } for (size_t l = k + 1; l < end_pos[nodeID]; ++l) { size_t sample_l = sampleIDs[l]; double time_l = data->get_y(sample_l, 0); double status_l = data->get_y(sample_l, 1); double value_l = data->get_x(sample_l, varID); // Compute split computeAucSplit(time_k, time_l, status_k, status_l, value_k, value_l, num_splits, possible_split_values, num_count, num_total); } } for (size_t i = 0; i < num_splits; ++i) { // Do not consider this split point if fewer than min_node_size samples in one node size_t num_samples_right_child = num_node_samples - num_samples_left_child[i]; if (num_samples_left_child[i] < min_node_size || num_samples_right_child < min_node_size) { continue; } else { double auc = fabs((num_count[i] / 2) / num_total[i] - 0.5); // Regularization regularize(auc, varID); if (auc > best_auc) { best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; best_varID = varID; best_auc = auc; // Use smaller value if average is numerically the same as the larger value if (best_value == possible_split_values[i + 1]) { best_value = possible_split_values[i]; } } } } } void TreeSurvival::computeAucSplit(double time_k, double time_l, double status_k, double status_l, double value_k, double value_l, size_t num_splits, std::vector& possible_split_values, std::vector& num_count, std::vector& num_total) { bool ignore_pair = false; bool do_nothing = false; double value_smaller = 0; double value_larger = 0; double status_smaller = 0; if (time_k < time_l) { value_smaller = value_k; value_larger = value_l; status_smaller = status_k; } else if (time_l < time_k) { value_smaller = value_l; value_larger = value_k; status_smaller = status_l; } else { // Tie in survival time if (status_k == 0 || status_l == 0) { ignore_pair = true; } else { if (splitrule == AUC_IGNORE_TIES) { ignore_pair = true; } else { if (value_k == value_l) { // Tie in survival time and in covariate ignore_pair = true; } else { // Tie in survival time in covariate do_nothing = true; } } } } // Do not count if smaller time censored if (status_smaller == 0) { ignore_pair = true; } if (ignore_pair) { for (size_t i = 0; i < num_splits; ++i) { --num_count[i]; --num_total[i]; } } else if (do_nothing) { // Do nothing } else { for (size_t i = 0; i < num_splits; ++i) { double split_value = possible_split_values[i]; if (value_smaller <= split_value && value_larger > split_value) { ++num_count[i]; } else if (value_smaller > split_value && value_larger <= split_value) { --num_count[i]; } else if (value_smaller <= split_value && value_larger <= split_value) { break; } } } } bool TreeSurvival::findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs) { double best_decrease = -1; size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; size_t best_varID = 0; double best_value = 0; computeDeathCounts(nodeID); // Stop if maximum node size or depth reached (will check again for each child node) if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { computeSurvival(nodeID); return true; } // Stop early if no split posssible if (num_samples_node >= 2 * min_node_size) { // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { findBestSplitValueExtraTrees(nodeID, varID, best_value, best_varID, best_decrease); } else { findBestSplitValueExtraTreesUnordered(nodeID, varID, best_value, best_varID, best_decrease); } } } // Stop and save CHF if no good split found (this is terminal node). if (best_decrease < 0) { computeSurvival(nodeID); return true; } else { // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } } void TreeSurvival::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; // Get min/max values of covariate in node double min; double max; data->getMinMaxValues(min, max, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (min == max) { return; } // Create possible split values: Draw randomly between min and max std::vector possible_split_values; std::uniform_real_distribution udist(min, max); possible_split_values.reserve(num_random_splits); for (size_t i = 0; i < num_random_splits; ++i) { possible_split_values.push_back(udist(random_number_generator)); } if (num_random_splits > 1) { std::sort(possible_split_values.begin(), possible_split_values.end()); } size_t num_splits = possible_split_values.size(); // Initialize std::vector num_deaths_right_child(num_splits * num_timepoints); std::vector delta_samples_at_risk_right_child(num_splits * num_timepoints); std::vector num_samples_right_child(num_splits); computeChildDeathCounts(nodeID, varID, possible_split_values, num_samples_right_child, delta_samples_at_risk_right_child, num_deaths_right_child, num_splits); // Compute logrank test for all splits and use best for (size_t i = 0; i < num_splits; ++i) { double numerator = 0; double denominator_squared = 0; // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child[i]; if (num_samples_right_child[i] < min_node_size || num_samples_left_child < min_node_size) { continue; } // Compute logrank test statistic for this split size_t num_samples_at_risk_right_child = num_samples_right_child[i]; for (size_t t = 0; t < num_timepoints; ++t) { if (num_samples_at_risk[t] < 2 || num_samples_at_risk_right_child < 1) { break; } if (num_deaths[t] > 0) { // Numerator and demoninator for log-rank test, notation from Ishwaran et al. double di = (double) num_deaths[t]; double di1 = (double) num_deaths_right_child[i * num_timepoints + t]; double Yi = (double) num_samples_at_risk[t]; double Yi1 = (double) num_samples_at_risk_right_child; numerator += di1 - Yi1 * (di / Yi); denominator_squared += (Yi1 / Yi) * (1.0 - Yi1 / Yi) * ((Yi - di) / (Yi - 1)) * di; } // Reduce number of samples at risk for next timepoint num_samples_at_risk_right_child -= delta_samples_at_risk_right_child[i * num_timepoints + t]; } double logrank = -1; if (denominator_squared != 0) { logrank = fabs(numerator / sqrt(denominator_squared)); } // Regularization regularize(logrank, varID); if (logrank > best_logrank) { best_value = possible_split_values[i]; best_varID = varID; best_logrank = logrank; } } } void TreeSurvival::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, double& best_value, size_t& best_varID, double& best_logrank) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; size_t num_unique_values = data->getNumUniqueDataValues(varID); // Get all factor indices in node std::vector factor_in_node(num_unique_values, false); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); factor_in_node[index] = true; } // Vector of indices in and out of node std::vector indices_in_node; std::vector indices_out_node; indices_in_node.reserve(num_unique_values); indices_out_node.reserve(num_unique_values); for (size_t i = 0; i < num_unique_values; ++i) { if (factor_in_node[i]) { indices_in_node.push_back(i); } else { indices_out_node.push_back(i); } } // Generate num_random_splits splits for (size_t i = 0; i < num_random_splits; ++i) { std::vector split_subset; split_subset.reserve(num_unique_values); // Draw random subsets, sample all partitions with equal probability if (indices_in_node.size() > 1) { size_t num_partitions = (2ULL << (indices_in_node.size() - 1ULL)) - 2ULL; // 2^n-2 (don't allow full or empty) std::uniform_int_distribution udist(1, num_partitions); size_t splitID_in_node = udist(random_number_generator); for (size_t j = 0; j < indices_in_node.size(); ++j) { if ((splitID_in_node & (1ULL << j)) > 0) { split_subset.push_back(indices_in_node[j]); } } } if (indices_out_node.size() > 1) { size_t num_partitions = (2ULL << (indices_out_node.size() - 1ULL)) - 1ULL; // 2^n-1 (allow full or empty) std::uniform_int_distribution udist(0, num_partitions); size_t splitID_out_node = udist(random_number_generator); for (size_t j = 0; j < indices_out_node.size(); ++j) { if ((splitID_out_node & (1ULL << j)) > 0) { split_subset.push_back(indices_out_node[j]); } } } // Assign union of the two subsets to right child size_t splitID = 0; for (auto& idx : split_subset) { splitID |= 1ULL << idx; } // Initialize std::vector num_deaths_right_child(num_timepoints); std::vector delta_samples_at_risk_right_child(num_timepoints); size_t num_samples_right_child = 0; double numerator = 0; double denominator_squared = 0; // Count deaths in right child per timepoint for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t survival_timeID = (*response_timepointIDs)[sampleID]; double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++num_samples_right_child; ++delta_samples_at_risk_right_child[survival_timeID]; if (data->get_y(sampleID, 1) == 1) { ++num_deaths_right_child[survival_timeID]; } } } // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child; if (num_samples_right_child < min_node_size || num_samples_left_child < min_node_size) { continue; } // Compute logrank test statistic for this split size_t num_samples_at_risk_right_child = num_samples_right_child; for (size_t t = 0; t < num_timepoints; ++t) { if (num_samples_at_risk[t] < 2 || num_samples_at_risk_right_child < 1) { break; } if (num_deaths[t] > 0) { // Numerator and demoninator for log-rank test, notation from Ishwaran et al. double di = (double) num_deaths[t]; double di1 = (double) num_deaths_right_child[t]; double Yi = (double) num_samples_at_risk[t]; double Yi1 = (double) num_samples_at_risk_right_child; numerator += di1 - Yi1 * (di / Yi); denominator_squared += (Yi1 / Yi) * (1.0 - Yi1 / Yi) * ((Yi - di) / (Yi - 1)) * di; } // Reduce number of samples at risk for next timepoint num_samples_at_risk_right_child -= delta_samples_at_risk_right_child[t]; } double logrank = -1; if (denominator_squared != 0) { logrank = fabs(numerator / sqrt(denominator_squared)); } // Regularization regularize(logrank, varID); if (logrank > best_logrank) { best_value = splitID; best_varID = varID; best_logrank = logrank; } } } void TreeSurvival::addImpurityImportance(size_t nodeID, size_t varID, double decrease) { // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { (*variable_importance)[tempvarID] -= decrease; } else { (*variable_importance)[tempvarID] += decrease; } } } // namespace ranger ranger/src/rangerCpp.cpp0000755000176200001440000002612314027301517014753 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of Ranger. Ranger is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Ranger is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Ranger. If not, see . Written by: Marvin N. Wright Institut für Medizinische Biometrie und Statistik Universität zu Lübeck Ratzeburger Allee 160 23562 Lübeck http://www.imbs-luebeck.de #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include "globals.h" #include "Forest.h" #include "ForestClassification.h" #include "ForestRegression.h" #include "ForestSurvival.h" #include "ForestProbability.h" #include "Data.h" #include "DataChar.h" #include "DataRcpp.h" #include "DataFloat.h" #include "DataSparse.h" #include "utility.h" using namespace ranger; // [[Rcpp::depends(RcppEigen)]] // [[Rcpp::export]] Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth) { Rcpp::List result; try { std::unique_ptr forest { }; std::unique_ptr data { }; // Empty split select weights and always split variables if not used if (!use_split_select_weights) { split_select_weights.clear(); } if (!use_always_split_variable_names) { always_split_variable_names.clear(); } if (!use_unordered_variable_names) { unordered_variable_names.clear(); } if (!use_case_weights) { case_weights.clear(); } if (!use_inbag) { inbag.clear(); } if (!use_regularization_factor) { regularization_factor.clear(); } std::ostream* verbose_out; if (verbose) { verbose_out = &Rcpp::Rcout; } else { verbose_out = new std::stringstream; } size_t num_rows; size_t num_cols; if (use_sparse_data) { num_rows = sparse_x.rows(); num_cols = sparse_x.cols(); } else { num_rows = input_x.nrow(); num_cols = input_x.ncol(); } // Initialize data if (use_sparse_data) { data = make_unique(sparse_x, input_y, variable_names, num_rows, num_cols); } else { data = make_unique(input_x, input_y, variable_names, num_rows, num_cols); } // If there is snp data, add it if (snp_data.nrow() > 1) { data->addSnpData(snp_data.begin(), snp_data.ncol()); // Load SNP order if available if (prediction_mode && loaded_forest.containsElementNamed("snp.order")) { std::vector> snp_order = loaded_forest["snp.order"]; data->setSnpOrder(snp_order); } } switch (treetype) { case TREE_CLASSIFICATION: if (probability) { forest = make_unique(); } else { forest = make_unique(); } break; case TREE_REGRESSION: forest = make_unique(); break; case TREE_SURVIVAL: forest = make_unique(); break; case TREE_PROBABILITY: forest = make_unique(); break; } ImportanceMode importance_mode = (ImportanceMode) importance_mode_r; SplitRule splitrule = (SplitRule) splitrule_r; PredictionType prediction_type = (PredictionType) prediction_type_r; // Init Ranger forest->initR(std::move(data), mtry, num_trees, verbose_out, seed, num_threads, importance_mode, min_node_size, split_select_weights, always_split_variable_names, prediction_mode, sample_with_replacement, unordered_variable_names, save_memory, splitrule, case_weights, inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth); // Load forest object if in prediction mode if (prediction_mode) { std::vector> > child_nodeIDs = loaded_forest["child.nodeIDs"]; std::vector> split_varIDs = loaded_forest["split.varIDs"]; std::vector> split_values = loaded_forest["split.values"]; std::vector is_ordered = loaded_forest["is.ordered"]; if (treetype == TREE_CLASSIFICATION) { std::vector class_values = loaded_forest["class.values"]; auto& temp = dynamic_cast(*forest); temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, class_values, is_ordered); } else if (treetype == TREE_REGRESSION) { auto& temp = dynamic_cast(*forest); temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, is_ordered); } else if (treetype == TREE_SURVIVAL) { std::vector> > chf = loaded_forest["chf"]; std::vector unique_timepoints = loaded_forest["unique.death.times"]; auto& temp = dynamic_cast(*forest); temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, chf, unique_timepoints, is_ordered); } else if (treetype == TREE_PROBABILITY) { std::vector class_values = loaded_forest["class.values"]; std::vector>> terminal_class_counts = loaded_forest["terminal.class.counts"]; auto& temp = dynamic_cast(*forest); temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, class_values, terminal_class_counts, is_ordered); } } else { // Set class weights if (treetype == TREE_CLASSIFICATION && !class_weights.empty()) { auto& temp = dynamic_cast(*forest); temp.setClassWeights(class_weights); } else if (treetype == TREE_PROBABILITY && !class_weights.empty()) { auto& temp = dynamic_cast(*forest); temp.setClassWeights(class_weights); } } // Run Ranger forest->run(false, oob_error); if (use_split_select_weights && importance_mode != IMP_NONE) { if (verbose_out) { *verbose_out << "Warning: Split select weights used. Variable importance measures are only comparable for variables with equal weights." << std::endl; } } // Use first non-empty dimension of predictions const std::vector>>& predictions = forest->getPredictions(); if (predictions.size() == 1) { if (predictions[0].size() == 1) { result.push_back(forest->getPredictions()[0][0], "predictions"); } else { result.push_back(forest->getPredictions()[0], "predictions"); } } else { result.push_back(forest->getPredictions(), "predictions"); } // Return output result.push_back(forest->getNumTrees(), "num.trees"); result.push_back(forest->getNumIndependentVariables(), "num.independent.variables"); if (treetype == TREE_SURVIVAL) { auto& temp = dynamic_cast(*forest); result.push_back(temp.getUniqueTimepoints(), "unique.death.times"); } if (!prediction_mode) { result.push_back(forest->getMtry(), "mtry"); result.push_back(forest->getMinNodeSize(), "min.node.size"); if (importance_mode != IMP_NONE) { result.push_back(forest->getVariableImportance(), "variable.importance"); if (importance_mode == IMP_PERM_CASEWISE) { result.push_back(forest->getVariableImportanceCasewise(), "variable.importance.local"); } } result.push_back(forest->getOverallPredictionError(), "prediction.error"); } if (keep_inbag) { result.push_back(forest->getInbagCounts(), "inbag.counts"); } // Save forest if needed if (write_forest) { Rcpp::List forest_object; forest_object.push_back(forest->getNumTrees(), "num.trees"); forest_object.push_back(forest->getChildNodeIDs(), "child.nodeIDs"); forest_object.push_back(forest->getSplitVarIDs(), "split.varIDs"); forest_object.push_back(forest->getSplitValues(), "split.values"); forest_object.push_back(forest->getIsOrderedVariable(), "is.ordered"); if (snp_data.nrow() > 1 && order_snps) { // Exclude permuted SNPs (if any) std::vector> snp_order = forest->getSnpOrder(); forest_object.push_back(std::vector>(snp_order.begin(), snp_order.begin() + snp_data.ncol()), "snp.order"); } if (treetype == TREE_CLASSIFICATION) { auto& temp = dynamic_cast(*forest); forest_object.push_back(temp.getClassValues(), "class.values"); } else if (treetype == TREE_PROBABILITY) { auto& temp = dynamic_cast(*forest); forest_object.push_back(temp.getClassValues(), "class.values"); forest_object.push_back(temp.getTerminalClassCounts(), "terminal.class.counts"); } else if (treetype == TREE_SURVIVAL) { auto& temp = dynamic_cast(*forest); forest_object.push_back(temp.getChf(), "chf"); forest_object.push_back(temp.getUniqueTimepoints(), "unique.death.times"); } result.push_back(forest_object, "forest"); } if (!verbose) { delete verbose_out; } } catch (std::exception& e) { if (strcmp(e.what(), "User interrupt.") != 0) { Rcpp::Rcerr << "Error: " << e.what() << " Ranger will EXIT now." << std::endl; } return result; } return result; } ranger/src/DataDouble.h0000755000176200001440000000351314027301517014501 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ // Ignore in coverage report (not used in R package) // #nocov start #ifndef DATADOUBLE_H_ #define DATADOUBLE_H_ #include #include #include "globals.h" #include "utility.h" #include "Data.h" namespace ranger { class DataDouble: public Data { public: DataDouble() = default; DataDouble(const DataDouble&) = delete; DataDouble& operator=(const DataDouble&) = delete; virtual ~DataDouble() override = default; double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { col = getUnpermutedVarID(col); row = getPermutedSampleID(row); } if (col < num_cols_no_snp) { return x[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } double get_y(size_t row, size_t col) const override { return y[col * num_rows + row]; } void reserveMemory(size_t y_cols) override { x.resize(num_cols * num_rows); y.resize(y_cols * num_rows); } void set_x(size_t col, size_t row, double value, bool& error) override { x[col * num_rows + row] = value; } void set_y(size_t col, size_t row, double value, bool& error) override { y[col * num_rows + row] = value; } private: std::vector x; std::vector y; }; } // namespace ranger #endif /* DATADOUBLE_H_ */ // #nocov end ranger/src/TreeRegression.cpp0000755000176200001440000007507114027301517016000 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include "utility.h" #include "TreeRegression.h" #include "Data.h" namespace ranger { TreeRegression::TreeRegression(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values) : Tree(child_nodeIDs, split_varIDs, split_values), counter(0), sums(0) { } void TreeRegression::allocateMemory() { // Init counters if not in memory efficient mode if (!memory_saving_splitting) { size_t max_num_splits = data->getMaxNumUniqueValues(); // Use number of random splits for extratrees if (splitrule == EXTRATREES && num_random_splits > max_num_splits) { max_num_splits = num_random_splits; } counter.resize(max_num_splits); sums.resize(max_num_splits); } } double TreeRegression::estimate(size_t nodeID) { // Mean of responses of samples in node double sum_responses_in_node = 0; size_t num_samples_in_node = end_pos[nodeID] - start_pos[nodeID]; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; sum_responses_in_node += data->get_y(sampleID, 0); } return (sum_responses_in_node / (double) num_samples_in_node); } void TreeRegression::appendToFileInternal(std::ofstream& file) { // #nocov start // Empty on purpose } // #nocov end bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { split_values[nodeID] = estimate(nodeID); return true; } // Check if node is pure and set split_value to estimate and stop if pure bool pure = true; double pure_value = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_y(sampleID, 0); if (pos != start_pos[nodeID] && value != pure_value) { pure = false; break; } pure_value = value; } if (pure) { split_values[nodeID] = pure_value; return true; } // Find best split, stop if no decrease of impurity bool stop; if (splitrule == MAXSTAT) { stop = findBestSplitMaxstat(nodeID, possible_split_varIDs); } else if (splitrule == EXTRATREES) { stop = findBestSplitExtraTrees(nodeID, possible_split_varIDs); } else if (splitrule == BETA) { stop = findBestSplitBeta(nodeID, possible_split_varIDs); } else { stop = findBestSplit(nodeID, possible_split_varIDs); } if (stop) { split_values[nodeID] = estimate(nodeID); return true; } return false; } void TreeRegression::createEmptyNodeInternal() { // Empty on purpose } double TreeRegression::computePredictionAccuracyInternal(std::vector* prediction_error_casewise) { size_t num_predictions = prediction_terminal_nodeIDs.size(); double sum_of_squares = 0; for (size_t i = 0; i < num_predictions; ++i) { size_t terminal_nodeID = prediction_terminal_nodeIDs[i]; double predicted_value = split_values[terminal_nodeID]; double real_value = data->get_y(oob_sampleIDs[i], 0); if (predicted_value != real_value) { double diff = (predicted_value - real_value) * (predicted_value - real_value); if (prediction_error_casewise) { (*prediction_error_casewise)[i] = diff; } sum_of_squares += diff; } } return (1.0 - sum_of_squares / (double) num_predictions); } bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; double best_decrease = -1; size_t best_varID = 0; double best_value = 0; // Compute sum of responses in node double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; sum_node += data->get_y(sampleID, 0); } // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { // Use memory saving method if option set if (memory_saving_splitting) { findBestSplitValueSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } else { // Use faster method for both cases double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); if (q < Q_THRESHOLD) { findBestSplitValueSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } else { findBestSplitValueLargeQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } } } else { findBestSplitValueUnordered(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } } // Stop if no good split found if (best_decrease < 0) { return true; } // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } void TreeRegression::findBestSplitValueSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector possible_split_values; data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (possible_split_values.size() < 2) { return; } const size_t num_splits = possible_split_values.size(); if (memory_saving_splitting) { std::vector sums_right(num_splits); std::vector n_right(num_splits); findBestSplitValueSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums_right, n_right); } else { std::fill_n(sums.begin(), num_splits, 0); std::fill_n(counter.begin(), num_splits, 0); findBestSplitValueSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums, counter); } } void TreeRegression::findBestSplitValueSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums, std::vector& counter) { for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t idx = std::lower_bound(possible_split_values.begin(), possible_split_values.end(), data->get_x(sampleID, varID)) - possible_split_values.begin(); sums[idx] += data->get_y(sampleID, 0); ++counter[idx]; } size_t n_left = 0; double sum_left = 0; // Compute decrease of impurity for each split for (size_t i = 0; i < possible_split_values.size() - 1; ++i) { // Stop if nothing here if (counter[i] == 0) { continue; } n_left += counter[i]; sum_left += sums[i]; // Stop if right child empty size_t n_right = num_samples_node - n_left; if (n_right == 0) { break; } double sum_right = sum_node - sum_left; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { // Use mid-point split best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; best_varID = varID; best_decrease = decrease; // Use smaller value if average is numerically the same as the larger value if (best_value == possible_split_values[i + 1]) { best_value = possible_split_values[i]; } } } } void TreeRegression::findBestSplitValueLargeQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Set counters to 0 size_t num_unique = data->getNumUniqueDataValues(varID); std::fill_n(counter.begin(), num_unique, 0); std::fill_n(sums.begin(), num_unique, 0); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); sums[index] += data->get_y(sampleID, 0); ++counter[index]; } size_t n_left = 0; double sum_left = 0; // Compute decrease of impurity for each split for (size_t i = 0; i < num_unique - 1; ++i) { // Stop if nothing here if (counter[i] == 0) { continue; } n_left += counter[i]; sum_left += sums[i]; // Stop if right child empty size_t n_right = num_samples_node - n_left; if (n_right == 0) { break; } double sum_right = sum_node - sum_left; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { // Find next value in this node size_t j = i + 1; while (j < num_unique && counter[j] == 0) { ++j; } // Use mid-point split best_value = (data->getUniqueDataValue(varID, i) + data->getUniqueDataValue(varID, j)) / 2; best_varID = varID; best_decrease = decrease; // Use smaller value if average is numerically the same as the larger value if (best_value == data->getUniqueDataValue(varID, j)) { best_value = data->getUniqueDataValue(varID, i); } } } } void TreeRegression::findBestSplitValueUnordered(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector factor_levels; data->getAllValues(factor_levels, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (factor_levels.size() < 2) { return; } // Number of possible splits is 2^num_levels size_t num_splits = (1ULL << factor_levels.size()); // Compute decrease of impurity for each possible split // Split where all left (0) or all right (1) are excluded // The second half of numbers is just left/right switched the first half -> Exclude second half for (size_t local_splitID = 1; local_splitID < num_splits / 2; ++local_splitID) { // Compute overall splitID by shifting local factorIDs to global positions size_t splitID = 0; for (size_t j = 0; j < factor_levels.size(); ++j) { if ((local_splitID & (1ULL << j))) { double level = factor_levels[j]; size_t factorID = floor(level) - 1; splitID = splitID | (1ULL << factorID); } } // Initialize double sum_right = 0; size_t n_right = 0; // Sum in right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double response = data->get_y(sampleID, 0); double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++n_right; sum_right += response; } } size_t n_left = num_samples_node - n_right; // Sum of squares double sum_left = sum_node - sum_right; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = splitID; best_varID = varID; best_decrease = decrease; } } } bool TreeRegression::findBestSplitMaxstat(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; // Compute ranks std::vector response; response.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; response.push_back(data->get_y(sampleID, 0)); } std::vector ranks = rank(response); // Save split stats std::vector pvalues; pvalues.reserve(possible_split_varIDs.size()); std::vector values; values.reserve(possible_split_varIDs.size()); std::vector candidate_varIDs; candidate_varIDs.reserve(possible_split_varIDs.size()); std::vector test_statistics; test_statistics.reserve(possible_split_varIDs.size()); // Compute p-values for (auto& varID : possible_split_varIDs) { // Get all observations std::vector x; x.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; x.push_back(data->get_x(sampleID, varID)); } // Order by x std::vector indices = order(x, false); //std::vector indices = orderInData(data, sampleIDs[nodeID], varID, false); // Compute maximally selected rank statistics double best_maxstat; double best_split_value; maxstat(ranks, x, indices, best_maxstat, best_split_value, minprop, 1 - minprop); //maxstatInData(scores, data, sampleIDs[nodeID], varID, indices, best_maxstat, best_split_value, minprop, 1 - minprop); if (best_maxstat > -1) { // Compute number of samples left of cutpoints std::vector num_samples_left = numSamplesLeftOfCutpoint(x, indices); //std::vector num_samples_left = numSamplesLeftOfCutpointInData(data, sampleIDs[nodeID], varID, indices); // Compute p-values double pvalue_lau92 = maxstatPValueLau92(best_maxstat, minprop, 1 - minprop); double pvalue_lau94 = maxstatPValueLau94(best_maxstat, minprop, 1 - minprop, num_samples_node, num_samples_left); // Use minimum of Lau92 and Lau94 double pvalue = std::min(pvalue_lau92, pvalue_lau94); // Save split stats pvalues.push_back(pvalue); values.push_back(best_split_value); candidate_varIDs.push_back(varID); test_statistics.push_back(best_maxstat); } } double adjusted_best_pvalue = std::numeric_limits::max(); size_t best_varID = 0; double best_value = 0; double best_maxstat = 0; if (pvalues.size() > 0) { // Adjust p-values with Benjamini/Hochberg std::vector adjusted_pvalues = adjustPvalues(pvalues); // Use smallest p-value double min_pvalue = std::numeric_limits::max(); for (size_t i = 0; i < pvalues.size(); ++i) { if (pvalues[i] < min_pvalue) { min_pvalue = pvalues[i]; best_varID = candidate_varIDs[i]; best_value = values[i]; adjusted_best_pvalue = adjusted_pvalues[i]; best_maxstat = test_statistics[i]; } } } // Stop if no good split found (this is terminal node). if (adjusted_best_pvalue > alpha) { return true; } else { // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_maxstat); } return false; } } bool TreeRegression::findBestSplitExtraTrees(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; double best_decrease = -1; size_t best_varID = 0; double best_value = 0; // Compute sum of responses in node double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; sum_node += data->get_y(sampleID, 0); } // For all possible split variables for (auto& varID : possible_split_varIDs) { // Find best split value, if ordered consider all values as split values, else all 2-partitions if (data->isOrderedVariable(varID)) { findBestSplitValueExtraTrees(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } else { findBestSplitValueExtraTreesUnordered(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } } // Stop if no good split found if (best_decrease < 0) { return true; } // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } void TreeRegression::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Get min/max values of covariate in node double min; double max; data->getMinMaxValues(min, max, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (min == max) { return; } // Create possible split values: Draw randomly between min and max std::vector possible_split_values; std::uniform_real_distribution udist(min, max); possible_split_values.reserve(num_random_splits); for (size_t i = 0; i < num_random_splits; ++i) { possible_split_values.push_back(udist(random_number_generator)); } if (num_random_splits > 1) { std::sort(possible_split_values.begin(), possible_split_values.end()); } const size_t num_splits = possible_split_values.size(); if (memory_saving_splitting) { std::vector sums_right(num_splits); std::vector n_right(num_splits); findBestSplitValueExtraTrees(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums_right, n_right); } else { std::fill_n(sums.begin(), num_splits, 0); std::fill_n(counter.begin(), num_splits, 0); findBestSplitValueExtraTrees(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums, counter); } } void TreeRegression::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums_right, std::vector& n_right) { const size_t num_splits = possible_split_values.size(); // Sum in right child and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_x(sampleID, varID); double response = data->get_y(sampleID, 0); // Count samples until split_value reached for (size_t i = 0; i < num_splits; ++i) { if (value > possible_split_values[i]) { ++n_right[i]; sums_right[i] += response; } else { break; } } } // Compute decrease of impurity for each possible split for (size_t i = 0; i < num_splits; ++i) { // Stop if one child empty size_t n_left = num_samples_node - n_right[i]; if (n_left == 0 || n_right[i] == 0) { continue; } double sum_right = sums_right[i]; double sum_left = sum_node - sum_right; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right[i]; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = possible_split_values[i]; best_varID = varID; best_decrease = decrease; } } } void TreeRegression::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { size_t num_unique_values = data->getNumUniqueDataValues(varID); // Get all factor indices in node std::vector factor_in_node(num_unique_values, false); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); factor_in_node[index] = true; } // Vector of indices in and out of node std::vector indices_in_node; std::vector indices_out_node; indices_in_node.reserve(num_unique_values); indices_out_node.reserve(num_unique_values); for (size_t i = 0; i < num_unique_values; ++i) { if (factor_in_node[i]) { indices_in_node.push_back(i); } else { indices_out_node.push_back(i); } } // Generate num_random_splits splits for (size_t i = 0; i < num_random_splits; ++i) { std::vector split_subset; split_subset.reserve(num_unique_values); // Draw random subsets, sample all partitions with equal probability if (indices_in_node.size() > 1) { size_t num_partitions = (2ULL << (indices_in_node.size() - 1ULL)) - 2ULL; // 2^n-2 (don't allow full or empty) std::uniform_int_distribution udist(1, num_partitions); size_t splitID_in_node = udist(random_number_generator); for (size_t j = 0; j < indices_in_node.size(); ++j) { if ((splitID_in_node & (1ULL << j)) > 0) { split_subset.push_back(indices_in_node[j]); } } } if (indices_out_node.size() > 1) { size_t num_partitions = (2ULL << (indices_out_node.size() - 1ULL)) - 1ULL; // 2^n-1 (allow full or empty) std::uniform_int_distribution udist(0, num_partitions); size_t splitID_out_node = udist(random_number_generator); for (size_t j = 0; j < indices_out_node.size(); ++j) { if ((splitID_out_node & (1ULL << j)) > 0) { split_subset.push_back(indices_out_node[j]); } } } // Assign union of the two subsets to right child size_t splitID = 0; for (auto& idx : split_subset) { splitID |= 1ULL << idx; } // Initialize double sum_right = 0; size_t n_right = 0; // Sum in right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double response = data->get_y(sampleID, 0); double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count // In right child, if bitwise splitID at position factorID is 1 if ((splitID & (1ULL << factorID))) { ++n_right; sum_right += response; } } size_t n_left = num_samples_node - n_right; // Sum of squares double sum_left = sum_node - sum_right; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; // Regularization regularize(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = splitID; best_varID = varID; best_decrease = decrease; } } } bool TreeRegression::findBestSplitBeta(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; double best_decrease = -std::numeric_limits::infinity(); size_t best_varID = 0; double best_value = 0; // Compute sum of responses in node double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; sum_node += data->get_y(sampleID, 0); } // For all possible split variables find best split value for (auto& varID : possible_split_varIDs) { findBestSplitValueBeta(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } // Stop if no good split found if (std::isinf(-best_decrease)) { return true; } // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } // Regularization saveSplitVarID(best_varID); return false; } void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector possible_split_values; data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); // Try next variable if all equal for this if (possible_split_values.size() < 2) { return; } // -1 because no split possible at largest value size_t num_splits = possible_split_values.size() - 1; if (memory_saving_splitting) { std::vector sums_right(num_splits); std::vector n_right(num_splits); findBestSplitValueBeta(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums_right, n_right); } else { std::fill_n(sums.begin(), num_splits, 0); std::fill_n(counter.begin(), num_splits, 0); findBestSplitValueBeta(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums, counter); } } void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums_right, std::vector& n_right) { // -1 because no split possible at largest value const size_t num_splits = possible_split_values.size() - 1; // Sum in right child and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_x(sampleID, varID); double response = data->get_y(sampleID, 0); // Count samples until split_value reached for (size_t i = 0; i < num_splits; ++i) { if (value > possible_split_values[i]) { ++n_right[i]; sums_right[i] += response; } else { break; } } } // Compute LogLik of beta distribution for each possible split for (size_t i = 0; i < num_splits; ++i) { // Stop if one child too small size_t n_left = num_samples_node - n_right[i]; if (n_left < 2 || n_right[i] < 2) { continue; } // Compute mean double sum_right = sums_right[i]; double mean_right = sum_right / (double) n_right[i]; double sum_left = sum_node - sum_right; double mean_left = sum_left / (double) n_left; // Compute variance double var_right = 0; double var_left = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_x(sampleID, varID); double response = data->get_y(sampleID, 0); if (value > possible_split_values[i]) { var_right += (response - mean_right) * (response - mean_right); } else { var_left += (response - mean_left) * (response - mean_left); } } var_right /= (double) n_right[i] - 1; var_left /= (double) n_left - 1; // Stop if zero variance if (var_right < std::numeric_limits::epsilon() || var_left < std::numeric_limits::epsilon()) { continue; } // Compute phi for beta distribution double phi_right = mean_right * (1 - mean_right) / var_right - 1; double phi_left = mean_left * (1 - mean_left) / var_left - 1; // Compute LogLik of beta distribution double beta_loglik_right = 0; double beta_loglik_left = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; double value = data->get_x(sampleID, varID); double response = data->get_y(sampleID, 0); if (value > possible_split_values[i]) { beta_loglik_right += betaLogLik(response, mean_right, phi_right); } else { beta_loglik_left += betaLogLik(response, mean_left, phi_left); } } // Split statistic is sum of both log-likelihoods double decrease = beta_loglik_right + beta_loglik_left; // Stop if no result if (std::isnan(decrease)) { continue; } // Regularization (negative values) regularizeNegative(decrease, varID); // If better than before, use this if (decrease > best_decrease) { best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; best_varID = varID; best_decrease = decrease; // Use smaller value if average is numerically the same as the larger value if (best_value == possible_split_values[i + 1]) { best_value = possible_split_values[i]; } } } } void TreeRegression::addImpurityImportance(size_t nodeID, size_t varID, double decrease) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; double best_decrease = decrease; if (splitrule != MAXSTAT) { double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; sum_node += data->get_y(sampleID, 0); } double impurity_node = (sum_node * sum_node / (double) num_samples_node); // Account for the regularization regularize(impurity_node, varID); best_decrease = decrease - impurity_node; } // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { (*variable_importance)[tempvarID] -= best_decrease; } else { (*variable_importance)[tempvarID] += best_decrease; } } } // namespace ranger ranger/src/globals.h0000755000176200001440000000471614027301517014126 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #ifndef GLOBALS_H_ #define GLOBALS_H_ namespace ranger { #ifndef M_PI #define M_PI 3.14159265358979323846 #endif // Old/new Win build #ifdef WIN_R_BUILD #if __cplusplus < 201103L #define OLD_WIN_R_BUILD #else #define NEW_WIN_R_BUILD #endif #endif typedef unsigned int uint; // Tree types, probability is not selected by ID enum TreeType { TREE_CLASSIFICATION = 1, TREE_REGRESSION = 3, TREE_SURVIVAL = 5, TREE_PROBABILITY = 9 }; // Memory modes enum MemoryMode { MEM_DOUBLE = 0, MEM_FLOAT = 1, MEM_CHAR = 2 }; const uint MAX_MEM_MODE = 2; // Mask and Offset to store 2 bit values in bytes static const int mask[4] = {192,48,12,3}; static const int offset[4] = {6,4,2,0}; // Variable importance enum ImportanceMode { IMP_NONE = 0, IMP_GINI = 1, IMP_PERM_BREIMAN = 2, IMP_PERM_LIAW = 4, IMP_PERM_RAW = 3, IMP_GINI_CORRECTED = 5, IMP_PERM_CASEWISE = 6 }; const uint MAX_IMP_MODE = 6; // Split mode enum SplitRule { LOGRANK = 1, AUC = 2, AUC_IGNORE_TIES = 3, MAXSTAT = 4, EXTRATREES = 5, BETA = 6, HELLINGER = 7 }; // Prediction type enum PredictionType { RESPONSE = 1, TERMINALNODES = 2 }; // Default values const uint DEFAULT_NUM_TREE = 500; const uint DEFAULT_NUM_THREADS = 0; const ImportanceMode DEFAULT_IMPORTANCE_MODE = IMP_NONE; const uint DEFAULT_MIN_NODE_SIZE_CLASSIFICATION = 1; const uint DEFAULT_MIN_NODE_SIZE_REGRESSION = 5; const uint DEFAULT_MIN_NODE_SIZE_SURVIVAL = 3; const uint DEFAULT_MIN_NODE_SIZE_PROBABILITY = 10; const SplitRule DEFAULT_SPLITRULE = LOGRANK; const double DEFAULT_ALPHA = 0.5; const double DEFAULT_MINPROP = 0.1; const uint DEFAULT_MAXDEPTH = 0; const PredictionType DEFAULT_PREDICTIONTYPE = RESPONSE; const uint DEFAULT_NUM_RANDOM_SPLITS = 1; const double DEFAULT_SAMPLE_FRACTION_REPLACE = 1; const double DEFAULT_SAMPLE_FRACTION_NOREPLACE = 0.632; // Interval to print progress in seconds const double STATUS_INTERVAL = 30.0; // Threshold for q value split method switch const double Q_THRESHOLD = 0.02; } // namespace ranger #endif /* GLOBALS_H_ */ ranger/src/Data.cpp0000755000176200001440000002374514027301517013712 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include #include #include "Data.h" #include "utility.h" namespace ranger { Data::Data() : num_rows(0), num_rows_rounded(0), num_cols(0), snp_data(0), num_cols_no_snp(0), externalData(true), index_data(0), max_num_unique_values( 0), order_snps(false) { } size_t Data::getVariableID(const std::string& variable_name) const { auto it = std::find(variable_names.cbegin(), variable_names.cend(), variable_name); if (it == variable_names.cend()) { throw std::runtime_error("Variable " + variable_name + " not found."); } return (std::distance(variable_names.cbegin(), it)); } // #nocov start (cannot be tested anymore because GenABEL not on CRAN) void Data::addSnpData(unsigned char* snp_data, size_t num_cols_snp) { num_cols = num_cols_no_snp + num_cols_snp; num_rows_rounded = roundToNextMultiple(num_rows, 4); this->snp_data = snp_data; } // #nocov end // #nocov start bool Data::loadFromFile(std::string filename, std::vector& dependent_variable_names) { bool result; // Open input file std::ifstream input_file; input_file.open(filename); if (!input_file.good()) { throw std::runtime_error("Could not open input file."); } // Count number of rows size_t line_count = 0; std::string line; while (getline(input_file, line)) { ++line_count; } num_rows = line_count - 1; input_file.close(); input_file.open(filename); // Check if comma, semicolon or whitespace seperated std::string header_line; getline(input_file, header_line); // Find out if comma, semicolon or whitespace seperated and call appropriate method if (header_line.find(",") != std::string::npos) { result = loadFromFileOther(input_file, header_line, dependent_variable_names, ','); } else if (header_line.find(";") != std::string::npos) { result = loadFromFileOther(input_file, header_line, dependent_variable_names, ';'); } else { result = loadFromFileWhitespace(input_file, header_line, dependent_variable_names); } externalData = false; input_file.close(); return result; } bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_line, std::vector& dependent_variable_names) { size_t num_dependent_variables = dependent_variable_names.size(); std::vector dependent_varIDs; dependent_varIDs.resize(num_dependent_variables); // Read header std::string header_token; std::stringstream header_line_stream(header_line); size_t col = 0; while (header_line_stream >> header_token) { bool is_dependent_var = false; for (size_t i = 0; i < dependent_variable_names.size(); ++i) { if (header_token == dependent_variable_names[i]) { dependent_varIDs[i] = col; is_dependent_var = true; } } if (!is_dependent_var) { variable_names.push_back(header_token); } ++col; } num_cols = variable_names.size(); num_cols_no_snp = num_cols; // Read body reserveMemory(num_dependent_variables); bool error = false; std::string line; size_t row = 0; while (getline(input_file, line)) { double token; std::stringstream line_stream(line); size_t column = 0; while (readFromStream(line_stream, token)) { size_t column_x = column; bool is_dependent_var = false; for (size_t i = 0; i < dependent_varIDs.size(); ++i) { if (column == dependent_varIDs[i]) { set_y(i, row, token, error); is_dependent_var = true; break; } else if (column > dependent_varIDs[i]) { --column_x; } } if (!is_dependent_var) { set_x(column_x, row, token, error); } ++column; } if (column > (num_cols + num_dependent_variables)) { throw std::runtime_error( std::string("Could not open input file. Too many columns in row ") + std::to_string(row) + std::string(".")); } else if (column < (num_cols + num_dependent_variables)) { throw std::runtime_error( std::string("Could not open input file. Too few columns in row ") + std::to_string(row) + std::string(". Are all values numeric?")); } ++row; } num_rows = row; return error; } bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, std::vector& dependent_variable_names, char seperator) { size_t num_dependent_variables = dependent_variable_names.size(); std::vector dependent_varIDs; dependent_varIDs.resize(num_dependent_variables); // Read header std::string header_token; std::stringstream header_line_stream(header_line); size_t col = 0; while (getline(header_line_stream, header_token, seperator)) { bool is_dependent_var = false; for (size_t i = 0; i < dependent_variable_names.size(); ++i) { if (header_token == dependent_variable_names[i]) { dependent_varIDs[i] = col; is_dependent_var = true; } } if (!is_dependent_var) { variable_names.push_back(header_token); } ++col; } num_cols = variable_names.size(); num_cols_no_snp = num_cols; // Read body reserveMemory(num_dependent_variables); bool error = false; std::string line; size_t row = 0; while (getline(input_file, line)) { std::string token_string; double token; std::stringstream line_stream(line); size_t column = 0; while (getline(line_stream, token_string, seperator)) { std::stringstream token_stream(token_string); readFromStream(token_stream, token); size_t column_x = column; bool is_dependent_var = false; for (size_t i = 0; i < dependent_varIDs.size(); ++i) { if (column == dependent_varIDs[i]) { set_y(i, row, token, error); is_dependent_var = true; break; } else if (column > dependent_varIDs[i]) { --column_x; } } if (!is_dependent_var) { set_x(column_x, row, token, error); } ++column; } ++row; } num_rows = row; return error; } // #nocov end void Data::getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const { // All values for varID (no duplicates) for given sampleIDs if (getUnpermutedVarID(varID) < num_cols_no_snp) { all_values.reserve(end - start); for (size_t pos = start; pos < end; ++pos) { all_values.push_back(get_x(sampleIDs[pos], varID)); } std::sort(all_values.begin(), all_values.end()); all_values.erase(std::unique(all_values.begin(), all_values.end()), all_values.end()); } else { // If GWA data just use 0, 1, 2 all_values = std::vector( { 0, 1, 2 }); } } void Data::getMinMaxValues(double& min, double&max, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const { if (sampleIDs.size() > 0) { min = get_x(sampleIDs[start], varID); max = min; } for (size_t pos = start; pos < end; ++pos) { double value = get_x(sampleIDs[pos], varID); if (value < min) { min = value; } if (value > max) { max = value; } } } void Data::sort() { // Reserve memory index_data.resize(num_cols_no_snp * num_rows); // For all columns, get unique values and save index for each observation for (size_t col = 0; col < num_cols_no_snp; ++col) { // Get all unique values std::vector unique_values(num_rows); for (size_t row = 0; row < num_rows; ++row) { unique_values[row] = get_x(row, col); } std::sort(unique_values.begin(), unique_values.end()); unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end()); // Get index of unique value for (size_t row = 0; row < num_rows; ++row) { size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get_x(row, col)) - unique_values.begin(); index_data[col * num_rows + row] = idx; } // Save unique values unique_data_values.push_back(unique_values); if (unique_values.size() > max_num_unique_values) { max_num_unique_values = unique_values.size(); } } } // TODO: Implement ordering for multiclass and survival // #nocov start (cannot be tested anymore because GenABEL not on CRAN) void Data::orderSnpLevels(bool corrected_importance) { // Stop if now SNP data if (snp_data == 0) { return; } size_t num_snps; if (corrected_importance) { num_snps = 2 * (num_cols - num_cols_no_snp); } else { num_snps = num_cols - num_cols_no_snp; } // Reserve space snp_order.resize(num_snps, std::vector(3)); // For each SNP for (size_t i = 0; i < num_snps; ++i) { size_t col = i; if (i >= (num_cols - num_cols_no_snp)) { // Get unpermuted SNP ID col = i - num_cols + num_cols_no_snp; } // Order by mean response std::vector means(3, 0); std::vector counts(3, 0); for (size_t row = 0; row < num_rows; ++row) { size_t row_permuted = row; if (i >= (num_cols - num_cols_no_snp)) { row_permuted = getPermutedSampleID(row); } size_t idx = col * num_rows_rounded + row_permuted; size_t value = (((snp_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); // TODO: Better way to treat missing values? if (value > 2) { value = 0; } means[value] += get_y(row, 0); ++counts[value]; } for (size_t value = 0; value < 3; ++value) { means[value] /= counts[value]; } // Save order snp_order[i] = order(means, false); } order_snps = true; } // #nocov end } // namespace ranger ranger/src/AAA_check_cpp11.cpp0000755000176200001440000000030314027301517015545 0ustar liggesusers#ifndef WIN_R_BUILD #if __cplusplus < 201103L #error Error: ranger requires a real C++11 compiler, e.g., gcc >= 4.7 or Clang >= 3.0. You probably have to update your C++ compiler. #endif #endif ranger/src/RcppExports.cpp0000755000176200001440000002201214043720173015315 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include "../inst/include/ranger.h" #include #include using namespace Rcpp; // rangerCpp Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< uint >::type treetype(treetypeSEXP); Rcpp::traits::input_parameter< Rcpp::NumericMatrix& >::type input_x(input_xSEXP); Rcpp::traits::input_parameter< Rcpp::NumericMatrix& >::type input_y(input_ySEXP); Rcpp::traits::input_parameter< std::vector >::type variable_names(variable_namesSEXP); Rcpp::traits::input_parameter< uint >::type mtry(mtrySEXP); Rcpp::traits::input_parameter< uint >::type num_trees(num_treesSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< uint >::type seed(seedSEXP); Rcpp::traits::input_parameter< uint >::type num_threads(num_threadsSEXP); Rcpp::traits::input_parameter< bool >::type write_forest(write_forestSEXP); Rcpp::traits::input_parameter< uint >::type importance_mode_r(importance_mode_rSEXP); Rcpp::traits::input_parameter< uint >::type min_node_size(min_node_sizeSEXP); Rcpp::traits::input_parameter< std::vector>& >::type split_select_weights(split_select_weightsSEXP); Rcpp::traits::input_parameter< bool >::type use_split_select_weights(use_split_select_weightsSEXP); Rcpp::traits::input_parameter< std::vector& >::type always_split_variable_names(always_split_variable_namesSEXP); Rcpp::traits::input_parameter< bool >::type use_always_split_variable_names(use_always_split_variable_namesSEXP); Rcpp::traits::input_parameter< bool >::type prediction_mode(prediction_modeSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type loaded_forest(loaded_forestSEXP); Rcpp::traits::input_parameter< Rcpp::RawMatrix >::type snp_data(snp_dataSEXP); Rcpp::traits::input_parameter< bool >::type sample_with_replacement(sample_with_replacementSEXP); Rcpp::traits::input_parameter< bool >::type probability(probabilitySEXP); Rcpp::traits::input_parameter< std::vector& >::type unordered_variable_names(unordered_variable_namesSEXP); Rcpp::traits::input_parameter< bool >::type use_unordered_variable_names(use_unordered_variable_namesSEXP); Rcpp::traits::input_parameter< bool >::type save_memory(save_memorySEXP); Rcpp::traits::input_parameter< uint >::type splitrule_r(splitrule_rSEXP); Rcpp::traits::input_parameter< std::vector& >::type case_weights(case_weightsSEXP); Rcpp::traits::input_parameter< bool >::type use_case_weights(use_case_weightsSEXP); Rcpp::traits::input_parameter< std::vector& >::type class_weights(class_weightsSEXP); Rcpp::traits::input_parameter< bool >::type predict_all(predict_allSEXP); Rcpp::traits::input_parameter< bool >::type keep_inbag(keep_inbagSEXP); Rcpp::traits::input_parameter< std::vector& >::type sample_fraction(sample_fractionSEXP); Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< double >::type minprop(minpropSEXP); Rcpp::traits::input_parameter< bool >::type holdout(holdoutSEXP); Rcpp::traits::input_parameter< uint >::type prediction_type_r(prediction_type_rSEXP); Rcpp::traits::input_parameter< uint >::type num_random_splits(num_random_splitsSEXP); Rcpp::traits::input_parameter< Eigen::SparseMatrix& >::type sparse_x(sparse_xSEXP); Rcpp::traits::input_parameter< bool >::type use_sparse_data(use_sparse_dataSEXP); Rcpp::traits::input_parameter< bool >::type order_snps(order_snpsSEXP); Rcpp::traits::input_parameter< bool >::type oob_error(oob_errorSEXP); Rcpp::traits::input_parameter< uint >::type max_depth(max_depthSEXP); Rcpp::traits::input_parameter< std::vector>& >::type inbag(inbagSEXP); Rcpp::traits::input_parameter< bool >::type use_inbag(use_inbagSEXP); Rcpp::traits::input_parameter< std::vector& >::type regularization_factor(regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type use_regularization_factor(use_regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type regularization_usedepth(regularization_usedepthSEXP); rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)); return rcpp_result_gen; END_RCPP } // numSmaller Rcpp::IntegerVector numSmaller(Rcpp::NumericVector values, Rcpp::NumericVector reference); RcppExport SEXP _ranger_numSmaller(SEXP valuesSEXP, SEXP referenceSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::NumericVector >::type values(valuesSEXP); Rcpp::traits::input_parameter< Rcpp::NumericVector >::type reference(referenceSEXP); rcpp_result_gen = Rcpp::wrap(numSmaller(values, reference)); return rcpp_result_gen; END_RCPP } // randomObsNode Rcpp::NumericMatrix randomObsNode(Rcpp::IntegerMatrix groups, Rcpp::NumericVector y, Rcpp::IntegerMatrix inbag_counts); RcppExport SEXP _ranger_randomObsNode(SEXP groupsSEXP, SEXP ySEXP, SEXP inbag_countsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type groups(groupsSEXP); Rcpp::traits::input_parameter< Rcpp::NumericVector >::type y(ySEXP); Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type inbag_counts(inbag_countsSEXP); rcpp_result_gen = Rcpp::wrap(randomObsNode(groups, y, inbag_counts)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 46}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {"_ranger_randomObsNode", (DL_FUNC) &_ranger_randomObsNode, 3}, {NULL, NULL, 0} }; RcppExport void R_init_ranger(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } ranger/src/ForestRegression.cpp0000755000176200001440000002065414027301517016340 0ustar liggesusers/*------------------------------------------------------------------------------- This file is part of ranger. Copyright (c) [2014-2018] [Marvin N. Wright] This software may be modified and distributed under the terms of the MIT license. Please note that the C++ core of ranger is distributed under MIT license and the R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ #include #include #include #include "utility.h" #include "ForestRegression.h" #include "TreeRegression.h" #include "Data.h" namespace ranger { void ForestRegression::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& is_ordered_variable) { this->num_trees = num_trees; data->setIsOrderedVariable(is_ordered_variable); // Create trees trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i])); } // Create thread ranges equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } void ForestRegression::initInternal() { // If mtry not set, use floored square root of number of independent variables if (mtry == 0) { unsigned long temp = sqrt((double) num_independent_variables); mtry = std::max((unsigned long) 1, temp); } // Set minimal node size if (min_node_size == 0) { min_node_size = DEFAULT_MIN_NODE_SIZE_REGRESSION; } // Error if beta splitrule used with data outside of [0,1] if (splitrule == BETA && !prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { double y = data->get_y(i, 0); if (y < 0 || y > 1) { throw std::runtime_error("Beta splitrule applicable to regression data with outcome between 0 and 1 only."); } } } // Sort data if memory saving mode if (!memory_saving_splitting) { data->sort(); } } void ForestRegression::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back(make_unique()); } } void ForestRegression::allocatePredictMemory() { size_t num_prediction_samples = data->getNumRows(); if (predict_all || prediction_type == TERMINALNODES) { predictions = std::vector>>(1, std::vector>(num_prediction_samples, std::vector(num_trees))); } else { predictions = std::vector>>(1, std::vector>(1, std::vector(num_prediction_samples))); } } void ForestRegression::predictInternal(size_t sample_idx) { if (predict_all || prediction_type == TERMINALNODES) { // Get all tree predictions for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { if (prediction_type == TERMINALNODES) { predictions[0][sample_idx][tree_idx] = getTreePredictionTerminalNodeID(tree_idx, sample_idx); } else { predictions[0][sample_idx][tree_idx] = getTreePrediction(tree_idx, sample_idx); } } } else { // Mean over trees double prediction_sum = 0; for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { prediction_sum += getTreePrediction(tree_idx, sample_idx); } predictions[0][0][sample_idx] = prediction_sum / num_trees; } } void ForestRegression::computePredictionErrorInternal() { // For each sample sum over trees where sample is OOB std::vector samples_oob_count; predictions = std::vector>>(1, std::vector>(1, std::vector(num_samples, 0))); samples_oob_count.resize(num_samples, 0); for (size_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) { for (size_t sample_idx = 0; sample_idx < trees[tree_idx]->getNumSamplesOob(); ++sample_idx) { size_t sampleID = trees[tree_idx]->getOobSampleIDs()[sample_idx]; double value = getTreePrediction(tree_idx, sample_idx); predictions[0][0][sampleID] += value; ++samples_oob_count[sampleID]; } } // MSE with predictions and true data size_t num_predictions = 0; overall_prediction_error = 0; for (size_t i = 0; i < predictions[0][0].size(); ++i) { if (samples_oob_count[i] > 0) { ++num_predictions; predictions[0][0][i] /= (double) samples_oob_count[i]; double predicted_value = predictions[0][0][i]; double real_value = data->get_y(i, 0); overall_prediction_error += (predicted_value - real_value) * (predicted_value - real_value); } else { predictions[0][0][i] = NAN; } } overall_prediction_error /= (double) num_predictions; } // #nocov start void ForestRegression::writeOutputInternal() { if (verbose_out) { *verbose_out << "Tree type: " << "Regression" << std::endl; } } void ForestRegression::writeConfusionFile() { // Open confusion file for writing std::string filename = output_prefix + ".confusion"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to confusion file: " + filename + "."); } // Write confusion to file outfile << "Overall OOB prediction error (MSE): " << overall_prediction_error << std::endl; outfile.close(); if (verbose_out) *verbose_out << "Saved prediction error to file " << filename << "." << std::endl; } void ForestRegression::writePredictionFile() { // Open prediction file for writing std::string filename = output_prefix + ".prediction"; std::ofstream outfile; outfile.open(filename, std::ios::out); if (!outfile.good()) { throw std::runtime_error("Could not write to prediction file: " + filename + "."); } // Write outfile << "Predictions: " << std::endl; if (predict_all) { for (size_t k = 0; k < num_trees; ++k) { outfile << "Tree " << k << ":" << std::endl; for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { outfile << predictions[i][j][k] << std::endl; } } outfile << std::endl; } } else { for (size_t i = 0; i < predictions.size(); ++i) { for (size_t j = 0; j < predictions[i].size(); ++j) { for (size_t k = 0; k < predictions[i][j].size(); ++k) { outfile << predictions[i][j][k] << std::endl; } } } } if (verbose_out) *verbose_out << "Saved predictions to file " << filename << "." << std::endl; } void ForestRegression::saveToFileInternal(std::ofstream& outfile) { // Write num_variables outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_REGRESSION; outfile.write((char*) &treetype, sizeof(treetype)); } void ForestRegression::loadFromFileInternal(std::ifstream& infile) { // Read number of variables size_t num_variables_saved; infile.read((char*) &num_variables_saved, sizeof(num_variables_saved)); // Read treetype TreeType treetype; infile.read((char*) &treetype, sizeof(treetype)); if (treetype != TREE_REGRESSION) { throw std::runtime_error("Wrong treetype. Loaded file is not a regression forest."); } for (size_t i = 0; i < num_trees; ++i) { // Read data std::vector> child_nodeIDs; readVector2D(child_nodeIDs, infile); std::vector split_varIDs; readVector1D(split_varIDs, infile); std::vector split_values; readVector1D(split_values, infile); // If dependent variable not in test data, throw error if (num_variables_saved != num_independent_variables) { throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree trees.push_back(make_unique(child_nodeIDs, split_varIDs, split_values)); } } double ForestRegression::getTreePrediction(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPrediction(sample_idx); } size_t ForestRegression::getTreePredictionTerminalNodeID(size_t tree_idx, size_t sample_idx) const { const auto& tree = dynamic_cast(*trees[tree_idx]); return tree.getPredictionTerminalNodeID(sample_idx); } // #nocov end }// namespace ranger ranger/NEWS0000755000176200001440000001347514071754425012255 0ustar liggesusers ##### Version 0.13 * Faster quantile prediction * Add ... argument to ranger() * Bug fixes ##### Version 0.12.0 * Faster computation (in some cases) * Add local variable importance * Add "hellinger" splitrule for binary classification * Add "beta" splitrule for bounded outcomes * Accept user-specified function in quantile prediction * Add regularization * Add x/y interface * Internal changes (seed differences possible, prediction incompatible with older versions) * Bug fixes ##### Version 0.11.0 * Add max.depth parameter to limit tree depth * Add inbag argument for manual selection of observations in trees * Add support of splitting weights for corrected impurity importance * Internal changes (slightly improved computation speed) * Warning: Possible seed differences compared to older versions * Bug fixes ##### Version 0.10.0 * Change license of C++ core to MIT (R package is still GPL3) * Better 'order' mode for unordered factors for multiclass and survival * Add 'order' mode for unordered factors for GenABEL SNP data (binary classification and regression) * Add class-weighted Gini splitting * Add fixed proportion sampling * Add impurity importance for the maxstat splitting rule * Remove GenABEL from suggested packages (removed from CRAN). GenABEL data is still supported * Improve memory management (internal changes) * Bug fixes ##### Version 0.9.0 * Add bias-corrected impurity importance (actual impurity reduction, AIR) * Add quantile prediction as in quantile regression forests * Add treeInfo() function to extract human readable tree structure * Add standard error estimation with the infinitesimal jackknife (now the default) * Add impurity importance for survival forests * Faster aggregation of predictions * Fix memory issues on Windows 7 * Bug fixes ##### Version 0.8.0 * Handle sparse data of class Matrix::dgCMatrix * Add prediction of standard errors to predict() * Allow devtools::install_github() without subdir and on Windows * Bug fixes ##### Version 0.7.0 * Add randomized splitting (extraTrees) * Better formula interface: Support interactions terms and faster computation * Split at mid-point between candidate values * Improvements in holdoutRF and importance p-value estimation * Drop unused factor levels in outcome before growing * Add predict.all for probability and survival prediction * Bug fixes ##### Version 0.6.0 * Set write.forest=TRUE by default * Add num.trees option to predict() * Faster version of getTerminalNodeIDs(), included in predict() * Handle new factor levels in 'order' mode * Use unadjusted p-value for 2 categories in maxstat splitting * Bug fixes ##### Version 0.5.0 * Add Windows multithreading support for new toolchain * Add splitting by maximally selected rank statistics for survival and regression forests * Faster method for unordered factor splitting * Add p-values for variable importance * Runtime improvement for regression forests on classification data * Bug fixes ##### Version 0.4.0 * Reduce memory usage of savest forest objects (changed child.nodeIDs interface) * Add keep.inbag option to track in-bag counts * Add option sample.fraction for fraction of sampled observations * Add tree-wise split.select.weights * Add predict.all option in predict() to get individual predictions for each tree for classification and regression * Add case-specific random forests * Add case weights (weighted bootstrapping or subsampling) * Remove tuning functions, please use mlr or caret * Catch error of outdated gcc not supporting C++11 completely * Bug fixes ##### Version 0.3.0 * Allow the user to interrupt computation from R * Transpose classification.table and rename to confusion.matrix * Respect R seed for prediction * Memory improvements for variable importance computation * Fix bug: Probability prediction for single observations * Fix bug: Results not identical when using alternative interface ##### Version 0.2.7 * Small fixes for Solaris compiler ##### Version 0.2.6 * Add C-index splitting * Fix NA SNP handling ##### Version 0.2.5 * Fix matrix and gwaa alternative survival interface * Version submitted to JSS ##### Version 0.2.4 * Small changes in documentation ##### Version 0.2.3 * Preallocate memory for splitting ##### Version 0.2.2 * Remove recursive splitting ##### Version 0.2.1 * Allow matrix as input data in R version ##### Version 0.2.0 * Fix prediction of classification forests in R ##### Version 0.1.9 * Speedup growing for continuous covariates * Add memory save option to save memory for very large datasets (but slower) * Remove memory mode option from R version since no performance gain ##### Version 0.1.8 * Fix problems when using Rcpp <0.11.4 ##### Version 0.1.7 * Add option to split on unordered categorical covariates ##### Version 0.1.6 * Optimize memory management for very large survival forests ##### Version 0.1.5 * Set required Rcpp version to 0.11.2 * Fix large $call objects when using BatchJobs * Add details and example on GenABEL usage to documentation * Minor changes to documentation ##### Version 0.1.4 * Speedup for survival forests with continuous covariates * R version: Generate seed from R. It is no longer necessary to set the seed argument in ranger calls. ##### Version 0.1.3 * Windows support for R version (without multithreading) ##### Version 0.1.2 * Speedup growing of regression and probability prediction forests * Prediction forests are now handled like regression forests: MSE used for prediction error and permutation importance * Fixed name conflict with randomForest package for "importance" * Fixed a bug: prediction function is now working for probability prediction forests * Slot "predictions" for probability forests now contains class probabilities * importance function is now working even if randomForest package is loaded after ranger * Fixed a bug: Split selection weights are now working as expected * Small changes in documentation ranger/R/0000755000176200001440000000000014073533770011742 5ustar liggesusersranger/R/print.R0000755000176200001440000001032314027301517013212 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' Print contents of Ranger object. ##' ##' ##' @title Print Ranger ##' @param x Object of class 'ranger'. ##' @param ... Further arguments passed to or from other methods. ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @export print.ranger <- function(x, ...) { cat("Ranger result\n\n") cat("Call:\n", deparse(x$call), "\n\n") cat("Type: ", x$treetype, "\n") cat("Number of trees: ", x$num.trees, "\n") cat("Sample size: ", x$num.samples, "\n") cat("Number of independent variables: ", x$num.independent.variables, "\n") cat("Mtry: ", x$mtry, "\n") cat("Target node size: ", x$min.node.size, "\n") cat("Variable importance mode: ", x$importance.mode, "\n") cat("Splitrule: ", x$splitrule, "\n") if (x$treetype == "Survival") { cat("Number of unique death times: ", length(x$unique.death.times), "\n") } if (!is.null(x$splitrule) && x$splitrule == "extratrees" && !is.null(x$num.random.splits)) { cat("Number of random splits: ", x$num.random.splits, "\n") } if (x$treetype == "Classification") { cat("OOB prediction error: ", sprintf("%1.2f %%", 100*x$prediction.error), "\n") } else if (x$treetype == "Regression") { cat("OOB prediction error (MSE): ", x$prediction.error, "\n") } else if (x$treetype == "Survival") { cat("OOB prediction error (1-C): ", x$prediction.error, "\n") } else if (x$treetype == "Probability estimation") { cat("OOB prediction error (Brier s.): ", x$prediction.error, "\n") } else { cat("OOB prediction error: ", x$prediction.error, "\n") } if (x$treetype == "Regression") { cat("R squared (OOB): ", x$r.squared, "\n") } } ##' Print contents of Ranger forest object. ##' ##' ##' @title Print Ranger forest ##' @param x Object of class 'ranger.forest'. ##' @param ... further arguments passed to or from other methods. ##' @author Marvin N. Wright ##' @export print.ranger.forest <- function(x, ...) { cat("Ranger forest object\n\n") cat("Type: ", x$treetype, "\n") cat("Number of trees: ", x$num.trees, "\n") if (x$treetype == "Survival") { cat("Number of unique death times: ", length(x$unique.death.times), "\n") } } ##' Print contents of Ranger prediction object. ##' ##' ##' @title Print Ranger prediction ##' @param x Object of class 'ranger.prediction'. ##' @param ... further arguments passed to or from other methods. ##' @author Marvin N. Wright ##' @export print.ranger.prediction <- function(x, ...) { cat("Ranger prediction\n\n") cat("Type: ", x$treetype, "\n") cat("Sample size: ", x$num.samples, "\n") cat("Number of independent variables: ", x$num.independent.variables, "\n") if (x$treetype == "Survival") { cat("Number of unique death times: ", length(x$unique.death.times), "\n") } } str.ranger.forest <- function(object, max.level = 2, ...) { class(object) <- "list" str(object, max.level = max.level, ...) } str.ranger <- function(object, max.level = 2, ...) { class(object) <- "list" str(object, max.level = max.level, ...) } ranger/R/timepoints.R0000755000176200001440000000461114027301517014254 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' @export timepoints <- function(x, ...) UseMethod("timepoints") ##' Extract unique death times of Ranger Survival prediction object. ##' ##' ##' @title Ranger timepoints ##' @param x Ranger Survival prediction object. ##' @param ... Further arguments passed to or from other methods. ##' @return Unique death times ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @export timepoints.ranger.prediction <- function(x, ...) { if (!inherits(x, "ranger.prediction")) { stop("Object ist no ranger.prediction object.") } if (x$treetype != "Survival") { stop("No timepoints found. Object is no Survival prediction object.") } if (is.null(x$unique.death.times)) { stop("No timepoints found.") } return(x$unique.death.times) } ##' Extract unique death times of Ranger Survival forest ##' ##' ##' @title Ranger timepoints ##' @param x Ranger Survival forest object. ##' @param ... Further arguments passed to or from other methods. ##' @return Unique death times ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @aliases timepoints ##' @export timepoints.ranger <- function(x, ...) { if (!inherits(x, "ranger")) { stop("Object ist no ranger object.") } if (x$treetype != "Survival") { stop("No timepoints found. Object is no Survival forest.") } if (is.null(x$unique.death.times)) { stop("No timepoints found.") } return(x$unique.death.times) } ranger/R/utility.R0000755000176200001440000000641514073532511013571 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- # Convert integer to factor integer.to.factor <- function(x, labels) { factor(x, levels = seq_along(labels), labels = labels) } # Save version of sample() for length(x) == 1 # See help(sample) save.sample <- function(x, ...) { x[sample.int(length(x), ...)] } # Order factor levels with PCA approach # Reference: Coppersmith, D., Hong, S.J. & Hosking, J.R. (1999) Partitioning Nominal Attributes in Decision Trees. Data Min Knowl Discov 3:197. \doi{10.1023/A:1009869804967}. pca.order <- function(y, x) { x <- droplevels(x) if (nlevels(x) < 2) { return(as.character(levels(x))) } ## Create contingency table of the nominal outcome with the nominal covariate N <- table(x, droplevels(y)) ## PCA of weighted covariance matrix of class probabilites P <- N/rowSums(N) S <- cov.wt(P, wt = rowSums(N))$cov pc1 <- prcomp(S, rank. = 1)$rotation score <- P %*% pc1 ## Return ordered factor levels as.character(levels(x)[order(score)]) } # Compute median survival if available or largest quantile available in all strata if median not available. largest.quantile <- function(formula) { ## Fit survival model fit <- survival::survfit(formula) smry <- summary(fit) ## Use median survival if available or largest quantile available in all strata if median not available max_quant <- max(aggregate(smry$surv ~ smry$strata, FUN = min)[, "smry$surv"]) quantiles <- quantile(fit, conf.int = FALSE, prob = min(0.5, 1 - max_quant))[, 1] names(quantiles) <- gsub(".+=", "", names(quantiles)) ## Return ordered levels names(sort(quantiles)) } # Convert ranger object from version <0.11.5 (without x/y interface) convert.pre.xy <- function(forest, trees = 1:forest$num.trees) { if (is.null(forest$status.varID)) { # Not survival for (i in 1:forest$num.trees) { idx <- forest$split.varIDs[[i]] > forest$dependent.varID forest$split.varIDs[[i]][idx] <- forest$split.varIDs[[i]][idx] - 1 } } else { # Survival for (i in 1:forest$num.trees) { idx1 <- forest$split.varIDs[[i]] > forest$dependent.varID idx2 <- forest$split.varIDs[[i]] > forest$status.varID forest$split.varIDs[[i]][idx1] <- forest$split.varIDs[[i]][idx1] - 1 forest$split.varIDs[[i]][idx2] <- forest$split.varIDs[[i]][idx2] - 1 } } return(forest) } ranger/R/predict.R0000755000176200001440000006127314073532223013523 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' Prediction with new data and a saved forest from Ranger. ##' ##' For \code{type = 'response'} (the default), the predicted classes (classification), predicted numeric values (regression), predicted probabilities (probability estimation) or survival probabilities (survival) are returned. ##' For \code{type = 'se'}, the standard error of the predictions are returned (regression only). The jackknife-after-bootstrap or infinitesimal jackknife for bagging is used to estimate the standard errors based on out-of-bag predictions. See Wager et al. (2014) for details. ##' For \code{type = 'terminalNodes'}, the IDs of the terminal node in each tree for each observation in the given dataset are returned. ##' ##' If \code{type = 'se'} is selected, the method to estimate the variances can be chosen with \code{se.method}. Set \code{se.method = 'jack'} for jackknife after bootstrap and \code{se.method = 'infjack'} for the infinitesimal jackknife for bagging. ##' ##' For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. ##' To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. ##' ##' @title Ranger prediction ##' @param object Ranger \code{ranger.forest} object. ##' @param data New test data of class \code{data.frame} or \code{gwaa.data} (GenABEL). ##' @param predict.all Return individual predictions for each tree instead of aggregated predictions for all trees. Return a matrix (sample x tree) for classification and regression, a 3d array for probability estimation (sample x class x tree) and survival (sample x time x tree). ##' @param num.trees Number of trees used for prediction. The first \code{num.trees} in the forest are used. ##' @param type Type of prediction. One of 'response', 'se', 'terminalNodes', 'quantiles' with default 'response'. See below for details. ##' @param se.method Method to compute standard errors. One of 'jack', 'infjack' with default 'infjack'. Only applicable if type = 'se'. See below for details. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode. ##' @param num.threads Number of threads. Default is number of CPUs available. ##' @param verbose Verbose output on or off. ##' @param inbag.counts Number of times the observations are in-bag in the trees. ##' @param ... further arguments passed to or from other methods. ##' @return Object of class \code{ranger.prediction} with elements ##' \tabular{ll}{ ##' \code{predictions} \tab Predicted classes/values (only for classification and regression) \cr ##' \code{unique.death.times} \tab Unique death times (only for survival). \cr ##' \code{chf} \tab Estimated cumulative hazard function for each sample (only for survival). \cr ##' \code{survival} \tab Estimated survival function for each sample (only for survival). \cr ##' \code{num.trees} \tab Number of trees. \cr ##' \code{num.independent.variables} \tab Number of independent variables. \cr ##' \code{treetype} \tab Type of forest/tree. Classification, regression or survival. \cr ##' \code{num.samples} \tab Number of samples. ##' } ##' @references ##' \itemize{ ##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. ##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. ##' } ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @importFrom Matrix Matrix ##' @export predict.ranger.forest <- function(object, data, predict.all = FALSE, num.trees = object$num.trees, type = "response", se.method = "infjack", seed = NULL, num.threads = NULL, verbose = TRUE, inbag.counts = NULL, ...) { ## GenABEL GWA data if (inherits(data, "gwaa.data")) { snp.names <- snp.names(data) snp.data <- data@gtdata@gtps@.Data data <- data@phdata[, -1, drop = FALSE] gwa.mode <- TRUE } else { snp.data <- as.matrix(0) gwa.mode <- FALSE } ## Check forest argument if (!inherits(object, "ranger.forest")) { stop("Error: Invalid class of input object.") } else { forest <- object } if (is.null(forest$num.trees) || is.null(forest$child.nodeIDs) || is.null(forest$split.varIDs) || is.null(forest$split.values) || is.null(forest$independent.variable.names) || is.null(forest$treetype)) { stop("Error: Invalid forest object.") } if (forest$treetype == "Survival" && (is.null(forest$chf) || is.null(forest$unique.death.times))) { stop("Error: Invalid forest object.") } ## Check for old ranger version if (length(forest$child.nodeIDs) != forest$num.trees || length(forest$child.nodeIDs[[1]]) != 2) { stop("Error: Invalid forest object. Is the forest grown in ranger version <0.3.9? Try to predict with the same version the forest was grown.") } if (!is.null(forest$dependent.varID)) { warning("Forest grown in ranger version <0.11.5, converting ...") forest <- convert.pre.xy(forest) } ## Prediction type if (type == "response" || type == "se") { prediction.type <- 1 } else if (type == "terminalNodes") { prediction.type <- 2 } else if (type == "quantiles") { stop("Error: Apply predict() to the ranger object instead of the $forest object to predict quantiles.") } else { stop("Error: Invalid value for 'type'. Use 'response', 'se', 'terminalNodes', or 'quantiles'.") } ## Type "se" only for certain tree types if (type == "se" && se.method == "jack" && forest$treetype != "Regression") { stop("Error: Jackknife standard error prediction currently only available for regression.") } if (type == "se" && se.method == "infjack") { if (forest$treetype == "Survival") { stop("Error: Infinitesimal jackknife standard error prediction not yet available for survival.") } else if (forest$treetype == "Classification") { stop("Error: Not a probability forest. Set probability=TRUE to use the infinitesimal jackknife standard error prediction for classification.") } } ## Type "se" requires keep.inbag=TRUE if (type == "se" && is.null(inbag.counts)) { stop("Error: No saved inbag counts in ranger object. Please set keep.inbag=TRUE when calling ranger.") } ## Set predict.all if type is "se" if (type == "se") { predict.all <- TRUE } x <- data if (sum(!(forest$independent.variable.names %in% colnames(x))) > 0) { stop("Error: One or more independent variables not found in data.") } ## Subset to same column as in training if necessary if (length(colnames(x)) != length(forest$independent.variable.names) || any(colnames(x) != forest$independent.variable.names)) { x <- x[, forest$independent.variable.names, drop = FALSE] } ## Recode characters if (!is.matrix(x) && !inherits(x, "Matrix")) { char.columns <- sapply(x, is.character) if (length(char.columns) > 0) { x[char.columns] <- lapply(x[char.columns], factor) } } ## Recode factors if forest grown 'order' mode if (!is.null(forest$covariate.levels) && !all(sapply(forest$covariate.levels, is.null))) { x <- mapply(function(xx, yy) { if(is.null(yy)) { xx } else { new.levels <- setdiff(levels(xx), yy) factor(xx, levels = c(yy, new.levels), exclude = NULL) } }, x, forest$covariate.levels, SIMPLIFY = !is.data.frame(x)) } if (is.list(x) && !is.data.frame(x)) { x <- as.data.frame(x) } ## Convert to data matrix if (!is.matrix(x) & !inherits(x, "Matrix")) { x <- data.matrix(x) } ## Check missing values if (any(is.na(x))) { offending_columns <- colnames(x)[colSums(is.na(x)) > 0] stop("Missing data in columns: ", paste0(offending_columns, collapse = ", "), ".", call. = FALSE) } ## Num threads ## Default 0 -> detect from system in C++. if (is.null(num.threads)) { num.threads = 0 } else if (!is.numeric(num.threads) || num.threads < 0) { stop("Error: Invalid value for num.threads") } ## Seed if (is.null(seed)) { seed <- runif(1 , 0, .Machine$integer.max) } if (forest$treetype == "Classification") { treetype <- 1 } else if (forest$treetype == "Regression") { treetype <- 3 } else if (forest$treetype == "Survival") { treetype <- 5 } else if (forest$treetype == "Probability estimation") { treetype <- 9 } else { stop("Error: Unknown tree type.") } ## Defaults for variables not needed mtry <- 0 importance <- 0 min.node.size <- 0 split.select.weights <- list(c(0, 0)) use.split.select.weights <- FALSE always.split.variables <- c("0", "0") use.always.split.variables <- FALSE prediction.mode <- TRUE write.forest <- FALSE replace <- TRUE probability <- FALSE unordered.factor.variables <- c("0", "0") use.unordered.factor.variables <- FALSE save.memory <- FALSE splitrule <- 1 alpha <- 0 minprop <- 0 case.weights <- c(0, 0) use.case.weights <- FALSE class.weights <- c(0, 0) keep.inbag <- FALSE sample.fraction <- 1 holdout <- FALSE num.random.splits <- 1 order.snps <- FALSE oob.error <- FALSE max.depth <- 0 inbag <- list(c(0,0)) use.inbag <- FALSE y <- matrix(c(0, 0)) regularization.factor <- c(0, 0) use.regularization.factor <- FALSE regularization.usedepth <- FALSE ## Use sparse matrix if (inherits(x, "dgCMatrix")) { sparse.x <- x x <- matrix(c(0, 0)) use.sparse.data <- TRUE } else { sparse.x <- Matrix(matrix(c(0, 0))) use.sparse.data <- FALSE x <- data.matrix(x) } ## Call Ranger result <- rangerCpp(treetype, x, y, forest$independent.variable.names, mtry, num.trees, verbose, seed, num.threads, write.forest, importance, min.node.size, split.select.weights, use.split.select.weights, always.split.variables, use.always.split.variables, prediction.mode, forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule, case.weights, use.case.weights, class.weights, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, regularization.factor, use.regularization.factor, regularization.usedepth) if (length(result) == 0) { stop("User interrupt or internal error.") } ## Prepare results result$num.samples <- nrow(x) result$treetype <- forest$treetype if (predict.all) { if (forest$treetype %in% c("Classification", "Regression")) { if (is.list(result$predictions)) { result$predictions <- do.call(rbind, result$predictions) } else { result$predictions <- array(result$predictions, dim = c(1, length(result$predictions))) } } else { if (is.list(result$predictions) & length(result$predictions) >= 1 & is.numeric(result$predictions[[1]])) { # Fix for single test observation result$predictions <- list(result$predictions) } result$predictions <- aperm(array(unlist(result$predictions), dim = rev(c(length(result$predictions), length(result$predictions[[1]]), length(result$predictions[[1]][[1]]))))) } } else { if (is.list(result$predictions)) { result$predictions <- do.call(rbind, result$predictions) } } if (type == "response") { if (forest$treetype == "Classification" && !is.null(forest$levels)) { if (!predict.all) { result$predictions <- integer.to.factor(result$predictions, forest$levels) } } else if (forest$treetype == "Regression") { ## Empty } else if (forest$treetype == "Survival") { result$unique.death.times <- forest$unique.death.times result$chf <- result$predictions result$predictions <- NULL result$survival <- exp(-result$chf) } else if (forest$treetype == "Probability estimation") { if (predict.all) { ## Set colnames and sort by levels if (!is.null(forest$levels)) { colnames(result$predictions) <- forest$levels[forest$class.values] result$predictions <- result$predictions[, forest$levels[sort(forest$class.values)], , drop = FALSE] } } else { if (is.vector(result$predictions)) { result$predictions <- matrix(result$predictions, nrow = 1) } ## Set colnames and sort by levels if (!is.null(forest$levels)) { colnames(result$predictions) <- forest$levels[forest$class.values] result$predictions <- result$predictions[, forest$levels[sort(forest$class.values)], drop = FALSE] } } } } else if (type == "terminalNodes") { if (is.vector(result$predictions)) { result$predictions <- matrix(result$predictions, nrow = 1) } } ## Compute Jackknife if (type == "se") { ## Aggregated predictions if (length(dim(result$predictions)) > 2) { yhat <- apply(result$predictions, c(1, 2), mean) } else { yhat <- rowMeans(result$predictions) } ## Get inbag counts, keep only observations that are OOB at least once inbag.counts <- simplify2array(inbag.counts) if (is.vector(inbag.counts)) { inbag.counts <- t(as.matrix(inbag.counts)) } inbag.counts <- inbag.counts[rowSums(inbag.counts == 0) > 0, , drop = FALSE] n <- nrow(inbag.counts) oob <- inbag.counts == 0 if (num.trees != object$num.trees) { oob <- oob[, 1:num.trees] } if (all(!oob)) { stop("Error: No OOB observations found, consider increasing num.trees or reducing sample.fraction.") } if (se.method == "jack") { ## Compute Jackknife oob.count <- rowSums(oob) jack.n <- sweep(tcrossprod(result$predictions, oob), 2, oob.count, "/", check.margin = FALSE) if (is.vector(jack.n)) { jack.n <- t(as.matrix(jack.n)) } if (any(oob.count == 0)) { n <- sum(oob.count > 0) jack.n <- jack.n[, oob.count > 0] } jack <- (n - 1) / n * rowSums((jack.n - yhat)^2) bias <- (exp(1) - 1) * n / result$num.trees^2 * rowSums((result$predictions - yhat)^2) jab <- pmax(jack - bias, 0) result$se <- sqrt(jab) } else if (se.method == "infjack") { if (forest$treetype == "Regression") { infjack <- rInfJack(pred = result$predictions, inbag = inbag.counts, used.trees = 1:num.trees) result$se <- sqrt(infjack$var.hat) } else if (forest$treetype == "Probability estimation") { infjack <- apply(result$predictions, 2, function(x) { rInfJack(x, inbag.counts)$var.hat }) result$se <- sqrt(infjack) } } else { stop("Error: Unknown standard error method (se.method).") } ## Response as predictions result$predictions <- yhat if (forest$treetype == "Probability estimation") { ## Set colnames and sort by levels colnames(result$predictions) <- forest$levels[forest$class.values] result$predictions <- result$predictions[, forest$levels, drop = FALSE] if (!is.matrix(result$se)) { result$se <- matrix(result$se, ncol = length(forest$levels)) } colnames(result$se) <- forest$levels[forest$class.values] result$se <- result$se[, forest$levels, drop = FALSE] } } class(result) <- "ranger.prediction" return(result) } ##' Prediction with new data and a saved forest from Ranger. ##' ##' For \code{type = 'response'} (the default), the predicted classes (classification), predicted numeric values (regression), predicted probabilities (probability estimation) or survival probabilities (survival) are returned. ##' For \code{type = 'se'}, the standard error of the predictions are returned (regression only). The jackknife-after-bootstrap or infinitesimal jackknife for bagging is used to estimate the standard errors based on out-of-bag predictions. See Wager et al. (2014) for details. ##' For \code{type = 'terminalNodes'}, the IDs of the terminal node in each tree for each observation in the given dataset are returned. ##' For \code{type = 'quantiles'}, the selected quantiles for each observation are estimated. See Meinshausen (2006) for details. ##' ##' If \code{type = 'se'} is selected, the method to estimate the variances can be chosen with \code{se.method}. Set \code{se.method = 'jack'} for jackknife-after-bootstrap and \code{se.method = 'infjack'} for the infinitesimal jackknife for bagging. ##' ##' For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. ##' To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. ##' ##' @title Ranger prediction ##' @param object Ranger \code{ranger} object. ##' @param data New test data of class \code{data.frame} or \code{gwaa.data} (GenABEL). ##' @param predict.all Return individual predictions for each tree instead of aggregated predictions for all trees. Return a matrix (sample x tree) for classification and regression, a 3d array for probability estimation (sample x class x tree) and survival (sample x time x tree). ##' @param num.trees Number of trees used for prediction. The first \code{num.trees} in the forest are used. ##' @param type Type of prediction. One of 'response', 'se', 'terminalNodes', 'quantiles' with default 'response'. See below for details. ##' @param se.method Method to compute standard errors. One of 'jack', 'infjack' with default 'infjack'. Only applicable if type = 'se'. See below for details. ##' @param quantiles Vector of quantiles for quantile prediction. Set \code{type = 'quantiles'} to use. ##' @param what User specified function for quantile prediction used instead of \code{quantile}. Must return numeric vector, see examples. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode. ##' @param num.threads Number of threads. Default is number of CPUs available. ##' @param verbose Verbose output on or off. ##' @param ... further arguments passed to or from other methods. ##' @return Object of class \code{ranger.prediction} with elements ##' \tabular{ll}{ ##' \code{predictions} \tab Predicted classes/values (only for classification and regression) \cr ##' \code{unique.death.times} \tab Unique death times (only for survival). \cr ##' \code{chf} \tab Estimated cumulative hazard function for each sample (only for survival). \cr ##' \code{survival} \tab Estimated survival function for each sample (only for survival). \cr ##' \code{num.trees} \tab Number of trees. \cr ##' \code{num.independent.variables} \tab Number of independent variables. \cr ##' \code{treetype} \tab Type of forest/tree. Classification, regression or survival. \cr ##' \code{num.samples} \tab Number of samples. ##' } ##' @examples ##' ## Classification forest ##' ranger(Species ~ ., data = iris) ##' train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) ##' iris.train <- iris[train.idx, ] ##' iris.test <- iris[-train.idx, ] ##' rg.iris <- ranger(Species ~ ., data = iris.train) ##' pred.iris <- predict(rg.iris, data = iris.test) ##' table(iris.test$Species, pred.iris$predictions) ##' ##' ## Quantile regression forest ##' rf <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE) ##' pred <- predict(rf, mtcars[27:32, ], type = "quantiles", quantiles = c(0.1, 0.5, 0.9)) ##' pred$predictions ##' ##' ## Quantile regression forest with user-specified function ##' rf <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE) ##' pred <- predict(rf, mtcars[27:32, ], type = "quantiles", ##' what = function(x) sample(x, 10, replace = TRUE)) ##' pred$predictions ##' ##' @references ##' \itemize{ ##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. ##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. ##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. ##' } ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @export predict.ranger <- function(object, data = NULL, predict.all = FALSE, num.trees = object$num.trees, type = "response", se.method = "infjack", quantiles = c(0.1, 0.5, 0.9), what = NULL, seed = NULL, num.threads = NULL, verbose = TRUE, ...) { forest <- object$forest if (is.null(forest)) { stop("Error: No saved forest in ranger object. Please set write.forest to TRUE when calling ranger.") } if (object$importance.mode %in% c("impurity_corrected", "impurity_unbiased")) { warning("Forest was grown with 'impurity_corrected' variable importance. For prediction it is advised to grow another forest without this importance setting.") } if (type == "quantiles") { ## Quantile prediction if (object$treetype != "Regression") { stop("Error: Quantile prediction implemented only for regression outcomes.") } if (is.null(object$random.node.values)) { stop("Error: Set quantreg=TRUE in ranger(...) for quantile prediction.") } if (is.null(data)) { ## OOB prediction if (is.null(object$random.node.values.oob)) { stop("Error: Set keep.inbag=TRUE in ranger(...) for out-of-bag quantile prediction or provide new data in predict(...).") } node.values <- object$random.node.values.oob } else { ## New data prediction terminal.nodes <- predict(object, data, type = "terminalNodes")$predictions + 1 node.values <- 0 * terminal.nodes for (tree in 1:num.trees) { node.values[, tree] <- object$random.node.values[terminal.nodes[, tree], tree] } } ## Prepare results result <- list(num.samples = nrow(node.values), treetype = object$treetype, num.independent.variables = object$num.independent.variables, num.trees = num.trees) class(result) <- "ranger.prediction" if (is.null(what)) { ## Compute quantiles of distribution result$predictions <- t(apply(node.values, 1, quantile, quantiles, na.rm=TRUE)) if (nrow(result$predictions) != result$num.samples) { ## Fix result for single quantile result$predictions <- t(result$predictions) } colnames(result$predictions) <- paste("quantile=", quantiles) } else { ## User function if (!is.function(what)) { stop("Error: Argument 'what' is not a function.") } result$predictions <- t(apply(node.values, 1, what)) } result } else { ## Non-quantile prediction if (is.null(data)) { stop("Error: Argument 'data' is required for non-quantile prediction.") } predict(forest, data, predict.all, num.trees, type, se.method, seed, num.threads, verbose, object$inbag.counts, ...) } } ranger/R/holdoutRF.R0000755000176200001440000000610714073532076014000 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' Grow two random forests on two cross-validation folds. ##' Instead of out-of-bag data, the other fold is used to compute permutation importance. ##' Related to the novel permutation variable importance by Janitza et al. (2015). ##' ##' @title Hold-out random forests ##' @param ... All arguments are passed to \code{\link{ranger}()} (except \code{importance}, \code{case.weights}, \code{replace} and \code{holdout}.). ##' @return Hold-out random forests with variable importance. ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @references ##' Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr ##' @export holdoutRF <- function(...) { ## Get data from arguments args <- list(...) if ("data" %in% names(args)) { data <- args$data } else { data <- args[[2]] } ## Split data if (inherits(data, "gwaa.data")) { n <- nrow(data@phdata) } else { n <- nrow(data) } weights <- rbinom(n, 1, 0.5) ## Check args if ("case.weights" %in% names(args)) { stop("Error: Argument 'case.weights' not supported in holdoutRF.") } if ("holdout" %in% names(args)) { stop("Error: Argument 'holdout' not supported in holdoutRF.") } if ("importance" %in% names(args)) { stop("Error: Argument 'importance' not supported in holdoutRF. Always set to 'permutation'.") } if ("replace" %in% names(args)) { stop("Error: Argument 'replace' not supported in holdoutRF.") } ## Grow RFs res <- list( rf1 = ranger(..., importance = "permutation", case.weights = weights, replace = FALSE, holdout = TRUE), rf2 = ranger(..., importance = "permutation", case.weights = 1-weights, replace = FALSE, holdout = TRUE) ) ## Compute importance res$variable.importance <- (res$rf1$variable.importance + res$rf2$variable.importance)/2 res$treetype <- res$rf1$treetype res$importance.mode <- res$rf1$importance.mode class(res) <- "holdoutRF" res } ranger/R/predictions.R0000755000176200001440000000607414027301517014411 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' @export predictions <- function(x, ...) UseMethod("predictions") ##' Extract predictions of Ranger prediction object. ##' ##' ##' @title Ranger predictions ##' @param x Ranger prediction object. ##' @param ... Further arguments passed to or from other methods. ##' @return Predictions: Classes for Classification forests, Numerical values for Regressions forests and the estimated survival functions for all individuals for Survival forests. ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @aliases predictions ##' @export predictions.ranger.prediction <- function(x, ...) { if (!inherits(x, "ranger.prediction")) { stop("Object ist no ranger.prediction object.") } if (x$treetype == "Classification" || x$treetype == "Regression" || x$treetype == "Probability estimation") { if (is.null(x$predictions)) { stop("No predictions found.") } else { return(x$predictions) } } else if (x$treetype == "Survival") { if (is.null(x$survival)) { stop("No predictions found.") } else { return(x$survival) } } else { stop("Unknown tree type.") } } ##' Extract training data predictions of Ranger object. ##' ##' ##' @title Ranger predictions ##' @param x Ranger object. ##' @param ... Further arguments passed to or from other methods. ##' @return Predictions: Classes for Classification forests, Numerical values for Regressions forests and the estimated survival functions for all individuals for Survival forests. ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @export predictions.ranger<- function(x, ...) { if (!inherits(x, "ranger")) { stop("Object ist no ranger object.") } if (x$treetype == "Classification" || x$treetype == "Regression" || x$treetype == "Probability estimation") { if (is.null(x$predictions)) { stop("No predictions found.") } else { return(x$predictions) } } else if (x$treetype == "Survival") { if (is.null(x$survival)) { stop("No predictions found.") } else { return(x$survival) } } else { stop("Unknown tree type.") } } ranger/R/csrf.R0000755000176200001440000001073014073531654013025 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' Case-specific random forests. ##' ##' In case-specific random forests (CSRF), random forests are built specific to the cases of interest. ##' Instead of using equal probabilities, the cases are weighted according to their difference to the case of interest. ##' ##' The algorithm consists of 3 steps: ##' \enumerate{ ##' \item Grow a random forest on the training data ##' \item For each observation of interest (test data), the weights of all training observations are computed by counting the number of trees in which both observations are in the same terminal node. ##' \item For each test observation, grow a weighted random forest on the training data, using the weights obtained in step 2. Predict the outcome of the test observation as usual. ##' } ##' In total, n+1 random forests are grown, where n is the number observations in the test dataset. ##' For details, see Xu et al. (2014). ##' ##' @param formula Object of class \code{formula} or \code{character} describing the model to fit. ##' @param training_data Training data of class \code{data.frame}. ##' @param test_data Test data of class \code{data.frame}. ##' @param params1 Parameters for the proximity random forest grown in the first step. ##' @param params2 Parameters for the prediction random forests grown in the second step. ##' @param verbose Logical indicating whether or not to print computation progress. ##' ##' @return Predictions for the test dataset. ##' ##' @examples ##' ## Split in training and test data ##' train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) ##' iris.train <- iris[train.idx, ] ##' iris.test <- iris[-train.idx, ] ##' ##' ## Run case-specific RF ##' csrf(Species ~ ., training_data = iris.train, test_data = iris.test, ##' params1 = list(num.trees = 50, mtry = 4), ##' params2 = list(num.trees = 5)) ##' ##' @author Marvin N. Wright ##' @references ##' Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \doi{10.1080/10618600.2014.983641}. ##' @export csrf <- function(formula, training_data, test_data, params1 = list(), params2 = list(), verbose = FALSE) { ## Grow a random forest on the training data to obtain weights rf.proximity <- do.call(ranger, c(list(formula = formula, data = training_data, write.forest = TRUE), params1)) ## Get terminal nodes terminal.nodeIDs.train <- predict(rf.proximity, training_data, type = "terminalNodes")$predictions terminal.nodeIDs.test <- predict(rf.proximity, test_data, type = "terminalNodes")$predictions ## Grow weighted RFs for test observations, predict the outcome predictions <- sapply(1:nrow(test_data), function(i) { ## Print computation progress if (isTRUE(verbose)) { message("Computing case-specific prediction for test observation ", i, " of ", nrow(test_data), ". (", round(i / nrow(test_data) * 100, digits = 2), "% complete.)") } ## Compute weights from first RF num.same.node <- rowSums(terminal.nodeIDs.test[i, ] == terminal.nodeIDs.train) weights <- num.same.node / sum(num.same.node) ## Grow weighted RF rf.prediction <- do.call(ranger, c(list(formula = formula, data = training_data, write.forest = TRUE, case.weights = weights), params2)) ## Predict outcome predict(rf.prediction, test_data[i, ])$predictions }) ## Return predictions predictions } ranger/R/formula.R0000755000176200001440000000444714027301516013534 0ustar liggesusers#' Parse formula #' #' Parse formula and return dataset containing selected columns. #' Interactions are supported for numerical columns only. #' An interaction column is the product of all interacting columns. #' #' @param formula Object of class \code{formula} or \code{character} describing the model to fit. #' @param data Training data of class \code{data.frame}. #' @param env The environment in which the left hand side of \code{formula} is evaluated. #' #' @return Dataset including selected columns and interactions. parse.formula <- function(formula, data, env = parent.frame()) { f <- as.formula(formula) t <- terms(f, data = data) ## Get dependent var(s) if (is.matrix(data)) { response <- data.frame(eval(f[[2]], envir = data.frame(data[, all.vars(f[[2]]), drop = FALSE]), enclos = env)) } else { response <- data.frame(eval(f[[2]], envir = data, enclos = env)) } colnames(response) <- deparse(f[[2]]) ## Get independent vars independent_vars <- attr(t, "term.labels") interaction_idx <- grepl(":", independent_vars) ## Error if illegal column name if (!all(make.names(independent_vars[!interaction_idx]) == independent_vars[!interaction_idx])) { stop("Error: Illegal column names in formula interface. Fix column names or use alternative interface in ranger.") } ## Shortcut if no interactions if (all(!interaction_idx)) { return(data.frame(response, data[, independent_vars, drop = FALSE], check.names = FALSE)) } ## Get interaction columns if (any(interaction_idx)) { interaction_vars <- independent_vars[interaction_idx] dat_interaction <- sapply(strsplit(interaction_vars, ":"), function(x) { if (any(!sapply(data[, x, drop = FALSE], is.numeric))) { stop("Error: Only numeric columns allowed in interaction terms.") } with(data, eval(parse(text = paste(x, collapse = "*")))) }) colnames(dat_interaction) <- interaction_vars } ## Get main effect columns if (any(!interaction_idx)) { main_vars <- independent_vars[!interaction_idx] dat_main <- data[, main_vars, drop = FALSE] } ## Return combined data frame if (any(!interaction_idx)) { data.frame(response, dat_main, dat_interaction, check.names = FALSE) } else { data.frame(response, dat_interaction, check.names = FALSE) } }ranger/R/RcppExports.R0000755000176200001440000000344514073533515014364 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) { .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) } numSmaller <- function(values, reference) { .Call(`_ranger_numSmaller`, values, reference) } randomObsNode <- function(groups, y, inbag_counts) { .Call(`_ranger_randomObsNode`, groups, y, inbag_counts) } ranger/R/ranger.R0000755000176200001440000013241114073532461013344 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' Ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data. ##' Classification, regression, and survival forests are supported. ##' Classification and regression forests are implemented as in the original Random Forest (Breiman 2001), survival forests as in Random Survival Forests (Ishwaran et al. 2008). ##' Includes implementations of extremely randomized trees (Geurts et al. 2006) and quantile regression forests (Meinshausen 2006). ##' ##' The tree type is determined by the type of the dependent variable. ##' For factors classification trees are grown, for numeric values regression trees and for survival objects survival trees. ##' The Gini index is used as default splitting rule for classification. ##' For regression, the estimated response variances or maximally selected rank statistics (Wright et al. 2016) can be used. ##' For Survival the log-rank test, a C-index based splitting rule (Schmid et al. 2015) and maximally selected rank statistics (Wright et al. 2016) are available. ##' For all tree types, forests of extremely randomized trees (Geurts et al. 2006) can be grown. ##' ##' With the \code{probability} option and factor dependent variable a probability forest is grown. ##' Here, the node impurity is used for splitting, as in classification forests. ##' Predictions are class probabilities for each sample. ##' In contrast to other implementations, each tree returns a probability estimate and these estimates are averaged for the forest probability estimate. ##' For details see Malley et al. (2012). ##' ##' Note that for classification and regression nodes with size smaller than \code{min.node.size} can occur, as in original Random Forests. ##' For survival all nodes contain at \code{min.node.size} samples. ##' Variables selected with \code{always.split.variables} are tried additionally to the mtry variables randomly selected. ##' In \code{split.select.weights}, weights do not need to sum up to 1, they will be normalized later. ##' The weights are assigned to the variables in the order they appear in the formula or in the data if no formula is used. ##' Names of the \code{split.select.weights} vector are ignored. ##' The usage of \code{split.select.weights} can increase the computation times for large forests. ##' ##' Unordered factor covariates can be handled in 3 different ways by using \code{respect.unordered.factors}: ##' For 'ignore' all factors are regarded ordered, for 'partition' all possible 2-partitions are considered for splitting. ##' For 'order' and 2-class classification the factor levels are ordered by their proportion falling in the second class, for regression by their mean response, as described in Hastie et al. (2009), chapter 9.2.4. ##' For multiclass classification the factor levels are ordered by the first principal component of the weighted covariance matrix of the contingency table (Coppersmith et al. 1999), for survival by the median survival (or the largest available quantile if the median is not available). ##' The use of 'order' is recommended, as it computationally fast and can handle an unlimited number of factor levels. ##' Note that the factors are only reordered once and not again in each split. ##' ##' The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. ##' It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. ##' See Nembrini et al. (2018) for details. ##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. ##' ##' Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. ##' If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. ##' If regularization is used, multithreading is deactivated because all trees need access to the list of variables that are already included in the model. ##' ##' For a large number of variables and data frames as input data the formula interface can be slow or impossible to use. ##' Alternatively \code{dependent.variable.name} (and \code{status.variable.name} for survival) or \code{x} and \code{y} can be used. ##' Use \code{x} and \code{y} with a matrix for \code{x} to avoid conversions and save memory. ##' Consider setting \code{save.memory = TRUE} if you encounter memory problems for very large datasets, but be aware that this option slows down the tree growing. ##' ##' For GWAS data consider combining \code{ranger} with the \code{GenABEL} package. ##' See the Examples section below for a demonstration using \code{Plink} data. ##' All SNPs in the \code{GenABEL} object will be used for splitting. ##' To use only the SNPs without sex or other covariates from the phenotype file, use \code{0} on the right hand side of the formula. ##' Note that missing values are treated as an extra category while splitting. ##' ##' See \url{https://github.com/imbs-hl/ranger} for the development version. ##' ##' With recent R versions, multithreading on Windows platforms should just work. ##' If you compile yourself, the new RTools toolchain is required. ##' ##' @title Ranger ##' @param formula Object of class \code{formula} or \code{character} describing the model to fit. Interaction terms supported only for numerical variables. ##' @param data Training data of class \code{data.frame}, \code{matrix}, \code{dgCMatrix} (Matrix) or \code{gwaa.data} (GenABEL). ##' @param num.trees Number of trees. ##' @param mtry Number of variables to possibly split at in each node. Default is the (rounded down) square root of the number variables. Alternatively, a single argument function returning an integer, given the number of independent variables. ##' @param importance Variable importance mode, one of 'none', 'impurity', 'impurity_corrected', 'permutation'. The 'impurity' measure is the Gini index for classification, the variance of the responses for regression and the sum of test statistics (see \code{splitrule}) for survival. ##' @param write.forest Save \code{ranger.forest} object, required for prediction. Set to \code{FALSE} to reduce memory usage if no prediction intended. ##' @param probability Grow a probability forest as in Malley et al. (2012). ##' @param min.node.size Minimal node size. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability. ##' @param max.depth Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree). ##' @param replace Sample with replacement. ##' @param sample.fraction Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values. ##' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees. ##' @param class.weights Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes. ##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". ##' @param num.random.splits For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable. ##' @param alpha For "maxstat" splitrule: Significance threshold to allow splitting. ##' @param minprop For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting. ##' @param split.select.weights Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used. ##' @param always.split.variables Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting. ##' @param respect.unordered.factors Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details. ##' @param scale.permutation.importance Scale permutation importance by standard error as in (Breiman 2001). Only applicable if permutation variable importance mode selected. ##' @param regularization.factor Regularization factor (gain penalization), either a vector of length p or one value for all variables. ##' @param regularization.usedepth Consider the depth in regularization. ##' @param local.importance Calculate and return local importance values as in (Breiman 2001). Only applicable if \code{importance} is set to 'permutation'. ##' @param keep.inbag Save how often observations are in-bag in each tree. ##' @param inbag Manually set observations per tree. List of size num.trees, containing inbag counts for each observation. Can be used for stratified sampling. ##' @param holdout Hold-out mode. Hold-out all samples with case weight 0 and use these for variable importance and prediction error. ##' @param quantreg Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction. ##' @param oob.error Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests. ##' @param num.threads Number of threads. Default is number of CPUs available. ##' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. ##' @param verbose Show computation status and estimated runtime. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. ##' @param dependent.variable.name Name of dependent variable, needed if no formula given. For survival forests this is the time variable. ##' @param status.variable.name Name of status variable, only applicable to survival data and needed if no formula given. Use 1 for event and 0 for censoring. ##' @param classification Set to \code{TRUE} to grow a classification forest. Only needed if the data is a matrix or the response numeric. ##' @param x Predictor data (independent variables), alternative interface to data with formula or dependent.variable.name. ##' @param y Response vector (dependent variable), alternative interface to data with formula or dependent.variable.name. For survival use a \code{Surv()} object or a matrix with time and status. ##' @param ... Further arguments passed to or from other methods (currently ignored). ##' @return Object of class \code{ranger} with elements ##' \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.} ##' \item{\code{predictions}}{Predicted classes/values, based on out of bag samples (classification and regression only).} ##' \item{\code{variable.importance}}{Variable importance for each independent variable.} ##' \item{\code{variable.importance.local}}{Variable importance for each independent variable and each sample, if \code{local.importance} is set to TRUE and \code{importance} is set to 'permutation'.} ##' \item{\code{prediction.error}}{Overall out of bag prediction error. For classification this is the fraction of missclassified samples, for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.} ##' \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out of bag data.} ##' \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out of bag samples (classification only).} ##' \item{\code{unique.death.times}}{Unique death times (survival only).} ##' \item{\code{chf}}{Estimated cumulative hazard function for each sample (survival only).} ##' \item{\code{survival}}{Estimated survival function for each sample (survival only).} ##' \item{\code{call}}{Function call.} ##' \item{\code{num.trees}}{Number of trees.} ##' \item{\code{num.independent.variables}}{Number of independent variables.} ##' \item{\code{mtry}}{Value of mtry used.} ##' \item{\code{min.node.size}}{Value of minimal node size used.} ##' \item{\code{treetype}}{Type of forest/tree. classification, regression or survival.} ##' \item{\code{importance.mode}}{Importance mode used.} ##' \item{\code{num.samples}}{Number of samples.} ##' \item{\code{inbag.counts}}{Number of times the observations are in-bag in the trees.} ##' @examples ##' ## Classification forest with default settings ##' ranger(Species ~ ., data = iris) ##' ##' ## Prediction ##' train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) ##' iris.train <- iris[train.idx, ] ##' iris.test <- iris[-train.idx, ] ##' rg.iris <- ranger(Species ~ ., data = iris.train) ##' pred.iris <- predict(rg.iris, data = iris.test) ##' table(iris.test$Species, pred.iris$predictions) ##' ##' ## Quantile regression forest ##' rf <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE) ##' pred <- predict(rf, mtcars[27:32, ], type = "quantiles") ##' pred$predictions ##' ##' ## Variable importance ##' rg.iris <- ranger(Species ~ ., data = iris, importance = "impurity") ##' rg.iris$variable.importance ##' ##' ## Survival forest ##' require(survival) ##' rg.veteran <- ranger(Surv(time, status) ~ ., data = veteran) ##' plot(rg.veteran$unique.death.times, rg.veteran$survival[1,]) ##' ##' ## Alternative interfaces (same results) ##' ranger(dependent.variable.name = "Species", data = iris) ##' ranger(y = iris[, 5], x = iris[, -5]) ##' ##' \dontrun{ ##' ## Use GenABEL interface to read Plink data into R and grow a classification forest ##' ## The ped and map files are not included ##' library(GenABEL) ##' convert.snp.ped("data.ped", "data.map", "data.raw") ##' dat.gwaa <- load.gwaa.data("data.pheno", "data.raw") ##' phdata(dat.gwaa)$trait <- factor(phdata(dat.gwaa)$trait) ##' ranger(trait ~ ., data = dat.gwaa) ##' } ##' ##' @author Marvin N. Wright ##' @references ##' \itemize{ ##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. ##' \item Schmid, M., Wright, M. N. & Ziegler, A. (2016). On the use of Harrell's C for clinical risk prediction via random survival forests. Expert Syst Appl 63:450-459. \doi{10.1016/j.eswa.2016.07.018}. ##' \item Wright, M. N., Dankowski, T. & Ziegler, A. (2017). Unbiased split variable selection for random survival forests using maximally selected rank statistics. Stat Med 36:1272-1284. \doi{10.1002/sim.7212}. ##' \item Nembrini, S., Koenig, I. R. & Wright, M. N. (2018). The revival of the Gini Importance? Bioinformatics. \doi{10.1093/bioinformatics/bty373}. ##' \item Breiman, L. (2001). Random forests. Mach Learn, 45:5-32. \doi{10.1023/A:1010933404324}. ##' \item Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. Ann Appl Stat 2:841-860. \doi{10.1097/JTO.0b013e318233d835}. ##' \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \doi{10.3414/ME00-01-0052}. ##' \item Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical Learning. Springer, New York. 2nd edition. ##' \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \doi{10.1007/s10994-006-6226-1}. ##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. ##' \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \doi{10.1198/106186008X344522}. ##' \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \doi{10.1023/A:1009869804967}. ##' \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \doi{10.1109/IJCNN.2012.6252640}. ##' } ##' @seealso \code{\link{predict.ranger}} ##' @useDynLib ranger, .registration = TRUE ##' @importFrom Rcpp evalCpp ##' @import stats ##' @import utils ##' @importFrom Matrix Matrix ##' @export ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, importance = "none", write.forest = TRUE, probability = FALSE, min.node.size = NULL, max.depth = NULL, replace = TRUE, sample.fraction = ifelse(replace, 1, 0.632), case.weights = NULL, class.weights = NULL, splitrule = NULL, num.random.splits = 1, alpha = 0.5, minprop = 0.1, split.select.weights = NULL, always.split.variables = NULL, respect.unordered.factors = NULL, scale.permutation.importance = FALSE, local.importance = FALSE, regularization.factor = 1, regularization.usedepth = FALSE, keep.inbag = FALSE, inbag = NULL, holdout = FALSE, quantreg = FALSE, oob.error = TRUE, num.threads = NULL, save.memory = FALSE, verbose = TRUE, seed = NULL, dependent.variable.name = NULL, status.variable.name = NULL, classification = NULL, x = NULL, y = NULL, ...) { ## Handle ... arguments if (length(list(...)) > 0) { warning(paste("Unused arguments:", paste(names(list(...)), collapse = ", "))) } ## By default not in GWAS mode snp.data <- as.matrix(0) gwa.mode <- FALSE if (is.null(data)) { ## x/y interface if (is.null(x) | is.null(y)) { stop("Error: Either data or x and y is required.") } } else { ## GenABEL GWA data if (inherits(data, "gwaa.data" )) { snp.names <- data@gtdata@snpnames snp.data <- data@gtdata@gtps@.Data data <- data@phdata if ("id" %in% names(data)) { data$"id" <- NULL } gwa.mode <- TRUE save.memory <- FALSE } ## Formula interface. Use whole data frame if no formula provided and depvarname given if (is.null(formula)) { if (is.null(dependent.variable.name)) { if (is.null(y) | is.null(x)) { stop("Error: Please give formula, dependent variable name or x/y.") } } else { if (is.null(status.variable.name)) { y <- data[, dependent.variable.name, drop = TRUE] x <- data[, !(colnames(data) %in% dependent.variable.name), drop = FALSE] } else { y <- survival::Surv(data[, dependent.variable.name], data[, status.variable.name]) x <- data[, !(colnames(data) %in% c(dependent.variable.name, status.variable.name)), drop = FALSE] } } } else { formula <- formula(formula) if (!inherits(formula, "formula")) { stop("Error: Invalid formula.") } data.selected <- parse.formula(formula, data, env = parent.frame()) y <- data.selected[, 1] x <- data.selected[, -1, drop = FALSE] } } ## Sparse matrix data if (inherits(x, "Matrix")) { if (!inherits(x, "dgCMatrix")) { stop("Error: Currently only sparse data of class 'dgCMatrix' supported.") } if (!is.null(formula)) { stop("Error: Sparse matrices only supported with alternative interface. Use dependent.variable.name or x/y instead of formula.") } } ## Check missing values if (any(is.na(x))) { offending_columns <- colnames(x)[colSums(is.na(x)) > 0] stop("Missing data in columns: ", paste0(offending_columns, collapse = ", "), ".", call. = FALSE) } if (any(is.na(y))) { stop("Missing data in dependent variable.", call. = FALSE) } ## Check response levels if (is.factor(y)) { if (nlevels(y) != nlevels(droplevels(y))) { dropped_levels <- setdiff(levels(y), levels(droplevels(y))) warning("Dropped unused factor level(s) in dependent variable: ", paste0(dropped_levels, collapse = ", "), ".", call. = FALSE) } } ## Treetype if (is.factor(y) || is.logical(y)) { if (probability) { treetype <- 9 } else { treetype <- 1 } } else if (is.numeric(y) && (is.null(ncol(y)) || ncol(y) == 1)) { if (!is.null(classification) && classification && !probability) { treetype <- 1 } else if (probability) { treetype <- 9 } else { treetype <- 3 } } else if (inherits(y, "Surv") || is.data.frame(y) || is.matrix(y)) { treetype <- 5 } else { stop("Error: Unsupported type of dependent variable.") } ## Quantile prediction only for regression if (quantreg && treetype != 3) { stop("Error: Quantile prediction implemented only for regression outcomes.") } independent.variable.names <- colnames(x) ## respect.unordered.factors if (is.null(respect.unordered.factors)) { if (!is.null(splitrule) && splitrule == "extratrees") { respect.unordered.factors <- "partition" } else { respect.unordered.factors <- "ignore" } } ## Old version of respect.unordered.factors if (respect.unordered.factors == TRUE) { respect.unordered.factors <- "order" } else if (respect.unordered.factors == FALSE) { respect.unordered.factors <- "ignore" } ## Recode characters as factors and recode factors if 'order' mode if (!is.matrix(x) && !inherits(x, "Matrix") && ncol(x) > 0) { character.idx <- sapply(x, is.character) if (respect.unordered.factors == "order") { ## Recode characters and unordered factors ordered.idx <- sapply(x, is.ordered) factor.idx <- sapply(x, is.factor) recode.idx <- character.idx | (factor.idx & !ordered.idx) if (any(recode.idx) & (importance == "impurity_corrected" || importance == "impurity_unbiased")) { warning("Corrected impurity importance may not be unbiased for re-ordered factor levels. Consider setting respect.unordered.factors to 'ignore' or 'partition' or manually compute corrected importance.") } ## Numeric response if (is.factor(y)) { num.y <- as.numeric(y) } else { num.y <- y } ## Save non-recoded x if quantile regression if (quantreg) { x_orig <- x } ## Recode each column x[recode.idx] <- lapply(x[recode.idx], function(xx) { if (!is.factor(xx)) { xx <- as.factor(xx) } if (length(levels(xx)) == 1) { ## Don't order if only one level levels.ordered <- levels(xx) } else if (inherits(y, "Surv")) { ## Use median survival if available or largest quantile available in all strata if median not available levels.ordered <- largest.quantile(y ~ xx) ## Get all levels not in node levels.missing <- setdiff(levels(xx), levels.ordered) levels.ordered <- c(levels.missing, levels.ordered) } else if (is.factor(y) & nlevels(y) > 2) { levels.ordered <- pca.order(y = y, x = xx) } else { ## Order factor levels by mean response means <- sapply(levels(xx), function(y) { mean(num.y[xx == y]) }) levels.ordered <- as.character(levels(xx)[order(means)]) } ## Return reordered factor factor(xx, levels = levels.ordered, ordered = TRUE, exclude = NULL) }) ## Save levels covariate.levels <- lapply(x, levels) } else { ## Recode characters only x[character.idx] <- lapply(x[character.idx], factor) } } ## If gwa mode, add snp variable names if (gwa.mode) { all.independent.variable.names <- c(independent.variable.names, snp.names) } else { all.independent.variable.names <- independent.variable.names } ## Error if no covariates if (length(all.independent.variable.names) < 1) { stop("Error: No covariates found.") } ## Number of trees if (!is.numeric(num.trees) || num.trees < 1) { stop("Error: Invalid value for num.trees.") } ## mtry as a function if (is.function(mtry)) { nv <- length(all.independent.variable.names) if (length(formals(mtry)) > 1){ stop("Error: Given mtry function requires single argument (the number of independent variables in the model).") } # Evaluate function mtry <- try(mtry(nv), silent = TRUE) if (inherits(mtry, "try-error")) { message("The mtry function produced the error: ", mtry) stop("Error: mtry function evaluation resulted in an error.") } ## Check for a single numeric if (!is.numeric(mtry) || length(mtry) != 1) { stop("Error: Given mtry function should return a single integer or numeric.") } else { mtry <- as.integer(mtry) } ## Check for limits if (mtry < 1 || mtry > nv) { stop("Error: Given mtry function should evaluate to a value not less than 1 and not greater than the number of independent variables ( = ", nv, " )") } } if (is.null(mtry)) { mtry <- 0 } else if (!is.numeric(mtry) || mtry < 0) { stop("Error: Invalid value for mtry") } ## Seed if (is.null(seed)) { seed <- runif(1 , 0, .Machine$integer.max) } ## Keep inbag if (!is.logical(keep.inbag)) { stop("Error: Invalid value for keep.inbag") } ## Num threads ## Default 0 -> detect from system in C++. if (is.null(num.threads)) { num.threads = 0 } else if (!is.numeric(num.threads) || num.threads < 0) { stop("Error: Invalid value for num.threads") } ## Minumum node size if (is.null(min.node.size)) { min.node.size <- 0 } else if (!is.numeric(min.node.size) || min.node.size < 0) { stop("Error: Invalid value for min.node.size") } ## Tree depth if (is.null(max.depth)) { max.depth <- 0 } else if (!is.numeric(max.depth) || max.depth < 0) { stop("Error: Invalid value for max.depth. Please give a positive integer.") } ## Sample fraction if (!is.numeric(sample.fraction)) { stop("Error: Invalid value for sample.fraction. Please give a value in (0,1] or a vector of values in [0,1].") } if (length(sample.fraction) > 1) { if (!(treetype %in% c(1, 9))) { stop("Error: Invalid value for sample.fraction. Vector values only valid for classification forests.") } if (any(sample.fraction < 0) || any(sample.fraction > 1)) { stop("Error: Invalid value for sample.fraction. Please give a value in (0,1] or a vector of values in [0,1].") } if (sum(sample.fraction) <= 0) { stop("Error: Invalid value for sample.fraction. Sum of values must be >0.") } if (length(sample.fraction) != nlevels(y)) { stop("Error: Invalid value for sample.fraction. Expecting ", nlevels(y), " values, provided ", length(sample.fraction), ".") } if (!replace & any(sample.fraction * length(y) > table(y))) { idx <- which(sample.fraction * length(y) > table(y))[1] stop("Error: Not enough samples in class ", names(idx), "; available: ", table(y)[idx], ", requested: ", (sample.fraction * length(y))[idx], ".") } if (!is.null(case.weights)) { stop("Error: Combination of case.weights and class-wise sampling not supported.") } # Fix order (C++ needs sample.fraction in order as classes appear in data) sample.fraction <- sample.fraction[as.numeric(unique(y))] } else { if (sample.fraction <= 0 || sample.fraction > 1) { stop("Error: Invalid value for sample.fraction. Please give a value in (0,1] or a vector of values in [0,1].") } } # Regularization if (all(regularization.factor == 1)) { regularization.factor <- c(0, 0) use.regularization.factor <- FALSE } else { # Deactivation of paralellization if (num.threads != 1) { num.threads <- 1 warning("Paralellization deactivated (regularization used).") } use.regularization.factor <- TRUE } if (use.regularization.factor) { # A few checkings on the regularization coefficients if (max(regularization.factor) > 1) { stop("The regularization coefficients cannot be greater than 1.") } if (max(regularization.factor) <= 0) { stop("The regularization coefficients cannot be smaller than 0.") } p <- length(all.independent.variable.names) if (length(regularization.factor) != 1 && length(regularization.factor) != p) { stop("You must use 1 or p (the number of predictor variables) regularization coefficients.") } if (length(regularization.factor) == 1) { regularization.factor = rep(regularization.factor, p) } } ## Importance mode if (is.null(importance) || importance == "none") { importance.mode <- 0 } else if (importance == "impurity") { importance.mode <- 1 } else if (importance == "impurity_corrected" || importance == "impurity_unbiased") { importance.mode <- 5 } else if (importance == "permutation") { if (local.importance) { importance.mode <- 6 } else if (scale.permutation.importance) { importance.mode <- 2 } else { importance.mode <- 3 } } else { stop("Error: Unknown importance mode.") } ## Case weights: NULL for no weights or all weights equal if (is.null(case.weights) || length(unique(case.weights)) == 1) { case.weights <- c(0,0) use.case.weights <- FALSE if (holdout) { stop("Error: Case weights required to use holdout mode.") } } else { use.case.weights <- TRUE ## Sample from non-zero weights in holdout mode if (holdout) { sample.fraction <- sample.fraction * mean(case.weights > 0) } if (!replace && sum(case.weights > 0) < sample.fraction * nrow(x)) { stop("Error: Fewer non-zero case weights than observations to sample.") } } ## Manual inbag selection if (is.null(inbag)) { inbag <- list(c(0,0)) use.inbag <- FALSE } else if (is.list(inbag)) { use.inbag <- TRUE if (use.case.weights) { stop("Error: Combination of case.weights and inbag not supported.") } if (length(sample.fraction) > 1) { stop("Error: Combination of class-wise sampling and inbag not supported.") } if (length(inbag) != num.trees) { stop("Error: Size of inbag list not equal to number of trees.") } } else { stop("Error: Invalid inbag, expects list of vectors of size num.trees.") } ## Class weights: NULL for no weights (all 1) if (is.null(class.weights)) { class.weights <- rep(1, nlevels(y)) } else { if (!(treetype %in% c(1, 9))) { stop("Error: Argument class.weights only valid for classification forests.") } if (!is.numeric(class.weights) || any(class.weights < 0)) { stop("Error: Invalid value for class.weights. Please give a vector of non-negative values.") } if (length(class.weights) != nlevels(y)) { stop("Error: Number of class weights not equal to number of classes.") } ## Reorder (C++ expects order as appearing in the data) class.weights <- class.weights[unique(as.numeric(y))] } ## Split select weights: NULL for no weights if (is.null(split.select.weights)) { split.select.weights <- list(c(0,0)) use.split.select.weights <- FALSE } else if (is.numeric(split.select.weights)) { if (length(split.select.weights) != length(all.independent.variable.names)) { stop("Error: Number of split select weights not equal to number of independent variables.") } split.select.weights <- list(split.select.weights) use.split.select.weights <- TRUE } else if (is.list(split.select.weights)) { if (length(split.select.weights) != num.trees) { stop("Error: Size of split select weights list not equal to number of trees.") } use.split.select.weights <- TRUE } else { stop("Error: Invalid split select weights.") } ## Always split variables: NULL for no variables if (is.null(always.split.variables)) { always.split.variables <- c("0", "0") use.always.split.variables <- FALSE } else { use.always.split.variables <- TRUE } if (use.split.select.weights && use.always.split.variables) { stop("Error: Please use only one option of split.select.weights and always.split.variables.") } ## Splitting rule if (is.null(splitrule)) { if (treetype == 5) { splitrule <- "logrank" } else if (treetype == 3) { splitrule <- "variance" } else if (treetype %in% c(1, 9)) { splitrule <- "gini" } splitrule.num <- 1 } else if (splitrule == "logrank") { if (treetype == 5) { splitrule.num <- 1 } else { stop("Error: logrank splitrule applicable to survival data only.") } } else if (splitrule == "gini") { if (treetype %in% c(1, 9)) { splitrule.num <- 1 } else { stop("Error: Gini splitrule applicable to classification data only.") } } else if (splitrule == "variance") { if (treetype == 3) { splitrule.num <- 1 } else { stop("Error: variance splitrule applicable to regression data only.") } } else if (splitrule == "auc" || splitrule == "C") { if (treetype == 5) { splitrule.num <- 2 } else { stop("Error: C index splitrule applicable to survival data only.") } } else if (splitrule == "auc_ignore_ties" || splitrule == "C_ignore_ties") { if (treetype == 5) { splitrule.num <- 3 } else { stop("Error: C index splitrule applicable to survival data only.") } } else if (splitrule == "maxstat") { if (treetype == 5 || treetype == 3) { splitrule.num <- 4 } else { stop("Error: maxstat splitrule applicable to regression or survival data only.") } } else if (splitrule == "extratrees") { splitrule.num <- 5 } else if (splitrule == "beta") { if (treetype == 3) { splitrule.num <- 6 } else { stop("Error: beta splitrule applicable to regression data only.") } ## Check for 0..1 outcome if (min(y) < 0 || max(y) > 1) { stop("Error: beta splitrule applicable to regression data with outcome between 0 and 1 only.") } } else if (splitrule == "hellinger") { if (treetype %in% c(1, 9)) { splitrule.num <- 7 } else { stop("Error: Hellinger splitrule only implemented for binary classification.") } if ((is.factor(y) && nlevels(y) > 2) || (length(unique(y)) > 2)) { stop("Error: Hellinger splitrule only implemented for binary classification.") } } else { stop("Error: Unknown splitrule.") } ## Maxstat splitting if (alpha < 0 || alpha > 1) { stop("Error: Invalid value for alpha, please give a value between 0 and 1.") } if (minprop < 0 || minprop > 0.5) { stop("Error: Invalid value for minprop, please give a value between 0 and 0.5.") } if (splitrule == "maxstat" & use.regularization.factor) { stop("Error: Regularization cannot be used with 'maxstat' splitrule.") } ## Extra trees if (!is.numeric(num.random.splits) || num.random.splits < 1) { stop("Error: Invalid value for num.random.splits, please give a positive integer.") } if (splitrule.num == 5 && save.memory && respect.unordered.factors == "partition") { stop("Error: save.memory option not possible in extraTrees mode with unordered predictors.") } if (num.random.splits > 1 && splitrule.num != 5) { warning("Argument 'num.random.splits' ignored if splitrule is not 'extratrees'.") } ## Unordered factors if (respect.unordered.factors == "partition") { ordered.idx <- sapply(x, is.ordered) factor.idx <- sapply(x, is.factor) unordered.factor.variables <- independent.variable.names[factor.idx & !ordered.idx] if (length(unordered.factor.variables) > 0) { use.unordered.factor.variables <- TRUE ## Check level count num.levels <- sapply(x[, factor.idx & !ordered.idx, drop = FALSE], nlevels) max.level.count <- .Machine$double.digits if (max(num.levels) > max.level.count) { stop(paste("Too many levels in unordered categorical variable ", unordered.factor.variables[which.max(num.levels)], ". Only ", max.level.count, " levels allowed on this system. Consider using the 'order' option.", sep = "")) } } else { unordered.factor.variables <- c("0", "0") use.unordered.factor.variables <- FALSE } } else if (respect.unordered.factors == "ignore" || respect.unordered.factors == "order") { ## Ordering for "order" is handled above unordered.factor.variables <- c("0", "0") use.unordered.factor.variables <- FALSE } else { stop("Error: Invalid value for respect.unordered.factors, please use 'order', 'partition' or 'ignore'.") } ## Unordered maxstat splitting not possible if (use.unordered.factor.variables && !is.null(splitrule)) { if (splitrule == "maxstat") { stop("Error: Unordered factor splitting not implemented for 'maxstat' splitting rule.") } else if (splitrule %in% c("C", "auc", "C_ignore_ties", "auc_ignore_ties")) { stop("Error: Unordered factor splitting not implemented for 'C' splitting rule.") } else if (splitrule == "beta") { stop("Error: Unordered factor splitting not implemented for 'beta' splitting rule.") } } ## Warning for experimental 'order' splitting if (respect.unordered.factors == "order") { if (treetype == 3 && splitrule == "maxstat") { warning("Warning: The 'order' mode for unordered factor handling with the 'maxstat' splitrule is experimental.") } if (gwa.mode & ((treetype %in% c(1,9) & nlevels(y) > 2) | treetype == 5)) { stop("Error: Ordering of SNPs currently only implemented for regression and binary outcomes.") } } ## Prediction mode always false. Use predict.ranger() method. prediction.mode <- FALSE predict.all <- FALSE prediction.type <- 1 ## No loaded forest object loaded.forest <- list() ## Use sparse matrix if (inherits(x, "dgCMatrix")) { sparse.x <- x x <- matrix(c(0, 0)) use.sparse.data <- TRUE } else { sparse.x <- Matrix(matrix(c(0, 0))) use.sparse.data <- FALSE if (is.data.frame(x)) { x <- data.matrix(x) } } if (treetype == 5) { y.mat <- as.matrix(y) } else { y.mat <- as.matrix(as.numeric(y)) } if (respect.unordered.factors == "order"){ order.snps <- TRUE } else { order.snps <- FALSE } ## No competing risks check if (treetype == 5) { if (!all(y.mat[, 2] %in% 0:1)) { stop("Error: Competing risks not supported yet. Use status=1 for events and status=0 for censoring.") } } ## Call Ranger result <- rangerCpp(treetype, x, y.mat, independent.variable.names, mtry, num.trees, verbose, seed, num.threads, write.forest, importance.mode, min.node.size, split.select.weights, use.split.select.weights, always.split.variables, use.always.split.variables, prediction.mode, loaded.forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule.num, case.weights, use.case.weights, class.weights, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, regularization.factor, use.regularization.factor, regularization.usedepth) if (length(result) == 0) { stop("User interrupt or internal error.") } ## Prepare results if (importance.mode != 0) { names(result$variable.importance) <- all.independent.variable.names if (importance.mode == 6) { # process casewise vimp result$variable.importance.local <- matrix( result$variable.importance.local, byrow = FALSE, ncol = length(all.independent.variable.names), dimnames = list( rownames(data), all.independent.variable.names ) ) } } ## Set predictions if (treetype == 1 && oob.error) { if (is.factor(y)) { result$predictions <- integer.to.factor(result$predictions, levels(y)) } result$confusion.matrix <- table(y, result$predictions, dnn = c("true", "predicted"), useNA = "ifany") } else if (treetype == 5 && oob.error) { if (is.list(result$predictions)) { result$predictions <- do.call(rbind, result$predictions) } if (is.vector(result$predictions)) { result$predictions <- matrix(result$predictions, nrow = 1) } result$chf <- result$predictions result$predictions <- NULL result$survival <- exp(-result$chf) } else if (treetype == 9 && oob.error) { if (is.list(result$predictions)) { result$predictions <- do.call(rbind, result$predictions) } if (is.vector(result$predictions)) { result$predictions <- matrix(result$predictions, nrow = 1) } ## Set colnames and sort by levels colnames(result$predictions) <- unique(y) if (is.factor(y)) { result$predictions <- result$predictions[, levels(droplevels(y)), drop = FALSE] } } ## Splitrule result$splitrule <- splitrule if (splitrule == "extratrees") { result$num.random.splits <- num.random.splits } ## Set treetype if (treetype == 1) { result$treetype <- "Classification" } else if (treetype == 3) { result$treetype <- "Regression" } else if (treetype == 5) { result$treetype <- "Survival" } else if (treetype == 9) { result$treetype <- "Probability estimation" } if (treetype == 3) { result$r.squared <- 1 - result$prediction.error / var(y) } result$call <- sys.call() result$importance.mode <- importance if (use.sparse.data) { result$num.samples <- nrow(sparse.x) } else { result$num.samples <- nrow(x) } result$replace <- replace ## Write forest object if (write.forest) { if (is.factor(y)) { result$forest$levels <- levels(y) } result$forest$independent.variable.names <- independent.variable.names result$forest$treetype <- result$treetype class(result$forest) <- "ranger.forest" ## In 'ordered' mode, save covariate levels if (respect.unordered.factors == "order" && ncol(x) > 0) { result$forest$covariate.levels <- covariate.levels } } class(result) <- "ranger" ## Prepare quantile prediction if (quantreg) { if (respect.unordered.factors == "order" && !is.null(x_orig)) { terminal.nodes <- predict(result, x_orig, type = "terminalNodes")$predictions + 1 } else { terminal.nodes <- predict(result, x, type = "terminalNodes")$predictions + 1 } n <- result$num.samples result$random.node.values <- matrix(nrow = max(terminal.nodes), ncol = num.trees) ## Select one random obs per node and tree for (tree in 1:num.trees){ idx <- sample(1:n, n) result$random.node.values[terminal.nodes[idx, tree], tree] <- y[idx] } ## Prepare out-of-bag quantile regression if(!is.null(result$inbag.counts)) { inbag.counts <- simplify2array(result$inbag.counts) random.node.values.oob <- randomObsNode(terminal.nodes, y, inbag.counts) ## Check num.trees minoob <- min(rowSums(inbag.counts == 0)) if (minoob < 10) { stop("Error: Too few trees for out-of-bag quantile regression.") } ## Use the same number of values for all obs, select randomly result$random.node.values.oob <- t(apply(random.node.values.oob, 1, function(x) { sample(x[!is.na(x)], minoob) })) } } return(result) } ranger/R/getTerminalNodeIDs.R0000755000176200001440000000327414027301516015545 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' This function is deprecated. ##' Please use predict() with \code{type = "terminalNodes"} instead. ##' This function calls predict() now. ##' ##' @title Get terminal node IDs (deprecated) ##' @param rf \code{ranger} object. ##' @param dat New dataset. Terminal node IDs for this dataset are obtained. ##' ##' @return Matrix with terminal nodeIDs for all observations in dataset and trees. ##' ##' @examples ##' rf <- ranger(Species ~ ., data = iris, num.trees = 5, write.forest = TRUE) ##' getTerminalNodeIDs(rf, iris) ##' @export getTerminalNodeIDs <- function(rf, dat) { warning("Function getTerminalNodeIDs() deprecated, calling predict().") predict(rf, dat, type = "terminalNodes")$predictions } ranger/R/infinitesimalJackknife.R0000755000176200001440000001663414027301516016531 0ustar liggesusers # Compute variance of estimate from a ranger model # # Computes variances for a prediction from a ranger model, using the infinitesimal jackknife procedure # # This function is a ranger-adaptation of the package \pkg{randomForestCI} of Wager et al. (2014). Their original can be found on github: \url{ https://github.com/swager/randomForestCI/}. # # @param pred A nrow(newdata) by no. of trees matrix which contains numeric predictions # from a random forest trained with trees grown on bootstrap samples of the training data # @param inbag A number of obs. in the training data by no. of trees matrix giving the # number of times the ith observation in the training data appeared in the bootstrap sample for the jth tree. # @param calibrate whether to apply calibration to mitigate Monte Carlo noise # warning: if calibrate = FALSE, some variance estimates may be negative # due to Monte Carlo effects if the number of trees in rf is too small # @param used.trees set of trees to use for variance estimation; uses all tress if NULL # # @return A two-column matrix is returned, with predictions in the first column and estimates of prediction variance in the second. # @author Stefan Wager rInfJack = function(pred, inbag, calibrate = TRUE, used.trees = NULL) { # original: https://github.com/swager/randomForestCI/blob/master/R/infinitesimalJackknife.R if (is.null(used.trees)) { used.trees = 1:ncol(inbag) } pred = pred[, used.trees, drop=FALSE] # Check if sampling without replacement no.replacement = (max(inbag) == 1) # Extract tree-wise predictions and variable counts from random forest B = length(used.trees) n = nrow(inbag) s = sum(inbag) / ncol(inbag) y.hat = rowMeans(pred) pred.centered = pred - rowMeans(pred) N = Matrix::Matrix(inbag[, used.trees], sparse = TRUE) N.avg = Matrix::rowMeans(N) # Compute raw infinitesimal jackknife if (as.numeric(B)^2 > as.numeric(n) * as.numeric(nrow(pred))) { C = Matrix::tcrossprod(N, pred.centered) - Matrix::Matrix(N.avg, nrow(N), 1) %*% Matrix::Matrix(rowSums(pred.centered), 1, nrow(pred.centered)) raw.IJ = Matrix::colSums(C^2) / B^2 } else { # Faster implementation when n is large. Uses the fact that # colSums((A - B)^2) = T1 - 2 * T2 + T3, # where T1 = diag(A'A), T2 = diag(B'A), and T3 = diag(B'B) NTN = Matrix::crossprod(N, N) NTNPT_T = Matrix::tcrossprod(pred.centered, NTN) T1 = Matrix::rowSums(pred.centered * NTNPT_T) RS = rowSums(pred.centered) NbarTN = Matrix::crossprod(N.avg, N) T2 = RS * Matrix::tcrossprod(NbarTN, pred.centered) T3 = sum(N.avg^2) * RS^2 raw.IJ = as.numeric(T1 - 2 * T2 + T3) / B^2 } # Apply Monte Carlo bias correction N.var = mean(Matrix::rowMeans(N^2) - Matrix::rowMeans(N)^2) boot.var = rowSums(pred.centered^2) / B bias.correction = n * N.var * boot.var / B vars = raw.IJ - bias.correction # Finite sample correction if (no.replacement) { variance.inflation = 1 / (1 - mean(inbag))^2 vars = variance.inflation * vars } results = data.frame(y.hat=y.hat, var.hat=vars) if (isTRUE(calibrate) && nrow(results) <= 20) { calibrate = FALSE warning("Sample size <=20, no calibration performed.") } # If appropriate, calibrate variance estimates; this step in particular # ensures that all variance estimates wil be positive. if (calibrate) { # Compute variance estimates using half the trees calibration.ratio = 2 n.sample = ceiling(B / calibration.ratio) results.ss = rInfJack(pred, inbag, calibrate = FALSE, used.trees = sample(used.trees, n.sample)) # Use this second set of variance estimates to estimate scale of Monte Carlo noise sigma2.ss = mean((results.ss$var.hat - results$var.hat)^2) delta = n.sample / B sigma2 = (delta^2 + (1 - delta)^2) / (2 * (1 - delta)^2) * sigma2.ss # Use Monte Carlo noise scale estimate for empirical Bayes calibration results = tryCatch( expr = { vars.calibrated = calibrateEB(vars, sigma2) results$var.hat = vars.calibrated results }, error = function(e) { warning(sprintf("Calibration failed with error:\n%sFalling back to non-calibrated variance estimates.", e)) results = rInfJack(pred, inbag, calibrate = FALSE, used.trees = used.trees) return(results) } ) } return(results) } # Fit an empirical Bayes prior in the hierarchical model # mu ~ G, X ~ N(mu, sigma^2) # # @param X a vector of observations # @param sigma noise estimate # @param p tuning parameter -- number of parameters used to fit G # @param nbin tuning parameter -- number of bins used for discrete approximation # @param unif.fraction tuning parameter -- fraction of G modeled as "slab" # # @return posterior density estimate g # # @section References: # For more details about "g-estimation", see: B Efron. Two modeling strategies for # empirical Bayes estimation. Stat. Sci., 29: 285-301, 2014. # @author Stefan Wager gfit = function(X, sigma, p = 2, nbin = 1000, unif.fraction = 0.1) { xvals = seq(min(min(X) - 2 * sd(X), 0), max(max(X) + 2 * sd(X), sd(X)), length.out = nbin) binw = xvals[2] - xvals[1] zero.idx = max(which(xvals <= 0)) noise.kernel = dnorm(xvals / sigma) * binw / sigma if (zero.idx > 1) { noise.rotate = noise.kernel[c(zero.idx:length(xvals), 1:(zero.idx - 1))] } else { noise.rotate = noise.kernel } XX = sapply(1:p, function(j) xvals^j * as.numeric(xvals >= 0)) neg.loglik = function(eta) { g.eta.raw = exp(XX %*% eta) * as.numeric(xvals >= 0) if ((sum(g.eta.raw) == Inf) | (sum(g.eta.raw) <= 100 * .Machine$double.eps)) { return (1000 * (length(X) + sum(eta^2))) } g.eta.main = g.eta.raw / sum(g.eta.raw) g.eta = (1 - unif.fraction) * g.eta.main + unif.fraction * as.numeric(xvals >= 0) / sum(xvals >= 0) f.eta = convolve(g.eta, noise.rotate) sum(approx(xvals, -log(pmax(f.eta, 0.0000001)), X)$y) } eta.hat = nlm(neg.loglik, rep(-1, p))$estimate g.eta.raw = exp(XX %*% eta.hat) * as.numeric(xvals >= 0) g.eta.main = g.eta.raw / sum(g.eta.raw) g.eta = (1 - unif.fraction) * g.eta.main + unif.fraction * as.numeric(xvals >= 0) / sum(xvals >= 0) return(data.frame(x=xvals, g=g.eta)) } # Bayes posterior estimation with Gaussian noise # # @param x0 an obsevation # @param g.est a prior density, as returned by gfit # @param sigma noise estimate # # @return posterior estimate E[mu | x0] # @author Stefan Wager gbayes = function(x0, g.est, sigma) { Kx = dnorm((g.est$x - x0) / sigma) post = Kx * g.est$g post = post / sum(post) sum(post * g.est$x) } # Empirical Bayes calibration of noisy variance estimates # # @param vars list of variance estimates # @param sigma2 estimate of the Monte Carlo noise in vars # # @return calibrated variance estimates # @author Stefan Wager calibrateEB = function(vars, sigma2) { if(sigma2 <= 0 | min(vars) == max(vars)) { return(pmax(vars, 0)) } sigma = sqrt(sigma2) eb.prior = gfit(vars, sigma) if (length(vars >= 200)) { # If there are many test points, use interpolation to speed up computations calib.x = unique(quantile(vars, q = seq(0, 1, by = 0.02))) calib.y = sapply(calib.x, function(xx) gbayes(xx, eb.prior, sigma)) calib.all = approx(x=calib.x, y=calib.y, xout=vars)$y } else { calib.all = sapply(vars, function(xx) gbayes(xx, eb.prior, sigma)) } return(calib.all) } ranger/R/importance.R0000755000176200001440000001727714073532130014234 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- ##' @export importance <- function(x, ...) UseMethod("importance") ##' Extract variable importance of ranger object. ##' ##' ##' @title ranger variable importance ##' @param x ranger object. ##' @param ... Further arguments passed to or from other methods. ##' @return Variable importance measures. ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @aliases importance ##' @export importance.ranger <- function(x, ...) { if (!inherits(x, "ranger")) { stop("Object ist no ranger object.") } if (is.null(x$variable.importance) || length(x$variable.importance) < 1) { stop("No variable importance found. Please use 'importance' option when growing the forest.") } return(x$variable.importance) } ##' Compute variable importance with p-values. ##' For high dimensional data, the fast method of Janitza et al. (2016) can be used. ##' The permutation approach of Altmann et al. (2010) is computationally intensive but can be used with all kinds of data. ##' See below for details. ##' ##' The method of Janitza et al. (2016) uses a clever trick: ##' With an unbiased variable importance measure, the importance values of non-associated variables vary randomly around zero. ##' Thus, all non-positive importance values are assumed to correspond to these non-associated variables and they are used to construct a distribution of the importance under the null hypothesis of no association to the response. ##' Since only the non-positive values of this distribution can be observed, the positive values are created by mirroring the negative distribution. ##' See Janitza et al. (2016) for details. ##' ##' The method of Altmann et al. (2010) uses a simple permutation test: ##' The distribution of the importance under the null hypothesis of no association to the response is created by several replications of permuting the response, growing an RF and computing the variable importance. ##' The authors recommend 50-100 permutations. ##' However, much larger numbers have to be used to estimate more precise p-values. ##' We add 1 to the numerator and denominator to avoid zero p-values. ##' ##' @title ranger variable importance p-values ##' @param x \code{ranger} or \code{holdoutRF} object. ##' @param method Method to compute p-values. Use "janitza" for the method by Janitza et al. (2016) or "altmann" for the non-parametric method by Altmann et al. (2010). ##' @param num.permutations Number of permutations. Used in the "altmann" method only. ##' @param formula Object of class formula or character describing the model to fit. Used in the "altmann" method only. ##' @param data Training data of class data.frame or matrix. Used in the "altmann" method only. ##' @param ... Further arguments passed to \code{ranger()}. Used in the "altmann" method only. ##' @return Variable importance and p-value for each variable. ##' @examples ##' ## Janitza's p-values with corrected Gini importance ##' n <- 50 ##' p <- 400 ##' dat <- data.frame(y = factor(rbinom(n, 1, .5)), replicate(p, runif(n))) ##' rf.sim <- ranger(y ~ ., dat, importance = "impurity_corrected") ##' importance_pvalues(rf.sim, method = "janitza") ##' ##' ## Permutation p-values ##' \dontrun{ ##' rf.iris <- ranger(Species ~ ., data = iris, importance = 'permutation') ##' importance_pvalues(rf.iris, method = "altmann", formula = Species ~ ., data = iris) ##' } ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @references ##' Janitza, S., Celik, E. & Boulesteix, A.-L., (2016). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr ##' Altmann, A., Tolosi, L., Sander, O. & Lengauer, T. (2010). Permutation importance: a corrected feature importance measure, Bioinformatics 26:1340-1347. ##' @export importance_pvalues <- function(x, method = c("janitza", "altmann"), num.permutations = 100, formula = NULL, data = NULL, ...) { method <- match.arg(method) if (!inherits(x, c("ranger", "holdoutRF"))) { stop("Object is no ranger or holdoutRF object.") } if (x$importance.mode == "none" || is.null(x$variable.importance) || length(x$variable.importance) < 1) { stop("No variable importance found. Please use 'importance' option when growing the forest.") } if (method == "janitza") { if (x$importance.mode == "impurity") { stop("Impurity variable importance found. Please use (hold-out) permutation importance or corrected impurity importance to use this method.") } if (!inherits(x, "holdoutRF") && x$importance.mode == "permutation") { warning("Permutation variable importance found, inaccurate p-values. Please use hold-out permutation importance or corrected impurity importance to use this method.") } if (x$treetype != "Classification") { warning("This method is tested for classification only, use with care.") } ## Mirrored VIMP m1 <- x$variable.importance[x$variable.importance < 0] m2 <- x$variable.importance[x$variable.importance == 0] vimp <- c(m1, -m1, m2) ## Compute p-value ## Note: ecdf is smaller or equal, problems with 0 importance values pval <- 1 - numSmaller(x$variable.importance, vimp) / length(vimp) ## TODO: 100 ok? increase? if (length(m1) == 0) { stop("No negative importance values found. Consider the 'altmann' approach.") } if (length(m1) < 100) { warning("Only few negative importance values found, inaccurate p-values. Consider the 'altmann' approach.") } } else if (method == "altmann") { if (!inherits(x, "ranger")) { stop("Altmann method not available for holdoutRF objects.") } if (is.null(formula) || is.null(data)) { stop("Formula and data required for the 'altmann' method.") } if (is.character(formula)) { formula <- formula(formula) } ## Permute and compute importance again if (x$treetype == "Survival") { dependent.variable.name <- all.vars(formula)[1:2] } else { dependent.variable.name <- all.vars(formula)[1] } vimp <- sapply(1:num.permutations, function(i) { dat <- data dat[, dependent.variable.name] <- dat[sample(nrow(dat)), dependent.variable.name] ranger(formula, dat, num.trees = x$num.trees, mtry = x$mtry, min.node.size = x$min.node.size, importance = x$importance.mode, replace = x$replace, ...)$variable.importance }) ## Compute p-value pval <- sapply(1:nrow(vimp), function(i) { (sum(vimp[i, ] >= x$variable.importance[i]) + 1)/(ncol(vimp) + 1) }) } else { stop("Unknown p-value method. Available methods are: 'janitza' and 'altmann'.") } ## Return VIMP and p-values res <- cbind(x$variable.importance, pval) colnames(res) <- c("importance", "pvalue") return(res) } ranger/R/treeInfo.R0000755000176200001440000001541114027301517013634 0ustar liggesusers# ------------------------------------------------------------------------------- # This file is part of Ranger. # # Ranger is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ranger is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ranger. If not, see . # # Written by: # # Marvin N. Wright # Institut fuer Medizinische Biometrie und Statistik # Universitaet zu Luebeck # Ratzeburger Allee 160 # 23562 Luebeck # Germany # # http://www.imbs-luebeck.de # ------------------------------------------------------------------------------- #' Tree information in human readable format #' #' Extract tree information of a \code{ranger} object. #' #' Node and variable ID's are 0-indexed, i.e., node 0 is the root node. #' If the formula interface is used in the \code{ranger} call, the variable ID's are usually different to the original data used to grow the tree. #' Refer to the variable name instead to be sure. #' #' Splitting at unordered factors (nominal variables) depends on the option \code{respect.unordered.factors} in the \code{ranger} call. #' For the "ignore" and "order" approaches, all values smaller or equal the \code{splitval} value go to the left and all values larger go to the right, as usual. #' However, with "order" the values correspond to the order in \code{object$forest$covariate.levels} instead of the original order (usually alphabetical). #' In the "partition" mode, the \code{splitval} values for unordered factor are comma separated lists of values, representing the factor levels (in the original order) going to the right. #' #' @param object \code{ranger} object. #' @param tree Number of the tree of interest. #' @return A data.frame with the columns #' \tabular{ll}{ #' \code{nodeID} \tab The nodeID, 0-indexed. \cr #' \code{leftChild} \tab ID of the left child node, 0-indexed. \cr #' \code{rightChild} \tab ID of the right child node, 0-indexed. \cr #' \code{splitvarID} \tab ID of the splitting variable, 0-indexed. Caution, the variable order changes if the formula interface is used. \cr #' \code{splitvarName} \tab Name of the splitting variable. \cr #' \code{splitval} \tab The splitting value. For numeric or ordinal variables, all values smaller or equal go to the left, larger values to the right. For unordered factor variables see above. \cr #' \code{terminal} \tab Logical, TRUE for terminal nodes. \cr #' \code{prediction} \tab One column with the predicted class (factor) for classification and the predicted numerical value for regression. One probability per class for probability estimation in several columns. Nothing for survival, refer to \code{object$forest$chf} for the CHF node predictions. \cr #' } #' @examples #' rf <- ranger(Species ~ ., data = iris) #' treeInfo(rf, 1) #' @seealso \code{\link{ranger}} #' @author Marvin N. Wright #' @export treeInfo <- function(object, tree = 1) { if (!inherits(object, "ranger")) { stop("Error: Invalid class of input object.") } forest <- object$forest if (is.null(forest)) { stop("Error: No saved forest in ranger object. Please set write.forest to TRUE when calling ranger.") } if (is.null(forest$num.trees) || is.null(forest$child.nodeIDs) || is.null(forest$split.varIDs) || is.null(forest$split.values) || is.null(forest$independent.variable.names) || is.null(forest$treetype)) { stop("Error: Invalid forest object.") } if (forest$treetype == "Survival" && (is.null(forest$chf) || is.null(forest$unique.death.times))) { stop("Error: Invalid forest object.") } if (length(forest$child.nodeIDs) != forest$num.trees || length(forest$child.nodeIDs[[1]]) != 2) { stop("Error: Invalid forest object. Is the forest grown in ranger version <0.3.9? Try with the same version the forest was grown.") } if (!is.null(forest$dependent.varID)) { forest <- convert.pre.xy(forest, trees = tree) } if (tree > forest$num.trees) { stop("Error: Requesting tree ", tree, ", but forest has only ", forest$num.trees, " trees.") } result <- data.frame(nodeID = 0:(length(forest$split.values[[tree]]) - 1), leftChild = forest$child.nodeIDs[[tree]][[1]], rightChild = forest$child.nodeIDs[[tree]][[2]], splitvarID = forest$split.varIDs[[tree]], splitvarName = "X", splitval = forest$split.values[[tree]], terminal = FALSE) result$leftChild[result$leftChild == 0] <- NA result$rightChild[result$rightChild == 0] <- NA result$terminal[is.na(result$leftChild)] <- TRUE result$splitvarID[result$terminal] <- NA result$splitvarName[result$terminal] <- NA result$splitval[result$terminal] <- NA result$splitvarName <- forest$independent.variable.names[result$splitvarID + 1] ## Unordered splitting idx.unordered <- !result$terminal & !forest$is.ordered[result$splitvarID + 1] if (any(idx.unordered)) { if (any(result$splitval[idx.unordered] > (2^31 - 1))) { warning("Unordered splitting levels can only be shown for up to 31 levels.") result$splitval[idx.unordered] <- NA } else { result$splitval[idx.unordered] <- sapply(result$splitval[idx.unordered], function(x) { paste(which(as.logical(intToBits(x))), collapse = ",") }) } } ## Prediction if (forest$treetype == "Classification") { result$prediction <- forest$split.values[[tree]] result$prediction[!result$terminal] <- NA if (!is.null(forest$levels)) { result$prediction <- factor(result$prediction, levels = forest$class.values, labels = forest$levels) } } else if (forest$treetype == "Regression") { result$prediction <- forest$split.values[[tree]] result$prediction[!result$terminal] <- NA } else if (forest$treetype == "Probability estimation") { predictions <- matrix(nrow = nrow(result), ncol = length(forest$levels)) predictions[result$terminal, ] <- do.call(rbind, forest$terminal.class.counts[[tree]]) colnames(predictions) <- forest$levels[forest$class.values] predictions <- predictions[, forest$levels, drop = FALSE] colnames(predictions) <- paste0("pred.", colnames(predictions)) result <- data.frame(result, predictions) } else if (forest$treetype == "Survival") { # No prediction for survival (CHF too large?) } else { stop("Error: Unknown tree type.") } result } ranger/MD50000644000176200001440000001346614073544142012056 0ustar liggesusersf0f6177149e03df3e6decee1055071ef *DESCRIPTION 8a28e9291c7e6ecf59661ab825c17cb4 *NAMESPACE 4efb57ba6a802937b6ac59aebc165741 *NEWS 31afcf7eb3735a1084a1a9759d59db4d *R/RcppExports.R a58e3755b5483aedb3bb029a815f8b81 *R/csrf.R 2e05183930ca2636c1f3149ea239ed88 *R/formula.R 15fea947b85a1e765d11e4839d9fe5f5 *R/getTerminalNodeIDs.R 93c5fe5f19714c310eefa3f3d4392afd *R/holdoutRF.R 6763045d509f7c1e8017eaed2ec94688 *R/importance.R dabac3b4f51dd2892441feeef26c6ac5 *R/infinitesimalJackknife.R 67261008e7efd1ac197fd0860f4b840d *R/predict.R 0ae0bc34d453f07eb5452c4b4911bd9b *R/predictions.R 50e2631469c0cc811cc9537b7aae44f9 *R/print.R bb91f68aa2ef91c57d7a84a4775336fc *R/ranger.R 89b6e422397207ddab3e3d0546529737 *R/timepoints.R 7a945f80b65f0e6e11031c21bd4fcd14 *R/treeInfo.R fae34578004769080a791f533ef10ea5 *R/utility.R ae400548a646adfce4177013d92eec24 *build/partial.rdb 4d030d3a06b0ef58d61ea262fa224e38 *inst/CITATION 99092cb89ddf775ba5286ad439e376b8 *inst/include/ranger.h 681334a333b0b50eb4a69e61bb6eb24a *man/csrf.Rd 9e424738a938a0881edf31f82e7e7c2a *man/getTerminalNodeIDs.Rd b1c2e27913991894b9db303efe19d3a9 *man/holdoutRF.Rd b0ec2081a6562829eeea481075d57887 *man/importance.ranger.Rd d8acd43680e44b6374856ecd573bc5f1 *man/importance_pvalues.Rd 0540439f10f473c415184e196adbdec4 *man/parse.formula.Rd f103c2b3c33738e8c5b78e972b6372a4 *man/predict.ranger.Rd 88e0098af4eea3542814eb22d2f73dc3 *man/predict.ranger.forest.Rd 5e4e754d297928a997e9559f7cc0387e *man/predictions.ranger.Rd 4264dba65c1e3cfe86c7c928bf7ebeb9 *man/predictions.ranger.prediction.Rd cfe96c2b79544199940eb1a7929990b3 *man/print.ranger.Rd a8bd4bf79b2a0a5de8baa1d3e2086a32 *man/print.ranger.forest.Rd 59df6ff5c0ec4fed916c856074bec9ab *man/print.ranger.prediction.Rd b9416675ff031c865ed9ef22e3f5f400 *man/ranger.Rd 37f914d88fe8a5a4b0e5b6b09407e5c7 *man/timepoints.ranger.Rd ce6576881be1c4cdc985fe14aca596d2 *man/timepoints.ranger.prediction.Rd 876547de3d1e426bdd4d9e4f2879154a *man/treeInfo.Rd 1337f027b94f486bb3899ca32bed8e5a *src/AAA_check_cpp11.cpp 310e12c0e34147efd0ef6b7fb846d243 *src/Data.cpp 247dd15438cae8e6049c1cf08a3f78c8 *src/Data.h 0adf1b0bdca330ead4d86ed71c69f843 *src/DataChar.h a0ec49351972b9a940c6844d40a5ff2a *src/DataDouble.h 21a5e81875a391485883a7b9547b65a1 *src/DataFloat.h 5597e414fd475cab8edf62b7231e342b *src/DataRcpp.h 5f057ca796310be682fd506e00a1b670 *src/DataSparse.cpp 989a908c719ab25e1a242544aa095b25 *src/DataSparse.h 7c24e8a628bf7a7ca7ee75d240669825 *src/Forest.cpp de5a3d9b07699eb08bac93e02098f595 *src/Forest.h a04bde475696bedadaa618b58fb873cf *src/ForestClassification.cpp 44d4448daba5d9bab5dc46b59c82a723 *src/ForestClassification.h 7312aa6fa64b1577e77055a476074156 *src/ForestProbability.cpp e60d7c2003d09aa894c1beba5f1b54a2 *src/ForestProbability.h b91ee05e0be478b3a5f6006615341bda *src/ForestRegression.cpp 798a91fe2c3f1382d00d6ead980ea503 *src/ForestRegression.h 5ff817d93f64354d8a83d7ce6056c491 *src/ForestSurvival.cpp 8d1affdaaa199bbdccf3d76523155b14 *src/ForestSurvival.h 9ec0f73a151cbf05029491a7d121c44e *src/Makevars 35b026405d74b33d230b336feb0ce0b2 *src/Makevars.win a49fe07df13afda4fb56dec19f9d5d00 *src/RcppExports.cpp 30a9cbb9443d5cbe67d06d0519ddf47f *src/Tree.cpp 158b5e7122e5f3f03140fbc41f6b13da *src/Tree.h 1deefe79363276ea13b97a9e304804eb *src/TreeClassification.cpp 540e9cbf00a9e844295e748f7c1b5588 *src/TreeClassification.h 113697e98051b975aabf1ce5fb90fa98 *src/TreeProbability.cpp c522b94f5e6950d2862fb981cdee1da4 *src/TreeProbability.h 1e23d65fb7164a9ec1ddfad9a25b0a2f *src/TreeRegression.cpp d8b0ad678a9d3849061ba1d0218bad17 *src/TreeRegression.h 7616f11d9724e62761e10822522bc7f4 *src/TreeSurvival.cpp ee8d78f44186f16cd0cb5ffc69b0efd1 *src/TreeSurvival.h fb094e51601ff5cac83102edf38279a9 *src/globals.h 71988ff40719827955ade5beac04f78d *src/rangerCpp.cpp c343523d852c5089f9d2ac81115fca1c *src/utility.cpp 7bae1f4d07590bcaa54ea3b434c098af *src/utility.h f2e0cfe95b458e85a22b0cc81137cdaf *src/utilityRcpp.cpp 2cc700f6f82a48dd04a85892ae1c8198 *tests/test_gwaa.rds 942a867f12d7fa46a16054763464a8da *tests/testthat.R 8f2c86e2ab9313c0573cbd06ecf287b8 *tests/testthat/test_betasplit.R 5c4236c7a5f5b6631322d957d9835287 *tests/testthat/test_char.R 74060016460c6430091b888cf3b4a49f *tests/testthat/test_classification.R 6a7f2886320d44db79d82d99d8ca7db8 *tests/testthat/test_classweights.R 782e464426d0b278d19e1faf594cbd6e *tests/testthat/test_csrf.R a8e2a0ac310cabac556e4f78bc8434ff *tests/testthat/test_extratrees.R b7c5786c508da8fa701665c6b472a919 *tests/testthat/test_formula.R 8f5dc50ec2c9dc958e9d9e9d8368f64f *tests/testthat/test_genabel.R 8adef9413259adff22a26eebb01946d3 *tests/testthat/test_hellinger.R e48d1c22344f6a3d69e38e94f6f9c02d *tests/testthat/test_importance.R 18347c9c9605731800b328e0d913337d *tests/testthat/test_importance_casewise.R c065e478f45ebcef1e505ab5f8dc2545 *tests/testthat/test_importance_pvalues.R 4211c78c8274642f1fcf4835f2e67e9a *tests/testthat/test_inbag.R c00ea1f9dedc18187c204c7af938bbcd *tests/testthat/test_interface.R 43406e749cd36a2c1ee9fd0c36cdb437 *tests/testthat/test_jackknife.R 209384e10e4623ab79eeee5e6228fcae *tests/testthat/test_maxstat.R 20c4a876be95f035013eb1f3d24bc959 *tests/testthat/test_predict.R 5b3404e559b629889a4417b6811969dc *tests/testthat/test_print.R 8601b0590f6d21b3588a01d279f9998a *tests/testthat/test_probability.R f1d81645285d05ed5c453716046c689d *tests/testthat/test_quantreg.R 3b5819fa12e35346057dc60f410a1afa *tests/testthat/test_ranger.R 57ca4fb60e6024bb860825e031f59cc5 *tests/testthat/test_regression.R 676f6d13c7671dddd2c2fc7807362ff1 *tests/testthat/test_regularization.R 0a7875ac6b50828fe98af36ef064ee55 *tests/testthat/test_seed.R 8e8f0488876cfa631e085e7163564bcd *tests/testthat/test_sparse.R 3ad98b0489073a738c17d77b0d432813 *tests/testthat/test_splitweights.R d7b16da13f25d2c23ee0dcd538f15846 *tests/testthat/test_survival.R e8c7faf60bbd67e561ddbdd710ee10aa *tests/testthat/test_treeInfo.R 8886e05d38561bd23639c1e047184e54 *tests/testthat/test_unordered.R ranger/inst/0000755000176200001440000000000014027301517012505 5ustar liggesusersranger/inst/include/0000755000176200001440000000000014027301517014130 5ustar liggesusersranger/inst/include/ranger.h0000755000176200001440000000006714027301517015565 0ustar liggesusers#include "../../src/globals.h" using namespace ranger; ranger/inst/CITATION0000755000176200001440000000162114027301517013645 0ustar liggesusersbibentry(bibtype = "Article", title = "{ranger}: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}", author = c(person(given = c("Marvin", "N."), family = "Wright"), person(given = "Andreas", family = "Ziegler", email = "ziegler@imbs.uni-luebeck.de")), journal = "Journal of Statistical Software", year = "2017", volume = "77", number = "1", pages = "1--17", doi = "10.18637/jss.v077.i01", header = "To cite ranger in publications use:", textVersion = paste("Marvin N. Wright, Andreas Ziegler (2017).", "ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R.", "Journal of Statistical Software, 77(1), 1-17.", "doi:10.18637/jss.v077.i01") )