partykit/0000755000176200001440000000000013614604473012131 5ustar liggesuserspartykit/NAMESPACE0000644000176200001440000001163713346233120013345 0ustar liggesusers useDynLib(partykit, .registration = TRUE) import("stats") import("graphics") import("grid") import("Formula") import("libcoin") import("inum") import("mvtnorm") importFrom("survival", "survfit" ) importFrom("rpart", "prune" ) importFrom("grDevices", "gray.colors" ) importFrom("utils", "capture.output", "head", "tail", "setTxtProgressBar", "txtProgressBar" ) export( ## core infrastructure "party", "partynode", "partysplit", ## internal tree growing infrastructure "extree_data", "extree_fit", ## new ctree implementation "ctree", "ctree_control", "sctest.constparty", "varimp.constparty", ## new mob implementation "mob", "mob_control", "refit.modelparty", ## mobsters "lmtree", "glmtree", ## new cforest implementation "cforest", "predict.cforest", "varimp", "gettree", "varimp.cforest", "gettree.cforest", ## as/is class generics "as.party", "as.partynode", "as.constparty", "as.simpleparty", "is.constparty", "is.partynode", "is.simpleparty", ## new generics "is.terminal", "nodeapply", "nodeids", "width", "nodeprune", ## exported methods (to facilitate re-use) "plot.party", "predict.party", "print.party", "plot.modelparty", "predict.modelparty", "print.modelparty", "sctest.modelparty", "prune.modelparty", "prune.lmtree", "nodeprune.party", ## workhorse infrastructure "breaks_split", "character_split", "formatinfo_node", "data_party", "data_party.default", "edge_simple", "fitted_node", "id_node", "index_split", "info_node", "info_split", "kidids_node", "kidids_split", "kids_node", "predict_party", "predict_party.default", "prob_split", "right_split", "split_node", "surrogates_node", "varid_split", ## visualization tools "node_barplot", "node_bivplot", "node_boxplot", "node_surv", "node_ecdf", "node_mvar", "node_inner", "node_party", "node_terminal", ## coercion methods for (non-imported) external classes "as.party.Weka_tree", "as.party.rpart", "as.party.XMLNode", "as.simpleparty.XMLNode", ## misc infrastructure "pmmlTreeModel", "get_paths", "model_frame_rpart" ) ## methods for class party S3method("[", "party") S3method("[[", "party") S3method("as.simpleparty", "party") S3method("depth", "party") S3method("formula", "party") S3method("getCall", "party") S3method("getCall", "constparties") S3method("length", "party") S3method("model.frame", "party") S3method("names", "party") S3method("names<-", "party") S3method("nodeapply", "party") S3method("nodeids", "party") S3method("predict", "party") S3method("width", "party") S3method("nodeprune", "party") S3method("nodeprune", "partynode") S3method("print", "party") S3method("plot", "party") S3method("data_party", "default") S3method("predict_party", "default") S3method("[[", "extree_data") S3method("model.frame", "extree_data") ## methods for class partynode S3method("[", "partynode") S3method("[[", "partynode") S3method("as.list", "partynode") S3method("as.partynode", "partynode") S3method("depth", "partynode") S3method("is.terminal", "partynode") S3method("length", "partynode") S3method("nodeapply", "partynode") S3method("nodeids", "partynode") S3method("print", "partynode") S3method("width", "partynode") ## methods for class constparty S3method("as.simpleparty", "constparty") S3method("plot", "constparty") S3method("predict_party", "constparty") S3method("print", "constparty") S3method("varimp", "constparty") ## methods for class simpleparty S3method("as.simpleparty", "simpleparty") S3method("plot", "simpleparty") S3method("predict_party", "simpleparty") S3method("print", "simpleparty") ## methods for class modelparty S3method("coef", "modelparty") S3method("deviance", "modelparty") S3method("fitted", "modelparty") S3method("formula", "modelparty") S3method("getCall", "modelparty") S3method("logLik", "modelparty") S3method("model.frame", "modelparty") S3method("nobs", "modelparty") S3method("residuals", "modelparty") S3method("summary", "modelparty") S3method("weights", "modelparty") S3method("predict", "modelparty") S3method("print", "modelparty") S3method("plot", "modelparty") S3method("prune", "modelparty") ## methods for class lmtree S3method("plot", "lmtree") S3method("predict", "lmtree") S3method("print", "lmtree") S3method("prune", "lmtree") ## methods for class glmtree S3method("plot", "glmtree") S3method("predict", "glmtree") S3method("print", "glmtree") ## methods for class cforest S3method("predict", "cforest") S3method("varimp", "cforest") S3method("gettree", "cforest") S3method("model.frame", "cforest") ## misc methods S3method("as.partynode", "list") S3method("as.party", "Weka_tree") S3method("as.party", "XMLNode") S3method("as.simpleparty", "XMLNode") S3method("as.party", "rpart") ## conditional registration of strucchange methods if(getRversion() >= "3.6.0") { S3method(strucchange::sctest, "constparty") S3method(strucchange::sctest, "modelparty") } partykit/demo/0000755000176200001440000000000013023523436013046 5ustar liggesuserspartykit/demo/memory-speed.R0000644000176200001440000000261413023523436015602 0ustar liggesusers### packages and data library("rpart") library("RWeka") library("partykit") data("Shuttle", package = "mlbench") ### fit rpart and J48 trees rp <- rpart(Class ~ ., data = Shuttle) j48 <- J48(Class ~ ., data = Shuttle) ### convert to party system.time(party_rp <- as.party(rp)) system.time(party_j48 <- as.party(j48)) ### check depth/width depth(party_rp) width(party_rp) depth(party_j48) width(party_j48) ### compare object sizes osize <- function(x) print(object.size(x), units = "Kb") osize(rp) ## rpart representation osize(party_rp) ## full party (with terms, fitted values) osize(node_party(party_rp)) ## only the raw partynode osize(j48) ## J48 tree in external Java pointer osize(party_j48) ## full party (with terms, fitted values) osize(node_party(party_j48)) ## only the raw partynode osize(Shuttle) ## learning data (not stored in any tree) ### set-up large prediction sample set.seed(1) nd <- Shuttle[sample(1:nrow(Shuttle), 1e6, replace = TRUE), ] ### compare predictions (speed and accuracy) system.time(p_rp <- predict(rp, newdata = nd, type = "prob")) system.time(p_party_rp <- predict(party_rp, newdata = nd, type = "prob")) all.equal(p_rp, p_party_rp) system.time(p_j48 <- predict(j48, newdata = nd)) system.time(p_party_j48 <- predict(party_j48, newdata = nd)) all.equal(p_j48, p_party_j48, check.attributes = FALSE) partykit/demo/00Index0000644000176200001440000000011713023523436014177 0ustar liggesusersmemory-speed Some memory and speed comparisons for rpart, J48, and constparty partykit/data/0000755000176200001440000000000013023523436013033 5ustar liggesuserspartykit/data/HuntingSpiders.rda0000644000176200001440000000111513023523436016467 0ustar liggesusersVM0@VU{oCeaHE?-mjx55Yז̢.Ovhzq:TMfC Eo< S5my1S't!ХA]ꗡ\Ŏ3uKYE![ۥ+v x/)?.< +&%K䠎21ڻ䕻qI_쵳,<>ud軅>!aHߘP$h7^Zn ގ9fi!,yG4tw)SmI4bV?2&g c*J B%So,' /}N=ֱQ}Rثm Ξ)&_ʪpartykit/man/0000755000176200001440000000000013612566352012705 5ustar liggesuserspartykit/man/mob.Rd0000644000176200001440000001715613077333227013761 0ustar liggesusers\name{mob} \alias{mob} \alias{modelparty} \alias{coef.modelparty} \alias{deviance.modelparty} \alias{fitted.modelparty} \alias{formula.modelparty} \alias{getCall.modelparty} \alias{logLik.modelparty} \alias{model.frame.modelparty} \alias{nobs.modelparty} \alias{plot.modelparty} \alias{predict.modelparty} \alias{print.modelparty} \alias{residuals.modelparty} \alias{summary.modelparty} \alias{weights.modelparty} \alias{refit.modelparty} \alias{sctest.modelparty} \title{Model-based Recursive Partitioning} \description{ MOB is an algorithm for model-based recursive partitioning yielding a tree with fitted models associated with each terminal node. } \usage{ mob(formula, data, subset, na.action, weights, offset, cluster, fit, control = mob_control(), \dots) } \arguments{ \item{formula}{symbolic description of the model (of type \code{y ~ z1 + \dots + zl} or \code{y ~ x1 + \dots + xk | z1 + \dots + zl}; for details see below).} \item{data, subset, na.action}{arguments controlling formula processing via \code{\link[stats]{model.frame}}.} \item{weights}{optional numeric vector of weights. By default these are treated as case weights but the default can be changed in \code{\link{mob_control}}.} \item{offset}{optional numeric vector with an a priori known component to be included in the model \code{y ~ x1 + \dots + xk} (i.e., only when \code{x} variables are specified).} \item{cluster}{optional vector (typically numeric or factor) with a cluster ID to be passed on to the \code{fit} function and employed for clustered covariances in the parameter stability tests.} \item{fit}{function. A function for fitting the model within each node. For details see below.} \item{control}{A list with control parameters as returned by \code{\link{mob_control}}.} \item{\dots}{Additional arguments passed to the \code{fit} function.} } \details{ Model-based partitioning fits a model tree using two groups of variables: (1) The model variables which can be just a (set of) response(s) \code{y} or additionally include regressors \code{x1}, \dots, \code{xk}. These are used for estimating the model parameters. (2) Partitioning variables \code{z1}, \dots, \code{zl}, which are used for recursively partitioning the data. The two groups of variables are either specified as \code{y ~ z1 + \dots + zl} (when there are no regressors) or \code{y ~ x1 + \dots + xk | z1 + \dots + zl} (when the model part contains regressors). Both sets of variables may in principle be overlapping. To fit a tree model the following algorithm is used. \enumerate{ \item \code{fit} a model to the \code{y} or \code{y} and \code{x} variables using the observations in the current node \item Assess the stability of the model parameters with respect to each of the partitioning variables \code{z1}, \dots, \code{zl}. If there is some overall instability, choose the variable \code{z} associated with the smallest \eqn{p} value for partitioning, otherwise stop. \item Search for the locally optimal split in \code{z} by minimizing the objective function of the model. Typically, this will be something like \code{\link{deviance}} or the negative \code{\link{logLik}}. \item Refit the \code{model} in both kid subsamples and repeat from step 2. } More details on the conceptual design of the algorithm can be found in Zeileis, Hothorn, Hornik (2008) and some illustrations are provided in \code{vignette("MOB")}. For specifying the \code{fit} function two approaches are possible: (1) It can be a function \code{fit(y, x = NULL, start = NULL, weights = NULL, offset = NULL, \dots)}. The arguments \code{y}, \code{x}, \code{weights}, \code{offset} will be set to the corresponding elements in the current node of the tree. Additionally, starting values will sometimes be supplied via \code{start}. Of course, the \code{fit} function can choose to ignore any arguments that are not applicable, e.g., if the are no regressors \code{x} in the model or if starting values or not supported. The returned object needs to have a class that has associated \code{\link[stats]{coef}}, \code{\link[stats]{logLik}}, and \code{\link[sandwich]{estfun}} methods for extracting the estimated parameters, the maximized log-likelihood, and the empirical estimating function (i.e., score or gradient contributions), respectively. (2) It can be a function \code{fit(y, x = NULL, start = NULL, weights = NULL, offset = NULL, \dots, estfun = FALSE, object = FALSE)}. The arguments have the same meaning as above but the returned object needs to have a different structure. It needs to be a list with elements \code{coefficients} (containing the estimated parameters), \code{objfun} (containing the minimized objective function), \code{estfun} (the empirical estimating functions), and \code{object} (the fitted model object). The elements \code{estfun}, or \code{object} should be \code{NULL} if the corresponding argument is set to \code{FALSE}. Internally, a function of type (2) is set up by \code{mob()} in case a function of type (1) is supplied. However, to save computation time, a function of type (2) may also be specified directly. For the fitted MOB tree, several standard methods are provided such as \code{print}, \code{predict}, \code{residuals}, \code{logLik}, \code{deviance}, \code{weights}, \code{coef} and \code{summary}. Some of these rely on reusing the corresponding methods for the individual model objects in the terminal nodes. Functions such as \code{coef}, \code{print}, \code{summary} also take a \code{node} argument that can specify the node IDs to be queried. Some examples are given below. More details can be found in \code{vignette("mob", package = "partykit")}. An overview of the connections to other functions in the package is provided by Hothorn and Zeileis (2015). } \value{ An object of class \code{modelparty} inheriting from \code{\link{party}}. The \code{info} element of the overall \code{party} and the individual \code{node}s contain various informations about the models. } \references{ Hothorn T, Zeileis A (2015). partykit: A Modular Toolkit for Recursive Partytioning in R. \emph{Journal of Machine Learning Research}, \bold{16}, 3905--3909. Zeileis A, Hothorn T, Hornik K (2008). Model-Based Recursive Partitioning. \emph{Journal of Computational and Graphical Statistics}, \bold{17}(2), 492--514. } \seealso{\code{\link{mob_control}}, \code{\link{lmtree}}, \code{\link{glmtree}}} \examples{ if(require("mlbench")) { ## Pima Indians diabetes data data("PimaIndiansDiabetes", package = "mlbench") ## a simple basic fitting function (of type 1) for a logistic regression logit <- function(y, x, start = NULL, weights = NULL, offset = NULL, ...) { glm(y ~ 0 + x, family = binomial, start = start, ...) } ## set up a logistic regression tree pid_tree <- mob(diabetes ~ glucose | pregnant + pressure + triceps + insulin + mass + pedigree + age, data = PimaIndiansDiabetes, fit = logit) ## see lmtree() and glmtree() for interfaces with more efficient fitting functions ## print tree print(pid_tree) ## print information about (some) nodes print(pid_tree, node = 3:4) ## visualization plot(pid_tree) ## coefficients and summary coef(pid_tree) coef(pid_tree, node = 1) summary(pid_tree, node = 1) ## average deviance computed in different ways mean(residuals(pid_tree)^2) deviance(pid_tree)/sum(weights(pid_tree)) deviance(pid_tree)/nobs(pid_tree) ## log-likelihood and information criteria logLik(pid_tree) AIC(pid_tree) BIC(pid_tree) ## predicted nodes predict(pid_tree, newdata = head(PimaIndiansDiabetes, 6), type = "node") ## other types of predictions are possible using lmtree()/glmtree() } } \keyword{tree} partykit/man/partynode.Rd0000644000176200001440000001354113023523436015175 0ustar liggesusers\name{partynode} \alias{partynode} \alias{kidids_node} \alias{fitted_node} \alias{id_node} \alias{split_node} \alias{surrogates_node} \alias{kids_node} \alias{info_node} \alias{formatinfo_node} \title{ Inner and Terminal Nodes } \description{ A class for representing inner and terminal nodes in trees and functions for data partitioning. } \usage{ partynode(id, split = NULL, kids = NULL, surrogates = NULL, info = NULL) kidids_node(node, data, vmatch = 1:ncol(data), obs = NULL, perm = NULL) fitted_node(node, data, vmatch = 1:ncol(data), obs = 1:nrow(data), perm = NULL) id_node(node) split_node(node) surrogates_node(node) kids_node(node) info_node(node) formatinfo_node(node, FUN = NULL, default = "", prefix = NULL, \dots) } \arguments{ \item{id}{ integer, a unique identifier for a node. } \item{split}{ an object of class \code{\link{partysplit}}. } \item{kids}{ a list of \code{partynode} objects. } \item{surrogates}{ a list of \code{partysplit} objects.} \item{info}{ additional information. } \item{node}{ an object of class \code{partynode}.} \item{data}{ a \code{\link{list}} or \code{\link{data.frame}}.} \item{vmatch}{ a permutation of the variable numbers in \code{data}.} \item{obs}{ a logical or integer vector indicating a subset of the observations in \code{data}.} \item{perm}{ a vector of integers specifying the variables to be permuted prior before splitting (i.e., for computing permutation variable importances). The default \code{NULL} doesn't alter the data.} \item{FUN}{ function for formatting the \code{info}, for default see below.} \item{default}{ a character used if the \code{info} in \code{node} is \code{NULL}.} \item{prefix}{ an optional prefix to be added to the returned character. } \item{\dots}{ further arguments passed to \code{\link[utils]{capture.output}}.} } \details{ A node represents both inner and terminal nodes in a tree structure. Each node has a unique identifier \code{id}. A node consisting only of such an identifier (and possibly additional information in \code{info}) is a terminal node. Inner nodes consist of a primary split (an object of class \code{\link{partysplit}}) and at least two kids (daughter nodes). Kid nodes are objects of class \code{partynode} itself, so the tree structure is defined recursively. In addition, a list of \code{partysplit} objects offering surrogate splits can be supplied. Like \code{\link{partysplit}} objects, \code{partynode} objects aren't connected to the actual data. Function \code{kidids_node()} determines how the observations in \code{data[obs,]} are partitioned into the kid nodes and returns the number of the list element in list \code{kids} each observations belongs to (and not it's identifier). This is done by evaluating \code{split} (and possibly all surrogate splits) on \code{data} using \code{\link{kidids_split}}. Function \code{fitted_node()} performs all splits recursively and returns the identifier \code{id} of the terminal node each observation in \code{data[obs,]} belongs to. Arguments \code{vmatch}, \code{obs} and \code{perm} are passed to \code{\link{kidids_split}}. Function \code{formatinfo_node()} extracts the the \code{info} from \code{node} and formats it to a \code{character} vector using the following strategy: If \code{is.null(info)}, the \code{default} is returned. Otherwise, \code{FUN} is applied for formatting. The default function uses \code{as.character} for atomic objects and applies \code{\link[utils]{capture.output}} to \code{print(info)} for other objects. Optionally, a \code{prefix} can be added to the computed character string. All other functions are accessor functions for extracting information from objects of class \code{partynode}. } \value{ The constructor \code{partynode()} returns an object of class \code{partynode}: \item{id}{ a unique integer identifier for a node. } \item{split}{ an object of class \code{\link{partysplit}}. } \item{kids}{ a list of \code{partynode} objects. } \item{surrogates}{ a list of \code{\link{partysplit}} objects.} \item{info}{ additional information. } \code{kidids_split()} returns an integer vector describing the partition of the observations into kid nodes by their position in list \code{kids}. \code{fitted_node()} returns the node identifiers (\code{id}) of the terminal nodes each observation belongs to. } \references{ Hothorn T, Zeileis A (2015). partykit: A Modular Toolkit for Recursive Partytioning in R. \emph{Journal of Machine Learning Research}, \bold{16}, 3905--3909. } \examples{ data("iris", package = "datasets") ## a stump defined by a binary split in Sepal.Length stump <- partynode(id = 1L, split = partysplit(which(names(iris) == "Sepal.Length"), breaks = 5), kids = lapply(2:3, partynode)) ## textual representation print(stump, data = iris) ## list element number and node id of the two terminal nodes table(kidids_node(stump, iris), fitted_node(stump, data = iris)) ## assign terminal nodes with probability 0.5 ## to observations with missing `Sepal.Length' iris_NA <- iris iris_NA[sample(1:nrow(iris), 50), "Sepal.Length"] <- NA table(fitted_node(stump, data = iris_NA, obs = !complete.cases(iris_NA))) ## a stump defined by a primary split in `Sepal.Length' ## and a surrogate split in `Sepal.Width' which ## determines terminal nodes for observations with ## missing `Sepal.Length' stump <- partynode(id = 1L, split = partysplit(which(names(iris) == "Sepal.Length"), breaks = 5), kids = lapply(2:3, partynode), surrogates = list(partysplit( which(names(iris) == "Sepal.Width"), breaks = 3))) f <- fitted_node(stump, data = iris_NA, obs = !complete.cases(iris_NA)) tapply(iris_NA$Sepal.Width[!complete.cases(iris_NA)], f, range) } \keyword{tree} partykit/man/WeatherPlay.Rd0000644000176200001440000000276413023523436015422 0ustar liggesusers\name{WeatherPlay} \alias{WeatherPlay} \title{Weather Conditions and Playing a Game} \description{ Artificial data set concerning the conditions suitable for playing some unspecified game. } \usage{data("WeatherPlay")} \format{ A data frame containing 14 observations on 5 variables. \describe{ \item{outlook}{factor.} \item{temperature}{numeric.} \item{humidity}{numeric.} \item{windy}{factor.} \item{play}{factor.} } } \source{ Table 1.3 in Witten and Frank (2011). } \references{ Witten IH, Frank E (2011). \emph{Data Mining: Practical Machine Learning Tools and Techniques}. 3rd Edition, Morgan Kaufmann, San Francisco. } \seealso{\code{\link{party}}, \code{\link{partynode}}, \code{\link{partysplit}}} \examples{ ## load weather data data("WeatherPlay", package = "partykit") WeatherPlay ## construct simple tree pn <- partynode(1L, split = partysplit(1L, index = 1:3), kids = list( partynode(2L, split = partysplit(3L, breaks = 75), kids = list( partynode(3L, info = "yes"), partynode(4L, info = "no"))), partynode(5L, info = "yes"), partynode(6L, split = partysplit(4L, index = 1:2), kids = list( partynode(7L, info = "yes"), partynode(8L, info = "no"))))) pn ## couple with data py <- party(pn, WeatherPlay) ## print/plot/predict print(py) plot(py) predict(py, newdata = WeatherPlay) ## customize printing print(py, terminal_panel = function(node) paste(": play=", info_node(node), sep = "")) } \keyword{datasets} partykit/man/mob_control.Rd0000644000176200001440000002042113077330461015503 0ustar liggesusers\name{mob_control} \alias{mob_control} \title{Control Parameters for Model-Based Partitioning} \description{ Various parameters that control aspects the fitting algorithm for recursively partitioned \code{\link{mob}} models. } \usage{ mob_control(alpha = 0.05, bonferroni = TRUE, minsize = NULL, maxdepth = Inf, mtry = Inf, trim = 0.1, breakties = FALSE, parm = NULL, dfsplit = TRUE, prune = NULL, restart = TRUE, verbose = FALSE, caseweights = TRUE, ytype = "vector", xtype = "matrix", terminal = "object", inner = terminal, model = TRUE, numsplit = "left", catsplit = "binary", vcov = "opg", ordinal = "chisq", nrep = 10000, minsplit = minsize, minbucket = minsize, applyfun = NULL, cores = NULL) } \arguments{ \item{alpha}{numeric significance level. A node is splitted when the (possibly Bonferroni-corrected) \eqn{p} value for any parameter stability test in that node falls below \code{alpha} (and the stopping criteria \code{minsize} and \code{maxdepth} are not fulfilled).} \item{bonferroni}{logical. Should \eqn{p} values be Bonferroni corrected?} \item{minsize, minsplit, minbucket}{integer. The minimum number of observations in a node. If \code{NULL}, the default is to use 10 times the number of parameters to be estimated (divided by the number of responses per observation if that is greater than 1). \code{minsize} is the recommended name and \code{minsplit}/\code{minbucket} are only included for backward compatibility with previous versions of \code{mob} and compatibility with \code{ctree}, respectively.} \item{maxdepth}{integer. The maximum depth of the tree.} \item{mtry}{integer. The number of partitioning variables randomly sampled as candidates in each node for forest-style algorithms. If \code{mtry} is greater than the number of partitioning variables, no random selection is performed. (Thus, by default all available partitioning variables are considered.)} \item{trim}{numeric. This specifies the trimming in the parameter instability test for the numerical variables. If smaller than 1, it is interpreted as the fraction relative to the current node size.} \item{breakties}{logical. Should ties in numeric variables be broken randomly for computing the associated parameter instability test?} \item{parm}{numeric or character. Number or name of model parameters included in the parameter instability tests (by default all parameters are included).} \item{dfsplit}{logical or numeric. \code{as.integer(dfsplit)} is the degrees of freedom per selected split employed when computing information criteria etc.} \item{prune}{character, numeric, or function for specifying post-pruning rule. If \code{prune} is \code{NULL} (the default), no post-pruning is performed. For likelihood-based \code{mob()} trees, \code{prune} can be set to \code{"AIC"} or \code{"BIC"} for post-pruning based on the corresponding information criteria. More general rules (also in scenarios that are not likelihood-based), can be specified by function arguments to \code{prune}, for details see below.} \item{restart}{logical. When determining the optimal split point in a numerical variable: Should model estimation be restarted with \code{NULL} starting values for each split? The default is \code{TRUE}. If \code{FALSE}, then the parameter estimates from the previous split point are used as starting values for the next split point (because in practice the difference are often not huge). (Note that in that case a \code{for} loop is used instead of the \code{applyfun} for fitting models across sample splits.)} \item{verbose}{logical. Should information about the fitting process of \code{\link{mob}} (such as test statistics, \eqn{p} values, selected splitting variables and split points) be printed to the screen?} \item{caseweights}{logical. Should weights be interpreted as case weights? If \code{TRUE}, the number of observations is \code{sum(weights)}, otherwise it is \code{sum(weights > 0)}.} \item{ytype, xtype}{character. Specification of how \code{mob} should preprocess \code{y} and \code{x} variables. Possible choice are: \code{"vector"} (for \code{y} only), i.e., only one variable; \code{"matrix"}, i.e., the model matrix of all variables; \code{"data.frame"}, i.e., a data frame of all variables.} \item{terminal, inner}{character. Specification of which additional information (\code{"estfun"}, \code{"object"}, or both) should be stored in each node. If \code{NULL}, no additional information is stored.} \item{model}{logical. Should the full model frame be stored in the resulting object?} \item{numsplit}{character indicating how splits for numeric variables should be justified. Because any splitpoint in the interval between the last observation from the left child segment and the first observation from the right child segment leads to the same observed split, two options are available in \code{mob_control}: Either, the split is \code{"left"}-justified (the default for backward compatibility) or \code{"center"}-justified using the midpoint of the possible interval.} \item{catsplit}{character indicating how (unordered) categorical variables should be splitted. By default the best \code{"binary"} split is searched (by minimizing the objective function). Alternatively, if set to \code{"multiway"}, the node is simply splitted into all levels of the categorical variable.} \item{vcov}{character indicating which type of covariance matrix estimator should be employed in the parameter instability tests. The default is the outer product of gradients (\code{"opg"}). Alternatively, \code{vcov = "info"} employs the information matrix and \code{vcov = "sandwich"} the sandwich matrix (both of which are only sensible for maximum likelihood estimation).} \item{ordinal}{character indicating which type of parameter instability test should be employed for ordinal partitioning variables (i.e., ordered factors). This can be \code{"chisq"}, \code{"max"}, or \code{"L2"}. If \code{"chisq"} then the variable is treated as unordered and a chi-squared test is performed. If \code{"L2"}, then a maxLM-type test as for numeric variables is carried out but correcting for ties. This requires simulation of p-values via \code{\link[strucchange]{catL2BB}} and requires some computation time. For \code{"max"} a weighted double maximum test is used that computes p-values via \code{\link[mvtnorm]{pmvnorm}}.} \item{nrep}{numeric. Number of replications in the simulation of p-values for the ordinal \code{"L2"} statistic (if used).} \item{applyfun}{an optional \code{\link[base]{lapply}}-style function with arguments \code{function(X, FUN, \dots)}. It is used for refitting the model across potential sample splits. The default is to use the basic \code{lapply} function unless the \code{cores} argument is specified (see below).} \item{cores}{numeric. If set to an integer the \code{applyfun} is set to \code{\link[parallel]{mclapply}} with the desired number of \code{cores}.} } \details{ See \code{\link{mob}} for more details and references. For post-pruning, \code{prune} can be set to a \code{function(objfun, df, nobs)} which either returns \code{TRUE} to signal that a current node can be pruned or \code{FALSE}. All supplied arguments are of length two: \code{objfun} is the sum of objective function values in the current node and its child nodes, respectively. \code{df} is the degrees of freedom in the current node and its child nodes, respectively. \code{nobs} is vector with the number of observations in the current node and the total number of observations in the dataset, respectively. If the objective function employed in the \code{mob()} call is the negative log-likelihood, then a suitable function is set up on the fly by comparing \code{(2 * objfun + penalty * df)} in the current and the daughter nodes. The penalty can then be set via a numeric or character value for \code{prune}: AIC is used if \code{prune = "AIC"} or \code{prune = 2} and BIC if \code{prune = "BIC"} or \code{prune = log(n)}. } \seealso{\code{\link{mob}}} \value{ A list of class \code{mob_control} containing the control parameters. } \keyword{misc} partykit/man/glmtree.Rd0000644000176200001440000000653113077330426014634 0ustar liggesusers\name{glmtree} \alias{glmtree} \alias{plot.glmtree} \alias{predict.glmtree} \alias{print.glmtree} \title{Generalized Linear Model Trees} \description{ Model-based recursive partitioning based on generalized linear models. } \usage{ glmtree(formula, data, subset, na.action, weights, offset, cluster, family = gaussian, epsilon = 1e-8, maxit = 25, \dots) } \arguments{ \item{formula}{symbolic description of the model (of type \code{y ~ z1 + \dots + zl} or \code{y ~ x1 + \dots + xk | z1 + \dots + zl}; for details see below).} \item{data, subset, na.action}{arguments controlling formula processing via \code{\link[stats]{model.frame}}.} \item{weights}{optional numeric vector of weights. By default these are treated as case weights but the default can be changed in \code{\link{mob_control}}.} \item{offset}{optional numeric vector with an a priori known component to be included in the model \code{y ~ x1 + \dots + xk} (i.e., only when \code{x} variables are specified).} \item{cluster}{optional vector (typically numeric or factor) with a cluster ID to be employed for clustered covariances in the parameter stability tests.} \item{family}{specification of a family for \code{\link[stats]{glm}}.} \item{epsilon, maxit}{control parameters passed to \code{\link[stats]{glm.control}}.} \item{\dots}{optional control parameters passed to \code{\link{mob_control}}.} } \details{ Convenience interface for fitting MOBs (model-based recursive partitions) via the \code{\link{mob}} function. \code{glmtree} internally sets up a model \code{fit} function for \code{mob}, using \code{\link[stats]{glm.fit}}. Then \code{mob} is called using the negative log-likelihood as the objective function. Compared to calling \code{mob} by hand, the implementation tries to avoid unnecessary computations while growing the tree. Also, it provides a more elaborate plotting function. } \value{ An object of class \code{glmtree} inheriting from \code{\link{modelparty}}. The \code{info} element of the overall \code{party} and the individual \code{node}s contain various informations about the models. } \references{ Zeileis A, Hothorn T, Hornik K (2008). Model-Based Recursive Partitioning. \emph{Journal of Computational and Graphical Statistics}, \bold{17}(2), 492--514. } \seealso{\code{\link{mob}}, \code{\link{mob_control}}, \code{\link{lmtree}}} \examples{ if(require("mlbench")) { ## Pima Indians diabetes data data("PimaIndiansDiabetes", package = "mlbench") ## recursive partitioning of a logistic regression model pid_tree2 <- glmtree(diabetes ~ glucose | pregnant + pressure + triceps + insulin + mass + pedigree + age, data = PimaIndiansDiabetes, family = binomial) ## printing whole tree or individual nodes print(pid_tree2) print(pid_tree2, node = 1) ## visualization plot(pid_tree2) plot(pid_tree2, tp_args = list(cdplot = TRUE)) plot(pid_tree2, terminal_panel = NULL) ## estimated parameters coef(pid_tree2) coef(pid_tree2, node = 5) summary(pid_tree2, node = 5) ## deviance, log-likelihood and information criteria deviance(pid_tree2) logLik(pid_tree2) AIC(pid_tree2) BIC(pid_tree2) ## different types of predictions pid <- head(PimaIndiansDiabetes) predict(pid_tree2, newdata = pid, type = "node") predict(pid_tree2, newdata = pid, type = "response") predict(pid_tree2, newdata = pid, type = "link") } } \keyword{tree} partykit/man/party-predict.Rd0000644000176200001440000001064713214760765015775 0ustar liggesusers\name{party-predict} \alias{party-predict} \alias{predict.party} \alias{predict_party} \alias{predict_party.default} \alias{predict_party.constparty} \alias{predict_party.simpleparty} \title{ Tree Predictions } \description{ Compute predictions from \code{party} objects. } \usage{ \method{predict}{party}(object, newdata = NULL, perm = NULL, \dots) predict_party(party, id, newdata = NULL, \dots) \method{predict_party}{default}(party, id, newdata = NULL, FUN = NULL, \dots) \method{predict_party}{constparty}(party, id, newdata = NULL, type = c("response", "prob", "quantile", "density", "node"), at = if (type == "quantile") c(0.1, 0.5, 0.9), FUN = NULL, simplify = TRUE, \dots) \method{predict_party}{simpleparty}(party, id, newdata = NULL, type = c("response", "prob", "node"), \dots) } \arguments{ \item{object}{ objects of class \code{\link{party}}. } \item{newdata}{ an optional data frame in which to look for variables with which to predict, if omitted, the fitted values are used.} \item{perm}{an optional character vector of variable names. Splits of nodes with a primary split in any of these variables will be permuted (after dealing with surrogates). Note that surrogate split in the \code{perm} variables will no be permuted.} \item{party}{ objects of class \code{\link{party}}. } \item{id}{ a vector of terminal node identifiers. } \item{type}{ a character string denoting the type of predicted value returned, ignored when argument \code{FUN} is given. For \code{"response"}, the mean of a numeric response, the predicted class for a categorical response or the median survival time for a censored response is returned. For \code{"prob"} the matrix of conditional class probabilities (\code{simplify = TRUE}) or a list with the conditional class probabilities for each observation (\code{simplify = FALSE}) is returned for a categorical response. For numeric and censored responses, a list with the empirical cumulative distribution functions and empirical survivor functions (Kaplan-Meier estimate) is returned when \code{type = "prob"}. \code{"node"} returns an integer vector of terminal node identifiers.} \item{FUN}{ a function to extract (\code{default} method) or compute (\code{constparty} method) summary statistics. For the \code{default} method, this is a function of a terminal node only, for the \code{constparty} method, predictions for each node have to be computed based on arguments \code{(y, w)} where \code{y} is the response and \code{w} are case weights.} \item{at}{ if the return value is a function (as the empirical cumulative distribution function or the empirical quantile function), this function is evaluated at values \code{at} and these numeric values are returned. If \code{at} is \code{NULL}, the functions themselves are returned in a list.} \item{simplify}{ a logical indicating whether the resulting list of predictions should be converted to a suitable vector or matrix (if possible).} \item{\dots}{ additional arguments. } } \details{ The \code{\link{predict}} method for \code{\link{party}} objects computes the identifiers of the predicted terminal nodes, either for new data in \code{newdata} or for the learning samples (only possible for objects of class \code{constparty}). These identifiers are delegated to the corresponding \code{predict_party} method which computes (via \code{FUN} for class \code{constparty}) or extracts (class \code{simpleparty}) the actual predictions. } \value{ A list of predictions, possibly simplified to a numeric vector, numeric matrix or factor. } \examples{ ## fit tree using rpart library("rpart") rp <- rpart(skips ~ Opening + Solder + Mask + PadType + Panel, data = solder, method = 'anova') ## coerce to `constparty' pr <- as.party(rp) ## mean predictions predict(pr, newdata = solder[c(3, 541, 640),]) ## ecdf predict(pr, newdata = solder[c(3, 541, 640),], type = "prob") ## terminal node identifiers predict(pr, newdata = solder[c(3, 541, 640),], type = "node") ## median predictions predict(pr, newdata = solder[c(3, 541, 640),], FUN = function(y, w = 1) median(y)) } \keyword{tree} partykit/man/party-plot.Rd0000644000176200001440000001077613164410623015311 0ustar liggesusers\name{party-plot} \alias{party-plot} \alias{plot.party} \alias{plot.constparty} \alias{plot.simpleparty} \title{ Visualization of Trees } \description{ \code{plot} method for \code{party} objects with extended facilities for plugging in panel functions. } \usage{ \method{plot}{party}(x, main = NULL, terminal_panel = node_terminal, tp_args = list(), inner_panel = node_inner, ip_args = list(), edge_panel = edge_simple, ep_args = list(), drop_terminal = FALSE, tnex = 1, newpage = TRUE, pop = TRUE, gp = gpar(), margins = NULL, \dots) \method{plot}{constparty}(x, main = NULL, terminal_panel = NULL, tp_args = list(), inner_panel = node_inner, ip_args = list(), edge_panel = edge_simple, ep_args = list(), type = c("extended", "simple"), drop_terminal = NULL, tnex = NULL, newpage = TRUE, pop = TRUE, gp = gpar(), \dots) \method{plot}{simpleparty}(x, digits = getOption("digits") - 4, tp_args = NULL, \dots) } \arguments{ \item{x}{ an object of class \code{party} or \code{constparty}.} \item{main}{ an optional title for the plot.} \item{type}{ a character specifying the complexity of the plot: \code{extended} tries to visualize the distribution of the response variable in each terminal node whereas \code{simple} only gives some summary information.} \item{terminal_panel}{ an optional panel function of the form \code{function(node)} plotting the terminal nodes. Alternatively, a panel generating function of class \code{"grapcon_generator"} that is called with arguments \code{x} and \code{tp_args} to set up a panel function. By default, an appropriate panel function is chosen depending on the scale of the dependent variable.} \item{tp_args}{ a list of arguments passed to \code{terminal_panel} if this is a \code{"grapcon_generator"} object.} \item{inner_panel}{ an optional panel function of the form \code{function(node)} plotting the inner nodes. Alternatively, a panel generating function of class \code{"grapcon_generator"} that is called with arguments \code{x} and \code{ip_args} to set up a panel function.} \item{ip_args}{ a list of arguments passed to \code{inner_panel} if this is a \code{"grapcon_generator"} object.} \item{edge_panel}{ an optional panel function of the form \code{function(split, ordered = FALSE, left = TRUE)} plotting the edges. Alternatively, a panel generating function of class \code{"grapcon_generator"} that is called with arguments \code{x} and \code{ip_args} to set up a panel function.} \item{ep_args}{ a list of arguments passed to \code{edge_panel} if this is a \code{"grapcon_generator"} object.} \item{drop_terminal}{ a logical indicating whether all terminal nodes should be plotted at the bottom.} \item{tnex}{a numeric value giving the terminal node extension in relation to the inner nodes.} \item{newpage}{ a logical indicating whether \code{grid.newpage()} should be called. } \item{pop}{ a logical whether the viewport tree should be popped before return. } \item{gp}{graphical parameters.} \item{margins}{numeric vector of margin sizes.} \item{digits}{number of digits to be printed.} \item{\dots}{ additional arguments passed to callies.} } \details{ This \code{plot} method for \code{party} objects provides an extensible framework for the visualization of binary regression trees. The user is allowed to specify panel functions for plotting terminal and inner nodes as well as the corresponding edges. Panel functions for plotting inner nodes, edges and terminal nodes are available for the most important cases and can serve as the basis for user-supplied extensions, see \code{\link{node_inner}}. More details on the ideas and concepts of panel-generating functions and \code{"grapcon_generator"} objects in general can be found in Meyer, Zeileis and Hornik (2005). } \references{ Meyer D, Zeileis A, Hornik K (2006). The Strucplot Framework: Visualizing Multi-Way Contingency Tables with vcd. \emph{Journal of Statistical Software}, \bold{17}(3), 1--48. \url{http://www.jstatsoft.org/v17/i03/} } \seealso{\code{\link{node_inner}}, \code{\link{node_terminal}}, \code{\link{edge_simple}}, \code{\link{node_barplot}}, \code{\link{node_boxplot}}.} \keyword{hplot} partykit/man/extree_data.Rd0000644000176200001440000000466713214456267015477 0ustar liggesusers\name{extree_data} \alias{extree_data} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Data Preprocessing for Extensible Trees. } \description{ A routine for preprocessing data before an extensible tree can be grown by \code{extree_fit}. } \usage{ extree_data(formula, data, subset, na.action = na.pass, weights, offset, cluster, strata, scores = NULL, yx = c("none", "matrix"), ytype = c("vector", "data.frame", "matrix"), nmax = c(yx = Inf, z = Inf), ...) } \arguments{ \item{formula}{a formula describing the model of the form \code{y1 + y2 + ... ~ x1 + x2 + ... | z1 + z2 + ...}. } \item{data}{an optional data.frame containing the variables in the model. } \item{subset}{an optional vector specifying a subset of observations to be used in the fitting process. } \item{na.action}{a function which indicates what should happen when the data contain missing values. } \item{weights}{an optional vector of weights. } \item{offset}{an optional offset vector. } \item{cluster}{an optional factor describing clusters. The interpretation depends on the specific tree algorithm. } \item{strata}{an optional factor describing strata. The interpretation depends on the specific tree algorithm. } \item{scores}{an optional named list of numeric scores to be assigned to ordered factors in the \code{z} part of the formula. } \item{yx}{a character indicating if design matrices shall be computed. } \item{ytype}{a character indicating how response variables shall be stored. } \item{nmax}{a numeric vector of length two with the maximal number of bins in the response and \code{x}-part (first element) and the \code{z} part. Use \code{Inf} to switch-off binning. } \item{\dots}{additional arguments. } } \details{ This internal functionality will be the basis of implementations of other tree algorithms in future versions. Currently, only \code{ctree} relies on this function. } \value{An object of class \code{extree_data}. } \examples{ data("iris") ed <- extree_data(Species ~ Sepal.Width + Sepal.Length | Petal.Width + Petal.Length, data = iris, nmax = c("yx" = 25, "z" = 10), yx = "matrix") ### the model.frame mf <- model.frame(ed) all.equal(mf, iris[, names(mf)]) ### binned y ~ x part model.frame(ed, yxonly = TRUE) ### binned Petal.Width ed[[4, type = "index"]] ### response ed$yx$y ### model matrix ed$yx$x } \keyword{tree} partykit/man/nodeapply.Rd0000644000176200001440000000551413214760743015172 0ustar liggesusers\name{nodeapply} \alias{nodeapply} \alias{nodeapply.party} \alias{nodeapply.partynode} \title{ Apply Functions Over Nodes } \description{ Returns a list of values obtained by applying a function to \code{party} or \code{partynode} objects. } \usage{ nodeapply(obj, ids = 1, FUN = NULL, \dots) \method{nodeapply}{partynode}(obj, ids = 1, FUN = NULL, \dots) \method{nodeapply}{party}(obj, ids = 1, FUN = NULL, by_node = TRUE, \dots) } \arguments{ \item{obj}{ an object of class \code{\link{partynode}} or \code{\link{party}}.} \item{ids}{ integer vector of node identifiers to apply over.} \item{FUN}{ a function to be applied to nodes. By default, the node itself is returned.} \item{by_node}{ a logical indicating if \code{FUN} is applied to subsets of \code{\link{party}} objects or \code{\link{partynode}} objects (default). } \item{\dots}{ additional arguments.} } \details{ Function \code{FUN} is applied to all nodes with node identifiers in \code{ids} for a \code{partynode} object. The method for \code{party} by default calls the \code{nodeapply} method on it's \code{node} slot. If \code{by_node} is \code{FALSE}, it is applied to a \code{party} object with root node \code{ids}. } \value{ A list of results of length \code{length(ids)}. } \examples{ ## a tree as flat list structure nodelist <- list( # root node list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), kids = 2:3), # V4 <= 1.9, terminal node list(id = 2L, info = "terminal A"), # V4 > 1.9 list(id = 3L, split = partysplit(varid = 5L, breaks = 1.7), kids = c(4L, 7L)), # V5 <= 1.7 list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), kids = 5:6), # V4 <= 4.8, terminal node list(id = 5L, info = "terminal B"), # V4 > 4.8, terminal node list(id = 6L, info = "terminal C"), # V5 > 1.7, terminal node list(id = 7L, info = "terminal D") ) ## convert to a recursive structure node <- as.partynode(nodelist) ## return root node nodeapply(node) ## return info slots of terminal nodes nodeapply(node, ids = nodeids(node, terminal = TRUE), FUN = function(x) info_node(x)) ## fit tree using rpart library("rpart") rp <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis) ## coerce to `constparty' rpk <- as.party(rp) ## extract nodeids nodeids(rpk) unlist(nodeapply(node_party(rpk), ids = nodeids(rpk), FUN = id_node)) unlist(nodeapply(rpk, ids = nodeids(rpk), FUN = id_node)) ## but root nodes of party objects always have id = 1 unlist(nodeapply(rpk, ids = nodeids(rpk), FUN = function(x) id_node(node_party(x)), by_node = FALSE)) } \keyword{tree} partykit/man/HuntingSpiders.Rd0000644000176200001440000000621313467470520016143 0ustar liggesusers\name{HuntingSpiders} \alias{HuntingSpiders} \title{Abundance of Hunting Spiders} \description{ Abundances for 12 species of hunting spiders along with environmental predictors, all rated on a 0--9 scale. } \usage{data("HuntingSpiders")} \format{ A data frame containing 28 observations on 18 variables (12 species abundances and 6 environmental predictors). \describe{ \item{arct.lute}{numeric. Abundance of species \emph{Arctosa lutetiana} (on a scale 0--9).} \item{pard.lugu}{numeric. Abundance of species \emph{Pardosa lugubris} (on a scale 0--9).} \item{zora.spin}{numeric. Abundance of species \emph{Zora spinimana} (on a scale 0--9).} \item{pard.nigr}{numeric. Abundance of species \emph{Pardosa nigriceps} (on a scale 0--9).} \item{pard.pull}{numeric. Abundance of species \emph{Pardosa pullata} (on a scale 0--9).} \item{aulo.albi}{numeric. Abundance of species \emph{Aulonia albimana} (on a scale 0--9).} \item{troc.terr}{numeric. Abundance of species \emph{Trochosa terricola} (on a scale 0--9).} \item{alop.cune}{numeric. Abundance of species \emph{Alopecosa cuneata} (on a scale 0--9).} \item{pard.mont}{numeric. Abundance of species \emph{Pardosa monticola} (on a scale 0--9).} \item{alop.acce}{numeric. Abundance of species \emph{Alopecosa accentuata} (on a scale 0--9).} \item{alop.fabr}{numeric. Abundance of species \emph{Alopecosa fabrilis} (on a scale 0--9).} \item{arct.peri}{numeric. Abundance of species \emph{Arctosa perita} (on a scale 0--9).} \item{water}{numeric. Environmental predictor on a scale 0--9.} \item{sand}{numeric. Environmental predictor on a scale 0--9.} \item{moss}{numeric. Environmental predictor on a scale 0--9.} \item{reft}{numeric. Environmental predictor on a scale 0--9.} \item{twigs}{numeric. Environmental predictor on a scale 0--9.} \item{herbs}{numeric. Environmental predictor on a scale 0--9.} } } \details{ The data were originally analyzed by Van der Aart and Smeenk-Enserink (1975). De'ath (2002) transformed all variables to the 0--9 scale and employed multivariate regression trees. } \source{ Package \pkg{mvpart} (currently archived, see \url{https://CRAN.R-project.org/package=mvpart}). } \references{ Van der Aart PJM, Smeenk-Enserink N (1975). Correlations between Distributions of Hunting Spiders (Lycosidae, Ctenidae) and Environmental Characteristics in a Dune Area. \emph{Netherlands Journal of Zoology}, \bold{25}, 1--45. De'ath G (2002). Multivariate Regression Trees: A New Technique for Modelling Species-Environment Relationships. \emph{Ecology}, \bold{83}(4), 1103--1117. } \examples{ ## load data data("HuntingSpiders", package = "partykit") ## fit multivariate tree for 12-dimensional species abundance ## (warnings by mvtnorm are suppressed) suppressWarnings(sptree <- ctree(arct.lute + pard.lugu + zora.spin + pard.nigr + pard.pull + aulo.albi + troc.terr + alop.cune + pard.mont + alop.acce + alop.fabr + arct.peri ~ herbs + reft + moss + sand + twigs + water, data = HuntingSpiders, teststat = "max", minsplit = 5)) plot(sptree, terminal_panel = node_barplot) } \keyword{datasets} partykit/man/panelfunctions.Rd0000644000176200001440000001772213444410436016227 0ustar liggesusers\name{panelfunctions} \alias{panelfunctions} \alias{node_inner} \alias{node_terminal} \alias{edge_simple} \alias{node_barplot} \alias{node_bivplot} \alias{node_boxplot} \alias{node_surv} \alias{node_ecdf} \alias{node_mvar} \title{ Panel-Generators for Visualization of Party Trees } \description{ The plot method for \code{party} and \code{constparty} objects are rather flexible and can be extended by panel functions. Some pre-defined panel-generating functions of class \code{grapcon_generator} for the most important cases are documented here. } \usage{ node_inner(obj, id = TRUE, pval = TRUE, abbreviate = FALSE, fill = "white", gp = gpar()) node_terminal(obj, digits = 3, abbreviate = FALSE, fill = c("lightgray", "white"), id = TRUE, just = c("center", "top"), top = 0.85, align = c("center", "left", "right"), gp = NULL, FUN = NULL, height = NULL, width = NULL) edge_simple(obj, digits = 3, abbreviate = FALSE, justmin = Inf, just = c("alternate", "increasing", "decreasing", "equal"), fill = "white") node_boxplot(obj, col = "black", fill = "lightgray", bg = "white", width = 0.5, yscale = NULL, ylines = 3, cex = 0.5, id = TRUE, mainlab = NULL, gp = gpar()) node_barplot(obj, col = "black", fill = NULL, bg = "white", beside = NULL, ymax = NULL, ylines = NULL, widths = 1, gap = NULL, reverse = NULL, rot = 0, just = c("center", "top"), id = TRUE, mainlab = NULL, text = c("none", "horizontal", "vertical"), gp = gpar()) node_surv(obj, col = "black", bg = "white", yscale = c(0, 1), ylines = 2, id = TRUE, mainlab = NULL, gp = gpar(), \dots) node_ecdf(obj, col = "black", bg = "white", ylines = 2, id = TRUE, mainlab = NULL, gp = gpar(), \dots) node_bivplot(mobobj, which = NULL, id = TRUE, pop = TRUE, pointcol = "black", pointcex = 0.5, boxcol = "black", boxwidth = 0.5, boxfill = "lightgray", bg = "white", fitmean = TRUE, linecol = "red", cdplot = FALSE, fivenum = TRUE, breaks = NULL, ylines = NULL, xlab = FALSE, ylab = FALSE, margins = rep(1.5, 4), mainlab = NULL, \dots) node_mvar(obj, which = NULL, id = TRUE, pop = TRUE, ylines = NULL, mainlab = NULL, varlab = TRUE, bg = "white", ...) } \arguments{ \item{obj}{ an object of class \code{party}.} \item{digits}{ integer, used for formating numbers. } \item{abbreviate}{ logical indicating whether strings should be abbreviated. } \item{col, pointcol, boxcol, linecol}{ a color for points and lines. } \item{fill, boxfill, bg}{ a color to filling rectangles and backgrounds. } \item{id}{ logical. Should node IDs be plotted?} \item{pval}{ logical. Should node p values be plotted (if they are available)?} \item{just}{justification of terminal panel viewport (\code{node_terminal}), or labels (\code{edge_simple}, \code{node_barplot}).} \item{justmin}{minimum average edge label length to employ justification via \code{just} in \code{edge_panel}, otherwise \code{just = "equal"} is used. Thus, by default \code{"equal"} justification is always used but other justifications could be employed for finite \code{justmin}.} \item{top}{in case of top justification, the npc coordinate at which the viewport is justified.} \item{align}{alignment of text within terminal panel viewport.} \item{ylines}{ number of lines for spaces in y-direction. } \item{widths}{ widths in barplots. } \item{boxwidth}{ width in boxplots (called \code{width} in \code{node_boxplot}). } \item{gap}{ gap between bars in a barplot (\code{node_barplot}). } \item{yscale}{ limits in y-direction} \item{ymax}{ upper limit in y-direction} \item{cex, pointcex}{character extension of points in scatter plots.} \item{beside}{ logical indicating if barplots should be side by side or stacked. } \item{reverse}{logical indicating whether the order of levels should be reversed for barplots. } \item{rot}{ arguments passed to \code{\link[grid]{grid.text}} for the x-axis labeling. } \item{gp}{graphical parameters.} \item{FUN}{function for formatting the \code{info}, passed to \code{\link{formatinfo_node}}.} \item{height, width}{ numeric, number of lines/columns for printing text. } \item{mobobj}{an object of class \code{modelparty} as computed by \code{\link{mob}}.} \item{which}{numeric or character. Optional selection of subset of regressor variables. By default one panel for each regressor variable is drawn.} \item{pop}{logical. Should the viewports in the individual nodes be popped after drawing?} \item{fitmean}{logical. Should the fitted mean function be visualized?} \item{cdplot}{logical. Should a CD plot (or a spineplot) be drawn when the response variable is categorical?} \item{fivenum}{logical. Should the five-number summary be used for splitting the x-axis in spineplots?} \item{breaks}{numeric. Optional numeric vector with breaks for the x-axis in splineplots.} \item{xlab, ylab}{character. Optional annotation for x-axis and y-axis.} \item{margins}{numeric. Margins around drawing area in viewport.} \item{mainlab}{character or function. An optional title for the plot. Either a character or a \code{function(id, nobs)}.} \item{varlab}{logical. Should the individual variable labels be attached to the \code{mainlab} for multivariate responses?} \item{text}{logical or character. Should percentage labels be drawn for each bar? The default is \code{"none"} or equivalently \code{FALSE}. Can be set to \code{TRUE} (or \code{"horizontal"}) or alternatively \code{"vertical"}.} \item{\dots}{ additional arguments passed to callies (for example to \code{\link[survival]{survfit}}).} } \details{ The \code{plot} methods for \code{party} and \code{constparty} objects provide an extensible framework for the visualization of binary regression trees. The user is allowed to specify panel functions for plotting terminal and inner nodes as well as the corresponding edges. The panel functions to be used should depend only on the node being visualized, however, for setting up an appropriate panel function, information from the whole tree is typically required. Hence, \pkg{party} adopts the framework of \code{grapcon_generator} (graphical appearance control) from the \pkg{vcd} package (Meyer, Zeileis and Hornik, 2005) and provides several panel-generating functions. For convenience, the panel-generating functions \code{node_inner} and \code{edge_simple} return panel functions to draw inner nodes and left and right edges. For drawing terminal nodes, the functions returned by the other panel functions can be used. The panel generating function \code{node_terminal} is a terse text-based representation of terminal nodes. Graphical representations of terminal nodes are available and depend on the kind of model and the measurement scale of the variables modeled. For univariate regressions (typically fitted by \code{}), \code{node_surv} returns a functions that plots Kaplan-Meier curves in each terminal node; \code{node_barplot}, \code{node_boxplot}, \code{node_hist}, \code{node_ecdf} and \code{node_density} can be used to plot bar plots, box plots, histograms, empirical cumulative distribution functions and estimated densities into the terminal nodes. For multivariate regressions (typically fitted by \code{mob}), \code{node_bivplot} returns a panel function that creates bivariate plots of the response against all regressors in the model. Depending on the scale of the variables involved, scatter plots, box plots, spinograms (or CD plots) and spine plots are created. For the latter two \code{\link[vcd]{spine}} and \code{\link[vcd]{cd_plot}} from the \pkg{vcd} package are re-used. For multivariate responses in \code{\link{ctree}}, the panel function \code{node_mvar} generates one plot for each response. } \references{ Meyer D, Zeileis A, Hornik K (2006). The Strucplot Framework: Visualizing Multi-Way Contingency Tables with vcd. \emph{Journal of Statistical Software}, \bold{17}(3), 1--48. \url{http://www.jstatsoft.org/v17/i03/} } \keyword{hplot} partykit/man/party-coercion.Rd0000644000176200001440000000400613214760754016132 0ustar liggesusers\name{party-coercion} \alias{party-coercion} \alias{as.party} \alias{as.party.rpart} \alias{as.party.Weka_tree} \alias{as.party.XMLNode} \alias{as.constparty} \alias{as.simpleparty} \alias{as.simpleparty.party} \alias{as.simpleparty.simpleparty} \alias{as.simpleparty.XMLNode} \alias{as.simpleparty.constparty} \alias{pmmlTreeModel} \title{Coercion Functions} \description{ Functions coercing various objects to objects of class party. } \usage{ as.party(obj, \dots) \method{as.party}{rpart}(obj, data = TRUE, \dots) \method{as.party}{Weka_tree}(obj, data = TRUE, \dots) \method{as.party}{XMLNode}(obj, \dots) pmmlTreeModel(file, \dots) as.constparty(obj, \dots) as.simpleparty(obj, \dots) \method{as.simpleparty}{party}(obj, \dots) \method{as.simpleparty}{simpleparty}(obj, \dots) \method{as.simpleparty}{constparty}(obj, \dots) \method{as.simpleparty}{XMLNode}(obj, \dots) } \arguments{ \item{obj}{ an object of class \code{\link[rpart]{rpart}}, \code{\link[RWeka:Weka_classifier_trees]{Weka_tree}}, \code{XMLnode} or objects inheriting from \code{party}.} \item{data}{logical. Should the model frame associated with the fitted \code{obj} be included in the \code{data} of the \code{party}?} \item{file}{ a file name of a XML file containing a PMML description of a tree.} \item{\dots}{ additional arguments.} } \details{ Trees fitted using functions \code{\link[rpart]{rpart}} or \code{\link[RWeka:Weka_classifier_trees]{J48}} are coerced to \code{\link{party}} objects. By default, objects of class \code{constparty} are returned. When information about the learning sample is available, \code{\link{party}} objects can be coerced to objects of class \code{constparty} or \code{simpleparty} (see \code{\link{party}} for details). } \value{ All methods return objects of class \code{\link{party}}. } \examples{ ## fit tree using rpart library("rpart") rp <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis) ## coerce to `constparty' as.party(rp) } \keyword{tree} partykit/man/partynode-methods.Rd0000644000176200001440000000763113023523436016641 0ustar liggesusers\name{partynode-methods} \alias{partynode-methods} \alias{is.partynode} \alias{as.partynode} \alias{as.partynode.partynode} \alias{as.partynode.list} \alias{as.list.partynode} \alias{length.partynode} \alias{[.partynode} \alias{[[.partynode} \alias{is.terminal} \alias{is.terminal.partynode} \alias{depth.partynode} \alias{width} \alias{width.partynode} \alias{print.partynode} \alias{nodeprune.partynode} \title{ Methods for Node Objects} \description{ Methods for computing on \code{partynode} objects. } \usage{ is.partynode(x) as.partynode(x, \dots) \method{as.partynode}{partynode}(x, from = NULL, recursive = TRUE, \dots) \method{as.partynode}{list}(x, \dots) \method{as.list}{partynode}(x, \dots) \method{length}{partynode}(x) \method{[}{partynode}(x, i, \dots) \method{[[}{partynode}(x, i, \dots) is.terminal(x, \dots) \method{is.terminal}{partynode}(x, \dots) \method{depth}{partynode}(x, root = FALSE, \dots) width(x, \dots) \method{width}{partynode}(x, \dots) \method{print}{partynode}(x, data = NULL, names = NULL, inner_panel = function(node) "", terminal_panel = function(node) " *", prefix = "", first = TRUE, digits = getOption("digits") - 2, \dots) \method{nodeprune}{partynode}(x, ids, ...) } \arguments{ \item{x}{ an object of class \code{partynode} or \code{list}.} \item{from}{ an integer giving the identifier of the root node.} \item{recursive}{ a logical, if \code{FALSE}, only the id of the root node is checked against \code{from}. If \code{TRUE}, the ids of all nodes are checked.} \item{i}{ an integer specifying the kid to extract.} \item{root}{ a logical. Should the root count be counted in \code{depth}? } \item{data}{ an optional \code{data.frame}.} \item{names}{ a vector of names for nodes.} \item{terminal_panel}{ a panel function for printing terminal nodes.} \item{inner_panel}{ a panel function for printing inner nodes.} \item{prefix}{ lines start with this symbol.} \item{first}{ a logical.} \item{digits}{ number of digits to be printed.} \item{ids}{ a vector of node ids to be pruned-off.} \item{\dots}{ additional arguments.} } \details{ \code{is.partynode} checks if the argument is a valid \code{partynode} object. \code{is.terminal} is \code{TRUE} for terminal nodes and \code{FALSE} for inner nodes. The subset methods return the \code{partynode} object corresponding to the \code{i}th kid. The \code{as.partynode} and \code{as.list} methods can be used to convert flat list structures into recursive \code{partynode} objects and vice versa. \code{as.partynode} applied to \code{partynode} objects renumbers the recursive nodes starting with root node identifier \code{from}. \code{length} gives the number of kid nodes of the root node, \code{depth} the depth of the tree and \code{width} the number of terminal nodes. } \examples{ ## a tree as flat list structure nodelist <- list( # root node list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), kids = 2:3), # V4 <= 1.9, terminal node list(id = 2L), # V4 > 1.9 list(id = 3L, split = partysplit(varid = 1L, breaks = 1.7), kids = c(4L, 7L)), # V1 <= 1.7 list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), kids = 5:6), # V4 <= 4.8, terminal node list(id = 5L), # V4 > 4.8, terminal node list(id = 6L), # V1 > 1.7, terminal node list(id = 7L) ) ## convert to a recursive structure node <- as.partynode(nodelist) ## print raw recursive structure without data print(node) ## print tree along with the associated iris data data("iris", package = "datasets") print(node, data = iris) ## print subtree print(node[2], data = iris) ## print subtree, with root node number one print(as.partynode(node[2], from = 1), data = iris) ## number of kids in root node length(node) ## depth of tree depth(node) ## number of terminal nodes width(node) ## convert back to flat structure as.list(node) } \keyword{tree} partykit/man/prune.modelparty.Rd0000644000176200001440000000634113305165130016474 0ustar liggesusers\name{prune.modelparty} \alias{prune.modelparty} \alias{prune.lmtree} \title{Post-Prune \code{modelparty} Objects} \usage{ prune.modelparty(tree, type = "AIC", ...) } \description{ Post-pruning of \code{modelparty} objects based on information criteria like AIC, BIC, or related user-defined criteria. } \arguments{ \item{tree}{object of class \code{modelparty}.} \item{type}{pruning type. Can be \code{"AIC"}, \code{"BIC"} or a user-defined function (details below).} \item{\dots}{additional arguments.} } \details{ In \code{\link{mob}}-based model trees, pre-pruning based on p-values is used by default and often no post-pruning is necessary in such trees. However, if pre-pruning is switched off (by using a large \code{alpha}) or does is not sufficient (e.g., possibly in large samples) the \code{prune} method can be used for subsequent post-pruning based on information criteria. The function \code{prune.modelparty} can be called directly but it is also registered as a method for the generic \code{\link[rpart]{prune}} function from the \pkg{rpart} package. Thus, if \pkg{rpart} is attached, \code{prune(tree, type = "AIC", ...)} also works (see examples below). To customize the post-pruning strategy, \code{type} can be set to a \code{function(objfun, df, nobs)} which either returns \code{TRUE} to signal that a current node can be pruned or \code{FALSE}. All supplied arguments are of length two: \code{objfun} is the sum of objective function values in the current node and its child nodes, respectively. \code{df} is the degrees of freedom in the current node and its child nodes, respectively. \code{nobs} is vector with the number of observations in the current node and the total number of observations in the dataset, respectively. For \code{"AIC"} and \code{"BIC"} \code{type} is transformed so that AIC or BIC are computed. However, this assumes that the \code{objfun} used in \code{tree} is actually the negative log-likelihood. The degrees of freedom assumed for a split can be set via the \code{dfsplit} argument in \code{\link{mob_control}} when computing the \code{tree} or manipulated later by changing the value of \code{tree$info$control$dfsplit}. } \value{ An object of class \code{modelparty} where the associated tree is either the same as the original or smaller. } \seealso{ \code{\link[rpart]{prune}}, \code{\link{lmtree}}, \code{\link{glmtree}}, \code{\link{mob}} } \examples{ set.seed(29) n <- 1000 d <- data.frame( x = runif(n), z = runif(n), z_noise = factor(sample(1:3, size = n, replace = TRUE)) ) d$y <- rnorm(n, mean = d$x * c(-1, 1)[(d$z > 0.7) + 1], sd = 3) ## glm versus lm / logLik versus sum of squared residuals fmla <- y ~ x | z + z_noise lm_big <- lmtree(formula = fmla, data = d, maxdepth = 3, alpha = 1) glm_big <- glmtree(formula = fmla, data = d, maxdepth = 3, alpha = 1) AIC(lm_big) AIC(glm_big) ## load rpart for prune() generic ## (otherwise: use prune.modelparty directly) if (require("rpart")) { ## pruning lm_aic <- prune(lm_big, type = "AIC") lm_bic <- prune(lm_big, type = "BIC") width(lm_big) width(lm_aic) width(lm_bic) glm_aic <- prune(glm_big, type = "AIC") glm_bic <- prune(glm_big, type = "BIC") width(glm_big) width(glm_aic) width(glm_bic) } } partykit/man/ctree_control.Rd0000644000176200001440000002127413266346454016050 0ustar liggesusers\name{ctree_control} \alias{ctree_control} \title{ Control for Conditional Inference Trees } \description{ Various parameters that control aspects of the `ctree' fit. } \usage{ ctree_control(teststat = c("quadratic", "maximum"), splitstat = c("quadratic", "maximum"), splittest = FALSE, testtype = c("Bonferroni", "MonteCarlo", "Univariate", "Teststatistic"), pargs = GenzBretz(), nmax = c(yx = Inf, z = Inf), alpha = 0.05, mincriterion = 1 - alpha, logmincriterion = log(mincriterion), minsplit = 20L, minbucket = 7L, minprob = 0.01, stump = FALSE, lookahead = FALSE, MIA = FALSE, nresample = 9999L, tol = sqrt(.Machine$double.eps),maxsurrogate = 0L, numsurrogate = FALSE, mtry = Inf, maxdepth = Inf, multiway = FALSE, splittry = 2L, intersplit = FALSE, majority = FALSE, caseweights = TRUE, applyfun = NULL, cores = NULL, saveinfo = TRUE, update = NULL, splitflavour = c("ctree", "exhaustive")) } \arguments{ \item{teststat}{ a character specifying the type of the test statistic to be applied for variable selection. } \item{splitstat}{ a character specifying the type of the test statistic to be applied for splitpoint selection. Prior to version 1.2-0, \code{maximum} was implemented only.} \item{splittest}{ a logical changing linear (the default \code{FALSE}) to maximally selected statistics for variable selection. Currently needs \code{testtype = "MonteCarlo"}.} \item{testtype}{ a character specifying how to compute the distribution of the test statistic. The first three options refer to p-values as criterion, \code{Teststatistic} uses the raw statistic as criterion. \code{Bonferroni} and \code{Univariate} relate to p-values from the asymptotic distribution (adjusted or unadjusted). Bonferroni-adjusted Monte-Carlo p-values are computed when both \code{Bonferroni} and \code{MonteCarlo} are given.} \item{pargs}{ control parameters for the computation of multivariate normal probabilities, see \code{\link[mvtnorm]{GenzBretz}}.} \item{nmax}{ an integer of length two defining the number of bins each variable (in the response \code{yx} and the partitioning variables \code{z})) and is divided into prior to tree building. The default \code{Inf} does not apply any binning. Highly experimental, use at your own risk.} \item{alpha}{ a double, the significance level for variable selection.} \item{mincriterion}{ the value of the test statistic or 1 - p-value that must be exceeded in order to implement a split. } \item{logmincriterion}{ the value of the test statistic or 1 - p-value that must be exceeded in order to implement a split on the log-scale. } \item{minsplit}{ the minimum sum of weights in a node in order to be considered for splitting. } \item{minbucket}{ the minimum sum of weights in a terminal node. } \item{minprob}{ proportion of observations needed to establish a terminal node.} \item{stump}{ a logical determining whether a stump (a tree with a maximum of three nodes only) is to be computed. } \item{lookahead}{ a logical determining whether a split is implemented only after checking if tests in both daughter nodes can be performed.} \item{MIA}{ a logical determining the treatment of \code{NA} as a category in split, see Twala et al. (2008).} \item{nresample}{ number of permutations for \code{testtype = "MonteCarlo"}.} \item{tol}{tolerance for zero variances.} \item{maxsurrogate}{ number of surrogate splits to evaluate.} \item{numsurrogate}{ a logical for backward-compatibility with party. If \code{TRUE}, only at least ordered variables are considered for surrogate splits.} \item{mtry}{ number of input variables randomly sampled as candidates at each node for random forest like algorithms. The default \code{mtry = Inf} means that no random selection takes place. If \code{\link{ctree_control}} is used in \code{\link{cforest}} this argument is ignored.} \item{maxdepth}{ maximum depth of the tree. The default \code{maxdepth = Inf} means that no restrictions are applied to tree sizes.} \item{multiway}{ a logical indicating if multiway splits for all factor levels are implemented for unordered factors.} \item{splittry}{ number of variables that are inspected for admissible splits if the best split doesn't meet the sample size constraints.} \item{intersplit}{ a logical indicating if splits in numeric variables are simply \code{x <= a} (the default) or interpolated \code{x <= (a + b) / 2}. The latter feature is experimental, see Galili and Meilijson (2016).} \item{majority}{ if \code{FALSE}, observations which can't be classified to a daughter node because of missing information are randomly assigned (following the node distribution). If \code{TRUE}, they go with the majority (the default in \code{\link[party]{ctree}}).} \item{caseweights}{ a logical interpreting \code{weights} as case weights.} \item{applyfun}{an optional \code{\link[base]{lapply}}-style function with arguments \code{function(X, FUN, \dots)}. It is used for computing the variable selection criterion. The default is to use the basic \code{lapply} function unless the \code{cores} argument is specified (see below). If \code{\link{ctree_control}} is used in \code{\link{cforest}} this argument is ignored.} \item{cores}{numeric. If set to an integer the \code{applyfun} is set to \code{\link[parallel]{mclapply}} with the desired number of \code{cores}. If \code{\link{ctree_control}} is used in \code{\link{cforest}} this argument is ignored.} \item{saveinfo}{logical. Store information about variable selection procedure in \code{info} slot of each \code{partynode}.} \item{update}{logical. If \code{TRUE}, the data transformation is updated in every node. The default always was and still is not to update unless \code{ytrafo} is a function.} \item{splitflavour}{use exhaustive search over splits instead of maximally selected statistics (\code{ctree}). This feature may change.} } \details{ The arguments \code{teststat}, \code{testtype} and \code{mincriterion} determine how the global null hypothesis of independence between all input variables and the response is tested (see \code{\link{ctree}}). The variable with most extreme p-value or test statistic is selected for splitting. If this isn't possible due to sample size constraints explained in the next paragraph, up to \code{splittry} other variables are inspected for possible splits. A split is established when all of the following criteria are met: 1) the sum of the weights in the current node is larger than \code{minsplit}, 2) a fraction of the sum of weights of more than \code{minprob} will be contained in all daughter nodes, 3) the sum of the weights in all daughter nodes exceeds \code{minbucket}, and 4) the depth of the tree is smaller than \code{maxdepth}. This avoids pathological splits deep down the tree. When \code{stump = TRUE}, a tree with at most two terminal nodes is computed. The argument \code{mtry > 0} means that a random forest like `variable selection', i.e., a random selection of \code{mtry} input variables, is performed in each node. In each inner node, \code{maxsurrogate} surrogate splits are computed (regardless of any missing values in the learning sample). Factors in test samples whose levels were empty in the learning sample are treated as missing when computing predictions (in contrast to \code{\link[party]{ctree}}. Note also the different behaviour of \code{majority} in the two implementations. } \value{ A list. } \references{ B. E. T. H. Twala, M. C. Jones, and D. J. Hand (2008), Good Methods for Coping with Missing Data in Decision Trees, \emph{Pattern Recognition Letters}, \bold{29}(7), 950--956. Tal Galili, Isaac Meilijson (2016), Splitting Matters: How Monotone Transformation of Predictor Variables May Improve the Predictions of Decision Tree Models, \url{https://arxiv.org/abs/1611.04561}. } \keyword{misc} partykit/man/cforest.Rd0000644000176200001440000003352413266350714014646 0ustar liggesusers\name{cforest} \alias{cforest} \alias{gettree} \alias{gettree.cforest} \alias{predict.cforest} \encoding{latin1} \title{Conditional Random Forests} \description{ An implementation of the random forest and bagging ensemble algorithms utilizing conditional inference trees as base learners. } \usage{ cforest(formula, data, weights, subset, offset, cluster, strata, na.action = na.pass, control = ctree_control(teststat = "quad", testtype = "Univ", mincriterion = 0, saveinfo = FALSE, \dots), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.632), mtry = ceiling(sqrt(nvar)), applyfun = NULL, cores = NULL, trace = FALSE, \dots) \method{predict}{cforest}(object, newdata = NULL, type = c("response", "prob", "weights", "node"), OOB = FALSE, FUN = NULL, simplify = TRUE, scale = TRUE, \dots) \method{gettree}{cforest}(object, tree = 1L, \dots) } \arguments{ \item{formula}{ a symbolic description of the model to be fit. } \item{data}{ a data frame containing the variables in the model. } \item{subset}{ an optional vector specifying a subset of observations to be used in the fitting process.} \item{weights}{ an optional vector of weights to be used in the fitting process. Non-negative integer valued weights are allowed as well as non-negative real weights. Observations are sampled (with or without replacement) according to probabilities \code{weights / sum(weights)}. The fraction of observations to be sampled (without replacement) is computed based on the sum of the weights if all weights are integer-valued and based on the number of weights greater zero else. Alternatively, \code{weights} can be a double matrix defining case weights for all \code{ncol(weights)} trees in the forest directly. This requires more storage but gives the user more control.} \item{offset}{ an optional vector of offset values.} \item{cluster}{ an optional factor indicating independent clusters. Highly experimental, use at your own risk.} \item{strata}{ an optional factor for stratified sampling.} \item{na.action}{a function which indicates what should happen when the data contain missing value.} \item{control}{a list with control parameters, see \code{\link{ctree_control}}. The default values correspond to those of the default values used by \code{\link[party]{cforest}} from the \code{party} package. \code{saveinfo = FALSE} leads to less memory hungry representations of trees. Note that arguments \code{mtry}, \code{cores} and \code{applyfun} in \code{\link{ctree_control}} are ignored for \code{\link{cforest}}, because they are already set.} \item{ytrafo}{an optional named list of functions to be applied to the response variable(s) before testing their association with the explanatory variables. Note that this transformation is only performed once for the root node and does not take weights into account (which means, the forest bootstrap or subsetting is ignored, which is almost certainly not a good idea). Alternatively, \code{ytrafo} can be a function of \code{data} and \code{weights}. In this case, the transformation is computed for every node and the corresponding weights. This feature is experimental and the user interface likely to change.} \item{scores}{an optional named list of scores to be attached to ordered factors.} \item{ntree}{ Number of trees to grow for the forest.} \item{perturb}{ a list with arguments \code{replace} and \code{fraction} determining which type of resampling with \code{replace = TRUE} referring to the n-out-of-n bootstrap and \code{replace = FALSE} to sample splitting. \code{fraction} is the number of observations to draw without replacement. } \item{mtry}{ number of input variables randomly sampled as candidates at each node for random forest like algorithms. Bagging, as special case of a random forest without random input variable sampling, can be performed by setting \code{mtry} either equal to \code{Inf} or manually equal to the number of input variables.} \item{applyfun}{an optional \code{\link[base]{lapply}}-style function with arguments \code{function(X, FUN, \dots)}. It is used for computing the variable selection criterion. The default is to use the basic \code{lapply} function unless the \code{cores} argument is specified (see below).} \item{cores}{numeric. If set to an integer the \code{applyfun} is set to \code{\link[parallel]{mclapply}} with the desired number of \code{cores}.} \item{trace}{a logical indicating if a progress bar shall be printed while the forest grows.} \item{object}{ An object as returned by \code{cforest}} \item{newdata}{ An optional data frame containing test data.} \item{type}{ a character string denoting the type of predicted value returned, ignored when argument \code{FUN} is given. For \code{"response"}, the mean of a numeric response, the predicted class for a categorical response or the median survival time for a censored response is returned. For \code{"prob"} the matrix of conditional class probabilities (\code{simplify = TRUE}) or a list with the conditional class probabilities for each observation (\code{simplify = FALSE}) is returned for a categorical response. For numeric and censored responses, a list with the empirical cumulative distribution functions and empirical survivor functions (Kaplan-Meier estimate) is returned when \code{type = "prob"}. \code{"weights"} returns an integer vector of prediction weights. For \code{type = "where"}, a list of terminal node ids for each of the trees in the forest ist returned.} \item{OOB}{ a logical defining out-of-bag predictions (only if \code{newdata = NULL}).} \item{FUN}{ a function to compute summary statistics. Predictions for each node have to be computed based on arguments \code{(y, w)} where \code{y} is the response and \code{w} are case weights.} \item{simplify}{ a logical indicating whether the resulting list of predictions should be converted to a suitable vector or matrix (if possible).} \item{scale}{a logical indicating scaling of the nearest neighbor weights by the sum of weights in the corresponding terminal node of each tree. In the simple regression forest, predicting the conditional mean by nearest neighbor weights will be equivalent to (but slower!) the aggregation of means.} \item{tree}{ an integer, the number of the tree to extract from the forest.} \item{\dots}{ additional arguments. } } \details{ This implementation of the random forest (and bagging) algorithm differs from the reference implementation in \code{\link[randomForest]{randomForest}} with respect to the base learners used and the aggregation scheme applied. Conditional inference trees, see \code{\link{ctree}}, are fitted to each of the \code{ntree} perturbed samples of the learning sample. Most of the hyper parameters in \code{\link{ctree_control}} regulate the construction of the conditional inference trees. Hyper parameters you might want to change are: 1. The number of randomly preselected variables \code{mtry}, which is fixed to the square root of the number of input variables. 2. The number of trees \code{ntree}. Use more trees if you have more variables. 3. The depth of the trees, regulated by \code{mincriterion}. Usually unstopped and unpruned trees are used in random forests. To grow large trees, set \code{mincriterion} to a small value. The aggregation scheme works by averaging observation weights extracted from each of the \code{ntree} trees and NOT by averaging predictions directly as in \code{\link[randomForest]{randomForest}}. See Hothorn et al. (2004) and Meinshausen (2006) for a description. Predictions can be computed using \code{\link{predict}}. For observations with zero weights, predictions are computed from the fitted tree when \code{newdata = NULL}. Ensembles of conditional inference trees have not yet been extensively tested, so this routine is meant for the expert user only and its current state is rather experimental. However, there are some things available in \code{\link{cforest}} that can't be done with \code{\link[randomForest]{randomForest}}, for example fitting forests to censored response variables (see Hothorn et al., 2004, 2006a) or to multivariate and ordered responses. Using the rich \code{partykit} infrastructure allows additional functionality in \code{cforest}, such as parallel tree growing and probabilistic forecasting (for example via quantile regression forests). Also plotting of single trees from a forest is much easier now. Unlike \code{\link[party]{cforest}}, \code{cforest} is entirely written in R which makes customisation much easier at the price of longer computing times. However, trees can be grown in parallel with this R only implemention which renders speed less of an issue. Note that the default values are different from those used in package \code{party}, most importantly the default for mtry is now data-dependent. \code{predict(, type = "node")} replaces the \code{\link[party]{where}} function and \code{predict(, type = "prob")} the \code{\link[party]{treeresponse}} function. Moreover, when predictors vary in their scale of measurement of number of categories, variable selection and computation of variable importance is biased in favor of variables with many potential cutpoints in \code{\link[randomForest]{randomForest}}, while in \code{\link{cforest}} unbiased trees and an adequate resampling scheme are used by default. See Hothorn et al. (2006b) and Strobl et al. (2007) as well as Strobl et al. (2009). } \value{ An object of class \code{cforest}. } \references{ Breiman L (2001). Random Forests. \emph{Machine Learning}, \bold{45}(1), 5--32. Hothorn T, Lausen B, Benner A, Radespiel-Troeger M (2004). Bagging Survival Trees. \emph{Statistics in Medicine}, \bold{23}(1), 77--91. Hothorn T, B?hlmann P, Dudoit S, Molinaro A, Van der Laan MJ (2006a). Survival Ensembles. \emph{Biostatistics}, \bold{7}(3), 355--373. Hothorn T, Hornik K, Zeileis A (2006b). Unbiased Recursive Partitioning: A Conditional Inference Framework. \emph{Journal of Computational and Graphical Statistics}, \bold{15}(3), 651--674. Hothorn T, Zeileis A (2015). partykit: A Modular Toolkit for Recursive Partytioning in R. \emph{Journal of Machine Learning Research}, \bold{16}, 3905--3909. Meinshausen N (2006). Quantile Regression Forests. \emph{Journal of Machine Learning Research}, \bold{7}, 983--999. Strobl C, Boulesteix AL, Zeileis A, Hothorn T (2007). Bias in Random Forest Variable Importance Measures: Illustrations, Sources and a Solution. \emph{BMC Bioinformatics}, \bold{8}, 25. \url{http://www.biomedcentral.com/1471-2105/8/25} Strobl C, Malley J, Tutz G (2009). An Introduction to Recursive Partitioning: Rationale, Application, and Characteristics of Classification and Regression Trees, Bagging, and Random Forests. \emph{Psychological Methods}, \bold{14}(4), 323--348. } \examples{ ## basic example: conditional inference forest for cars data cf <- cforest(dist ~ speed, data = cars) ## prediction of fitted mean and visualization nd <- data.frame(speed = 4:25) nd$mean <- predict(cf, newdata = nd, type = "response") plot(dist ~ speed, data = cars) lines(mean ~ speed, data = nd) ## predict quantiles (aka quantile regression forest) myquantile <- function(y, w) quantile(rep(y, w), probs = c(0.1, 0.5, 0.9)) p <- predict(cf, newdata = nd, type = "response", FUN = myquantile) colnames(p) <- c("lower", "median", "upper") nd <- cbind(nd, p) ## visualization with conditional (on speed) prediction intervals plot(dist ~ speed, data = cars, type = "n") with(nd, polygon(c(speed, rev(speed)), c(lower, rev(upper)), col = "lightgray", border = "transparent")) points(dist ~ speed, data = cars) lines(mean ~ speed, data = nd, lwd = 1.5) lines(median ~ speed, data = nd, lty = 2, lwd = 1.5) legend("topleft", c("mean", "median", "10\% - 90\% quantile"), lwd = c(1.5, 1.5, 10), lty = c(1, 2, 1), col = c("black", "black", "lightgray"), bty = "n") ### we may also use predicted conditional (on speed) densities mydensity <- function (y, w) approxfun(density(y, weights = w/sum(w))[1:2], rule = 2) pd <- predict(cf, newdata = nd, type = "response", FUN = mydensity) ## visualization in heatmap (instead of scatterplot) ## with fitted curves as above dist <- -10:150 dens <- t(sapply(seq_along(pd), function(i) pd[[i]](dist))) image(nd$speed, dist, dens, xlab = "speed", col = rev(gray.colors(9))) lines(mean ~ speed, data = nd, lwd = 1.5) lines(median ~ speed, data = nd, lty = 2, lwd = 1.5) lines(lower ~ speed, data = nd, lty = 2) lines(upper ~ speed, data = nd, lty = 2) \dontrun{ ### honest (i.e., out-of-bag) cross-classification of ### true vs. predicted classes data("mammoexp", package = "TH.data") table(mammoexp$ME, predict(cforest(ME ~ ., data = mammoexp, ntree = 50), OOB = TRUE, type = "response")) ### fit forest to censored response if (require("TH.data") && require("survival")) { data("GBSG2", package = "TH.data") bst <- cforest(Surv(time, cens) ~ ., data = GBSG2, ntree = 50) ### estimate conditional Kaplan-Meier curves print(predict(bst, newdata = GBSG2[1:2,], OOB = TRUE, type = "prob")) print(gettree(bst)) } } } \keyword{tree} partykit/man/nodeids.Rd0000644000176200001440000000442213107107552014613 0ustar liggesusers\name{nodeids} \alias{nodeids} \alias{nodeids.party} \alias{nodeids.partynode} \alias{get_paths} \title{ Extract Node Identifiers } \description{ Extract unique identifiers from inner and terminals nodes of a \code{partynode} object. } \usage{ nodeids(obj, \dots) \method{nodeids}{partynode}(obj, from = NULL, terminal = FALSE, \dots) \method{nodeids}{party}(obj, from = NULL, terminal = FALSE, \dots) get_paths(obj, i) } \arguments{ \item{obj}{ an object of class \code{\link{partynode}} or \code{\link{party}}.} \item{from}{ an integer specifying node to start from.} \item{terminal}{ logical specifying if only node identifiers of terminal nodes are returned. } \item{i}{a vector of node identifiers.} \item{\dots}{ additional arguments.} } \details{ The identifiers of each node are extracted from \code{nodeids}. \code{get_paths} returns the paths for extracting the corresponding nodes using list subsets. } \value{ A vector of node identifiers. } \examples{ ## a tree as flat list structure nodelist <- list( # root node list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), kids = 2:3), # V4 <= 1.9, terminal node list(id = 2L), # V4 > 1.9 list(id = 3L, split = partysplit(varid = 1L, breaks = 1.7), kids = c(4L, 7L)), # V1 <= 1.7 list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), kids = 5:6), # V4 <= 4.8, terminal node list(id = 5L), # V4 > 4.8, terminal node list(id = 6L), # V1 > 1.7, terminal node list(id = 7L) ) ## convert to a recursive structure node <- as.partynode(nodelist) ## set up party object data("iris") tree <- party(node, data = iris, fitted = data.frame("(fitted)" = fitted_node(node, data = iris), check.names = FALSE)) tree ### ids of all nodes nodeids(tree) ### ids of all terminal nodes nodeids(tree, terminal = TRUE) ### ids of terminal nodes in subtree with root [3] nodeids(tree, from = 3, terminal = TRUE) ### get paths and extract all terminal nodes tr <- unclass(node_party(tree)) lapply(get_paths(tree, nodeids(tree, terminal = TRUE)), function(path) tr[path]) } \keyword{tree} partykit/man/extree_fit.Rd0000644000176200001440000000313513214456161015326 0ustar liggesusers\name{extree_fit} \alias{extree_fit} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Fit Extensible Trees. } \description{ Basic infrastructure for fitting extensible trees. } \usage{ extree_fit(data, trafo, converged, selectfun = ctrl$selectfun, splitfun = ctrl$splitfun, svselectfun = ctrl$svselectfun, svsplitfun = ctrl$svsplitfun, partyvars, subset, weights, ctrl, doFit = TRUE) } \arguments{ \item{data}{an object of class \code{extree_data}, see \code{\link{extree_data}}. } \item{trafo}{a function with arguments \code{subset}, \code{weights}, \code{info}, \code{estfun} and \code{object}. } \item{converged}{a function with arguments \code{subset}, \code{weights}. } \item{selectfun}{an optional function for selecting variables. } \item{splitfun}{an optional function for selecting splits. } \item{svselectfun}{an optional function for selecting surrogate variables. } \item{svsplitfun}{an optional function for selecting surrogate splits. } \item{partyvars}{a numeric vector assigning a weight to each partitioning variable (\code{z} in \code{\link{extree_data}}. } \item{subset}{a sorted integer vector describing a subset. } \item{weights}{an optional vector of weights. } \item{ctrl}{control arguments. } \item{doFit}{a logical indicating if the tree shall be grown (\code{TRUE}) or not \code{FALSE}. } } \details{ This internal functionality will be the basis of implementations of other tree algorithms in future versions. Currently, only \code{ctree} relies on this function. } \value{An object of class \code{partynode}.} \keyword{tree} partykit/man/partysplit.Rd0000644000176200001440000001460213207262553015406 0ustar liggesusers\name{partysplit} \alias{partysplit} \alias{kidids_split} \alias{character_split} \alias{varid_split} \alias{breaks_split} \alias{index_split} \alias{right_split} \alias{prob_split} \alias{info_split} \title{ Binary and Multiway Splits } \description{ A class for representing multiway splits and functions for computing on splits. } \usage{ partysplit(varid, breaks = NULL, index = NULL, right = TRUE, prob = NULL, info = NULL) kidids_split(split, data, vmatch = 1:length(data), obs = NULL) character_split(split, data = NULL, digits = getOption("digits") - 2) varid_split(split) breaks_split(split) index_split(split) right_split(split) prob_split(split) info_split(split) } \arguments{ \item{varid}{ an integer specifying the variable to split in, i.e., a column number in \code{data}. } \item{breaks}{ a numeric vector of split points. } \item{index}{ an integer vector containing a contiguous sequence from one to the number of kid nodes. May contain \code{NA}s.} \item{right}{ a logical, indicating if the intervals defined by \code{breaks} should be closed on the right (and open on the left) or vice versa.} \item{prob}{ a numeric vector representing a probability distribution over kid nodes. } \item{info}{ additional information. } \item{split}{ an object of class \code{partysplit}.} \item{data}{ a \code{\link{list}} or \code{\link{data.frame}}.} \item{vmatch}{ a permutation of the variable numbers in \code{data}.} \item{obs}{ a logical or integer vector indicating a subset of the observations in \code{data}.} \item{digits}{ minimal number of significant digits.} } \details{ A split is basically a function that maps data, more specifically a partitioning variable, to a set of integers indicating the kid nodes to send observations to. Objects of class \code{partysplit} describe such a function and can be set-up via the \code{partysplit()} constructor. The variables are available in a \code{list} or \code{data.frame} (here called \code{data}) and \code{varid} specifies the partitioning variable, i.e., the variable or list element to split in. The constructor \code{partysplit()} doesn't have access to the actual data, i.e., doesn't \emph{estimate} splits. \code{kidids_split(split, data)} actually partitions the data \code{data[obs,varid_split(split)]} and assigns an integer (giving the kid node number) to each observation. If \code{vmatch} is given, the variable \code{vmatch[varid_split(split)]} is used. \code{character_split()} returns a character representation of its \code{split} argument. The remaining functions defined here are accessor functions for \code{partysplit} objects. The numeric vector \code{breaks} defines how the range of the partitioning variable (after coercing to a numeric via \code{\link{as.numeric}}) is divided into intervals (like in \code{\link{cut}}) and may be \code{NULL}. These intervals are represented by the numbers one to \code{length(breaks) + 1}. \code{index} assigns these \code{length(breaks) + 1} intervals to one of at least two kid nodes. Thus, \code{index} is a vector of integers where each element corresponds to one element in a list \code{kids} containing \code{\link{partynode}} objects, see \code{\link{partynode}} for details. The vector \code{index} may contain \code{NA}s, in that case, the corresponding values of the splitting variable are treated as missings (for example factor levels that are not present in the learning sample). Either \code{breaks} or \code{index} must be given. When \code{breaks} is \code{NULL}, it is assumed that the partitioning variable itself has storage mode \code{integer} (e.g., is a \code{\link{factor}}). \code{prob} defines a probability distribution over all kid nodes which is used for random splitting when a deterministic split isn't possible (due to missing values, for example). \code{info} takes arbitrary user-specified information. } \value{ The constructor \code{partysplit()} returns an object of class \code{partysplit}: \item{varid}{ an integer specifying the variable to split in, i.e., a column number in \code{data}, } \item{breaks}{ a numeric vector of split points, } \item{index}{ an integer vector containing a contiguous sequence from one to the number of kid nodes,} \item{right}{ a logical, indicating if the intervals defined by \code{breaks} should be closed on the right (and open on the left) or vice versa} \item{prob}{ a numeric vector representing a probability distribution over kid nodes, } \item{info}{ additional information. } \code{kidids_split()} returns an integer vector describing the partition of the observations into kid nodes. \code{character_split()} gives a character representation of the split and the remaining functions return the corresponding slots of \code{partysplit} objects. } \seealso{\code{\link{cut}}} \references{ Hothorn T, Zeileis A (2015). partykit: A Modular Toolkit for Recursive Partytioning in R. \emph{Journal of Machine Learning Research}, \bold{16}, 3905--3909. } \examples{ data("iris", package = "datasets") ## binary split in numeric variable `Sepal.Length' sl5 <- partysplit(which(names(iris) == "Sepal.Length"), breaks = 5) character_split(sl5, data = iris) table(kidids_split(sl5, data = iris), iris$Sepal.Length <= 5) ## multiway split in numeric variable `Sepal.Width', ## higher values go to the first kid, smallest values ## to the last kid sw23 <- partysplit(which(names(iris) == "Sepal.Width"), breaks = c(3, 3.5), index = 3:1) character_split(sw23, data = iris) table(kidids_split(sw23, data = iris), cut(iris$Sepal.Width, breaks = c(-Inf, 2, 3, Inf))) ## binary split in factor `Species' sp <- partysplit(which(names(iris) == "Species"), index = c(1L, 1L, 2L)) character_split(sp, data = iris) table(kidids_split(sp, data = iris), iris$Species) ## multiway split in factor `Species' sp <- partysplit(which(names(iris) == "Species"), index = 1:3) character_split(sp, data = iris) table(kidids_split(sp, data = iris), iris$Species) ## multiway split in numeric variable `Sepal.Width' sp <- partysplit(which(names(iris) == "Sepal.Width"), breaks = quantile(iris$Sepal.Width)) character_split(sp, data = iris) } \keyword{tree} partykit/man/model.frame.rpart.Rd0000644000176200001440000000104013214760044016477 0ustar liggesusers\name{model_frame_rpart} \alias{model_frame_rpart} \title{ Model Frame Method for rpart } \description{ A model.frame method for rpart objects. } \usage{ model_frame_rpart(formula, \dots) } \arguments{ \item{formula}{ an object of class \code{\link{rpart}}.} \item{\dots}{ additional arguments.} } \details{ A \code{\link{model.frame}} method for \code{\link{rpart}} objects. Because it is no longer possible to overwrite existing methods, the function name is a little different here. } \value{ A model frame. } \keyword{tree} partykit/man/party.Rd0000644000176200001440000001350613023523436014330 0ustar liggesusers\name{party} \alias{party} \alias{names.party} \alias{names<-.party} \alias{node_party} \alias{is.constparty} \alias{is.simpleparty} \alias{data_party} \alias{data_party.default} \title{ Recursive Partytioning } \description{ A class for representing decision trees and corresponding accessor functions. } \usage{ party(node, data, fitted = NULL, terms = NULL, names = NULL, info = NULL) \method{names}{party}(x) \method{names}{party}(x) <- value data_party(party, id = 1L) \method{data_party}{default}(party, id = 1L) node_party(party) is.constparty(party) is.simpleparty(party) } \arguments{ \item{node}{ an object of class \code{\link{partynode}}.} \item{data}{ a (potentially empty) \code{\link{data.frame}}.} \item{fitted}{ an optional \code{\link{data.frame}} with \code{nrow(data)} rows (only if \code{nrow(data) != 0} and containing at least the fitted terminal node identifiers as element \code{(fitted)}. In addition, weights may be contained as element \code{(weights)} and responses as \code{(response)}.} \item{terms}{ an optional \code{\link{terms}} object. } \item{names}{ an optional vector of names to be assigned to each node of \code{node}. } \item{info}{ additional information. } \item{x}{ an object of class \code{party}.} \item{party}{ an object of class \code{party}.} \item{value}{a character vector of up to the same length as \code{x}, or \code{NULL}.} \item{id}{ a node identifier.} } \details{ Objects of class \code{party} basically consist of a \code{\link{partynode}} object representing the tree structure in a recursive way and data. The \code{data} argument takes a \code{data.frame} which, however, might have zero columns. Optionally, a \code{data.frame} with at least one variable \code{(fitted)} containing the terminal node numbers of data used for fitting the tree may be specified along with a \code{\link{terms}} object or any additional (currently unstructured) information as \code{info}. Argument \code{names} defines names for all nodes in \code{node}. Method \code{names} can be used to extract or alter names for nodes. Function \code{node_party} returns the \code{node} element of a \code{party} object. Further methods for \code{party} objects are documented in \code{\link{party-methods}} and \code{\link{party-predict}}. Trees of various flavors can be coerced to \code{party}, see \code{\link{party-coercion}}. Two classes inherit from class \code{party} and impose additional assumptions on the structure of this object: Class \code{constparty} requires that the \code{fitted} slot contains a partitioning of the learning sample as a factor \code{("fitted")} and the response values of all observations in the learning sample as \code{("response")}. This structure is most flexible and allows for graphical display of the response values in terminal nodes as well as for computing predictions based on arbitrary summary statistics. Class \code{simpleparty} assumes that certain pre-computed information about the distribution of the response variable is contained in the \code{info} slot nodes. At the moment, no formal class is used to describe this information. } \value{ The constructor returns an object of class \code{party}: \item{node}{ an object of class \code{\link{partynode}}.} \item{data}{ a (potentially empty) \code{\link{data.frame}}.} \item{fitted}{ an optional \code{\link{data.frame}} with \code{nrow(data)} rows (only if \code{nrow(data) != 0} and containing at least the fitted terminal node identifiers as element \code{(fitted)}. In addition, weights may be contained as element \code{(weights)} and responses as \code{(response)}.} \item{terms}{ an optional \code{\link{terms}} object. } \item{names}{ an optional vector of names to be assigned to each node of \code{node}. } \item{info}{ additional information. } \code{names} can be used to set and retrieve names of nodes and \code{node_party} returns an object of class \code{\link{partynode}}. \code{data_party} returns a data frame with observations contained in node \code{id}. } \references{ Hothorn T, Zeileis A (2015). partykit: A Modular Toolkit for Recursive Partytioning in R. \emph{Journal of Machine Learning Research}, \bold{16}, 3905--3909. } \examples{ ### data ### ## artificial WeatherPlay data data("WeatherPlay", package = "partykit") str(WeatherPlay) ### splits ### ## split in overcast, humidity, and windy sp_o <- partysplit(1L, index = 1:3) sp_h <- partysplit(3L, breaks = 75) sp_w <- partysplit(4L, index = 1:2) ## query labels character_split(sp_o) ### nodes ### ## set up partynode structure pn <- partynode(1L, split = sp_o, kids = list( partynode(2L, split = sp_h, kids = list( partynode(3L, info = "yes"), partynode(4L, info = "no"))), partynode(5L, info = "yes"), partynode(6L, split = sp_w, kids = list( partynode(7L, info = "yes"), partynode(8L, info = "no"))))) pn ### tree ### ## party: associate recursive partynode structure with data py <- party(pn, WeatherPlay) py plot(py) ### variations ### ## tree stump n1 <- partynode(id = 1L, split = sp_o, kids = lapply(2L:4L, partynode)) print(n1, data = WeatherPlay) ## query fitted nodes and kids ids fitted_node(n1, data = WeatherPlay) kidids_node(n1, data = WeatherPlay) ## tree with full data sets t1 <- party(n1, data = WeatherPlay) ## tree with empty data set party(n1, data = WeatherPlay[0, ]) ## constant-fit tree t2 <- party(n1, data = WeatherPlay, fitted = data.frame( "(fitted)" = fitted_node(n1, data = WeatherPlay), "(response)" = WeatherPlay$play, check.names = FALSE), terms = terms(play ~ ., data = WeatherPlay), ) t2 <- as.constparty(t2) t2 plot(t2) } \keyword{tree} partykit/man/ctree.Rd0000644000176200001440000001727613317402275014306 0ustar liggesusers\name{ctree} \alias{ctree} \alias{sctest.constparty} \title{Conditional Inference Trees} \description{ Recursive partitioning for continuous, censored, ordered, nominal and multivariate response variables in a conditional inference framework. } \usage{ ctree(formula, data, subset, weights, na.action = na.pass, offset, cluster, control = ctree_control(\dots), ytrafo = NULL, converged = NULL, scores = NULL, doFit = TRUE, \dots) } \arguments{ \item{formula}{ a symbolic description of the model to be fit. } \item{data}{ a data frame containing the variables in the model. } \item{subset}{ an optional vector specifying a subset of observations to be used in the fitting process.} \item{weights}{ an optional vector of weights to be used in the fitting process. Only non-negative integer valued weights are allowed.} \item{offset}{ an optional vector of offset values.} \item{cluster}{ an optional factor indicating independent clusters. Highly experimental, use at your own risk.} \item{na.action}{a function which indicates what should happen when the data contain missing value.} \item{control}{a list with control parameters, see \code{\link{ctree_control}}.} \item{ytrafo}{an optional named list of functions to be applied to the response variable(s) before testing their association with the explanatory variables. Note that this transformation is only performed once for the root node and does not take weights into account. Alternatively, \code{ytrafo} can be a function of \code{data} and \code{weights}. In this case, the transformation is computed for every node with corresponding weights. This feature is experimental and the user interface likely to change.} \item{converged}{an optional function for checking user-defined criteria before splits are implemented. This is not to be used and very likely to change.} \item{scores}{an optional named list of scores to be attached to ordered factors.} \item{doFit}{a logical, if \code{FALSE}, the tree is not fitted.} \item{\dots}{arguments passed to \code{\link{ctree_control}}.} } \details{ Function \code{partykit::ctree} is a reimplementation of (most of) \code{party::ctree} employing the new \code{\link{party}} infrastructure of the \pkg{partykit} infrastructure. The vignette \code{vignette("ctree", package = "partykit")} explains internals of the different implementations. Conditional inference trees estimate a regression relationship by binary recursive partitioning in a conditional inference framework. Roughly, the algorithm works as follows: 1) Test the global null hypothesis of independence between any of the input variables and the response (which may be multivariate as well). Stop if this hypothesis cannot be rejected. Otherwise select the input variable with strongest association to the response. This association is measured by a p-value corresponding to a test for the partial null hypothesis of a single input variable and the response. 2) Implement a binary split in the selected input variable. 3) Recursively repeate steps 1) and 2). The implementation utilizes a unified framework for conditional inference, or permutation tests, developed by Strasser and Weber (1999). The stop criterion in step 1) is either based on multiplicity adjusted p-values (\code{testtype = "Bonferroni"} in \code{\link{ctree_control}}) or on the univariate p-values (\code{testtype = "Univariate"}). In both cases, the criterion is maximized, i.e., 1 - p-value is used. A split is implemented when the criterion exceeds the value given by \code{mincriterion} as specified in \code{\link{ctree_control}}. For example, when \code{mincriterion = 0.95}, the p-value must be smaller than $0.05$ in order to split this node. This statistical approach ensures that the right-sized tree is grown without additional (post-)pruning or cross-validation. The level of \code{mincriterion} can either be specified to be appropriate for the size of the data set (and \code{0.95} is typically appropriate for small to moderately-sized data sets) or could potentially be treated like a hyperparameter (see Section~3.4 in Hothorn, Hornik and Zeileis, 2006). The selection of the input variable to split in is based on the univariate p-values avoiding a variable selection bias towards input variables with many possible cutpoints. The test statistics in each of the nodes can be extracted with the \code{sctest} method. (Note that the generic is in the \pkg{strucchange} package so this either needs to be loaded or \code{sctest.constparty} has to be called directly.) In cases where splitting stops due to the sample size (e.g., \code{minsplit} or \code{minbucket} etc.), the test results may be empty. Predictions can be computed using \code{\link{predict}}, which returns predicted means, predicted classes or median predicted survival times and more information about the conditional distribution of the response, i.e., class probabilities or predicted Kaplan-Meier curves. For observations with zero weights, predictions are computed from the fitted tree when \code{newdata = NULL}. By default, the scores for each ordinal factor \code{x} are \code{1:length(x)}, this may be changed for variables in the formula using \code{scores = list(x = c(1, 5, 6))}, for example. For a general description of the methodology see Hothorn, Hornik and Zeileis (2006) and Hothorn, Hornik, van de Wiel and Zeileis (2006). } \value{ An object of class \code{\link{party}}. } \references{ Hothorn T, Hornik K, Van de Wiel MA, Zeileis A (2006). A Lego System for Conditional Inference. \emph{The American Statistician}, \bold{60}(3), 257--263. Hothorn T, Hornik K, Zeileis A (2006). Unbiased Recursive Partitioning: A Conditional Inference Framework. \emph{Journal of Computational and Graphical Statistics}, \bold{15}(3), 651--674. Hothorn T, Zeileis A (2015). partykit: A Modular Toolkit for Recursive Partytioning in R. \emph{Journal of Machine Learning Research}, \bold{16}, 3905--3909. Strasser H, Weber C (1999). On the Asymptotic Theory of Permutation Statistics. \emph{Mathematical Methods of Statistics}, \bold{8}, 220--250. } \examples{ ### regression airq <- subset(airquality, !is.na(Ozone)) airct <- ctree(Ozone ~ ., data = airq) airct plot(airct) mean((airq$Ozone - predict(airct))^2) ### classification irisct <- ctree(Species ~ .,data = iris) irisct plot(irisct) table(predict(irisct), iris$Species) ### estimated class probabilities, a list tr <- predict(irisct, newdata = iris[1:10,], type = "prob") ### survival analysis if (require("TH.data") && require("survival") && require("coin") && require("Formula")) { data("GBSG2", package = "TH.data") (GBSG2ct <- ctree(Surv(time, cens) ~ ., data = GBSG2)) predict(GBSG2ct, newdata = GBSG2[1:2,], type = "response") plot(GBSG2ct) ### with weight-dependent log-rank scores ### log-rank trafo for observations in this node only (= weights > 0) h <- function(y, x, start = NULL, weights, offset, estfun = TRUE, object = FALSE, ...) { if (is.null(weights)) weights <- rep(1, NROW(y)) s <- logrank_trafo(y[weights > 0,,drop = FALSE]) r <- rep(0, length(weights)) r[weights > 0] <- s list(estfun = matrix(as.double(r), ncol = 1), converged = TRUE) } ### very much the same tree (ctree(Surv(time, cens) ~ ., data = GBSG2, ytrafo = h)) } ### multivariate responses airct2 <- ctree(Ozone + Temp ~ ., data = airq) airct2 plot(airct2) } \keyword{tree} partykit/man/varimp.Rd0000644000176200001440000001223513612550145014466 0ustar liggesusers\name{varimp} \alias{varimp} \alias{varimp.constparty} \alias{varimp.cforest} \title{ Variable Importance } \description{ Standard and conditional variable importance for `cforest', following the permutation principle of the `mean decrease in accuracy' importance in `randomForest'. } \usage{ \method{varimp}{constparty}(object, nperm = 1L, risk = c("loglik", "misclassification"), conditions = NULL, mincriterion = 0, ...) \method{varimp}{cforest}(object, nperm = 1L, OOB = TRUE, risk = c("loglik", "misclassification"), conditional = FALSE, threshold = .2, applyfun = NULL, cores = NULL, ...) } \arguments{ \item{object}{ an object as returned by \code{cforest}.} \item{mincriterion}{ the value of the test statistic or 1 - p-value that must be exceeded in order to include a split in the computation of the importance. The default \code{mincriterion = 0} guarantees that all splits are included.} \item{conditional}{ a logical determining whether unconditional or conditional computation of the importance is performed. } \item{threshold}{ the value of the test statistic or 1 - p-value of the association between the variable of interest and a covariate that must be exceeded inorder to include the covariate in the conditioning scheme for the variable of interest (only relevant if \code{conditional = TRUE}). } \item{nperm}{ the number of permutations performed.} \item{OOB}{ a logical determining whether the importance is computed from the out-of-bag sample or the learning sample (not suggested).} \item{risk}{ a character determining the risk to be evaluated.} \item{conditions}{ a list of conditions. } \item{applyfun}{an optional \code{\link[base]{lapply}}-style function with arguments \code{function(X, FUN, \dots)}. It is used for computing the variable importances for each tree. The default is to use the basic \code{lapply} function unless the \code{cores} argument is specified (see below). Extra care is needed to ensure correct seeds are used in the parallel runs (\code{RNGkind("L'Ecuyer-CMRG")} for example).} \item{cores}{numeric. If set to an integer the \code{applyfun} is set to \code{\link[parallel]{mclapply}} with the desired number of \code{cores}.} \item{\dots}{additional arguments, not used.} } \details{ NEEDS UPDATE Function \code{varimp} can be used to compute variable importance measures similar to those computed by \code{\link[randomForest]{importance}}. Besides the standard version, a conditional version is available, that adjusts for correlations between predictor variables. If \code{conditional = TRUE}, the importance of each variable is computed by permuting within a grid defined by the covariates that are associated (with 1 - p-value greater than \code{threshold}) to the variable of interest. The resulting variable importance score is conditional in the sense of beta coefficients in regression models, but represents the effect of a variable in both main effects and interactions. See Strobl et al. (2008) for details. Note, however, that all random forest results are subject to random variation. Thus, before interpreting the importance ranking, check whether the same ranking is achieved with a different random seed -- or otherwise increase the number of trees \code{ntree} in \code{\link{ctree_control}}. Note that in the presence of missings in the predictor variables the procedure described in Hapfelmeier et al. (2012) is performed. } \value{ A vector of `mean decrease in accuracy' importance scores. } \references{ Leo Breiman (2001). Random Forests. \emph{Machine Learning}, 45(1), 5--32. Alexander Hapfelmeier, Torsten Hothorn, Kurt Ulm, and Carolin Strobl (2012). A New Variable Importance Measure for Random Forests with Missing Data. \emph{Statistics and Computing}, \url{https://dx.doi.org/10.1007/s11222-012-9349-1} Torsten Hothorn, Kurt Hornik, and Achim Zeileis (2006b). Unbiased Recursive Partitioning: A Conditional Inference Framework. \emph{Journal of Computational and Graphical Statistics}, \bold{15} (3), 651-674. Preprint available from \url{http://statmath.wu-wien.ac.at/~zeileis/papers/Hothorn+Hornik+Zeileis-2006.pdf} Carolin Strobl, Anne-Laure Boulesteix, Thomas Kneib, Thomas Augustin, and Achim Zeileis (2008). Conditional Variable Importance for Random Forests. \emph{BMC Bioinformatics}, \bold{9}, 307. \url{http://www.biomedcentral.com/1471-2105/9/307} } \examples{ set.seed(290875) data("readingSkills", package = "party") readingSkills.cf <- cforest(score ~ ., data = readingSkills, mtry = 2, ntree = 50) # standard importance varimp(readingSkills.cf) # conditional importance, may take a while... varimp(readingSkills.cf, conditional = TRUE) } \keyword{tree} partykit/man/party-methods.Rd0000644000176200001440000000745313023523436015775 0ustar liggesusers\name{party-methods} \alias{party-methods} \alias{length.party} \alias{print.party} \alias{print.simpleparty} \alias{print.constparty} \alias{[.party} \alias{[[.party} \alias{depth.party} \alias{width.party} \alias{getCall.party} \alias{nodeprune} \alias{nodeprune.party} \title{ Methods for Party Objects } \description{ Methods for computing on \code{party} objects. } \usage{ \method{print}{party}(x, terminal_panel = function(node) formatinfo_node(node, default = "*", prefix = ": "), tp_args = list(), inner_panel = function(node) "", ip_args = list(), header_panel = function(party) "", footer_panel = function(party) "", digits = getOption("digits") - 2, \dots) \method{print}{simpleparty}(x, digits = getOption("digits") - 4, header = NULL, footer = TRUE, \dots) \method{print}{constparty}(x, FUN = NULL, digits = getOption("digits") - 4, header = NULL, footer = TRUE, \dots) \method{length}{party}(x) \method{[}{party}(x, i, \dots) \method{[[}{party}(x, i, \dots) \method{depth}{party}(x, root = FALSE, \dots) \method{width}{party}(x, \dots) \method{nodeprune}{party}(x, ids, ...) } \arguments{ \item{x}{ an object of class \code{\link{party}}.} \item{i}{ an integer specifying the root of the subtree to extract.} \item{terminal_panel}{ a panel function for printing terminal nodes.} \item{tp_args}{ a list containing arguments to \code{terminal_panel}.} \item{inner_panel}{ a panel function for printing inner nodes.} \item{ip_args}{ a list containing arguments to \code{inner_panel}.} \item{header_panel}{ a panel function for printing the header.} \item{footer_panel}{ a panel function for printing the footer.} \item{digits}{ number of digits to be printed.} \item{header}{ header to be printed.} \item{footer}{ footer to be printed.} \item{FUN}{ a function to be applied to nodes.} \item{root}{ a logical. Should the root count be counted in \code{depth}? } \item{ids}{ a vector of node ids (or their names) to be pruned-off.} \item{\dots}{ additional arguments.} } \details{ \code{length} gives the number of nodes in the tree (in contrast to the \code{length} method for \code{\link{partynode}} objects which returns the number of kid nodes in the root), \code{depth} the depth of the tree and \code{width} the number of terminal nodes. The subset methods extract subtrees and the \code{print} method generates a textual representation of the tree. \code{nodeprune} prunes-off nodes and makes sure that the node ids of the resulting tree are in pre-order starting with root node id 1. For \code{constparty} objects, the \code{fitted} slot is also changed. } \examples{ ## a tree as flat list structure nodelist <- list( # root node list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), kids = 2:3), # V4 <= 1.9, terminal node list(id = 2L), # V4 > 1.9 list(id = 3L, split = partysplit(varid = 5L, breaks = 1.7), kids = c(4L, 7L)), # V5 <= 1.7 list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), kids = 5:6), # V4 <= 4.8, terminal node list(id = 5L), # V4 > 4.8, terminal node list(id = 6L), # V5 > 1.7, terminal node list(id = 7L) ) ## convert to a recursive structure node <- as.partynode(nodelist) ## set up party object data("iris") tree <- party(node, data = iris, fitted = data.frame("(fitted)" = fitted_node(node, data = iris), check.names = FALSE)) names(tree) <- paste("Node", nodeids(tree), sep = " ") ## number of kids in root node length(tree) ## depth of tree depth(tree) ## number of terminal nodes width(tree) ## node number four tree["Node 4"] tree[["Node 4"]] } \keyword{tree} partykit/man/lmtree.Rd0000644000176200001440000001156213077330432014462 0ustar liggesusers\name{lmtree} \alias{lmtree} \alias{plot.lmtree} \alias{predict.lmtree} \alias{print.lmtree} \title{Linear Model Trees} \description{ Model-based recursive partitioning based on least squares regression. } \usage{ lmtree(formula, data, subset, na.action, weights, offset, cluster, \dots) } \arguments{ \item{formula}{symbolic description of the model (of type \code{y ~ z1 + \dots + zl} or \code{y ~ x1 + \dots + xk | z1 + \dots + zl}; for details see below).} \item{data, subset, na.action}{arguments controlling formula processing via \code{\link[stats]{model.frame}}.} \item{weights}{optional numeric vector of weights. By default these are treated as case weights but the default can be changed in \code{\link{mob_control}}.} \item{offset}{optional numeric vector with an a priori known component to be included in the model \code{y ~ x1 + \dots + xk} (i.e., only when \code{x} variables are specified).} \item{cluster}{optional vector (typically numeric or factor) with a cluster ID to be employed for clustered covariances in the parameter stability tests.} \item{\dots}{optional control parameters passed to \code{\link{mob_control}}.} } \details{ Convenience interface for fitting MOBs (model-based recursive partitions) via the \code{\link{mob}} function. \code{lmtree} internally sets up a model \code{fit} function for \code{mob}, using either \code{\link[stats]{lm.fit}} or \code{\link[stats]{lm.wfit}} (depending on whether weights are used or not). Then \code{mob} is called using the residual sum of squares as the objective function. Compared to calling \code{mob} by hand, the implementation tries to avoid unnecessary computations while growing the tree. Also, it provides a more elaborate plotting function. } \value{ An object of class \code{lmtree} inheriting from \code{\link{modelparty}}. The \code{info} element of the overall \code{party} and the individual \code{node}s contain various informations about the models. } \references{ Zeileis A, Hothorn T, Hornik K (2008). Model-Based Recursive Partitioning. \emph{Journal of Computational and Graphical Statistics}, \bold{17}(2), 492--514. } \seealso{\code{\link{mob}}, \code{\link{mob_control}}, \code{\link{glmtree}}} \examples{ if(require("mlbench")) { ## Boston housing data data("BostonHousing", package = "mlbench") BostonHousing <- transform(BostonHousing, chas = factor(chas, levels = 0:1, labels = c("no", "yes")), rad = factor(rad, ordered = TRUE)) ## linear model tree bh_tree <- lmtree(medv ~ log(lstat) + I(rm^2) | zn + indus + chas + nox + age + dis + rad + tax + crim + b + ptratio, data = BostonHousing, minsize = 40) ## printing whole tree or individual nodes print(bh_tree) print(bh_tree, node = 7) ## plotting plot(bh_tree) plot(bh_tree, tp_args = list(which = "log(lstat)")) plot(bh_tree, terminal_panel = NULL) ## estimated parameters coef(bh_tree) coef(bh_tree, node = 9) summary(bh_tree, node = 9) ## various ways for computing the mean squared error (on the training data) mean((BostonHousing$medv - fitted(bh_tree))^2) mean(residuals(bh_tree)^2) deviance(bh_tree)/sum(weights(bh_tree)) deviance(bh_tree)/nobs(bh_tree) ## log-likelihood and information criteria logLik(bh_tree) AIC(bh_tree) BIC(bh_tree) ## (Note that this penalizes estimation of error variances, which ## were treated as nuisance parameters in the fitting process.) ## different types of predictions bh <- BostonHousing[c(1, 10, 50), ] predict(bh_tree, newdata = bh, type = "node") predict(bh_tree, newdata = bh, type = "response") predict(bh_tree, newdata = bh, type = function(object) summary(object)$r.squared) } if(require("AER")) { ## Demand for economics journals data data("Journals", package = "AER") Journals <- transform(Journals, age = 2000 - foundingyear, chars = charpp * pages) ## linear regression tree (OLS) j_tree <- lmtree(log(subs) ~ log(price/citations) | price + citations + age + chars + society, data = Journals, minsize = 10, verbose = TRUE) ## printing and plotting j_tree plot(j_tree) ## coefficients and summary coef(j_tree, node = 1:3) summary(j_tree, node = 1:3) } if(require("AER")) { ## Beauty and teaching ratings data data("TeachingRatings", package = "AER") ## linear regression (WLS) ## null model tr_null <- lm(eval ~ 1, data = TeachingRatings, weights = students, subset = credits == "more") ## main effects tr_lm <- lm(eval ~ beauty + gender + minority + native + tenure + division, data = TeachingRatings, weights = students, subset = credits == "more") ## tree tr_tree <- lmtree(eval ~ beauty | minority + age + gender + division + native + tenure, data = TeachingRatings, weights = students, subset = credits == "more", caseweights = FALSE) ## visualization plot(tr_tree) ## beauty slope coefficient coef(tr_lm)[2] coef(tr_tree)[, 2] ## R-squared 1 - deviance(tr_lm)/deviance(tr_null) 1 - deviance(tr_tree)/deviance(tr_null) } } \keyword{tree} partykit/DESCRIPTION0000644000176200001440000000423113614604472013636 0ustar liggesusersPackage: partykit Title: A Toolkit for Recursive Partytioning Date: 2020-01-23 Version: 1.2-6 Authors@R: c(person(given = "Torsten", family = "Hothorn", role = c("aut", "cre"), email = "Torsten.Hothorn@R-project.org", comment = c(ORCID = "0000-0001-8301-0471")), person(given = "Heidi", family = "Seibold", role = "ctb", email = "heidi@seibold.co", comment = c(ORCID = "0000-0002-8960-9642")), person(given = "Achim", family = "Zeileis", role = "aut", email = "Achim.Zeileis@R-project.org", comment = c(ORCID = "0000-0003-0918-3766"))) Description: A toolkit with infrastructure for representing, summarizing, and visualizing tree-structured regression and classification models. This unified infrastructure can be used for reading/coercing tree models from different sources ('rpart', 'RWeka', 'PMML') yielding objects that share functionality for print()/plot()/predict() methods. Furthermore, new and improved reimplementations of conditional inference trees (ctree()) and model-based recursive partitioning (mob()) from the 'party' package are provided based on the new infrastructure. A description of this package was published by Hothorn and Zeileis (2015) . Depends: R (>= 3.1.0), graphics, grid, libcoin (>= 1.0-0), mvtnorm Imports: grDevices, stats, utils, survival, Formula (>= 1.2-1), inum (>= 1.0-0), rpart (>= 4.1-11), party (>= 1.3-0) Suggests: XML, pmml, rJava, sandwich, strucchange, vcd, AER, mlbench, TH.data (>= 1.0-3), coin (>= 1.1-0), RWeka (>= 0.4-19), datasets, parallel, psychotools (>= 0.3-0), psychotree, LazyData: yes License: GPL-2 | GPL-3 URL: http://partykit.R-Forge.R-project.org/partykit RoxygenNote: 6.1.1 NeedsCompilation: yes Packaged: 2020-01-24 13:11:44 UTC; hothorn Author: Torsten Hothorn [aut, cre] (), Heidi Seibold [ctb] (), Achim Zeileis [aut] () Maintainer: Torsten Hothorn Repository: CRAN Date/Publication: 2020-01-30 16:50:02 UTC partykit/build/0000755000176200001440000000000013612566420013225 5ustar liggesuserspartykit/build/vignette.rds0000644000176200001440000000111313612566420015560 0ustar liggesusersTo0X˘-i>1eMU%osi;F}Gc7iX'T5|9_V݊zV;et/L mrv<Q5G?y8!뷖 Cl yd BELnZc)4)Y=dʍ4D`>=F'ϲ4bj8b"5۰k(i֌XB@Li@$ztn0C?KJhV8DaqzVҠ(ۥ K<ˇSs}zkR 65?c)EoY)y e>g BA303J?oH*n?Fuڵu#tT>ihVwވ~y5 l f~)!cnY;KC_Cqb{_(4Kpartykit/tests/0000755000176200001440000000000013612566420013270 5ustar liggesuserspartykit/tests/regtest-party-random.R0000644000176200001440000000131213467227322017503 0ustar liggesuserssuppressWarnings(RNGversion("3.5.2")) ## packages library("partykit") library("rpart") ## data-generating process dgp <- function(n) data.frame(y = gl(4, n), x1 = rnorm(4 * n), x2 = rnorm(4 * n)) ## rpart check learn <- dgp(100) fit <- as.party(rpart(y ~ ., data = learn)) test <- dgp(100000) system.time(id <- fitted_node(node_party(fit), test)) system.time(yhat <- predict_party(fit, id = id, newdata = test)) ### predictions in info slots tmp <- data.frame(x = rnorm(100)) pfit <- party(node = partynode(1L, split = partysplit(1L, breaks = 0), kids = list(partynode(2L, info = -0.5), partynode(3L, info = 0.5))), data = tmp) pfit p <- predict(pfit, newdata = tmp) p table(p, sign(tmp$x)) partykit/tests/Examples/0000755000176200001440000000000013467232062015046 5ustar liggesuserspartykit/tests/Examples/partykit-Ex.Rout.save0000644000176200001440000017172013467471066021120 0ustar liggesusers R version 3.6.0 (2019-04-26) -- "Planting of a Tree" Copyright (C) 2019 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > pkgname <- "partykit" > source(file.path(R.home("share"), "R", "examples-header.R")) > options(warn = 1) > library('partykit') Loading required package: grid Loading required package: libcoin Loading required package: mvtnorm > > base::assign(".oldSearch", base::search(), pos = 'CheckExEnv') > base::assign(".old_wd", base::getwd(), pos = 'CheckExEnv') > cleanEx() > nameEx("HuntingSpiders") > ### * HuntingSpiders > > flush(stderr()); flush(stdout()) > > ### Name: HuntingSpiders > ### Title: Abundance of Hunting Spiders > ### Aliases: HuntingSpiders > ### Keywords: datasets > > ### ** Examples > > > ## load data > data("HuntingSpiders", package = "partykit") > > ## fit multivariate tree for 12-dimensional species abundance > ## (warnings by mvtnorm are suppressed) > suppressWarnings(sptree <- ctree(arct.lute + pard.lugu + zora.spin + pard.nigr + pard.pull + + aulo.albi + troc.terr + alop.cune + pard.mont + alop.acce + alop.fabr + + arct.peri ~ herbs + reft + moss + sand + twigs + water, data = HuntingSpiders, + teststat = "max", minsplit = 5)) > plot(sptree, terminal_panel = node_barplot) > > > > cleanEx() > nameEx("WeatherPlay") > ### * WeatherPlay > > flush(stderr()); flush(stdout()) > > ### Name: WeatherPlay > ### Title: Weather Conditions and Playing a Game > ### Aliases: WeatherPlay > ### Keywords: datasets > > ### ** Examples > > ## load weather data > data("WeatherPlay", package = "partykit") > WeatherPlay outlook temperature humidity windy play 1 sunny 85 85 false no 2 sunny 80 90 true no 3 overcast 83 86 false yes 4 rainy 70 96 false yes 5 rainy 68 80 false yes 6 rainy 65 70 true no 7 overcast 64 65 true yes 8 sunny 72 95 false no 9 sunny 69 70 false yes 10 rainy 75 80 false yes 11 sunny 75 70 true yes 12 overcast 72 90 true yes 13 overcast 81 75 false yes 14 rainy 71 91 true no > > ## construct simple tree > pn <- partynode(1L, + split = partysplit(1L, index = 1:3), + kids = list( + partynode(2L, + split = partysplit(3L, breaks = 75), + kids = list( + partynode(3L, info = "yes"), + partynode(4L, info = "no"))), + partynode(5L, info = "yes"), + partynode(6L, + split = partysplit(4L, index = 1:2), + kids = list( + partynode(7L, info = "yes"), + partynode(8L, info = "no"))))) > pn [1] root | [2] V1 in (-Inf,1] | | [3] V3 <= 75 * | | [4] V3 > 75 * | [5] V1 in (1,2] * | [6] V1 in (2, Inf] | | [7] V4 <= 1 * | | [8] V4 > 1 * > > ## couple with data > py <- party(pn, WeatherPlay) > > ## print/plot/predict > print(py) [1] root | [2] outlook in sunny | | [3] humidity <= 75: yes | | [4] humidity > 75: no | [5] outlook in overcast: yes | [6] outlook in rainy | | [7] windy in false: yes | | [8] windy in true: no > plot(py) > predict(py, newdata = WeatherPlay) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 4 4 5 7 7 8 5 4 3 7 3 5 5 8 > > ## customize printing > print(py, + terminal_panel = function(node) paste(": play=", info_node(node), sep = "")) [1] root | [2] outlook in sunny | | [3] humidity <= 75: play=yes | | [4] humidity > 75: play=no | [5] outlook in overcast: play=yes | [6] outlook in rainy | | [7] windy in false: play=yes | | [8] windy in true: play=no > > > > cleanEx() > nameEx("cforest") > ### * cforest > > flush(stderr()); flush(stdout()) > > ### Name: cforest > ### Title: Conditional Random Forests > ### Aliases: cforest gettree gettree.cforest predict.cforest > ### Keywords: tree > > ### ** Examples > > ## basic example: conditional inference forest for cars data > cf <- cforest(dist ~ speed, data = cars) > > ## prediction of fitted mean and visualization > nd <- data.frame(speed = 4:25) > nd$mean <- predict(cf, newdata = nd, type = "response") > plot(dist ~ speed, data = cars) > lines(mean ~ speed, data = nd) > > ## predict quantiles (aka quantile regression forest) > myquantile <- function(y, w) quantile(rep(y, w), probs = c(0.1, 0.5, 0.9)) > p <- predict(cf, newdata = nd, type = "response", FUN = myquantile) > colnames(p) <- c("lower", "median", "upper") > nd <- cbind(nd, p) > > ## visualization with conditional (on speed) prediction intervals > plot(dist ~ speed, data = cars, type = "n") > with(nd, polygon(c(speed, rev(speed)), c(lower, rev(upper)), + col = "lightgray", border = "transparent")) > points(dist ~ speed, data = cars) > lines(mean ~ speed, data = nd, lwd = 1.5) > lines(median ~ speed, data = nd, lty = 2, lwd = 1.5) > legend("topleft", c("mean", "median", "10% - 90% quantile"), + lwd = c(1.5, 1.5, 10), lty = c(1, 2, 1), + col = c("black", "black", "lightgray"), bty = "n") > > ### we may also use predicted conditional (on speed) densities > mydensity <- function (y, w) approxfun(density(y, weights = w/sum(w))[1:2], rule = 2) > pd <- predict(cf, newdata = nd, type = "response", FUN = mydensity) > > ## visualization in heatmap (instead of scatterplot) > ## with fitted curves as above > dist <- -10:150 > dens <- t(sapply(seq_along(pd), function(i) pd[[i]](dist))) > image(nd$speed, dist, dens, xlab = "speed", col = rev(gray.colors(9))) > lines(mean ~ speed, data = nd, lwd = 1.5) > lines(median ~ speed, data = nd, lty = 2, lwd = 1.5) > lines(lower ~ speed, data = nd, lty = 2) > lines(upper ~ speed, data = nd, lty = 2) > > ## Not run: > ##D > ##D ### honest (i.e., out-of-bag) cross-classification of > ##D ### true vs. predicted classes > ##D data("mammoexp", package = "TH.data") > ##D table(mammoexp$ME, predict(cforest(ME ~ ., data = mammoexp, ntree = 50), > ##D OOB = TRUE, type = "response")) > ##D > ##D ### fit forest to censored response > ##D if (require("TH.data") && require("survival")) { > ##D > ##D data("GBSG2", package = "TH.data") > ##D bst <- cforest(Surv(time, cens) ~ ., data = GBSG2, ntree = 50) > ##D > ##D ### estimate conditional Kaplan-Meier curves > ##D print(predict(bst, newdata = GBSG2[1:2,], OOB = TRUE, type = "prob")) > ##D > ##D print(gettree(bst)) > ##D } > ## End(Not run) > > > > cleanEx() > nameEx("ctree") > ### * ctree > > flush(stderr()); flush(stdout()) > > ### Name: ctree > ### Title: Conditional Inference Trees > ### Aliases: ctree sctest.constparty > ### Keywords: tree > > ### ** Examples > > ### regression > airq <- subset(airquality, !is.na(Ozone)) > airct <- ctree(Ozone ~ ., data = airq) > airct Model formula: Ozone ~ Solar.R + Wind + Temp + Month + Day Fitted party: [1] root | [2] Temp <= 82 | | [3] Wind <= 6.9: 55.600 (n = 10, err = 21946.4) | | [4] Wind > 6.9 | | | [5] Temp <= 77: 18.479 (n = 48, err = 3956.0) | | | [6] Temp > 77: 31.143 (n = 21, err = 4620.6) | [7] Temp > 82 | | [8] Wind <= 10.3: 81.633 (n = 30, err = 15119.0) | | [9] Wind > 10.3: 48.714 (n = 7, err = 1183.4) Number of inner nodes: 4 Number of terminal nodes: 5 > plot(airct) > mean((airq$Ozone - predict(airct))^2) [1] 403.6668 > > ### classification > irisct <- ctree(Species ~ .,data = iris) > irisct Model formula: Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width Fitted party: [1] root | [2] Petal.Length <= 1.9: setosa (n = 50, err = 0.0%) | [3] Petal.Length > 1.9 | | [4] Petal.Width <= 1.7 | | | [5] Petal.Length <= 4.8: versicolor (n = 46, err = 2.2%) | | | [6] Petal.Length > 4.8: versicolor (n = 8, err = 50.0%) | | [7] Petal.Width > 1.7: virginica (n = 46, err = 2.2%) Number of inner nodes: 3 Number of terminal nodes: 4 > plot(irisct) > table(predict(irisct), iris$Species) setosa versicolor virginica setosa 50 0 0 versicolor 0 49 5 virginica 0 1 45 > > ### estimated class probabilities, a list > tr <- predict(irisct, newdata = iris[1:10,], type = "prob") > > ### survival analysis > if (require("TH.data") && require("survival") && + require("coin") && require("Formula")) { + + data("GBSG2", package = "TH.data") + (GBSG2ct <- ctree(Surv(time, cens) ~ ., data = GBSG2)) + predict(GBSG2ct, newdata = GBSG2[1:2,], type = "response") + plot(GBSG2ct) + + ### with weight-dependent log-rank scores + ### log-rank trafo for observations in this node only (= weights > 0) + h <- function(y, x, start = NULL, weights, offset, estfun = TRUE, object = FALSE, ...) { + if (is.null(weights)) weights <- rep(1, NROW(y)) + s <- logrank_trafo(y[weights > 0,,drop = FALSE]) + r <- rep(0, length(weights)) + r[weights > 0] <- s + list(estfun = matrix(as.double(r), ncol = 1), converged = TRUE) + } + + ### very much the same tree + (ctree(Surv(time, cens) ~ ., data = GBSG2, ytrafo = h)) + } Loading required package: TH.data Loading required package: survival Loading required package: MASS Attaching package: ‘TH.data’ The following object is masked from ‘package:MASS’: geyser Loading required package: coin Loading required package: Formula Model formula: Surv(time, cens) ~ horTh + age + menostat + tsize + tgrade + pnodes + progrec + estrec Fitted party: [1] root | [2] pnodes <= 3 | | [3] horTh in no: 2093.000 (n = 248) | | [4] horTh in yes: Inf (n = 128) | [5] pnodes > 3 | | [6] progrec <= 20: 624.000 (n = 144) | | [7] progrec > 20: 1701.000 (n = 166) Number of inner nodes: 3 Number of terminal nodes: 4 > > ### multivariate responses > airct2 <- ctree(Ozone + Temp ~ ., data = airq) > airct2 Model formula: ~Ozone + Temp + (Solar.R + Wind + Month + Day) Fitted party: [1] root | [2] Wind <= 6.3: * | [3] Wind > 6.3 | | [4] Month <= 5: * | | [5] Month > 5 | | | [6] Wind <= 9.7: * | | | [7] Wind > 9.7 | | | | [8] Day <= 13: * | | | | [9] Day > 13: * Number of inner nodes: 4 Number of terminal nodes: 5 > plot(airct2) > > > > cleanEx() detaching ‘package:Formula’, ‘package:coin’, ‘package:TH.data’, ‘package:MASS’, ‘package:survival’ > nameEx("extree_data") > ### * extree_data > > flush(stderr()); flush(stdout()) > > ### Name: extree_data > ### Title: Data Preprocessing for Extensible Trees. > ### Aliases: extree_data > ### Keywords: tree > > ### ** Examples > > > data("iris") > > ed <- extree_data(Species ~ Sepal.Width + Sepal.Length | Petal.Width + Petal.Length, + data = iris, nmax = c("yx" = 25, "z" = 10), yx = "matrix") > > ### the model.frame > mf <- model.frame(ed) > all.equal(mf, iris[, names(mf)]) [1] "Attributes: < Length mismatch: comparison on first 2 components >" > > ### binned y ~ x part > model.frame(ed, yxonly = TRUE) Species Sepal.Width Sepal.Length 1 setosa 2.3 4.600 2 setosa 2.9 4.600 3 setosa 3.0 4.600 4 setosa 3.1 4.600 5 setosa 3.2 4.600 6 setosa 3.4 4.600 7 setosa 3.6 4.600 8 setosa 3.0 4.800 9 setosa 3.1 4.800 10 setosa 3.2 4.800 11 setosa 3.4 4.800 12 versicolor 2.4 4.900 13 virginica 2.5 4.900 14 setosa 3.0 4.900 15 setosa 3.1 4.900 16 setosa 3.6 4.900 17 versicolor 2.0 5.000 18 versicolor 2.3 5.000 19 setosa 3.0 5.000 20 setosa 3.2 5.000 21 setosa 3.3 5.000 22 setosa 3.4 5.000 23 setosa 3.5 5.000 24 setosa 3.6 5.000 25 versicolor 2.5 5.100 26 setosa 3.3 5.100 27 setosa 3.4 5.100 28 setosa 3.5 5.100 29 setosa 3.7 5.100 30 setosa 3.8 5.100 31 versicolor 2.7 5.200 32 setosa 3.4 5.200 33 setosa 3.5 5.200 34 setosa 4.1 5.200 35 versicolor 3.0 5.400 36 setosa 3.4 5.400 37 setosa 3.7 5.400 38 setosa 3.9 5.400 39 versicolor 2.3 5.500 40 versicolor 2.4 5.500 41 versicolor 2.5 5.500 42 versicolor 2.6 5.500 43 setosa 3.5 5.500 44 setosa 4.2 5.500 45 versicolor 2.5 5.600 46 versicolor 2.7 5.600 47 virginica 2.8 5.600 48 versicolor 2.9 5.600 49 versicolor 3.0 5.600 50 virginica 2.5 5.700 51 versicolor 2.6 5.700 52 versicolor 2.8 5.700 53 versicolor 2.9 5.700 54 versicolor 3.0 5.700 55 setosa 3.8 5.700 56 setosa 4.4 5.700 57 versicolor 2.6 5.800 58 versicolor 2.7 5.800 59 virginica 2.7 5.800 60 virginica 2.8 5.800 61 setosa 4.0 5.800 62 versicolor 2.2 6.000 63 virginica 2.2 6.000 64 versicolor 2.7 6.000 65 versicolor 2.9 6.000 66 versicolor 3.0 6.000 67 virginica 3.0 6.000 68 versicolor 3.2 6.000 69 versicolor 3.4 6.000 70 virginica 2.6 6.100 71 versicolor 2.8 6.100 72 versicolor 2.9 6.100 73 versicolor 3.0 6.100 74 virginica 3.0 6.100 75 versicolor 2.2 6.200 76 virginica 2.8 6.200 77 versicolor 2.9 6.200 78 virginica 3.4 6.200 79 versicolor 2.3 6.300 80 versicolor 2.5 6.300 81 virginica 2.5 6.300 82 virginica 2.7 6.300 83 virginica 2.8 6.300 84 virginica 2.9 6.300 85 versicolor 3.3 6.300 86 virginica 3.3 6.300 87 virginica 3.4 6.300 88 virginica 2.7 6.400 89 virginica 2.8 6.400 90 versicolor 2.9 6.400 91 virginica 3.1 6.400 92 versicolor 3.2 6.400 93 virginica 3.2 6.400 94 versicolor 2.8 6.520 95 virginica 3.0 6.520 96 virginica 3.2 6.520 97 virginica 2.5 6.700 98 versicolor 2.9 6.700 99 versicolor 3.0 6.700 100 virginica 3.0 6.700 101 versicolor 3.1 6.700 102 virginica 3.1 6.700 103 virginica 3.3 6.700 104 versicolor 2.8 6.800 105 virginica 3.0 6.800 106 virginica 3.2 6.800 107 versicolor 3.1 7.008 108 virginica 3.1 7.008 109 versicolor 3.2 7.008 110 virginica 3.2 7.008 111 virginica 2.8 7.408 112 virginica 2.9 7.408 113 virginica 3.0 7.408 114 virginica 3.2 7.408 115 virginica 3.6 7.408 116 virginica 2.6 7.900 117 virginica 2.8 7.900 118 virginica 3.0 7.900 119 virginica 3.8 7.900 > > ### binned Petal.Width > ed[[4, type = "index"]] [1] 0.2 0.2 0.2 0.2 0.2 0.4 0.4 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 [16] 0.4 0.4 0.4 0.4 0.4 0.2 0.4 0.2 1.16 0.2 0.2 0.4 0.2 0.2 0.2 [31] 0.2 0.4 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4 0.4 0.2 1.16 0.4 [46] 0.4 0.2 0.2 0.2 0.2 1.5 1.5 1.5 1.3 1.5 1.3 1.8 1.16 1.3 1.5 [61] 1.16 1.5 1.16 1.5 1.3 1.5 1.5 1.16 1.5 1.16 1.8 1.3 1.5 1.3 1.3 [76] 1.5 1.5 1.8 1.5 1.16 1.16 1.16 1.3 1.8 1.5 1.8 1.5 1.3 1.3 1.3 [91] 1.3 1.5 1.3 1.16 1.3 1.3 1.3 1.3 1.16 1.3 2.5 1.9 2.2 1.8 2.2 [106] 2.2 1.8 1.8 1.8 2.5 2.2 1.9 2.2 2.2 2.5 2.5 1.8 2.2 2.5 1.5 [121] 2.5 2.2 2.2 1.8 2.2 1.8 1.8 1.8 2.2 1.8 1.9 2.2 2.2 1.5 1.5 [136] 2.5 2.5 1.8 1.8 2.2 2.5 2.5 1.9 2.5 2.5 2.5 1.9 2.2 2.5 1.8 Levels: 0.2 0.4 1.16 1.3 1.5 1.8 1.9 2.2 2.5 > > ### response > ed$yx$y [1] setosa setosa setosa setosa setosa setosa [7] setosa setosa setosa setosa setosa versicolor [13] virginica setosa setosa setosa versicolor versicolor [19] setosa setosa setosa setosa setosa setosa [25] versicolor setosa setosa setosa setosa setosa [31] versicolor setosa setosa setosa versicolor setosa [37] setosa setosa versicolor versicolor versicolor versicolor [43] setosa setosa versicolor versicolor virginica versicolor [49] versicolor virginica versicolor versicolor versicolor versicolor [55] setosa setosa versicolor versicolor virginica virginica [61] setosa versicolor virginica versicolor versicolor versicolor [67] virginica versicolor versicolor virginica versicolor versicolor [73] versicolor virginica versicolor virginica versicolor virginica [79] versicolor versicolor virginica virginica virginica virginica [85] versicolor virginica virginica virginica virginica versicolor [91] virginica versicolor virginica versicolor virginica virginica [97] virginica versicolor versicolor virginica versicolor virginica [103] virginica versicolor virginica virginica versicolor virginica [109] versicolor virginica virginica virginica virginica virginica [115] virginica virginica virginica virginica virginica Levels: setosa versicolor virginica > > ### model matrix > ed$yx$x (Intercept) Sepal.Width Sepal.Length 1 1 2.3 4.600 2 1 2.9 4.600 3 1 3.0 4.600 4 1 3.1 4.600 5 1 3.2 4.600 6 1 3.4 4.600 7 1 3.6 4.600 8 1 3.0 4.800 9 1 3.1 4.800 10 1 3.2 4.800 11 1 3.4 4.800 12 1 2.4 4.900 13 1 2.5 4.900 14 1 3.0 4.900 15 1 3.1 4.900 16 1 3.6 4.900 17 1 2.0 5.000 18 1 2.3 5.000 19 1 3.0 5.000 20 1 3.2 5.000 21 1 3.3 5.000 22 1 3.4 5.000 23 1 3.5 5.000 24 1 3.6 5.000 25 1 2.5 5.100 26 1 3.3 5.100 27 1 3.4 5.100 28 1 3.5 5.100 29 1 3.7 5.100 30 1 3.8 5.100 31 1 2.7 5.200 32 1 3.4 5.200 33 1 3.5 5.200 34 1 4.1 5.200 35 1 3.0 5.400 36 1 3.4 5.400 37 1 3.7 5.400 38 1 3.9 5.400 39 1 2.3 5.500 40 1 2.4 5.500 41 1 2.5 5.500 42 1 2.6 5.500 43 1 3.5 5.500 44 1 4.2 5.500 45 1 2.5 5.600 46 1 2.7 5.600 47 1 2.8 5.600 48 1 2.9 5.600 49 1 3.0 5.600 50 1 2.5 5.700 51 1 2.6 5.700 52 1 2.8 5.700 53 1 2.9 5.700 54 1 3.0 5.700 55 1 3.8 5.700 56 1 4.4 5.700 57 1 2.6 5.800 58 1 2.7 5.800 59 1 2.7 5.800 60 1 2.8 5.800 61 1 4.0 5.800 62 1 2.2 6.000 63 1 2.2 6.000 64 1 2.7 6.000 65 1 2.9 6.000 66 1 3.0 6.000 67 1 3.0 6.000 68 1 3.2 6.000 69 1 3.4 6.000 70 1 2.6 6.100 71 1 2.8 6.100 72 1 2.9 6.100 73 1 3.0 6.100 74 1 3.0 6.100 75 1 2.2 6.200 76 1 2.8 6.200 77 1 2.9 6.200 78 1 3.4 6.200 79 1 2.3 6.300 80 1 2.5 6.300 81 1 2.5 6.300 82 1 2.7 6.300 83 1 2.8 6.300 84 1 2.9 6.300 85 1 3.3 6.300 86 1 3.3 6.300 87 1 3.4 6.300 88 1 2.7 6.400 89 1 2.8 6.400 90 1 2.9 6.400 91 1 3.1 6.400 92 1 3.2 6.400 93 1 3.2 6.400 94 1 2.8 6.520 95 1 3.0 6.520 96 1 3.2 6.520 97 1 2.5 6.700 98 1 2.9 6.700 99 1 3.0 6.700 100 1 3.0 6.700 101 1 3.1 6.700 102 1 3.1 6.700 103 1 3.3 6.700 104 1 2.8 6.800 105 1 3.0 6.800 106 1 3.2 6.800 107 1 3.1 7.008 108 1 3.1 7.008 109 1 3.2 7.008 110 1 3.2 7.008 111 1 2.8 7.408 112 1 2.9 7.408 113 1 3.0 7.408 114 1 3.2 7.408 115 1 3.6 7.408 116 1 2.6 7.900 117 1 2.8 7.900 118 1 3.0 7.900 119 1 3.8 7.900 attr(,"assign") [1] 0 1 2 attr(,"formula") Species ~ Sepal.Width + Sepal.Length attr(,"terms") Species ~ Sepal.Width + Sepal.Length attr(,"variables") list(Species, Sepal.Width, Sepal.Length) attr(,"factors") Sepal.Width Sepal.Length Species 0 0 Sepal.Width 1 0 Sepal.Length 0 1 attr(,"term.labels") [1] "Sepal.Width" "Sepal.Length" attr(,"order") [1] 1 1 attr(,"intercept") [1] 1 attr(,"response") [1] 1 attr(,".Environment") > > > > > cleanEx() > nameEx("glmtree") > ### * glmtree > > flush(stderr()); flush(stdout()) > > ### Name: glmtree > ### Title: Generalized Linear Model Trees > ### Aliases: glmtree plot.glmtree predict.glmtree print.glmtree > ### Keywords: tree > > ### ** Examples > > if(require("mlbench")) { + + ## Pima Indians diabetes data + data("PimaIndiansDiabetes", package = "mlbench") + + ## recursive partitioning of a logistic regression model + pid_tree2 <- glmtree(diabetes ~ glucose | pregnant + + pressure + triceps + insulin + mass + pedigree + age, + data = PimaIndiansDiabetes, family = binomial) + + ## printing whole tree or individual nodes + print(pid_tree2) + print(pid_tree2, node = 1) + + ## visualization + plot(pid_tree2) + plot(pid_tree2, tp_args = list(cdplot = TRUE)) + plot(pid_tree2, terminal_panel = NULL) + + ## estimated parameters + coef(pid_tree2) + coef(pid_tree2, node = 5) + summary(pid_tree2, node = 5) + + ## deviance, log-likelihood and information criteria + deviance(pid_tree2) + logLik(pid_tree2) + AIC(pid_tree2) + BIC(pid_tree2) + + ## different types of predictions + pid <- head(PimaIndiansDiabetes) + predict(pid_tree2, newdata = pid, type = "node") + predict(pid_tree2, newdata = pid, type = "response") + predict(pid_tree2, newdata = pid, type = "link") + + } Loading required package: mlbench Generalized linear model tree (family: binomial) Model formula: diabetes ~ glucose | pregnant + pressure + triceps + insulin + mass + pedigree + age Fitted party: [1] root | [2] mass <= 26.3: n = 167 | (Intercept) glucose | -9.95150963 0.05870786 | [3] mass > 26.3 | | [4] age <= 30: n = 304 | | (Intercept) glucose | | -6.70558554 0.04683748 | | [5] age > 30: n = 297 | | (Intercept) glucose | | -2.77095386 0.02353582 Number of inner nodes: 2 Number of terminal nodes: 3 Number of parameters per node: 2 Objective function (negative log-likelihood): 355.4578 Generalized linear model tree (family: binomial) -- Node 1 -- Estimated parameters: (Intercept) glucose -5.35008039 0.03787304 Objective function: 404.3598 Parameter instability tests: pregnant pressure triceps insulin mass pedigree statistic 2.988542e+01 7.5024235 15.9409542 6.5969297 4.880982e+01 18.33476114 p.value 9.778517e-05 0.9104325 0.0647362 0.9701412 8.316815e-09 0.02252955 age statistic 4.351412e+01 p.value 1.182811e-07 Loading required namespace: vcd 1 2 3 4 5 6 0.7123469 -0.7704095 0.7920297 -2.5370501 0.4534529 -3.1413973 > > > > cleanEx() detaching ‘package:mlbench’ > nameEx("lmtree") > ### * lmtree > > flush(stderr()); flush(stdout()) > > ### Name: lmtree > ### Title: Linear Model Trees > ### Aliases: lmtree plot.lmtree predict.lmtree print.lmtree > ### Keywords: tree > > ### ** Examples > > if(require("mlbench")) { + + ## Boston housing data + data("BostonHousing", package = "mlbench") + BostonHousing <- transform(BostonHousing, + chas = factor(chas, levels = 0:1, labels = c("no", "yes")), + rad = factor(rad, ordered = TRUE)) + + ## linear model tree + bh_tree <- lmtree(medv ~ log(lstat) + I(rm^2) | zn + + indus + chas + nox + age + dis + rad + tax + crim + b + ptratio, + data = BostonHousing, minsize = 40) + + ## printing whole tree or individual nodes + print(bh_tree) + print(bh_tree, node = 7) + + ## plotting + plot(bh_tree) + plot(bh_tree, tp_args = list(which = "log(lstat)")) + plot(bh_tree, terminal_panel = NULL) + + ## estimated parameters + coef(bh_tree) + coef(bh_tree, node = 9) + summary(bh_tree, node = 9) + + ## various ways for computing the mean squared error (on the training data) + mean((BostonHousing$medv - fitted(bh_tree))^2) + mean(residuals(bh_tree)^2) + deviance(bh_tree)/sum(weights(bh_tree)) + deviance(bh_tree)/nobs(bh_tree) + + ## log-likelihood and information criteria + logLik(bh_tree) + AIC(bh_tree) + BIC(bh_tree) + ## (Note that this penalizes estimation of error variances, which + ## were treated as nuisance parameters in the fitting process.) + + ## different types of predictions + bh <- BostonHousing[c(1, 10, 50), ] + predict(bh_tree, newdata = bh, type = "node") + predict(bh_tree, newdata = bh, type = "response") + predict(bh_tree, newdata = bh, type = function(object) summary(object)$r.squared) + + } Loading required package: mlbench Linear model tree Model formula: medv ~ log(lstat) + I(rm^2) | zn + indus + chas + nox + age + dis + rad + tax + crim + b + ptratio Fitted party: [1] root | [2] tax <= 432 | | [3] ptratio <= 15.2: n = 72 | | (Intercept) log(lstat) I(rm^2) | | 9.2348804 -4.9390955 0.6859136 | | [4] ptratio > 15.2 | | | [5] ptratio <= 19.6 | | | | [6] tax <= 265: n = 63 | | | | (Intercept) log(lstat) I(rm^2) | | | | 3.9637196 -2.7662873 0.6881287 | | | | [7] tax > 265: n = 162 | | | | (Intercept) log(lstat) I(rm^2) | | | | -1.7983871 -0.2677070 0.6538864 | | | [8] ptratio > 19.6: n = 56 | | | (Intercept) log(lstat) I(rm^2) | | | 17.5864899 -4.6189750 0.3386744 | [9] tax > 432: n = 153 | (Intercept) log(lstat) I(rm^2) | 68.2970874 -16.3540061 -0.1477939 Number of inner nodes: 4 Number of terminal nodes: 5 Number of parameters per node: 3 Objective function (residual sum of squares): 6089.803 Linear model tree -- Node 7 -- Estimated parameters: (Intercept) log(lstat) I(rm^2) -1.7983871 -0.2677070 0.6538864 Objective function: 1118.535 Parameter instability tests: zn indus chas nox age dis statistic 11.998039 7.3971233 7.227770 9.2936189 14.3023962 8.9239826 p.value 0.574642 0.9931875 0.522447 0.9119621 0.2886603 0.9389895 rad tax crim b ptratio statistic 33.1746444 16.6666129 11.7143758 9.9050903 11.5927528 p.value 0.3926249 0.1206412 0.6153455 0.8539893 0.6328381 1 10 50 0.8172892 0.9220070 0.8176030 > > > if(require("AER")) { + + ## Demand for economics journals data + data("Journals", package = "AER") + Journals <- transform(Journals, + age = 2000 - foundingyear, + chars = charpp * pages) + + ## linear regression tree (OLS) + j_tree <- lmtree(log(subs) ~ log(price/citations) | price + citations + + age + chars + society, data = Journals, minsize = 10, verbose = TRUE) + + ## printing and plotting + j_tree + plot(j_tree) + + ## coefficients and summary + coef(j_tree, node = 1:3) + summary(j_tree, node = 1:3) + + } Loading required package: AER Loading required package: car Loading required package: carData Loading required package: lmtest Loading required package: zoo Attaching package: ‘zoo’ The following objects are masked from ‘package:base’: as.Date, as.Date.numeric Loading required package: sandwich Loading required package: survival -- Node 1 --------------------------------- Number of observations: 180 Parameter instability tests: price citations age chars society statistic 6.5617160 5.261443 4.219816e+01 4.5638410 3.2797248 p.value 0.9217588 0.988134 1.629130e-07 0.9976804 0.6598605 Best splitting variable: age Perform split? yes Selected split: <= 18 | > 18 -- Node 2 --------------------------------- Number of observations: 53 Parameter instability tests: price citations age chars society statistic 3.3415225 3.7259445 5.6132430 6.0399915 0.6495396 p.value 0.9996007 0.9984132 0.9353808 0.8978925 0.9983602 Best splitting variable: chars Perform split? no -- Node 3 --------------------------------- Number of observations: 127 Parameter instability tests: price citations age chars society statistic 3.3695027 6.839065 5.986772 3.6768804 0.6083476 p.value 0.9999824 0.894352 0.959765 0.9999056 0.9987591 Best splitting variable: citations Perform split? no $`1` Call: lm(formula = log(subs) ~ log(price/citations)) Residuals: Min 1Q Median 3Q Max -2.72478 -0.53609 0.03721 0.46619 1.84808 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.76621 0.05591 85.25 <2e-16 *** log(price/citations) -0.53305 0.03561 -14.97 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.7497 on 178 degrees of freedom Multiple R-squared: 0.5573, Adjusted R-squared: 0.5548 F-statistic: 224 on 1 and 178 DF, p-value: < 2.2e-16 $`2` Call: lm(formula = log(subs) ~ log(price/citations)) Residuals: Min 1Q Median 3Q Max -2.12974 -0.37882 -0.05063 0.35693 1.57877 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.35278 0.11687 37.244 < 2e-16 *** log(price/citations) -0.60486 0.07483 -8.083 1.08e-10 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.6682 on 51 degrees of freedom Multiple R-squared: 0.5616, Adjusted R-squared: 0.553 F-statistic: 65.34 on 1 and 51 DF, p-value: 1.078e-10 $`3` Call: lm(formula = log(subs) ~ log(price/citations)) Residuals: Min 1Q Median 3Q Max -2.16039 -0.38995 0.08398 0.41365 1.52063 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 5.01127 0.05985 83.74 <2e-16 *** log(price/citations) -0.40298 0.03804 -10.59 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.659 on 125 degrees of freedom Multiple R-squared: 0.4731, Adjusted R-squared: 0.4689 F-statistic: 112.2 on 1 and 125 DF, p-value: < 2.2e-16 > > > if(require("AER")) { + + ## Beauty and teaching ratings data + data("TeachingRatings", package = "AER") + + ## linear regression (WLS) + ## null model + tr_null <- lm(eval ~ 1, data = TeachingRatings, weights = students, + subset = credits == "more") + ## main effects + tr_lm <- lm(eval ~ beauty + gender + minority + native + tenure + division, + data = TeachingRatings, weights = students, subset = credits == "more") + ## tree + tr_tree <- lmtree(eval ~ beauty | minority + age + gender + division + native + tenure, + data = TeachingRatings, weights = students, subset = credits == "more", + caseweights = FALSE) + + ## visualization + plot(tr_tree) + + ## beauty slope coefficient + coef(tr_lm)[2] + coef(tr_tree)[, 2] + + ## R-squared + 1 - deviance(tr_lm)/deviance(tr_null) + 1 - deviance(tr_tree)/deviance(tr_null) + } [1] 0.3820419 > > > > > cleanEx() detaching ‘package:AER’, ‘package:survival’, ‘package:sandwich’, ‘package:lmtest’, ‘package:zoo’, ‘package:car’, ‘package:carData’, ‘package:mlbench’ > nameEx("mob") > ### * mob > > flush(stderr()); flush(stdout()) > > ### Name: mob > ### Title: Model-based Recursive Partitioning > ### Aliases: mob modelparty coef.modelparty deviance.modelparty > ### fitted.modelparty formula.modelparty getCall.modelparty > ### logLik.modelparty model.frame.modelparty nobs.modelparty > ### plot.modelparty predict.modelparty print.modelparty > ### residuals.modelparty summary.modelparty weights.modelparty > ### refit.modelparty sctest.modelparty > ### Keywords: tree > > ### ** Examples > > if(require("mlbench")) { + + ## Pima Indians diabetes data + data("PimaIndiansDiabetes", package = "mlbench") + + ## a simple basic fitting function (of type 1) for a logistic regression + logit <- function(y, x, start = NULL, weights = NULL, offset = NULL, ...) { + glm(y ~ 0 + x, family = binomial, start = start, ...) + } + + ## set up a logistic regression tree + pid_tree <- mob(diabetes ~ glucose | pregnant + pressure + triceps + insulin + + mass + pedigree + age, data = PimaIndiansDiabetes, fit = logit) + ## see lmtree() and glmtree() for interfaces with more efficient fitting functions + + ## print tree + print(pid_tree) + + ## print information about (some) nodes + print(pid_tree, node = 3:4) + + ## visualization + plot(pid_tree) + + ## coefficients and summary + coef(pid_tree) + coef(pid_tree, node = 1) + summary(pid_tree, node = 1) + + ## average deviance computed in different ways + mean(residuals(pid_tree)^2) + deviance(pid_tree)/sum(weights(pid_tree)) + deviance(pid_tree)/nobs(pid_tree) + + ## log-likelihood and information criteria + logLik(pid_tree) + AIC(pid_tree) + BIC(pid_tree) + + ## predicted nodes + predict(pid_tree, newdata = head(PimaIndiansDiabetes, 6), type = "node") + ## other types of predictions are possible using lmtree()/glmtree() + } Loading required package: mlbench Model-based recursive partitioning (logit) Model formula: diabetes ~ glucose | pregnant + pressure + triceps + insulin + mass + pedigree + age Fitted party: [1] root | [2] mass <= 26.3: n = 167 | x(Intercept) xglucose | -9.95150963 0.05870786 | [3] mass > 26.3 | | [4] age <= 30: n = 304 | | x(Intercept) xglucose | | -6.70558554 0.04683748 | | [5] age > 30: n = 297 | | x(Intercept) xglucose | | -2.77095386 0.02353582 Number of inner nodes: 2 Number of terminal nodes: 3 Number of parameters per node: 2 Objective function: 355.4578 Model-based recursive partitioning (logit) -- Node 3 -- Estimated parameters: x(Intercept) xglucose -4.61015031 0.03426267 Objective function: 344.225 Parameter instability tests: pregnant pressure triceps insulin mass pedigree statistic 2.673912e+01 6.1757583 7.346804 7.8963977 9.1545915 17.96438828 p.value 4.434356e-04 0.9845137 0.922646 0.8700398 0.7033477 0.02646585 age statistic 3.498466e+01 p.value 8.098640e-06 -- Node 4 -- Estimated parameters: x(Intercept) xglucose -6.70558554 0.04683748 Objective function: 140.4905 Parameter instability tests: pregnant pressure triceps insulin mass pedigree age statistic 4.3749991 9.4006532 7.661457 9.0583568 5.4287861 5.640420 6.3088818 p.value 0.9998989 0.6656073 0.893893 0.7168659 0.9967316 0.994611 0.9804133 1 2 3 4 5 6 5 5 2 4 5 2 > > > > cleanEx() detaching ‘package:mlbench’ > nameEx("nodeapply") > ### * nodeapply > > flush(stderr()); flush(stdout()) > > ### Name: nodeapply > ### Title: Apply Functions Over Nodes > ### Aliases: nodeapply nodeapply.party nodeapply.partynode > ### Keywords: tree > > ### ** Examples > > > ## a tree as flat list structure > nodelist <- list( + # root node + list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), + kids = 2:3), + # V4 <= 1.9, terminal node + list(id = 2L, info = "terminal A"), + # V4 > 1.9 + list(id = 3L, split = partysplit(varid = 5L, breaks = 1.7), + kids = c(4L, 7L)), + # V5 <= 1.7 + list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), + kids = 5:6), + # V4 <= 4.8, terminal node + list(id = 5L, info = "terminal B"), + # V4 > 4.8, terminal node + list(id = 6L, info = "terminal C"), + # V5 > 1.7, terminal node + list(id = 7L, info = "terminal D") + ) > > ## convert to a recursive structure > node <- as.partynode(nodelist) > > ## return root node > nodeapply(node) [[1]] [1] root | [2] V4 <= 1.9 * | [3] V4 > 1.9 | | [4] V5 <= 1.7 | | | [5] V4 <= 4.8 * | | | [6] V4 > 4.8 * | | [7] V5 > 1.7 * > > ## return info slots of terminal nodes > nodeapply(node, ids = nodeids(node, terminal = TRUE), + FUN = function(x) info_node(x)) [[1]] [1] "terminal A" [[2]] [1] "terminal B" [[3]] [1] "terminal C" [[4]] [1] "terminal D" > > ## fit tree using rpart > library("rpart") > rp <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis) > > ## coerce to `constparty' > rpk <- as.party(rp) > > ## extract nodeids > nodeids(rpk) [1] 1 2 3 4 5 6 7 8 9 > unlist(nodeapply(node_party(rpk), ids = nodeids(rpk), + FUN = id_node)) [1] 1 2 3 4 5 6 7 8 9 > unlist(nodeapply(rpk, ids = nodeids(rpk), FUN = id_node)) 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 > > ## but root nodes of party objects always have id = 1 > unlist(nodeapply(rpk, ids = nodeids(rpk), FUN = function(x) + id_node(node_party(x)), by_node = FALSE)) 1 2 3 4 5 6 7 8 9 1 1 1 1 1 1 1 1 1 > > > > cleanEx() detaching ‘package:rpart’ > nameEx("nodeids") > ### * nodeids > > flush(stderr()); flush(stdout()) > > ### Name: nodeids > ### Title: Extract Node Identifiers > ### Aliases: nodeids nodeids.party nodeids.partynode get_paths > ### Keywords: tree > > ### ** Examples > > > ## a tree as flat list structure > nodelist <- list( + # root node + list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), + kids = 2:3), + # V4 <= 1.9, terminal node + list(id = 2L), + # V4 > 1.9 + list(id = 3L, split = partysplit(varid = 1L, breaks = 1.7), + kids = c(4L, 7L)), + # V1 <= 1.7 + list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), + kids = 5:6), + # V4 <= 4.8, terminal node + list(id = 5L), + # V4 > 4.8, terminal node + list(id = 6L), + # V1 > 1.7, terminal node + list(id = 7L) + ) > > ## convert to a recursive structure > node <- as.partynode(nodelist) > > ## set up party object > data("iris") > tree <- party(node, data = iris, + fitted = data.frame("(fitted)" = + fitted_node(node, data = iris), + check.names = FALSE)) > tree [1] root | [2] Petal.Width <= 1.9: * | [3] Petal.Width > 1.9 | | [4] Sepal.Length <= 1.7 | | | [5] Petal.Width <= 4.8: * | | | [6] Petal.Width > 4.8: * | | [7] Sepal.Length > 1.7: * > > ### ids of all nodes > nodeids(tree) [1] 1 2 3 4 5 6 7 > > ### ids of all terminal nodes > nodeids(tree, terminal = TRUE) [1] 2 5 6 7 > > ### ids of terminal nodes in subtree with root [3] > nodeids(tree, from = 3, terminal = TRUE) [1] 5 6 7 > > ### get paths and extract all terminal nodes > tr <- unclass(node_party(tree)) > lapply(get_paths(tree, nodeids(tree, terminal = TRUE)), + function(path) tr[path]) [[1]] [[1]]$kids [[1]]$kids[[1]] [2] root * [[1]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[1]]$id [1] 1 [[2]] [[2]]$kids [[2]]$kids[[1]] [2] root * [[2]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[2]]$split $varid [1] 4 $breaks [1] 1.9 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" [[2]]$kids [[2]]$kids[[1]] [2] root * [[2]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[2]]$id [1] 1 [[2]]$kids [[2]]$kids[[1]] [2] root * [[2]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[2]]$id [1] 1 [[3]] [[3]]$kids [[3]]$kids[[1]] [2] root * [[3]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[3]]$split $varid [1] 4 $breaks [1] 1.9 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" [[3]]$kids [[3]]$kids[[1]] [2] root * [[3]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[3]]$id [1] 1 [[3]]$kids [[3]]$kids[[1]] [2] root * [[3]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[3]]$split $varid [1] 4 $breaks [1] 1.9 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" [[4]] [[4]]$kids [[4]]$kids[[1]] [2] root * [[4]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[4]]$split $varid [1] 4 $breaks [1] 1.9 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" [[4]]$kids [[4]]$kids[[1]] [2] root * [[4]]$kids[[2]] [3] root | [4] V1 <= 1.7 | | [5] V4 <= 4.8 * | | [6] V4 > 4.8 * | [7] V1 > 1.7 * [[4]]$split $varid [1] 4 $breaks [1] 1.9 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" > > > > > cleanEx() > nameEx("party-coercion") > ### * party-coercion > > flush(stderr()); flush(stdout()) > > ### Name: party-coercion > ### Title: Coercion Functions > ### Aliases: party-coercion as.party as.party.rpart as.party.Weka_tree > ### as.party.XMLNode as.constparty as.simpleparty as.simpleparty.party > ### as.simpleparty.simpleparty as.simpleparty.XMLNode > ### as.simpleparty.constparty pmmlTreeModel > ### Keywords: tree > > ### ** Examples > > ## fit tree using rpart > library("rpart") > rp <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis) > > ## coerce to `constparty' > as.party(rp) Model formula: Kyphosis ~ Age + Number + Start Fitted party: [1] root | [2] Start >= 8.5 | | [3] Start >= 14.5: absent (n = 29, err = 0.0%) | | [4] Start < 14.5 | | | [5] Age < 55: absent (n = 12, err = 0.0%) | | | [6] Age >= 55 | | | | [7] Age >= 111: absent (n = 14, err = 14.3%) | | | | [8] Age < 111: present (n = 7, err = 42.9%) | [9] Start < 8.5: present (n = 19, err = 42.1%) Number of inner nodes: 4 Number of terminal nodes: 5 > > > > cleanEx() detaching ‘package:rpart’ > nameEx("party-methods") > ### * party-methods > > flush(stderr()); flush(stdout()) > > ### Name: party-methods > ### Title: Methods for Party Objects > ### Aliases: party-methods length.party print.party print.simpleparty > ### print.constparty [.party [[.party depth.party width.party > ### getCall.party nodeprune nodeprune.party > ### Keywords: tree > > ### ** Examples > > > ## a tree as flat list structure > nodelist <- list( + # root node + list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), + kids = 2:3), + # V4 <= 1.9, terminal node + list(id = 2L), + # V4 > 1.9 + list(id = 3L, split = partysplit(varid = 5L, breaks = 1.7), + kids = c(4L, 7L)), + # V5 <= 1.7 + list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), + kids = 5:6), + # V4 <= 4.8, terminal node + list(id = 5L), + # V4 > 4.8, terminal node + list(id = 6L), + # V5 > 1.7, terminal node + list(id = 7L) + ) > > ## convert to a recursive structure > node <- as.partynode(nodelist) > > ## set up party object > data("iris") > tree <- party(node, data = iris, + fitted = data.frame("(fitted)" = + fitted_node(node, data = iris), + check.names = FALSE)) > names(tree) <- paste("Node", nodeids(tree), sep = " ") > > ## number of kids in root node > length(tree) [1] 7 > > ## depth of tree > depth(tree) [1] 3 > > ## number of terminal nodes > width(tree) [1] 4 > > ## node number four > tree["Node 4"] [Node 4] root | [Node 5] Petal.Width <= 4.8: * | [Node 6] Petal.Width > 4.8: * > tree[["Node 4"]] [Node 4] root | [Node 5] Petal.Width <= 4.8: * | [Node 6] Petal.Width > 4.8: * > > > > > cleanEx() > nameEx("party-predict") > ### * party-predict > > flush(stderr()); flush(stdout()) > > ### Name: party-predict > ### Title: Tree Predictions > ### Aliases: party-predict predict.party predict_party > ### predict_party.default predict_party.constparty > ### predict_party.simpleparty > ### Keywords: tree > > ### ** Examples > > > ## fit tree using rpart > library("rpart") > rp <- rpart(skips ~ Opening + Solder + Mask + PadType + Panel, + data = solder, method = 'anova') > > ## coerce to `constparty' > pr <- as.party(rp) > > ## mean predictions > predict(pr, newdata = solder[c(3, 541, 640),]) 3 541 640 1.030952 1.030952 1.030952 > > ## ecdf > predict(pr, newdata = solder[c(3, 541, 640),], type = "prob") $`3` Empirical CDF Call: ecdf(y) x[1:11] = 0, 1, 2, ..., 9, 12 $`541` Empirical CDF Call: ecdf(y) x[1:11] = 0, 1, 2, ..., 9, 12 $`640` Empirical CDF Call: ecdf(y) x[1:11] = 0, 1, 2, ..., 9, 12 > > ## terminal node identifiers > predict(pr, newdata = solder[c(3, 541, 640),], type = "node") 3 541 640 3 3 3 > > ## median predictions > predict(pr, newdata = solder[c(3, 541, 640),], + FUN = function(y, w = 1) median(y)) 3 541 640 0 0 0 > > > > > cleanEx() detaching ‘package:rpart’ > nameEx("party") > ### * party > > flush(stderr()); flush(stdout()) > > ### Name: party > ### Title: Recursive Partytioning > ### Aliases: party names.party names<-.party node_party is.constparty > ### is.simpleparty data_party data_party.default > ### Keywords: tree > > ### ** Examples > > ### data ### > ## artificial WeatherPlay data > data("WeatherPlay", package = "partykit") > str(WeatherPlay) 'data.frame': 14 obs. of 5 variables: $ outlook : Factor w/ 3 levels "sunny","overcast",..: 1 1 2 3 3 3 2 1 1 3 ... $ temperature: num 85 80 83 70 68 65 64 72 69 75 ... $ humidity : num 85 90 86 96 80 70 65 95 70 80 ... $ windy : Factor w/ 2 levels "false","true": 1 2 1 1 1 2 2 1 1 1 ... $ play : Factor w/ 2 levels "yes","no": 2 2 1 1 1 2 1 2 1 1 ... > > > ### splits ### > ## split in overcast, humidity, and windy > sp_o <- partysplit(1L, index = 1:3) > sp_h <- partysplit(3L, breaks = 75) > sp_w <- partysplit(4L, index = 1:2) > > ## query labels > character_split(sp_o) $name [1] "V1" $levels [1] "(-Inf,1]" "(1,2]" "(2, Inf]" > > > ### nodes ### > ## set up partynode structure > pn <- partynode(1L, split = sp_o, kids = list( + partynode(2L, split = sp_h, kids = list( + partynode(3L, info = "yes"), + partynode(4L, info = "no"))), + partynode(5L, info = "yes"), + partynode(6L, split = sp_w, kids = list( + partynode(7L, info = "yes"), + partynode(8L, info = "no"))))) > pn [1] root | [2] V1 in (-Inf,1] | | [3] V3 <= 75 * | | [4] V3 > 75 * | [5] V1 in (1,2] * | [6] V1 in (2, Inf] | | [7] V4 <= 1 * | | [8] V4 > 1 * > > > ### tree ### > ## party: associate recursive partynode structure with data > py <- party(pn, WeatherPlay) > py [1] root | [2] outlook in sunny | | [3] humidity <= 75: yes | | [4] humidity > 75: no | [5] outlook in overcast: yes | [6] outlook in rainy | | [7] windy in false: yes | | [8] windy in true: no > plot(py) > > > ### variations ### > ## tree stump > n1 <- partynode(id = 1L, split = sp_o, kids = lapply(2L:4L, partynode)) > print(n1, data = WeatherPlay) [1] root | [2] outlook in sunny * | [3] outlook in overcast * | [4] outlook in rainy * > > ## query fitted nodes and kids ids > fitted_node(n1, data = WeatherPlay) [1] 2 2 3 4 4 4 3 2 2 4 2 3 3 4 > kidids_node(n1, data = WeatherPlay) [1] 1 1 2 3 3 3 2 1 1 3 1 2 2 3 > > ## tree with full data sets > t1 <- party(n1, data = WeatherPlay) > > ## tree with empty data set > party(n1, data = WeatherPlay[0, ]) [1] root | [2] outlook in sunny: * | [3] outlook in overcast: * | [4] outlook in rainy: * > > ## constant-fit tree > t2 <- party(n1, + data = WeatherPlay, + fitted = data.frame( + "(fitted)" = fitted_node(n1, data = WeatherPlay), + "(response)" = WeatherPlay$play, + check.names = FALSE), + terms = terms(play ~ ., data = WeatherPlay), + ) > t2 <- as.constparty(t2) > t2 Model formula: play ~ outlook + temperature + humidity + windy Fitted party: [1] root | [2] outlook in sunny: no (n = 5, err = 40.0%) | [3] outlook in overcast: yes (n = 4, err = 0.0%) | [4] outlook in rainy: yes (n = 5, err = 40.0%) Number of inner nodes: 1 Number of terminal nodes: 3 > plot(t2) > > > > cleanEx() > nameEx("partynode-methods") > ### * partynode-methods > > flush(stderr()); flush(stdout()) > > ### Name: partynode-methods > ### Title: Methods for Node Objects > ### Aliases: partynode-methods is.partynode as.partynode > ### as.partynode.partynode as.partynode.list as.list.partynode > ### length.partynode [.partynode [[.partynode is.terminal > ### is.terminal.partynode depth.partynode width width.partynode > ### print.partynode nodeprune.partynode > ### Keywords: tree > > ### ** Examples > > ## a tree as flat list structure > nodelist <- list( + # root node + list(id = 1L, split = partysplit(varid = 4L, breaks = 1.9), + kids = 2:3), + # V4 <= 1.9, terminal node + list(id = 2L), + # V4 > 1.9 + list(id = 3L, split = partysplit(varid = 1L, breaks = 1.7), + kids = c(4L, 7L)), + # V1 <= 1.7 + list(id = 4L, split = partysplit(varid = 4L, breaks = 4.8), + kids = 5:6), + # V4 <= 4.8, terminal node + list(id = 5L), + # V4 > 4.8, terminal node + list(id = 6L), + # V1 > 1.7, terminal node + list(id = 7L) + ) > > ## convert to a recursive structure > node <- as.partynode(nodelist) > > ## print raw recursive structure without data > print(node) [1] root | [2] V4 <= 1.9 * | [3] V4 > 1.9 | | [4] V1 <= 1.7 | | | [5] V4 <= 4.8 * | | | [6] V4 > 4.8 * | | [7] V1 > 1.7 * > > ## print tree along with the associated iris data > data("iris", package = "datasets") > print(node, data = iris) [1] root | [2] Petal.Width <= 1.9 * | [3] Petal.Width > 1.9 | | [4] Sepal.Length <= 1.7 | | | [5] Petal.Width <= 4.8 * | | | [6] Petal.Width > 4.8 * | | [7] Sepal.Length > 1.7 * > > ## print subtree > print(node[2], data = iris) [3] root | [4] Sepal.Length <= 1.7 | | [5] Petal.Width <= 4.8 * | | [6] Petal.Width > 4.8 * | [7] Sepal.Length > 1.7 * > > ## print subtree, with root node number one > print(as.partynode(node[2], from = 1), data = iris) [1] root | [2] Sepal.Length <= 1.7 | | [3] Petal.Width <= 4.8 * | | [4] Petal.Width > 4.8 * | [5] Sepal.Length > 1.7 * > > ## number of kids in root node > length(node) [1] 2 > > ## depth of tree > depth(node) [1] 3 > > ## number of terminal nodes > width(node) [1] 4 > > ## convert back to flat structure > as.list(node) [[1]] [[1]]$id [1] 1 [[1]]$split $varid [1] 4 $breaks [1] 1.9 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" [[1]]$kids [1] 2 3 [[2]] [[2]]$id [1] 2 [[2]]$info NULL [[3]] [[3]]$id [1] 3 [[3]]$split $varid [1] 1 $breaks [1] 1.7 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" [[3]]$kids [1] 4 7 [[4]] [[4]]$id [1] 4 [[4]]$split $varid [1] 4 $breaks [1] 4.8 $index NULL $right [1] TRUE $prob NULL $info NULL attr(,"class") [1] "partysplit" [[4]]$kids [1] 5 6 [[5]] [[5]]$id [1] 5 [[5]]$info NULL [[6]] [[6]]$id [1] 6 [[6]]$info NULL [[7]] [[7]]$id [1] 7 [[7]]$info NULL > > > > cleanEx() > nameEx("partynode") > ### * partynode > > flush(stderr()); flush(stdout()) > > ### Name: partynode > ### Title: Inner and Terminal Nodes > ### Aliases: partynode kidids_node fitted_node id_node split_node > ### surrogates_node kids_node info_node formatinfo_node > ### Keywords: tree > > ### ** Examples > > data("iris", package = "datasets") > > ## a stump defined by a binary split in Sepal.Length > stump <- partynode(id = 1L, + split = partysplit(which(names(iris) == "Sepal.Length"), + breaks = 5), + kids = lapply(2:3, partynode)) > > ## textual representation > print(stump, data = iris) [1] root | [2] Sepal.Length <= 5 * | [3] Sepal.Length > 5 * > > ## list element number and node id of the two terminal nodes > table(kidids_node(stump, iris), + fitted_node(stump, data = iris)) 2 3 1 32 0 2 0 118 > > ## assign terminal nodes with probability 0.5 > ## to observations with missing `Sepal.Length' > iris_NA <- iris > iris_NA[sample(1:nrow(iris), 50), "Sepal.Length"] <- NA > table(fitted_node(stump, data = iris_NA, + obs = !complete.cases(iris_NA))) 2 3 23 27 > > ## a stump defined by a primary split in `Sepal.Length' > ## and a surrogate split in `Sepal.Width' which > ## determines terminal nodes for observations with > ## missing `Sepal.Length' > stump <- partynode(id = 1L, + split = partysplit(which(names(iris) == "Sepal.Length"), + breaks = 5), + kids = lapply(2:3, partynode), + surrogates = list(partysplit( + which(names(iris) == "Sepal.Width"), breaks = 3))) > f <- fitted_node(stump, data = iris_NA, + obs = !complete.cases(iris_NA)) > tapply(iris_NA$Sepal.Width[!complete.cases(iris_NA)], f, range) $`2` [1] 2.3 3.0 $`3` [1] 3.1 4.2 > > > > > cleanEx() > nameEx("partysplit") > ### * partysplit > > flush(stderr()); flush(stdout()) > > ### Name: partysplit > ### Title: Binary and Multiway Splits > ### Aliases: partysplit kidids_split character_split varid_split > ### breaks_split index_split right_split prob_split info_split > ### Keywords: tree > > ### ** Examples > > data("iris", package = "datasets") > > ## binary split in numeric variable `Sepal.Length' > sl5 <- partysplit(which(names(iris) == "Sepal.Length"), + breaks = 5) > character_split(sl5, data = iris) $name [1] "Sepal.Length" $levels [1] "<= 5" "> 5" > table(kidids_split(sl5, data = iris), iris$Sepal.Length <= 5) FALSE TRUE 1 0 32 2 118 0 > > ## multiway split in numeric variable `Sepal.Width', > ## higher values go to the first kid, smallest values > ## to the last kid > sw23 <- partysplit(which(names(iris) == "Sepal.Width"), + breaks = c(3, 3.5), index = 3:1) > character_split(sw23, data = iris) $name [1] "Sepal.Width" $levels [1] "(3.5, Inf]" "(3,3.5]" "(-Inf,3]" > table(kidids_split(sw23, data = iris), + cut(iris$Sepal.Width, breaks = c(-Inf, 2, 3, Inf))) (-Inf,2] (2,3] (3, Inf] 1 0 0 19 2 0 0 48 3 1 82 0 > > ## binary split in factor `Species' > sp <- partysplit(which(names(iris) == "Species"), + index = c(1L, 1L, 2L)) > character_split(sp, data = iris) $name [1] "Species" $levels [1] "setosa, versicolor" "virginica" > table(kidids_split(sp, data = iris), iris$Species) setosa versicolor virginica 1 50 50 0 2 0 0 50 > > ## multiway split in factor `Species' > sp <- partysplit(which(names(iris) == "Species"), index = 1:3) > character_split(sp, data = iris) $name [1] "Species" $levels [1] "setosa" "versicolor" "virginica" > table(kidids_split(sp, data = iris), iris$Species) setosa versicolor virginica 1 50 0 0 2 0 50 0 3 0 0 50 > > ## multiway split in numeric variable `Sepal.Width' > sp <- partysplit(which(names(iris) == "Sepal.Width"), + breaks = quantile(iris$Sepal.Width)) > character_split(sp, data = iris) $name [1] "Sepal.Width" $levels [1] "(-Inf,2]" "(2,2.8]" "(2.8,3]" "(3,3.3]" "(3.3,4.4]" [6] "(4.4, Inf]" > > > > > cleanEx() > nameEx("prune.modelparty") > ### * prune.modelparty > > flush(stderr()); flush(stdout()) > > ### Name: prune.modelparty > ### Title: Post-Prune 'modelparty' Objects > ### Aliases: prune.modelparty prune.lmtree > > ### ** Examples > > set.seed(29) > n <- 1000 > d <- data.frame( + x = runif(n), + z = runif(n), + z_noise = factor(sample(1:3, size = n, replace = TRUE)) + ) > d$y <- rnorm(n, mean = d$x * c(-1, 1)[(d$z > 0.7) + 1], sd = 3) > > ## glm versus lm / logLik versus sum of squared residuals > fmla <- y ~ x | z + z_noise > lm_big <- lmtree(formula = fmla, data = d, maxdepth = 3, alpha = 1) > glm_big <- glmtree(formula = fmla, data = d, maxdepth = 3, alpha = 1) > > AIC(lm_big) [1] 5129.346 > AIC(glm_big) [1] 5126.784 > > ## load rpart for prune() generic > ## (otherwise: use prune.modelparty directly) > if (require("rpart")) { + + ## pruning + lm_aic <- prune(lm_big, type = "AIC") + lm_bic <- prune(lm_big, type = "BIC") + + width(lm_big) + width(lm_aic) + width(lm_bic) + + glm_aic <- prune(glm_big, type = "AIC") + glm_bic <- prune(glm_big, type = "BIC") + + width(glm_big) + width(glm_aic) + width(glm_bic) + + } Loading required package: rpart [1] 2 > > > > cleanEx() detaching ‘package:rpart’ > nameEx("varimp") > ### * varimp > > flush(stderr()); flush(stdout()) > > ### Name: varimp > ### Title: Variable Importance > ### Aliases: varimp varimp.constparty varimp.cforest > ### Keywords: tree > > ### ** Examples > > > set.seed(290875) > data("readingSkills", package = "party") > readingSkills.cf <- cforest(score ~ ., data = readingSkills, + mtry = 2, ntree = 50) > > # standard importance > varimp(readingSkills.cf) nativeSpeaker age shoeSize 12.17150 65.18361 16.09479 > > # conditional importance, may take a while... > varimp(readingSkills.cf, conditional = TRUE) nativeSpeaker age shoeSize 12.3017295 39.5830711 0.4435327 > > > > > ### *