Formula/0000755000175400001440000000000012127334604012054 5ustar zeileisusersFormula/MD50000644000175400001440000000134412127334604012366 0ustar zeileisusers0e06e9f93521a83d11d5959cddb04e4d *DESCRIPTION 3e144418e763582429edb2abd1dac97e *NAMESPACE 100f22205baad7d891a094ccd90f9073 *NEWS 3a254068f8b8e9900c4b7e0bec7fa11c *R/Formula.R cecc325558bfb9e8a1396e05dbcf9c23 *inst/CITATION ae4c4bf2ff5a0d0adbdfbb30bc0d23bf *inst/doc/Formula.R 0e62223f1646a8021221d03e42ea4a26 *inst/doc/Formula.Rnw 4cf99be17945e8171356a7aec051d5bd *inst/doc/Formula.pdf 7253145dfeaac6e10d37a3b00d21b957 *man/Formula.Rd 286599e5c9ab3a0526a5a8e39d7e1f44 *man/model.frame.Formula.Rd d580b42f75b07ab16cc40a6057127649 *tests/Examples/Formula-Ex.Rout.save 0e62223f1646a8021221d03e42ea4a26 *vignettes/Formula.Rnw e99ed6af2b803dc98ec4c6b2f6c042b4 *vignettes/Formula.Rout.save 4bab7df5607b24ff56c23a73e67c625e *vignettes/Formula.bib Formula/vignettes/0000755000175400001440000000000012127334421014061 5ustar zeileisusersFormula/vignettes/Formula.bib0000644000175400001440000001137011710474652016156 0ustar zeileisusers@Book{Formula:Chambers+Hastie:1992, editor = {John M. Chambers and Trevor J. Hastie}, title = {Statistical Models in \proglang{S}}, publisher = {Chapman \& Hall}, year = {1992}, address = {London} } @Manual{Formula:Coelli+Henningsen:2010, title = {\pkg{frontier}: Stochastic Frontier Analysis}, author = {Tim Coelli and Arne Henningsen}, year = {2010}, note = {\proglang{R}~package version~0.996-6}, url = {http://CRAN.R-project.org/package=frontier}, } @Article{Formula:Cribari-Neto+Zeileis:2010, author = {Francisco Cribari-Neto and Achim Zeileis}, title = {Beta Regression in \proglang{R}}, journal = {Journal of Statistical Software}, year = {2010}, volume = {34}, number = {2}, pages = {1--24}, url = {http://www.jstatsoft.org/v34/i02/} } @Manual{Formula:Croissant:2010, title = {\pkg{mlogit}: Multinomial Logit Model}, author = {Yves Croissant}, year = {2010}, note = {\proglang{R}~package version~0.1-5}, url = {http://CRAN.R-project.org/package=mlogit} } @Article{Formula:Croissant+Millo:2008, author = {Yves Croissant and Giovanni Millo}, title = {Panel Data Econometrics in \proglang{R}: The \pkg{plm} Package}, journal = {Journal of Statistical Software}, year = {2008}, volume = {27}, number = {2}, pages = {1--43}, url = {http://www.jstatsoft.org/v27/i02/} } @Article{Formula:Hothorn+Hornik+VanDeWiel:2006, author = {Torsten Hothorn and Kurt Hornik and Mark A. van de Wiel and Achim Zeileis}, title = {A {L}ego System for Conditional Inference}, journal = {The American Statistician}, year = {2006}, volume = {60}, number = {3}, pages = {257--263}, } @Article{Formula:Hothorn+Hornik+VanDeWiel:2008, author = {Torsten Hothorn and Kurt Hornik and Mark A. van de Wiel and Achim Zeileis}, title = {Implementing a Class of Permutation Tests: The \pkg{coin} Package}, year = {2008}, volume = {28}, number = {8}, pages = {1--23}, journal = {Journal of Statistical Software}, url = {http://www.jstatsoft.org/v28/i08/} } @Manual{Formula:Hothorn+Hornik+Zeileis:2009, title = {\pkg{party}: A Laboratory for Recursive Partytioning}, author = {Torsten Hothorn and Kurt Hornik and Achim Zeileis}, year = {2009}, note = {\proglang{R}~package version 0.9-998}, url = {http://CRAN.R-project.org/package=party} } @Book{Formula:Kleiber+Zeileis:2008, title = {Applied Econometrics with \proglang{R}}, author = {Christian Kleiber and Achim Zeileis}, year = {2008}, publisher = {Springer-Verlag}, address = {New York} } @Manual{Formula:R:2009, title = {\proglang{R}: {A} Language and Environment for Statistical Computing}, author = {{\proglang{R} Development Core Team}}, organization = {\proglang{R} Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2009}, note = {{ISBN} 3-900051-07-0}, url = {http://www.R-project.org/} } @Article{Formula:Stasinopoulos+Rigby:2007, author = {D. Mikis Stasinopoulos and Robert A. Rigby}, title = {Generalized Additive Models for Location Scale and Shape ({GAMLSS}) in \proglang{R}}, journal = {Journal of Statistical Software}, year = {2007}, volume = {23}, number = {7}, pages = {1--46}, url = {http://www.jstatsoft.org/v23/i07/} } @Manual{Formula:Therneau+Lumley:2009, title = {\pkg{survival}: Survival Analysis Including Penalised Likelihood}, author = {Terry Therneau and Thomas Lumley}, year = {2009}, note = {\proglang{R}~package version~2.35-4}, url = {http://CRAN.R-project.org/package=survival} } @Article{Formula:Wilkinson+Rogers:1973, author = {G. N. Wilkinson and C. E. Rogers}, title = {Symbolic Description of Factorial Models for Analysis of Variance}, journal = {Applied Statistics}, year = {1973}, volume = {22}, pages = {392--399} } @Article{Formula:Zeileis+Croissant:2010, author = {Achim Zeileis and Yves Croissant}, title = {Extended Model Formulas in \proglang{R}: Multiple Parts and Multiple Responses}, journal = {Journal of Statistical Software}, year = {2010}, volume = {34}, number = {1}, pages = {1--13}, url = {http://www.jstatsoft.org/v34/i01/} } @Article{Formula:Zeileis+Hothorn+Hornik:2008, author = {Achim Zeileis and Torsten Hothorn and Kurt Hornik}, title = {Model-Based Recursive Partitioning}, journal = {Journal of Computational and Graphical Statistics}, year = {2008}, volume = {17}, number = {2}, pages = {492--514}, } @Article{Formula:Zeileis+Kleiber+Jackman:2008, author = {Achim Zeileis and Christian Kleiber and Simon Jackman}, title = {Regression Models for Count Data in \proglang{R}}, journal = {Journal of Statistical Software}, year = {2008}, volume = {27}, number = {8}, pages = {1--25}, url = {http://www.jstatsoft.org/v27/i08/} } Formula/vignettes/Formula.Rout.save0000644000175400001440000000763511710514405017310 0ustar zeileisusers > options(width = 70, prompt = "R> ", continue = "+ ") > library("Formula") > set.seed(1090) > dat <- as.data.frame(matrix(round(runif(21), digits = 2), + ncol = 7)) > colnames(dat) <- c("y1", "y2", "y3", "x1", "x2", "x3", + "x4") > for (i in c(2, 6:7)) dat[[i]] <- factor(dat[[i]] < + 0.5, labels = c("a", "b")) > dat$y2[1] <- NA > dat y1 y2 y3 x1 x2 x3 x4 1 0.82 0.27 0.09 0.22 a b 2 0.70 a 0.17 0.26 0.46 b b 3 0.65 b 0.28 0.03 0.37 a a > F1 <- Formula(log(y1) ~ x1 + x2 | I(x1^2)) > length(F1) [1] 1 2 > mf1 <- model.frame(F1, data = dat) > mf1 log(y1) x1 x2 I(x1^2) 1 -0.1984509 0.09 0.22 0.0081 2 -0.3566749 0.26 0.46 0.0676 3 -0.4307829 0.03 0.37 9e-04 > model.response(mf1) 1 2 3 -0.1984509 -0.3566749 -0.4307829 > model.matrix(F1, data = mf1, rhs = 1) (Intercept) x1 x2 1 1 0.09 0.22 2 1 0.26 0.46 3 1 0.03 0.37 attr(,"assign") [1] 0 1 2 > model.matrix(F1, data = mf1, rhs = 2) (Intercept) I(x1^2) 1 1 0.0081 2 1 0.0676 3 1 0.0009 attr(,"assign") [1] 0 1 > F2 <- Formula(y1 + y2 ~ x3) > length(F2) [1] 1 1 > mf2 <- model.frame(F2, data = dat) > mf2 y1 y2 x3 2 0.70 a b 3 0.65 b a > model.response(mf2) NULL > model.part(F2, data = mf2, lhs = 1) y1 y2 2 0.70 a 3 0.65 b > model.part(F1, data = mf1, lhs = 1, drop = TRUE) 1 2 3 -0.1984509 -0.3566749 -0.4307829 > F3 <- Formula(y1 + y2 | log(y3) ~ x1 + I(x2^2) | 0 + + log(x1) | x3/x4) > F3 y1 + y2 | log(y3) ~ x1 + I(x2^2) | 0 + log(x1) | x3/x4 > length(F3) [1] 2 3 > attr(F3, "lhs") [[1]] y1 + y2 [[2]] log(y3) > formula(F3) y1 + y2 | log(y3) ~ x1 + I(x2^2) | 0 + log(x1) | x3/x4 > formula(F3, lhs = 2, rhs = -2) log(y3) ~ x1 + I(x2^2) | x3/x4 > formula(F3, lhs = c(TRUE, FALSE), rhs = 0) y1 + y2 ~ 0 > terms(F3) ~y1 + y2 + log(y3) + (x1 + I(x2^2)) + (0 + log(x1)) + x3/x4 attr(,"variables") list(y1, y2, log(y3), x1, I(x2^2), log(x1), x3, x4) attr(,"factors") y1 y2 log(y3) x1 I(x2^2) log(x1) x3 x3:x4 y1 1 0 0 0 0 0 0 0 y2 0 1 0 0 0 0 0 0 log(y3) 0 0 1 0 0 0 0 0 x1 0 0 0 1 0 0 0 0 I(x2^2) 0 0 0 0 1 0 0 0 log(x1) 0 0 0 0 0 1 0 0 x3 0 0 0 0 0 0 1 2 x4 0 0 0 0 0 0 0 1 attr(,"term.labels") [1] "y1" "y2" "log(y3)" "x1" "I(x2^2)" "log(x1)" [7] "x3" "x3:x4" attr(,"order") [1] 1 1 1 1 1 1 1 2 attr(,"intercept") [1] 0 attr(,"response") [1] 0 attr(,".Environment") > formula(terms(F3)) ~y1 + y2 + log(y3) + (x1 + I(x2^2)) + (0 + log(x1)) + x3/x4 > formula(terms(F3, lhs = 2, rhs = -2)) log(y3) ~ x1 + I(x2^2) + x3/x4 > formula(terms(F3, lhs = c(TRUE, FALSE), rhs = 0)) ~y1 + y2 > mf3 <- model.frame(F3, data = dat, subset = y1 < 0.75, + weights = x1) > mf3 y1 y2 log(y3) x1 I(x2^2) log(x1) x3 x4 (weights) 2 0.70 a -1.771957 0.26 0.2116 -1.347074 b b 0.26 3 0.65 b -1.272966 0.03 0.1369 -3.506558 a a 0.03 > model.matrix(F3, data = mf3, rhs = 2) log(x1) 2 -1.347074 3 -3.506558 attr(,"assign") [1] 1 > model.part(F3, data = mf3, lhs = 1) y1 y2 2 0.70 a 3 0.65 b > model.part(F3, data = mf3, lhs = 2) log(y3) 2 -1.771957 3 -1.272966 > model.weights(mf3) [1] 0.26 0.03 > update(F1, . ~ . - x1 | . + x1) log(y1) ~ x2 | I(x1^2) + x1 > update(F1, . + y2 | y3 ~ .) log(y1) + y2 | y3 ~ x1 + x2 | I(x1^2) > as.Formula(y1 ~ x1, y2 ~ x2, ~x3) y1 | y2 ~ x1 | x2 | x3 > ivcoef <- function(formula, data, subset, na.action, + ...) { + mf <- match.call(expand.dots = FALSE) + m <- match(c("formula", "data", .... [TRUNCATED] > ivcoef(log(y1) ~ x1 | x2, data = dat) (Intercept) x1 -0.169027 -1.260073 *** Run successfully completed *** Formula/vignettes/Formula.Rnw0000644000175400001440000006136511710474652016201 0ustar zeileisusers\documentclass[nojss]{jss} \usepackage{thumbpdf} %% need no \usepackage{Sweave} %% additional commands \newcommand{\squote}[1]{`{#1}'} \newcommand{\dquote}[1]{``{#1}''} \newcommand{\fct}[1]{{\texttt{#1()}}} \newcommand{\class}[1]{\dquote{\texttt{#1}}} %% for internal use \newcommand{\fixme}[1]{\emph{\marginpar{FIXME} (#1)}} \newcommand{\readme}[1]{\emph{\marginpar{README} (#1)}} \author{Achim Zeileis\\Universit\"at Innsbruck \And Yves Croissant\\Universit{\'e} de la R{\'e}union} \Plainauthor{Achim Zeileis, Yves Croissant} \title{Extended Model Formulas in \proglang{R}: Multiple Parts and Multiple Responses} \Plaintitle{Extended Model Formulas in R: Multiple Parts and Multiple Responses} \Shorttitle{Extended Model Formulas in \proglang{R}} \Keywords{formula processing, model frame, model matrix, \proglang{R}} \Plainkeywords{formula processing, model frame, model matrix, R} \Abstract{ This introduction to the \proglang{R} package \pkg{Formula} is a (slightly) modified version of \cite{Formula:Zeileis+Croissant:2010}, published in the \emph{Journal of Statistical Software}. Model formulas are the standard approach for specifying the variables in statistical models in the \proglang{S} language. Although being eminently useful in an extremely wide class of applications, they have certain limitations including being confined to single responses and not providing convenient support for processing formulas with multiple parts. The latter is relevant for models with two or more sets of variables, e.g., different equations for different model parameters (such as mean and dispersion), regressors and instruments in instrumental variable regressions, two-part models such as hurdle models, or alternative-specific and individual-specific variables in choice models among many others. The \proglang{R}~package \pkg{Formula} addresses these two problems by providing a new class \class{Formula} (inheriting from \class{formula}) that accepts an additional formula operator \code{|} separating multiple parts and by allowing all formula operators (including the new \code{|}) on the left-hand side to support multiple responses. } \Address{ Achim Zeileis\\ Department of Statistics\\ Universit\"at Innsbruck\\ Universit\"atsstr. 15\\ 6020 Innsbruck, Austria\\ E-mail: \email{Achim.Zeileis@R-project.org}\\ URL: \url{http://statmath.wu.ac.at/~zeileis/}\\ Yves Croissant\\ Universit{\'e} de la R{\'e}union\\ Facult{\'e} de Droit et d'Economie\\ 15, avenue Ren{\'e} Cassin\\ BP7151\\ 97715 Saint-Denis Messag Cedex 9, France\\ E-mail: \email{Yves.Croissant@univ-reunion.fr}\\ } \begin{document} \SweaveOpts{engine = R, eps = FALSE, keep.source = TRUE} %\VignetteIndexEntry{Extended Model Formulas in R: Multiple Parts and Multiple Responses} %\VignetteDepends{stats} %\VignetteKeywords{formula processing, model frame, model matrix, R} %\VignettePackage{Formula} <>= options(width = 70, prompt = "R> ", continue = "+ ") library("Formula") @ \section{Introduction} \label{sec:intro} Since publication of the seminal ``white book'' \citep{Formula:Chambers+Hastie:1992} the standard approach for fitting statistical models in the \proglang{S} language is to apply some model-fitting function (such as \fct{lm} or \fct{glm}) to a \class{formula} description of the variables involved in the model and typically stored in a \class{data.frame}. The semantics of formula processing are based on the ideas of the \cite{Formula:Wilkinson+Rogers:1973} notation which in turn was targeted at specification of analysis of variance models. Despite this emphasis on specification of terms in models with linear predictors, formula notation has always been used much more generally in \proglang{S}, e.g., for specifying variables in classification and regression trees, margins in contingency tables, or variables in graphical displays. In such applications, the precise meaning of a particular formula depends on the function that processes it. Typically, the standard formula processing approach would encompass extraction of the specified terms using \fct{terms}, preparation of a preprocessed data frame using \fct{model.frame}, and computation of a ``design'' or ``regressor'' matrix using \fct{model.matrix}. However, there are certain limitations to these standard formula processing tools in \proglang{S} that can be rather inconvenient in certain applications: \begin{enumerate} \item The formula notation can just be used on the right-hand side (RHS) of a formula (to the right of the \code{~}) while it has its original arithmetic meaning on the left-hand side (LHS). This makes it difficult to specify multiple responses, especially if these are not numeric (e.g., factors). This feature would be useful for specifying multivariate outcomes of mixed types in independence tests \citep[e.g., in the \pkg{coin} package,][]{Formula:Hothorn+Hornik+VanDeWiel:2006,Formula:Hothorn+Hornik+VanDeWiel:2008} or in models with multivariate responses \citep[e.g., supported in the \pkg{party} package][]{Formula:Zeileis+Hothorn+Hornik:2008}. \item There is no simple construct in standard formula notation that allows one to separate several groups of variables from which separate model matrices can be derived. This task occurs in many types of models, e.g., when processing separate sets of variables for mean and dispersion \citep[e.g., in the \pkg{betareg} package,][]{Formula:Cribari-Neto+Zeileis:2010}, separate equations for location, scatter, and shape \citep[e.g., in the \pkg{gamlss} package,][]{Formula:Stasinopoulos+Rigby:2007}, regressors and instruments in instrumental variable regressions (e.g., in the \pkg{plm} package, \citealp{Formula:Croissant+Millo:2008}, or the \pkg{AER} package, \citealp{Formula:Kleiber+Zeileis:2008}), variables in two-part models such as hurdle models or zero-inflated regressions \citep[e.g., in the \pkg{pscl} package,][]{Formula:Zeileis+Kleiber+Jackman:2008}, alternative-specific and individual-specific variables in choice models \citep[e.g., in the \pkg{mlogit} package,][]{Formula:Croissant:2010}, efficiency level variables in stochastic frontier analysis \citep[e.g., in the \pkg{frontier} package,][]{Formula:Coelli+Henningsen:2010}, or modeling variables and partitioning variables in model-based recursive partitioning techniques \citep[e.g., in the \pkg{party} package,][]{Formula:Zeileis+Hothorn+Hornik:2008}. \end{enumerate} In many of the aformentioned packages, standard \class{formula} objects are employed but their processing is generalized, e.g., by using multiple formulas, multiple terms, by adding new formula operators or even more elaborate solutions. However, in many situations it is not easy to reuse these generalizations outside the package/function they were designed for. Therefore, as we repeatedly needed such generalizations in our own packages and addressed this in the past by various different solutions, it seemed natural to provide a more general unified approach by means of our new \pkg{Formula} package. This has already been reused in some of our own packages (including \pkg{AER}, \pkg{betareg}, \pkg{mlogit}, and \pkg{plm}) but can also be easily employed by other package developers (e.g., as in the \pkg{frontier} package). More applications in our own and other packages will hopefully follow. In the remainder of this paper we discuss how multiple responses and multiple parts (both on the LHS and RHS) are enabled in the \pkg{Formula} package written in the \proglang{R} system for statistical computing \citep{Formula:R:2009} and available from the Comprehensive \proglang{R} Archive Network (CRAN) at \url{http://CRAN.R-project.org/package=Formula}. Built on top of basic \class{formula} objects, the \class{Formula} class just adds a thin additional layer which is based on a single additional operator, namely \code{|}, that can be used to separate different parts (or groups) of variables. \pkg{Formula} essentially just handles the different formula parts and leverages the existing methods for \class{formula} objects for all remaining operations.\footnote{Throughout the paper, no terminological distinction is made between classic \class{formula} objects as such and the way they are interpreted by standard processing tools (e.g., \fct{terms}, \fct{model.frame}, \fct{model.matrix}). The reasons for this are twofold: First, it reflects their use as described in \cite{Formula:Chambers+Hastie:1992} and, second, generalizations generally require extra effort from the user.} In Section~\ref{sec:motivation} we show two small motivating examples that convey the main ideas implemented in the package and how easily they can be employed. The details of the \class{Formula} class and its associated methods are discussed in Section~\ref{sec:implementation} and Section~\ref{sec:usage} illustrates the usage of \pkg{Formula} in developing new model-fitting functions. A short summary in Section~\ref{sec:summary} concludes the paper. \section{Motivating examples} \label{sec:motivation} To illustrate the basic ideas of the \pkg{Formula} package, we first generate a small artificial data set (with both numeric and categorical variables) and subsequently illustrate its usage with a multi-part and a multi-response \class{Formula}, respectively. <>= set.seed(1090) dat <- as.data.frame(matrix(round(runif(21), digits = 2), ncol = 7)) colnames(dat) <- c("y1", "y2", "y3", "x1", "x2", "x3", "x4") for(i in c(2, 6:7)) dat[[i]] <- factor(dat[[i]] < 0.5, labels = c("a", "b")) dat$y2[1] <- NA dat @ \subsection{Multiple parts} We start out with a simple formula \verb/log(y1) ~ x1 + x2 | I(x1^2)/ which has a single response \code{log(y1)} on the LHS and two parts on the RHS, separated by \code{|}. The first part contains \code{x1} and \code{x2}, the second contains \verb/I(x1^2)/, i.e., the squared values of \code{x1}. The initial \class{formula} can be transformed to a \class{Formula} using the constructor function \fct{Formula}: % <>= F1 <- Formula(log(y1) ~ x1 + x2 | I(x1^2)) length(F1) @ % The \fct{length} method indicates that there is one part on the LHS and two parts on the RHS. The first step of processing data using a formula is typically the construction of a so-called model frame containing only the variables required by the formula. As usual, this can be obtained with the \fct{model.frame} method. % <>= mf1 <- model.frame(F1, data = dat) mf1 @ % As this model just has a single response (as in base \class{formula} objects), the extractor function \fct{model.response} can be employed: % <>= model.response(mf1) @ % For constructing separate model matrices for the two parts on the RHS, the \fct{model.matrix} can be employed and additionally specifying the argument \code{rhs}: % <>= model.matrix(F1, data = mf1, rhs = 1) model.matrix(F1, data = mf1, rhs = 2) @ \subsection{Multiple responses} To accommodate multiple responses, all formula operators can be employed on the LHS in \class{Formula} objects (whereas they would have their original arithmetic meaning in \class{formula} objects). This also includes the new \code{|} operator for separating different parts. Thus, one could specify a two-part response via \code{y1 | y2 ~ x3} or a single part with two variables via \code{y1 + y2 ~ x3}. We do the latter in the following illustration. % <>= F2 <- Formula(y1 + y2 ~ x3) length(F2) @ % As usual, the model frame can be derived by % <>= mf2 <- model.frame(F2, data = dat) mf2 @ % However, there is an important difference to the model frame \code{mf1} derived in the previous example. As the (non-generic) \fct{model.response} function would only be able to extract a single response column from a model frame, multi-response model frames in \pkg{Formula} are implemented to have no response at all in the \fct{model.response} sense: % <>= model.response(mf2) @ % As \fct{model.response} cannot be extended (without overloading the base \proglang{R} function), \pkg{Formula} provides a new generic function \fct{model.part} which can be used to extract all variables from a model frame pertaining to specific parts. This can also be used to extract multiple responses. Its syntax is modeled after \fct{model.matrix} taking a \class{Formula} as the first argument. For further details see Section~\ref{sec:implementation}. Its application is straightforward and all LHS variables (in the first and only part of the LHS) can be extracted via % <>= model.part(F2, data = mf2, lhs = 1) @ % The same method also works for single response models as in the previous example: % <>= model.part(F1, data = mf1, lhs = 1, drop = TRUE) @ \section{Implementation} \label{sec:implementation} Below we discuss the ideas for the design of the \class{Formula} class and methods. As all tools are built on top of the \class{formula} class and its associated methods whose most important feature are briefly outlined as well. \subsection[Working with classic ``formula'' objects]{Working with classic \class{formula} objects} Classic \class{formula} objects \citep{Formula:Chambers+Hastie:1992} are constructed by using \code{~} to separate LHS and RHS, typically (but not necessarily) interpreted as ``dependent'' and ``explanatory'' variables. For describing relationships between variables on the RHS, several operators can be used: \code{+}, \code{-}, \code{*}, \code{/}, \code{:}, \verb+%in%+, \verb+^+. Thus, these do not have their original meaning in the top-level RHS while they keep their original arithmetic meaning on higher levels of the RHS (thus, within function calls such as \verb+I(x1^2)+) and on the LHS in general. A first step in using \class{formula} objects is often to compute the associated \class{terms} using the function \fct{terms}. Based on the formula or the associated terms and a suitable set of variables (typically either in a data frame or in the global environment) \fct{model.frame} can build a so-called model frame that contains all variables in the formula/terms. This might include processing of missing values (\code{NA}s), carrying out variable transformations (such as logs, squares, or other functions of one or more variables) and providing further variables like weights, offset, etc. A model frame is simply a \class{data.frame} with additional attributes (including \class{terms}) but without a specific class. From this preprocessed model frame several components can be extracted using \fct{model.extract} or \fct{model.response}, \fct{model.weights}, and \fct{model.offset}, all of which are non-generic. Last not least, \fct{model.matrix} (which is generic) can compute ``design'' or ``regressor'' matrices based on the formula/terms and the associated model frame. \subsection[Constructing ``Formula'' objects]{Constructing \class{Formula} objects} To accomplish the main objectives of the \class{Formula} class, the following design principles have been used: reuse of \class{formula} objects, introduction of a single new operator \code{|}, and support of all formula operators on the LHS. Thus, \code{|} loses its original meaning (logical ``or'') on the first level of formulas but can still be used with its original meaning on higher levels, e.g., \code{factor(x1 > 0.5 | x3 == "a")} still works as before. For assigning a new class to formulas containing \code{|}, the constructor function \fct{Formula} is used: % <>= F3 <- Formula(y1 + y2 | log(y3) ~ x1 + I(x2^2) | 0 + log(x1) | x3 / x4) F3 length(F3) @ % In this example, \code{F3} is an artificially complex formula with two parts on the LHS and three parts on the RHS, both containing multiple terms, transformations or other formula operators. Apart from assigning the new class \class{Formula} (in addition to the old \class{formula} class), \fct{Formula} also splits up the formula into LHS and RHS parts which are stored as list attributes \code{"lhs"} and \code{"rhs"}, respectively, e.g., % <>= attr(F3, "lhs") @ and analogously \code{attr(F3, "rhs")}. The user never has to compute on these attributes directly, but many methods for \class{Formula} objects take \code{lhs} and/or \code{rhs} arguments. These always refer to index vectors for the two respective lists. It would have been conceivable to generalize not only the notion of formulas but also of terms or model frames. However, there are few generics with methods for \class{terms} objects and there is no particular class for model frames at all. Hence, computing with generalized versions of these concepts would have required much more overhead for users of \pkg{Formula}. Hence, it was decided not to do so and keep the package interface as simple as possible. \subsection[Extracting ``formula'' and ``terms'' objects]{Extracting \class{formula} and \class{terms} objects} As subsequent computations typically require a \class{formula} or a \class{terms} object, \pkg{Formula} provides suitable \fct{formula} and \fct{terms} extractors for \class{Formula} objects. For the former, the idea is to be able to switch back and forth between the \class{Formula} and \class{formula} representation, e.g., \code{formula(Formula(...))} should recover the original input formula. For the latter, the objective is somewhat different: \fct{terms} should always return a \class{terms} object that can be processed by \fct{model.frame} and similar functions. Thus, the terms must not contain multiple responses and/or the new \code{|} operator. The \fct{formula} method is straightforward. When no additional arguments are supplied it recovers the original \class{formula}. Furthermore, there are two optional additional arguments \code{lhs} and \code{rhs}. With these arguments subsets of formulas can be chosen, indexing the LHS and RHS parts. The default value for both is \code{NULL}, meaning that all parts are employed. % <>= formula(F3) formula(F3, lhs = 2, rhs = -2) formula(F3, lhs = c(TRUE, FALSE), rhs = 0) @ Similarly, \fct{terms} computes a \class{terms} object, by default using all parts in the formula, but \code{lhs} and \code{rhs} can be used as above. To remove the \code{|} operator, all parts are collapsed using the \code{+} operator. Furthermore, the LHS variables can only be kept on the LHS if they contain a single term. Otherwise, to stop subsequent computations from interpreting the formula operators as arithmetic operators, all LHS components are added on the RHS as well. Thus, for \code{F3} we obtain % <>= terms(F3) @ % Instead of using all parts, subsets can again be selected. We illustrate this below but only show the associated \class{formula} to save output space: <>= formula(terms(F3)) formula(terms(F3, lhs = 2, rhs = -2)) formula(terms(F3, lhs = c(TRUE, FALSE), rhs = 0)) @ \subsection{Computing model frames, matrices, and responses} Given that suitable \class{terms} can be extracted from \class{Formula} objects, it is straightforward to set up the corresponding model frame. The \fct{model.frame} method simply first calls the \fct{terms} method and then applies the default \fct{model.frame}. Hence, all further arguments are processed as usual, e.g., % <>= mf3 <- model.frame(F3, data = dat, subset = y1 < 0.75, weights = x1) mf3 @ % All subsequent computations are then based on this preprocessed model frame (and possibly the original \class{Formula}). Thus, the model matrices for each RHS part can be easily computed, again setting the \code{rhs} argument: % <>= model.matrix(F3, data = mf3, rhs = 2) @ % Typically, just a single RHS will be selected and hence \code{rhs = 1} and not \code{rhs = NULL} is the default in this method. However, multiple RHS parts are also supported. Also, there is a \code{lhs} argument available (defaulting to \code{NULL}) which might seem unnecessary at first sight but it is important in case the selected RHS part(s) contain(s) a ``\code{.}'' that needs to be resolved (see \code{?model.matrix.Formula} for an example). The LHS parts can be extracted using the method for the new \fct{model.part} generic, employing a syntax similar to \fct{model.matrix}: % <>= model.part(F3, data = mf3, lhs = 1) model.part(F3, data = mf3, lhs = 2) @ % As argued above, introduction of a new generic is necessary for supporting multi-response formulas because \fct{model.response} is non-generic. For model frames derived from single-response formulas, \fct{model.response} can be used as usual. The remaining extractors work as usual: % <>= model.weights(mf3) @ \subsection{Further methods} To conclude the suite of methods available for the new \class{Formula} class, \pkg{Formula} provides an \fct{update} method and a new \fct{as.Formula} generic with suitable methods. The former updates the formula part by part, adding new parts if necessary: % <>= update(F1, . ~ . - x1 | . + x1) update(F1, . + y2 | y3 ~ .) @ % The \fct{as.Formula} method coerces to \class{Formula}, possibly also processing multiple arguments: % <>= as.Formula(y1 ~ x1, y2 ~ x2, ~ x3) @ \section{Usage in model fitting functions} \label{sec:usage} A typical application of \pkg{Formula} is to provide the workhorse for formula processing in model-fitting functions that require specification of multiple parts or multiple responses. To provide a very brief and simple example, we show how such a function can be set up. For illustration, we compute the coefficients in an instrumental variables regression using two-stage least squares. The \fct{ivcoef} function below takes the usual arguments \code{formula}, \code{data}, \code{subset}, and\linebreak \code{na.action} (and further arguments \code{weights} and \code{offset} could be included in the same way). The \code{formula} should be a two-part formula like \code{y ~ x1 + x2 | z1 + z2 + z3}. There is a single response on the LHS, one RHS part with the regressors and a second RHS part with the instruments. The function \fct{ivcoef} uses the typical workflow of model-fitting functions and processes its arguments in the following four steps: (1)~process the call, (2)~set up the model frame (using the \class{Formula} method), (3)~extract response and regressors from the model frame, (4)~estimate the model (by calling \fct{lm.fit} twice to compute the two-stage least squares coefficients). % <>= ivcoef <- function(formula, data, subset, na.action, ...) { mf <- match.call(expand.dots = FALSE) m <- match(c("formula", "data", "subset", "na.action"), names(mf), 0) mf <- mf[c(1, m)] f <- Formula(formula) mf[[1]] <- as.name("model.frame") mf$formula <- f mf <- eval(mf, parent.frame()) y <- model.response(mf) x <- model.matrix(f, data = mf, rhs = 1) z <- model.matrix(f, data = mf, rhs = 2) xz <- as.matrix(lm.fit(z, x)$fitted.values) lm.fit(xz, y)$coefficients } @ % The resulting function can then be applied easily (albeit not very meaningfully) to the \code{dat} data frame: % <>= ivcoef(log(y1) ~ x1 | x2, data = dat) @ % The same coefficients can be derived along with all the usual inference using the \fct{ivreg} function from the \pkg{AER} package \citep{Formula:Kleiber+Zeileis:2008}, which also uses the \pkg{Formula} tools in its latest release. Apart from providing inference and many other details, \fct{ivreg} also supports \code{weights}, \code{offsets} etc. Finally, for backward compatibility, the function also allows separate formulas for regressors and instruments (i.e., \code{formula = y ~ x1 + x2} and \code{instruments = ~ z1 + z2 + z3}) which can be easily incorporated using the \pkg{Formula} tools, e.g., replacing \code{f <- Formula(formula)} by % \begin{Sinput} + f <- if(!is.null(instruments)) as.Formula(formula, instruments) + else as.Formula(formula) + stopifnot(isTRUE(all.equal(length(f), c(1, 2)))) \end{Sinput} % In summary, the usage of \pkg{Formula} should reduce the overhead for the developers of model-fitting functions in \proglang{R} with multiple responses and/or multiple parts and make the resulting programs more intelligible. Further \proglang{R} packages employing the \class{Formula} approach can be obtained from CRAN, including \pkg{betareg}, \pkg{frontier}, \pkg{mlogit}, and \pkg{plm}. \section{Summary} \label{sec:summary} The \pkg{Formula} package provides tools for processing multi-response and multi-part formulas in the \proglang{R} system for statistical computing. The new class \class{Formula} inherits from the existing \class{formula} class, only adds a single new formula operator \code{|}, and enables the usage of formula operators on the left-hand side of formulas. The methods provided for \class{Formula} objects are as similar as possible to the classic methods, facilitating their usage in model-fitting functions that require support for multiple responses and/or multiple parts of variables. \bibliography{Formula} \end{document} Formula/tests/0000755000175400001440000000000012127334421013213 5ustar zeileisusersFormula/tests/Examples/0000755000175400001440000000000012127334421014771 5ustar zeileisusersFormula/tests/Examples/Formula-Ex.Rout.save0000644000175400001440000001747512127327302020576 0ustar zeileisusers R version 3.0.0 (2013-04-03) -- "Masked Marvel" Copyright (C) 2013 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > pkgname <- "Formula" > source(file.path(R.home("share"), "R", "examples-header.R")) > options(warn = 1) > library('Formula') > > base::assign(".oldSearch", base::search(), pos = 'CheckExEnv') > cleanEx() > nameEx("Formula") > ### * Formula > > flush(stderr()); flush(stdout()) > > ### Name: Formula > ### Title: Extended Formulas: Multiple Responses and Multiple Regressor > ### Parts > ### Aliases: Formula formula.Formula as.Formula as.Formula.default > ### as.Formula.formula as.Formula.Formula is.Formula print.Formula > ### update.Formula length.Formula all.equal.Formula > ### Keywords: classes > > ### ** Examples > > ## create a simple Formula with one response and two regressor parts > f1 <- y ~ x1 + x2 | z1 + z2 + z3 > F1 <- Formula(f1) > class(F1) [1] "Formula" "formula" > length(F1) [1] 1 2 > > ## switch back to original formula > formula(F1) y ~ x1 + x2 | z1 + z2 + z3 > > ## create formula with various transformations > formula(F1, rhs = 1) y ~ x1 + x2 > formula(F1, collapse = TRUE) y ~ x1 + x2 + (z1 + z2 + z3) > formula(F1, lhs = 0, rhs = 2) ~z1 + z2 + z3 > > ## put it together from its parts > as.Formula(y ~ x1 + x2, ~ z1 + z2 + z3) y ~ x1 + x2 | z1 + z2 + z3 > > ## update the formula > update(F1, . ~ . + I(x1^2) | . - z2 - z3) y ~ x1 + x2 + I(x1^2) | z1 > update(F1, . | y2 + y3 ~ .) y | y2 + y3 ~ x1 + x2 | z1 + z2 + z3 > > # create a multi-response multi-part formula > f2 <- y1 | y2 + y3 ~ x1 + I(x2^2) | 0 + log(x1) | x3 / x4 > F2 <- Formula(f2) > length(F2) [1] 2 3 > > ## obtain various subsets using standard indexing > ## no lhs, first/seconde rhs > formula(F2, lhs = 0, rhs = 1:2) ~x1 + I(x2^2) | 0 + log(x1) > formula(F2, lhs = 0, rhs = -3) ~x1 + I(x2^2) | 0 + log(x1) > formula(F2, lhs = 0, rhs = c(TRUE, TRUE, FALSE)) ~x1 + I(x2^2) | 0 + log(x1) > ## first lhs, third rhs > formula(F2, lhs = c(TRUE, FALSE), rhs = 3) y1 ~ x3/x4 > > > > cleanEx() > nameEx("model.frame.Formula") > ### * model.frame.Formula > > flush(stderr()); flush(stdout()) > > ### Name: model.frame.Formula > ### Title: Model Frame/Matrix/Response Construction for Extended Formulas > ### Aliases: terms.Formula model.matrix.Formula model.frame.Formula > ### model.part model.part.formula model.part.Formula > ### Keywords: models > > ### ** Examples > > ## artificial example data > set.seed(1090) > dat <- as.data.frame(matrix(round(runif(21), digits = 2), ncol = 7)) > colnames(dat) <- c("y1", "y2", "y3", "x1", "x2", "x3", "x4") > for(i in c(2, 6:7)) dat[[i]] <- factor(dat[[i]] > 0.5, labels = c("a", "b")) > dat$y2[1] <- NA > dat y1 y2 y3 x1 x2 x3 x4 1 0.82 0.27 0.09 0.22 b a 2 0.70 b 0.17 0.26 0.46 a a 3 0.65 a 0.28 0.03 0.37 b b > > ###################################### > ## single response and two-part RHS ## > ###################################### > > ## single response with two-part RHS > F1 <- Formula(log(y1) ~ x1 + x2 | I(x1^2)) > length(F1) [1] 1 2 > > ## set up model frame > mf1 <- model.frame(F1, data = dat) > mf1 log(y1) x1 x2 I(x1^2) 1 -0.1984509 0.09 0.22 0.0081 2 -0.3566749 0.26 0.46 0.0676 3 -0.4307829 0.03 0.37 9e-04 > > ## extract single response > model.part(F1, data = mf1, lhs = 1, drop = TRUE) 1 2 3 -0.1984509 -0.3566749 -0.4307829 > model.response(mf1) 1 2 3 -0.1984509 -0.3566749 -0.4307829 > ## model.response() works as usual > > ## extract model matrices > model.matrix(F1, data = mf1, rhs = 1) (Intercept) x1 x2 1 1 0.09 0.22 2 1 0.26 0.46 3 1 0.03 0.37 attr(,"assign") [1] 0 1 2 > model.matrix(F1, data = mf1, rhs = 2) (Intercept) I(x1^2) 1 1 0.0081 2 1 0.0676 3 1 0.0009 attr(,"assign") [1] 0 1 > > ######################################### > ## multiple responses and multiple RHS ## > ######################################### > > ## set up Formula > F2 <- Formula(y1 + y2 | log(y3) ~ x1 + I(x2^2) | 0 + log(x1) | x3 / x4) > length(F2) [1] 2 3 > > ## set up full model frame > mf2 <- model.frame(F2, data = dat) > mf2 y1 y2 log(y3) x1 I(x2^2) log(x1) x3 x4 2 0.70 b -1.771957 0.26 0.2116 -1.347074 a a 3 0.65 a -1.272966 0.03 0.1369 -3.506558 b b > > ## extract responses > model.part(F2, data = mf2, lhs = 1) y1 y2 2 0.70 b 3 0.65 a > model.part(F2, data = mf2, lhs = 2) log(y3) 2 -1.771957 3 -1.272966 > ## model.response(mf2) does not give correct results! > > ## extract model matrices > model.matrix(F2, data = mf2, rhs = 1) (Intercept) x1 I(x2^2) 2 1 0.26 0.2116 3 1 0.03 0.1369 attr(,"assign") [1] 0 1 2 > model.matrix(F2, data = mf2, rhs = 2) log(x1) 2 -1.347074 3 -3.506558 attr(,"assign") [1] 1 > model.matrix(F2, data = mf2, rhs = 3) (Intercept) x3b x3a:x4b x3b:x4b 2 1 0 0 0 3 1 1 0 1 attr(,"assign") [1] 0 1 2 2 attr(,"contrasts") attr(,"contrasts")$x3 [1] "contr.treatment" attr(,"contrasts")$x4 [1] "contr.treatment" > > ####################### > ## Formulas with '.' ## > ####################### > > ## set up Formula > F3 <- Formula(y1 | y2 ~ .) > mf3 <- model.frame(F3, data = dat) > ## without y1 or y2 > model.matrix(F3, data = mf3) (Intercept) y3 x1 x2 x3b x4b 2 1 0.17 0.26 0.46 0 0 3 1 0.28 0.03 0.37 1 1 attr(,"assign") [1] 0 1 2 3 4 5 attr(,"contrasts") attr(,"contrasts")$x3 [1] "contr.treatment" attr(,"contrasts")$x4 [1] "contr.treatment" > ## without y1 but with y2 > model.matrix(F3, data = mf3, lhs = 1) (Intercept) y2b y3 x1 x2 x3b x4b 2 1 1 0.17 0.26 0.46 0 0 3 1 0 0.28 0.03 0.37 1 1 attr(,"assign") [1] 0 1 2 3 4 5 6 attr(,"contrasts") attr(,"contrasts")$y2 [1] "contr.treatment" attr(,"contrasts")$x3 [1] "contr.treatment" attr(,"contrasts")$x4 [1] "contr.treatment" > ## without y2 but with y1 > model.matrix(F3, data = mf3, lhs = 2) (Intercept) y1 y3 x1 x2 x3b x4b 2 1 0.70 0.17 0.26 0.46 0 0 3 1 0.65 0.28 0.03 0.37 1 1 attr(,"assign") [1] 0 1 2 3 4 5 6 attr(,"contrasts") attr(,"contrasts")$x3 [1] "contr.treatment" attr(,"contrasts")$x4 [1] "contr.treatment" > > ############################## > ## Process multiple offsets ## > ############################## > > ## set up Formula > F4 <- Formula(y1 ~ x3 + offset(x1) | x4 + offset(log(x2))) > mf4 <- model.frame(F4, data = dat) > ## model.part can be applied as above and includes offset! > model.part(F4, data = mf4, rhs = 1) x3 offset(x1) 1 b 0.09 2 a 0.26 3 b 0.03 > ## additionally, the corresponding corresponding terms can be included > model.part(F4, data = mf4, rhs = 1, terms = TRUE) x3 offset(x1) 1 b 0.09 2 a 0.26 3 b 0.03 > ## hence model.offset() can be applied to extract offsets > model.offset(model.part(F4, data = mf4, rhs = 1, terms = TRUE)) [1] 0.09 0.26 0.03 > model.offset(model.part(F4, data = mf4, rhs = 2, terms = TRUE)) [1] -1.5141277 -0.7765288 -0.9942523 > > > > ### *