merTools/ 0000755 0001762 0000144 00000000000 13466261516 012070 5 ustar ligges users merTools/inst/ 0000755 0001762 0000144 00000000000 13466137707 013051 5 ustar ligges users merTools/inst/shiny-apps/ 0000755 0001762 0000144 00000000000 13402510754 015127 5 ustar ligges users merTools/inst/shiny-apps/shinyMer/ 0000755 0001762 0000144 00000000000 13466135366 016741 5 ustar ligges users merTools/inst/shiny-apps/shinyMer/ui.R 0000644 0001762 0000144 00000010357 13402510754 017473 0 ustar ligges users # ShinyMer
shinyUI(fluidPage(
shiny::titlePanel("Explore your merMod interactively"),
shiny::sidebarLayout(
shiny::sidebarPanel(
shiny::radioButtons("newdataType",
"Simulated data scenario",
choices=df.choices,
selected=NULL),
# conditionalPanel(condition = "input.newdataType!='orig'",
# selectInput("filter", "Filter",
# choices = names(merMod@frame))
# ),
shiny::numericInput("n.sims",
label="Simulations (Max=1,000)",
value=100,
min=1,
max=1000),
conditionalPanel(condition="input.conditionedPanels==3",
helpText("Here you can compare impact of
changing input variables on the
outcome variable for selected cases."),
selectInput("group", "Group Factor:",
choices = names(ranef(merMod)),
selected = NULL),
uiOutput("re.ui"),
sliderInput("nbin", "Effect Bins", min = 3,
max = 10, value = 4, step = 1),
helpText("And modify your fixed effects"),
selectInput("fixef", "Fixed Effect:",
choices = all.vars(nobars(formula(merMod)))[-1],
selected = NULL)
),
shiny::numericInput("alpha",
label="Credible Interval (%)",
value=95,
min=0,
max=100),
shiny::radioButtons("stat",
"Measure of central tendency",
choices=c("Median"="median", "Mean"="mean"),
selected=NULL),
shiny::radioButtons("predMetric",
"Prediction metric",
choices=c("Linear Predictor"="linear.prediction",
"Probability"="probability"),
selected=NULL),
shiny::checkboxInput("resid.var",
label="Include Residual Variation",
value=TRUE)
),
shiny::mainPanel(
shiny::tabsetPanel(type="tabs",
shiny::tabPanel("Prediction uncertainty",
shiny::h3("Prediction Intervals:"),
plotOutput("predPlot"),
shiny::h3("All Predictions"),
if ("DT" %in% rownames(installed.packages())) {
dataTableOutput("dt") } else {
tableOutput("shiny")
},
shiny::downloadButton("downloadData", "Download predict interval data")
,value = 1
),
shiny::tabPanel("Parameters",
shiny::h3("Original call"),
textOutput("call"),
shiny::h3("Fixed Effects"),
plotOutput("feplot"),
shiny::h3("Group Effects"),
plotOutput("replot"), value = 2
),
shiny::tabPanel("Substantive Effect",
shiny::h3("Effect Sizes"),
plotOutput("gPlot"),
shiny::h3("Fixef Effect Impact"),
plotOutput("wigglePlot"), value = 3
), id = "conditionedPanels"
)
)
)
))
merTools/inst/shiny-apps/shinyMer/server.r 0000644 0001762 0000144 00000012013 13466135366 020427 0 ustar ligges users #SERVER----
server = function(input, output){
output$text1 <- renderText({
paste("You have selected", input$stat)
})
predInput <- reactive({
data <- switch(input$newdataType,
"orig" = merMod@frame,
"mean" = draw(merMod, type = "average"),
"rand" = draw(merMod, type = "random"),
"user" = newdata)
cbind(predictInterval(merMod, newdata = data, level = input$alpha/100,
type = input$predMetric,
include.resid.var = input$resid.var,
n.sims = input$n.sims, stat = input$stat), data)
})
if ("DT" %in% rownames(installed.packages())) {
output$dt <- renderDataTable({
predInput()
})
} else {
output$dt <- renderTable({
predInput()
})
}
output$downloadData <- shiny::downloadHandler(
filename = "predictIntervalResults.csv",
content = function(file) {
write.csv(shiny::isolate(predInput()), file)
}
)
output$predPlot <- renderPlot({
data <- predInput()
data$x <- factor(seq(1:nrow(data)))
ggplot(data, aes(x = x, y = fit, ymin = lwr, ymax = upr)) +
geom_pointrange() +
theme_bw() + theme(axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.ticks.x = element_blank())
})
feData <- reactive({
data <- FEsim(merMod, n.sims = input$n.sims)
return(data)
})
output$feplot <- renderPlot({
plotdf <- feData()
scale <- input$alpha/100
vartmp <- input$stat
plotFEsim(plotdf, level = scale, stat = vartmp, sd = TRUE,
intercept = FALSE)
})
reData <- reactive({
data <- REsim(merMod, n.sims = input$n.sims)
return(data)
})
output$replot <- renderPlot({
plotdf <- reData()
scale <- input$alpha/100
vartmp <- input$stat
plotREsim(plotdf, level = scale, stat = vartmp, sd = TRUE)
})
output$call <- renderPrint({
merMod@call
})
reEffInput <- reactive({
data <- switch(input$newdataType,
"orig" = merMod@frame,
"mean" = draw(merMod, type = "average"),
"rand" = draw(merMod, type = "random"),
"user" = newdata)
if(nrow(data) > 12){
warning("Too much data selected, only using top 12 rows.")
data <- data[1:12, ]
}
return(data)
})
groupData <- reactive({
plotdf <- REimpact(merMod, newdata = reEffInput(),
groupFctr = input$group,
term = input$term,
level = input$alpha/100,
breaks = input$nbin,
type = input$predMetric,
include.resid.var = input$resid.var,
n.sims = input$n.sims, stat = input$stat)
plotdf$upr <- qnorm(input$alpha/100) * plotdf$AvgFitSE
plotdf$lwr <- qnorm(input$alpha/100) * plotdf$AvgFitSE
plotdf$upr <- plotdf$AvgFit + plotdf$upr
plotdf$lwr <- plotdf$AvgFit - plotdf$lwr
plotdf$bin <- factor(plotdf$bin)
return(plotdf)
})
output$gPlot <- renderPlot({
ggplot(groupData(), aes(x = bin, y = AvgFit, ymin = lwr, ymax = upr)) +
geom_pointrange() + facet_wrap(~case) +
theme_bw() + labs(x = "Bin", y = "Value of DV",
title = "Impact of grouping term for selected case")
})
wiggleData <- reactive({
valLookup <- unique(merMod@frame[, input$fixef])
if(class(valLookup) %in% c("numeric", "integer")){
newvals <- seq(min(valLookup), max(valLookup), length.out = 20)
} else{
if(length(valLookup) < 50){
newvals <- newvals
} else{
newvals <- sample(newvals, 50)
}
}
plotdf <- wiggle(reEffInput(), input$fixef, values = list(newvals))
plotdf <- cbind(plotdf, predictInterval(merMod, newdata=plotdf,
type = input$predMetric,
level = input$alpha/100,
include.resid.var = input$resid.var,
n.sims = input$n.sims, stat = input$stat))
plotdf$X <- plotdf[, input$fixef]
plotdf$case <- rep(1:length(newvals), length = nrow(reEffInput()))
return(plotdf)
})
output$re.ui <- renderUI({
choices <- names(ranef(merMod)[[input$group]])
selectInput("term", "Group Term:",
choices = choices,
selected = choices[1])
})
output$wigglePlot <- renderPlot({
ggplot(wiggleData(), aes(x = X, y = fit, ymin = lwr,
ymax = upr)) +
geom_pointrange() + facet_wrap(~case) +
theme_bw() + labs(y = "Simulated Value of DV",
title = "Impact of selected fixed effect for
selected cases.")
})
}
merTools/inst/shiny-apps/shinyMer/global.R 0000644 0001762 0000144 00000001046 13402510754 020311 0 ustar ligges users # Import variables from function
library(ggplot2)
library(shiny)
merMod <<- .shinyMerPar$merMod
if(is.null(.shinyMerPar$simData)){
} else {
newdata <<- .shinyMerPar$simData
}
if (!exists("newdata")) {
df.choices <- c("Model Frame" = "orig",
"Random Obs" = "rand",
"Average Obs" = "mean")
} else {
df.choices <- c("User Supplied" = "user",
"Model Frame" = "orig",
"Random Obs" = "rand",
"Average Obs" = "mean")
}
merTools/inst/doc/ 0000755 0001762 0000144 00000000000 13466137707 013616 5 ustar ligges users merTools/inst/doc/marginal_effects.html 0000644 0001762 0000144 00000046675 13466137511 020010 0 ustar ligges users
Using merTools to Marginalize Over Random Effect Levels
Using merTools to Marginalize Over Random Effect Levels
Jared Knowles and Carl Frederick
2019-05-12
Marginalizing Random Effects
One of the most common questions about multilevel models is how much influence grouping terms have on the outcome. One way to explore this is to simulate the predicted values of an observation across the distribution of random effects for a specific grouping variable and term. This can be described as “marginalizing” predictions over the distribution of random effects. This allows you to explore the influence of the grouping term and grouping levels on the outcome scale by simulating predictions for simulated values of each observation across the distribution of effect sizes.
The REmargins()
function allows you to do this. Here, we take the example sleepstudy
model and marginalize predictions for all of the random effect terms (Subject:Intercept, Subject:Days). By default, the function will marginalize over the quartiles of the expected rank (see expected rank vignette) of the effect distribution for each term.
The new data frame output from REmargins
contains a lot of information. The first few columns contain the original data passed to newdata
. Each observation in newdata
is identified by a case
number, because the function repeats each observation by the number of random effect terms and number of breaks to simulate each term over. Then the grouping_var
Summarizing
Plotting
Finally - you can plot the results marginalization to evaluate the effect of the random effect terms graphically.

merTools/inst/doc/Using_predictInterval.html 0000644 0001762 0000144 00000322663 13466137506 021021 0 ustar ligges users
Prediction Intervals from merMod Objects
Prediction Intervals from merMod Objects
Jared Knowles and Carl Frederick
2019-05-12
Introduction
Fitting (generalized) linear mixed models, (G)LMM, to very large data sets is becoming increasingly easy, but understanding and communicating the uncertainty inherent in those models is not. As the documentation for lme4::predict.merMod()
notes:
There is no option for computing standard errors of predictions because it is difficult to define an efficient method that incorporates uncertainty in the variance parameters; we recommend lme4::bootMer()
for this task.
We agree that, short of a fully Bayesian analysis, bootstrapping is the gold-standard for deriving a prediction interval predictions from a (G)LMM, but the time required to obtain even a respectable number of replications from bootMer()
quickly becomes prohibitive when the initial model fit is on the order of hours instead of seconds. The only other alternative we have identified for these situations is to use the arm::sim()
function to simulate values. Unfortunately, this only takes variation of the fixed coefficients and residuals into account, and assumes the conditional modes of the random effects are fixed.
We developed the predictInterval()
function to incorporate the variation in the conditional modes of the random effects (CMRE, a.k.a. BLUPs in the LMM case) into calculating prediction intervals. Ignoring the variance in the CMRE results in overly confident estimates of predicted values and in cases where the precision of the grouping term varies across levels of grouping terms, creates the illusion of difference where none may exist. The importance of accounting for this variance comes into play sharply when comparing the predictions of different models across observations.
We take the warning from lme4::predict.merMod()
seriously, but view this method as a decent first approximation the full bootstrap analysis for (G)LMMs fit to very large data sets.
Conceptual description
In order to generate a proper prediction interval, a prediction must account for three sources of uncertainty in mixed models:
- the residual (observation-level) variance,
- the uncertainty in the fixed coefficients, and
- the uncertainty in the variance parameters for the grouping factors.
A fourth, uncertainty about the data, is beyond the scope of any prediction method.
As we mentioned above, the arm:sim()
function incorporates the first two sources of variation but not the third , while bootstrapping using lme4::bootMer()
does incorporate all three sources of uncertainty because it re-estimates the model using random samples of the data.
When inference about the values of the CMREs is of interest, it would be nice to incorporate some degree of uncertainty in those estimates when comparing observations across groups. predictInterval()
does this by drawing values of the CMREs from the conditional variance-covariance matrix of the random affects accessible from lme4::ranef(model, condVar=TRUE)
. Thus, predictInterval()
incorporates all of the uncertainty from sources one and two, and part of the variance from source 3, but the variance parameters themselves are treated as fixed.
To do this, predictInterval()
takes an estimated model of class merMod
and, like predict()
, a data.frame upon which to make those predictions and:
- extracts the fixed and random coefficients
- takes
n
draws from the multivariate normal distribution of the fixed and random coefficients (separately)
- calculates the linear predictor for each row in
newdata
based on these draws, and
- optionally incorporates the residual variation (per the
arm::sim()
function), and,
- returns newdata with the lower and upper limits of the prediction interval and the mean or median of the simulated predictions
Currently, the supported model types are linear mixed models and mixed logistic regression models.
The prediction data set can include levels that are not in the estimation model frame. The prediction intervals for such observations only incorporate uncertainty from fixed coefficient estimates and the residual level of variation.
Comparison to existing methods
What do the differences between predictInterval()
and the other methods for constructing prediction intervals mean in practice? We would expect to see predictInterval()
to produce prediction intervals that are wider than all methods except for the bootMer()
method. We would also hope that the prediction point estimate from other methods falls within the prediction interval produced by predictInterval()
. Ideally, the predicted point estimate produced by predictInterval()
would fall close to that produced by bootMer()
.
This section compares the results of predictInterval()
with those obtained using arm::sim()
and lme4::bootMer()
using the sleepstudy data from lme4
. These data contain reaction time observations for 10 days on 18 subjects.
The data are sorted such that the first 10 observations are days one through ten for subject 1, the next 10 are days one through ten for subject 2 and so on. The example model that we are estimating below estimates random intercepts and a random slope for the number of days.
###Step 1: Estimating the model and using predictInterval()
First, we will load the required packages and data and estimate the model:
Then, calculate prediction intervals using predictInterval()
. The predictInterval
function has a number of user configurable options. In this example, we use the original data sleepstudy
as the newdata. We pass the function the fm1
model we fit above. We also choose a 95% interval with level = 0.95
, though we could choose a less conservative prediction interval. We make 1,000 simulations for each observation n.sims = 1000
. We set the point estimate to be the median of the simulated values, instead of the mean. We ask for the linear predictor back, if we fit a logistic regression, we could have asked instead for our predictions on the probability scale instead. Finally, we indicate that we want the predictions to incorporate the residual variance from the model – an option only available for lmerMod
objects.
Here is the first few rows of the object PI
:
251.6679 |
311.3148 |
196.4089 |
271.4794 |
330.9183 |
214.2174 |
292.6803 |
350.9868 |
237.7709 |
311.6967 |
369.2907 |
254.2236 |
331.8317 |
389.7439 |
278.1873 |
350.7461 |
408.1392 |
294.8502 |
The three columns are the median (fit
) and limits of the 95% prediction interval (upr
and lwr
) because we set level=0.95
. The following figure displays the output graphically for the first 30 observations.

Step 1a: Adjusting for correlation between fixed and random effects
The prediction intervals above do not correct for correlations between fixed and random effects. This tends to lead to predictive intervals that are too conservative, especially for existing groups when there is a lot of data on relatively few groups. In that case, a significant portion of the uncertainty in the prediction can be due to variance in the fixed intercept which is anti-correlated with variance in the random intercept effects. For instance, it does not actually matter if the fixed intercept is 5 and the random intercept effects are -2, 1, and 1, versus a fixed intercept of 6 and random intercept effects of -3, 0, and 0. (The latter situation will never be the MLE, but it can occur in this package’s simulations.)
To show this issue, we’ll use the sleep study model, predicting the reaction times of subjects after experiencing sleep deprivation:
Let’s use the model to give an interval for the true average body fat of a large group of students like the first one in the study — a 196cm female baseball player:
There are two ways to get predictInterval to create less-conservative intervals to deal with this. The first is just to tell it to consider certain fixed effects as fully-known (that is, with an effectively 0 variance.) This is done using the ignore.fixed.effects
argument.
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = 1)
#> fit upr lwr
#> 1 253.8527 268.5283 239.627
# predict the average reaction time for a subject at day 0, taking the global intercept
# (mean reaction time) as fully known
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = "(Intercept)")
#> fit upr lwr
#> 1 254.2344 269.3859 239.3111
#Same as above
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = 1:2)
#> fit upr lwr
#> 1 253.6733 269.8759 237.9836
# as above, taking the first two fixed effects (intercept and days effect) as fully known
The second way is to use an ad-hoc variance adjustment, with the fix.intercept.variance
argument. This takes the model’s intercept variance \(\hat\sigma^2_\mu\) and adjusts it to:
\[\hat\sigma\prime^2_\mu = \hat\sigma^2_\mu-\Sigma_{levels}\frac{1}{\Sigma_{groups(level)}1/(\hat\sigma^2_{level}+sigma^2_{group})}\]
In other words, it assumes the given intercept variance incorporates spurious variance for each level, where each of the spurious variance terms has a precision equal to the of the precisions due to the individual groups at that level.
A few notes about these two arguments:
fix.intercept.variance=T
is redundant with ignore.fixed.effects=1
, but not vice versa.
- These corrections should NOT be used when predicting outcomes for groups not present in the original data.
Step 2: Comparison with arm::sim()
How does the output above compare to what we could get from arm::sim()
?
PI.arm.time <- system.time(
PI.arm.sims <- arm::sim(fm1, 1000)
)
PI.arm <- data.frame(
fit=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.500)),
upr=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.975)),
lwr=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.025))
)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="arm::sim()", x=(1:nrow(PI.arm))+0.1, PI.arm))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)

The prediction intervals from arm:sim()
are much smaller and the random slope for days vary more than they do for predictInterval
. Both results are as expected, given the small number of subjects and observations per subject in these data. Because predictInterval()
is incorporating uncertainty in the CMFEs (but not the variance parameters of the random coefficients themselves), the Days slopes are closer to the overall or pooled regression slope.
###Step 3: Comparison with lme4::bootMer()
As quoted above, the developers of lme4 suggest that users interested in uncertainty estimates around their predictions use lme4::bootmer()
to calculate them. The documentation for lme4::bootMer()
goes on to describe three implemented flavors of bootstrapped estimates:
- parametrically resampling both the “spherical” random effects u and the i.i.d. errors \(\epsilon\)
- treating the random effects as fixed and parametrically resampling the i.i.d. errors
- treating the random effects as fixed and semi-parametrically resampling the i.i.d. errors from the distribution of residuals.
We will compare the results from predictInterval()
with each method, in turn.
Step 3a: lme4::bootMer()
method 1
##Functions for bootMer() and objects
####Return predicted values from bootstrap
mySumm <- function(.) {
predict(., newdata=sleepstudy, re.form=NULL)
}
####Collapse bootstrap into median, 95% PI
sumBoot <- function(merBoot) {
return(
data.frame(fit = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE))),
lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE)))
)
)
}
##lme4::bootMer() method 1
PI.boot1.time <- system.time(
boot1 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=FALSE, type="parametric")
)
PI.boot1 <- sumBoot(boot1)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 1", x=(1:nrow(PI.boot1))+0.1, PI.boot1))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)

The intervals produced by predictInterval
, represented in green, cover the point estimates produced by bootMer
in every case for these 30 observations. Additionally, in almost every case, the predictInterval
encompasses the entire interval presented by bootMer
. Here, the estimates produced by bootMer
are re-estimating the group terms, but by refitting the model, they are also taking into account the conditional variance of these terms, or theta
, and provide tighter prediction intervals than the predictInterval
method.
####Step 3b: lme4::bootMer()
method 2
##lme4::bootMer() method 2
PI.boot2.time <- system.time(
boot2 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=TRUE, type="parametric")
)
PI.boot2 <- sumBoot(boot2)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 2", x=(1:nrow(PI.boot2))+0.1, PI.boot2))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)

Here, the results for predictInterval
in green again encompass the results from bootMer
, but are much wider. The bootMer
estimates are ignoring the variance in the group effects, and as such, are only incorporating the residual variance and the variance in the fixed effects – similar to the arm::sim()
function.
Step 3c: lme4::bootMer()
method 3
##lme4::bootMer() method 3
PI.boot3.time <- system.time(
boot3 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=TRUE, type="semiparametric")
)
PI.boot3 <- sumBoot(boot3)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 3", x=(1:nrow(PI.boot3))+0.1, PI.boot3))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)

These results are virtually identical to those above.
Step 3c: Comparison to rstanarm
PI.time.stan <- system.time({
fm_stan <- stan_lmer(Reaction ~ Days + (Days|Subject), data = sleepstudy,
verbose = FALSE, open_progress = FALSE, refresh = -1,
show_messages=FALSE, chains = 1)
zed <- posterior_predict(fm_stan)
PI.stan <- cbind(apply(zed, 2, median), central_intervals(zed, prob=0.95))
})
#> Chain 1:
#> Chain 1: Gradient evaluation took 0 seconds
#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
#> Chain 1: Adjust your expectations accordingly!
#> Chain 1:
#> Chain 1:
#> Chain 1:
#> Chain 1: Elapsed Time: 6.919 seconds (Warm-up)
#> Chain 1: 3.744 seconds (Sampling)
#> Chain 1: 10.663 seconds (Total)
#> Chain 1:
print(fm_stan)
#> stan_lmer
#> family: gaussian [identity]
#> formula: Reaction ~ Days + (Days | Subject)
#> observations: 180
#> ------
#> Median MAD_SD
#> (Intercept) 251.9 6.3
#> Days 10.4 1.7
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 25.9 1.5
#>
#> Error terms:
#> Groups Name Std.Dev. Corr
#> Subject (Intercept) 24
#> Days 7 0.07
#> Residual 26
#> Num. levels: Subject 18
#>
#> Sample avg. posterior predictive distribution of y:
#> Median MAD_SD
#> mean_PPD 298.5 2.8
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
PI.stan <- as.data.frame(PI.stan)
names(PI.stan) <- c("fit", "lwr", "upr")
PI.stan <- PI.stan[, c("fit", "upr", "lwr")]
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="rstanArm", x=(1:nrow(PI.stan))+0.1, PI.stan))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)

Computation time
Our initial motivation for writing this function was to develop a method for incorporating uncertainty in the CMFEs for mixed models estimated on very large samples. Even for models with only modest degrees of complexity, using lme4::bootMer()
quickly becomes time prohibitive because it involves re-estimating the model for each simulation. We have seen how each alternative compares to predictInterval()
substantively, but how do they compare in terms of computational time? The table below lists the output of system.time()
for all five methods for calculating prediction intervals for merMod
objects.
predictInterval() |
0.33 |
0.03 |
0.36 |
arm::sim() |
0.64 |
0.00 |
0.64 |
lme4::bootMer()-Method 1 |
5.46 |
0.00 |
5.47 |
lme4::bootMer()-Method 2 |
5.50 |
0.00 |
5.50 |
lme4::bootMer()-Method 3 |
5.54 |
0.00 |
5.53 |
rstanarm:predict |
11.03 |
0.02 |
11.06 |
For this simple example, we see that arm::sim()
is the fastest–nearly five times faster than predictInterval()
. However, predictInterval()
is nearly six times faster than any of the bootstrapping options via lme4::bootMer
. This may not seem like a lot, but consider that the computational time for required for bootstrapping is roughly proportional to the number of bootstrapped simulations requested … predictInterval()
is not because it is just a series of draws from various multivariate normal distributions, so the time ratios in the table below represents the lowest bound of the computation time ratio of bootstrapping to predictInterval()
.
merTools/inst/doc/imputation.html 0000644 0001762 0000144 00000251206 13466137510 016673 0 ustar ligges users
Analyzing Imputed Data with Multilevel Models and merTools
Analyzing Imputed Data with Multilevel Models and merTools
Jared Knowles
2019-05-12
Introduction
Multilevel models are valuable in a wide array of problem areas that involve non-experimental, or observational data. In many of these cases the data on individual observations may be incomplete. In these situations, the analyst may turn to one of many methods for filling in missing data depending on the specific problem at hand, disciplinary norms, and prior research.
One of the most common cases is to use multiple imputation. Multiple imputation involves fitting a model to the data and estimating the missing values for observations. For details on multiple imputation, and a discussion of some of the main implementations in R, look at the documentation and vignettes for the mice
and Amelia
packages.
The key difficulty multiple imputation creates for users of multilevel models is that the result of multiple imputation is K replicated datasets corresponding to different estimated values for the missing data in the original dataset.
For the purposes of this vignette, I will describe how to use one flavor of multiple imputation and the function in merTools
to obtain estimates from a multilevel model in the presence of missing and multiply imputed data.
Missing Data and its Discontents
To demonstrate this workflow, we will use the hsb
dataset in the merTools
package which includes data on the math achievement of a wide sample of students nested within schools. The data has no missingness, so first we will simulate some missing data.
Fitting and Summarizing a Model List
Fitting a model is very similar
The resulting object modList
is a list of merMod
objects the same length as the number of imputation datasets. This object is assigned the class of merModList
and merTools
provides some convenience functions for reporting the results of this object.
Using this, we can directly compare the model fit with missing data excluded to the aggregate from the imputed models:
If you want to inspect the individual models, or you do not like taking the mean across the imputation replications, you can take the merModList
apart easily:
And, you can always operate on any single element of the list:
Output of a Model List
print(modList)
#> $imp1
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: mathach ~ minority + female + ses + meanses + (1 + ses | schid)
#> Data: d
#>
#> REML criterion at convergence: 46341.7
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -3.2121 -0.7269 0.0318 0.7616 2.9230
#>
#> Random effects:
#> Groups Name Variance Std.Dev. Corr
#> schid (Intercept) 2.3241 1.5245
#> ses 0.4518 0.6722 -0.45
#> Residual 35.7631 5.9802
#> Number of obs: 7185, groups: schid, 160
#>
#> Fixed effects:
#> Estimate Std. Error t value
#> (Intercept) 13.9640 0.1736 80.455
#> minority -2.5060 0.1983 -12.634
#> female -1.1875 0.1586 -7.489
#> ses 1.9564 0.1207 16.215
#> meanses 3.1769 0.3606 8.810
#>
#> Correlation of Fixed Effects:
#> (Intr) minrty female ses
#> minority -0.318
#> female -0.483 0.015
#> ses -0.200 0.140 0.043
#> meanses -0.091 0.121 0.023 -0.235
#>
#> $imp2
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: mathach ~ minority + female + ses + meanses + (1 + ses | schid)
#> Data: d
#>
#> REML criterion at convergence: 46354.9
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -3.2035 -0.7211 0.0323 0.7596 2.9226
#>
#> Random effects:
#> Groups Name Variance Std.Dev. Corr
#> schid (Intercept) 2.3326 1.5273
#> ses 0.3909 0.6252 -0.52
#> Residual 35.8580 5.9882
#> Number of obs: 7185, groups: schid, 160
#>
#> Fixed effects:
#> Estimate Std. Error t value
#> (Intercept) 13.9658 0.1733 80.570
#> minority -2.4763 0.1993 -12.427
#> female -1.2070 0.1584 -7.620
#> ses 1.9413 0.1193 16.273
#> meanses 3.1770 0.3591 8.846
#>
#> Correlation of Fixed Effects:
#> (Intr) minrty female ses
#> minority -0.315
#> female -0.480 0.007
#> ses -0.211 0.140 0.038
#> meanses -0.096 0.124 0.023 -0.235
#> convergence code: 0
#> Model failed to converge with max|grad| = 0.00243706 (tol = 0.002, component 1)
#>
#>
#> $imp3
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: mathach ~ minority + female + ses + meanses + (1 + ses | schid)
#> Data: d
#>
#> REML criterion at convergence: 46342.5
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -3.2214 -0.7222 0.0331 0.7602 2.9194
#>
#> Random effects:
#> Groups Name Variance Std.Dev. Corr
#> schid (Intercept) 2.3567 1.5352
#> ses 0.4808 0.6934 -0.46
#> Residual 35.7520 5.9793
#> Number of obs: 7185, groups: schid, 160
#>
#> Fixed effects:
#> Estimate Std. Error t value
#> (Intercept) 13.9685 0.1743 80.142
#> minority -2.5731 0.1987 -12.947
#> female -1.1729 0.1586 -7.393
#> ses 1.9349 0.1215 15.928
#> meanses 3.2066 0.3613 8.874
#>
#> Correlation of Fixed Effects:
#> (Intr) minrty female ses
#> minority -0.321
#> female -0.481 0.019
#> ses -0.208 0.138 0.042
#> meanses -0.093 0.123 0.025 -0.229
#>
#> $imp4
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: mathach ~ minority + female + ses + meanses + (1 + ses | schid)
#> Data: d
#>
#> REML criterion at convergence: 46359.6
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -3.2097 -0.7245 0.0341 0.7595 2.9047
#>
#> Random effects:
#> Groups Name Variance Std.Dev. Corr
#> schid (Intercept) 2.3002 1.5166
#> ses 0.4605 0.6786 -0.51
#> Residual 35.8664 5.9889
#> Number of obs: 7185, groups: schid, 160
#>
#> Fixed effects:
#> Estimate Std. Error t value
#> (Intercept) 13.9611 0.1729 80.758
#> minority -2.4274 0.1981 -12.251
#> female -1.2301 0.1584 -7.765
#> ses 1.9276 0.1213 15.895
#> meanses 3.2456 0.3574 9.082
#>
#> Correlation of Fixed Effects:
#> (Intr) minrty female ses
#> minority -0.317
#> female -0.481 0.010
#> ses -0.219 0.140 0.037
#> meanses -0.095 0.121 0.022 -0.233
#>
#> $imp5
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: mathach ~ minority + female + ses + meanses + (1 + ses | schid)
#> Data: d
#>
#> REML criterion at convergence: 46360.4
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -3.2003 -0.7248 0.0339 0.7606 2.9253
#>
#> Random effects:
#> Groups Name Variance Std.Dev. Corr
#> schid (Intercept) 2.3192 1.5229
#> ses 0.4721 0.6871 -0.54
#> Residual 35.8665 5.9889
#> Number of obs: 7185, groups: schid, 160
#>
#> Fixed effects:
#> Estimate Std. Error t value
#> (Intercept) 13.9635 0.1735 80.473
#> minority -2.5255 0.1980 -12.756
#> female -1.1876 0.1589 -7.472
#> ses 1.8948 0.1212 15.637
#> meanses 3.2880 0.3568 9.216
#>
#> Correlation of Fixed Effects:
#> (Intr) minrty female ses
#> minority -0.319
#> female -0.483 0.016
#> ses -0.233 0.138 0.043
#> meanses -0.097 0.118 0.023 -0.231
The standard errors reported for the model list include a correction, Rubin’s correction (see documentation), which adjusts for the within and between imputation set variance as well.
Cautions and Notes
Often it is desirable to include aggregate values in the level two or level three part of the model such as level 1 SES and level 2 mean SES for the group. In cases where there is missingness in either the level 1 SES values, or in the level 2 mean SES values, caution and careful thought need to be given to how to proceed with the imputation routine.
merTools/inst/doc/marginal_effects.R 0000644 0001762 0000144 00000001172 13466137511 017224 0 ustar ligges users ## ----setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide', cache=FALSE----
library(ggplot2); library(knitr); library(merTools)
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo = TRUE
)
## ------------------------------------------------------------------------
fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
mfx <- REmargins(merMod = fm1, newdata = sleepstudy[1:10,])
head(mfx)
## ------------------------------------------------------------------------
ggplot(mfx) + aes(x = breaks, y = fit_Subject, group = case) +
geom_line() +
facet_wrap(~term)
merTools/inst/doc/merToolsIntro.html 0000644 0001762 0000144 00000446143 13466137707 017340 0 ustar ligges users
An Introduction to merTools
An Introduction to merTools
Jared Knowles and Carl Frederick
2019-05-12
Introduction
Working with generalized linear mixed models (GLMM) and linear mixed models (LMM) has become increasingly easy with the advances in the lme4
package recently. As we have found ourselves using these models more and more within our work, we, the authors, have developed a set of tools for simplifying and speeding up common tasks for interacting with merMod
objects from lme4
. This package provides those tools.
Illustrating Model Effects
As the complexity of the model fit grows, it becomes harder and harder to interpret the substantive effect of parameters in the model.
Let’s start with a medium-sized example model using the InstEval
data provided by the lme4
package. These data represent university lecture evaluations at ETH Zurich made by students. In this data, s
is an individual student, d
is an individual lecturer, studage
is the semester the student is enrolled, lectage
is how many semesters back the lecture with the rating took place, dept
is the department of the lecture, and y
is an integer 1:5 representing the ratings of the lecture from “poor” to “very good”:
library(lme4)
head(InstEval)
#> s d studage lectage service dept y
#> 1 1 1002 2 2 0 2 5
#> 2 1 1050 2 1 1 6 2
#> 3 1 1582 2 2 0 2 5
#> 4 1 2050 2 2 1 3 3
#> 5 2 115 2 1 0 5 2
#> 6 2 756 2 1 0 5 4
str(InstEval)
#> 'data.frame': 73421 obs. of 7 variables:
#> $ s : Factor w/ 2972 levels "1","2","3","4",..: 1 1 1 1 2 2 3 3 3 3 ...
#> $ d : Factor w/ 1128 levels "1","6","7","8",..: 525 560 832 1068 62 406 3 6 19 75 ...
#> $ studage: Ord.factor w/ 4 levels "2"<"4"<"6"<"8": 1 1 1 1 1 1 1 1 1 1 ...
#> $ lectage: Ord.factor w/ 6 levels "1"<"2"<"3"<"4"<..: 2 1 2 2 1 1 1 1 1 1 ...
#> $ service: Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 1 1 1 ...
#> $ dept : Factor w/ 14 levels "15","5","10",..: 14 5 14 12 2 2 13 3 3 3 ...
#> $ y : int 5 2 5 3 2 4 4 5 5 4 ...
Starting with a simple model:
After fitting the model we can make use of the first function provided by merTools
, fastdisp
which modifies the function arm:::display
to more quickly display a summary of the model without calculating the model sigma:
We see some interesting effects. First, our decision to include student and lecturer effects seems justified as there is substantial variance within these groups. Second, there do appear to be some effects by age and for lectures given as a service by an outside lecturer. Let’s look at these in more detail. One way to do this would be to plot the coefficients together in a line to see which deviate from 0 and in what direction. To get a confidence interval for our fixed effect coefficients we have a number of options that represent a tradeoff between coverage and computation time – see confint.merMod
for details.
An alternative is to simulate values of the fixed effects from the posterior using the function arm::sim
. Our next tool, FEsim
, is a convenience wrapper to do this and provide an informative data frame of the results.
We can present these results graphically, using ggplot2
:

However, an easier option is:

Random Effects
Next, we might be interested in exploring the random effects. Again, we create a dataframe of the values of the simulation of these effects for the individual levels.
The result is a dataframe with estimates of the values of each of the random effects provided by the arm::sim()
function. groupID represents the identfiable level for the variable for one random effect, term represents whether the simulated values are for an intercept or which slope, and groupFctr identifies which of the (1|x)
terms the values represent. To make unique identifiers for each term, we need to use both the groupID
and the groupFctr
term in case these two variables use overlapping label names for their groups. In this case:
Most important is producing caterpillar or dotplots of these terms to explore their variation. This is easily accomplished with the dotplot
function:
However, these graphics do not provide much control over the results. Instead, we can use the plotREsim
function in merTools
to gain more control over plotting of the random effect simulations.

The result is a ggplot2 object which can be modified however the user sees fit. Here, we’ve established that most student and professor effects are indistinguishable from zero, but there do exist extreme outliers with both high and low averages that need to be accounted for.
Subtantive Effects
A logical next line of questioning is to see how much of the variation in a rating can be caused by changing the student rater and how much is due to the fixed effects we identified above. This is a very difficult problem to solve, but using simulation we can examine the model behavior under a range of scenarios to understand how the model is reflecting changes in the data. To do this, we use another set of functions available in merTools
.
The simplest option is to pick an observation at random and then modify its values deliberately to see how the prediction changes in response. merTools
makes this task very simple:
The draw
function takes a random observation from the data in the model and extracts it as a dataframe. We can now do a number of operations to this observation:
More interesting, let’s programatically modify this observation to see how the predicted value changes if we hold everything but one variable constant.
The function wiggle
allows us to create a new dataframe with copies of the variable that modify just one value. Chaining together wiggle
calls, we can see how the variable behaves under a number of different scenarios simultaneously.

The result allows us to graphically display the effect of each level of lectage
on an observation that is otherwise identical. This is plotted here against a horizontal line representing the mean of the observed ratings, and two finer lines showing plus or minus one standard deviation of the mean.
This is nice, but selecting a random observation is not very satisfying as it may not be very meaningful. To address this, we can instead take the average observation:
Here, the average observation is identified based on either the modal observation for factors or on the mean for numeric variables. Then, the random effect terms are set to the level equivalent to the median effect – very close to 0.
example3 <- wiggle(example1, varlist = "service",
valueslist = list(c("0", "1")))
example3$yhat <- predict(m1, newdata = example3)
ggplot(example3, aes(x = service, y = yhat)) + geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)

Here we can see that for the average observation, whether the lecture is outside of the home department has a very slight negative effect on the overall rating. Might the individual professor or student have more of an impact on the overall rating? To answer this question we need to wiggle the same observation across a wide range of student or lecturer effects.
How do we identify this range? merTools
provides the REquantile
function which helps to identify which levels of the grouping terms correspond to which quantile of the magnitude of the random effects:
Here we can see that group level 446 corresponds to the 25th percentile of the effect for the student groups, and level REquantile(m1, quantile = 0.25, groupFctr = "d")
corresponds to the 25th percentile for the instructor group. Using this information we can reassign a specific observation to varying magnitudes of grouping term effects to see how much they might influence our final prediction.
example4 <- draw(m1, type = 'average')
example4 <- wiggle(example4, varlist = "s",
list(REquantile(m1, quantile = seq(0.1, 0.9, .1),
groupFctr = "s")))
example4$yhat <- predict(m1, newdata = example4)
ggplot(example4, aes(x = reorder(s, -yhat), y = yhat)) +
geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)

This figure is very interesting because it shows that moving across the range of student effects can have a larger impact on the score than the fixed effects we observed above. That is, getting a “generous” or a “stingy” rater can have a substantial impact on the final rating.
But, we can do even better. First, we can move beyond the average observation by taking advantage of the varList
option to the function which allows us to specify a subset of the data to compute an average for.
Now we have the average observation with a student age of 2 and a lecture age of 4. We can then follow the same procedure as before to explore the effects on our subsamples. Before we do that, let’s fit a slightly more complex model that includes a random slope.
data(VerbAgg)
m2 <- glmer(r2 ~ Anger + Gender + btype + situ +
(1|id) + (1 + Gender|item), family = binomial,
data = VerbAgg)
#> Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
#> control$checkConv, : Model failed to converge with max|grad| = 0.0071146
#> (tol = 0.001, component 1)
example6 <- draw(m2, type = 'average', varList = list("id" = "149"))
example6$btype <- "scold"
example6$situ <- "self"
tempdf <- wiggle(example6, varlist = "Gender", list(c("M", "F")))
tempdf <- wiggle(tempdf, varlist = "item",
list(unique(VerbAgg$item)))
tempdf$yhat <- predict(m2, newdata = tempdf, type = "response",
allow.new.levels = TRUE)
ggplot(tempdf, aes(x = item, y = yhat, group = Gender)) +
geom_line(aes(color = Gender))+
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20, hjust=1),
legend.position = "bottom") + labs(x = "Item", y = "Probability")

Here we’ve shown that the effect of both the intercept and the gender slope on item simultaneously affect our predicted value. This results in the two lines for predicted values across the items not being parallel. While we can see this by looking at the results of the summary of the model object, using fastdisp
in the merTools
package for larger models, it is not intuitive what that effect looks like across different scenarios. merTools
has given us the machinery to investigate this.
Uncertainty
The above examples make use of simulation to show the model behavior after changing some values in a dataset. However, until now, we’ve focused on using point estimates to represent these changes. The use of predicted point estimates without incorporating any uncertainty can lead to overconfidence in the precision of the model.
In the predictInterval
function, discussed in more detail in another package vignette, we provide a way to incorporate three out of the four types of uncertainty inherent in a model. These are:
- Overall model uncertainty
- Uncertainty in fixed effect values
- Uncertainty in random effect values
- Uncertainty in the distribution of the random effects
1-3 are incorporated in the results of predictInterval
, while capturing 4 would require making use of the bootMer
function – options discussed in greater detail elsewhere. The main advantage of predictInterval
is that it is fast. By leveraging the power of the arm::sim()
function, we are able to generate prediction intervals for individual observations from very large models very quickly. And, it works a lot like predict
:
exampPreds <- predictInterval(m2, newdata = tempdf,
type = "probability", level = 0.8)
tempdf <- cbind(tempdf, exampPreds)
ggplot(tempdf, aes(x = item, y = fit, ymin = lwr, ymax = upr,
group = Gender)) +
geom_ribbon(aes(fill = Gender), alpha = I(0.2), color = I("black"))+
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20),
legend.position = "bottom")+ labs(x = "Item", y = "Probability")

Here we can see there is barely any gender difference in terms of area of potential prediction intervals. However, by default, this approach includes the residual variance of the model. If we instead focus just on the uncertainty of the random and fixed effects, we get:
exampPreds <- predictInterval(m2, newdata = tempdf,
type = "probability",
include.resid.var = FALSE, level = 0.8)
tempdf <- cbind(tempdf[, 1:8], exampPreds)
ggplot(tempdf, aes(x = item, y = fit, ymin = lwr, ymax = upr,
group = Gender)) +
geom_ribbon(aes(fill = Gender), alpha = I(0.2), color = I("black"))+
geom_line(aes(color = Gender)) +
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20),
legend.position = "bottom") + labs(x = "Item", y = "Probability")

Here, more difference emerges, but we see that the differences are not very precise.
merTools/inst/doc/Using_predictInterval.Rmd 0000644 0001762 0000144 00000051262 13402510755 020560 0 ustar ligges users ---
title: "Prediction Intervals from merMod Objects"
author: "Jared Knowles and Carl Frederick"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteEngine{knitr::rmarkdown}
%\VignetteIndexEntry{Prediction Intervals from merMod Objects}
%\VignetteEncoding{UTF-8}
---
```{r setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide', cache=FALSE}
library(ggplot2); library(knitr); library(merTools)
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo = TRUE
)
rstanarm_eval <- "rstanarm" %in% rownames(installed.packages())
```
## Introduction
Fitting (generalized) linear mixed models, (G)LMM, to very large data sets is
becoming increasingly easy, but understanding and communicating the uncertainty
inherent in those models is not. As the documentation for
`lme4::predict.merMod()` notes:
> There is no option for computing standard errors of predictions because it is
> difficult to define an efficient method that incorporates uncertainty in the
> variance parameters; we recommend `lme4::bootMer()` for this task.
We agree that, short of a fully Bayesian analysis, bootstrapping is the
gold-standard for deriving a prediction interval predictions from a (G)LMM, but
the time required to obtain even a respectable number of replications from
`bootMer()` quickly becomes prohibitive when the initial model fit is on the
order of hours instead of seconds. The only other alternative we have identified
for these situations is to use the `arm::sim()` function to simulate values.
Unfortunately, this only takes variation of the fixed coefficients and residuals
into account, and assumes the conditional modes of the random effects are fixed.
We developed the `predictInterval()` function to incorporate the variation in
the conditional modes of the random effects (CMRE, a.k.a. BLUPs in the LMM case)
into calculating prediction intervals. Ignoring the variance in the CMRE results
in overly confident estimates of predicted values and in cases where the precision
of the grouping term varies across levels of grouping terms, creates the illusion
of difference where none may exist. The importance of accounting for this variance
comes into play sharply when comparing the predictions of different models across
observations.
We take the warning from `lme4::predict.merMod()`
seriously, but view this method as a decent first approximation the full bootstrap
analysis for (G)LMMs fit to very large data sets.
## Conceptual description
In order to generate a proper prediction interval, a prediction must account for
three sources of uncertainty in mixed models:
1. the residual (observation-level) variance,
2. the uncertainty in the fixed coefficients, and
3. the uncertainty in the variance parameters for the grouping factors.
A fourth, uncertainty about the data, is beyond the scope of any prediction method.
As we mentioned above, the `arm:sim()` function incorporates the first two sources
of variation but not the third , while bootstrapping using `lme4::bootMer()`
does incorporate all three sources of uncertainty because it re-estimates the
model using random samples of the data.
When inference about the values of the CMREs is of interest, it would be nice to
incorporate some degree of uncertainty in those estimates when comparing
observations across groups. `predictInterval()` does this by drawing values of
the CMREs from the conditional variance-covariance matrix of the random affects
accessible from `lme4::ranef(model, condVar=TRUE)`. Thus, `predictInterval()`
incorporates all of the uncertainty from sources one and two, and part of the
variance from source 3, but the variance parameters themselves are treated as
fixed.
To do this, `predictInterval()` takes an estimated model of class `merMod` and,
like `predict()`, a data.frame upon which to make those predictions and:
1. extracts the fixed and random coefficients
2. takes `n` draws from the multivariate normal distribution of the fixed and
random coefficients (separately)
3. calculates the linear predictor for each row in `newdata` based on these draws,
and
4. optionally incorporates the residual variation (per the `arm::sim()` function),
and,
5. returns newdata with the lower and upper limits of the prediction interval
and the mean or median of the simulated predictions
Currently, the supported model types are linear mixed models and mixed logistic
regression models.
The prediction data set *can* include levels that are not in the estimation model
frame. The prediction intervals for such observations only incorporate
uncertainty from fixed coefficient estimates and the residual level of variation.
## Comparison to existing methods
What do the differences between `predictInterval()` and the other methods for
constructing prediction intervals mean in practice? We would expect to see
`predictInterval()` to produce prediction intervals that are wider than all methods
except for the `bootMer()` method. We would also hope that the prediction point
estimate from other methods falls within the prediction interval produced by
`predictInterval()`. Ideally, the predicted point estimate produced by
`predictInterval()` would fall close to that produced by `bootMer()`.
This section compares the results of `predictInterval()` with those obtained
using `arm::sim()` and `lme4::bootMer()` using the sleepstudy data from
`lme4`. These data contain reaction time observations for 10 days on 18 subjects.
The data are sorted such that the first 10 observations are days one through
ten for subject 1, the next 10 are days one through ten for subject 2 and so on.
The example model that we are estimating below estimates random intercepts and a
random slope for the number of days.
###Step 1: Estimating the model and using `predictInterval()`
First, we will load the required packages and data and estimate the model:
```{r Prep, message=FALSE, warning=FALSE}
set.seed(271828)
data(sleepstudy)
fm1 <- lmer(Reaction ~ Days + (Days|Subject), data=sleepstudy)
display(fm1)
```
Then, calculate prediction intervals using `predictInterval()`. The `predictInterval`
function has a number of user configurable options. In this example, we use the
original data `sleepstudy` as the newdata. We pass the function the `fm1` model
we fit above. We also choose a 95% interval with `level = 0.95`, though
we could choose a less conservative prediction interval. We make 1,000 simulations
for each observation `n.sims = 1000`. We set the point estimate to be the median
of the simulated values, instead of the mean. We ask for the linear predictor back,
if we fit a logistic regression, we could have asked instead for our predictions
on the probability scale instead. Finally, we indicate that we want the predictions
to incorporate the residual variance from the model -- an option only available
for `lmerMod` objects.
```{r predInt}
PI.time <- system.time(
PI <- predictInterval(merMod = fm1, newdata = sleepstudy,
level = 0.95, n.sims = 1000,
stat = "median", type="linear.prediction",
include.resid.var = TRUE)
)
```
Here is the first few rows of the object `PI`:
```{r Inspect predInt, results="asis", echo=FALSE}
kable(head(PI))
```
The three columns are the median (`fit`) and limits of the 95% prediction
interval (`upr` and `lwr`) because we set `level=0.95`. The following figure
displays the output graphically for the first 30 observations.
```{r Inspect predInt 2, fig.width=7, fig.align="center"}
library(ggplot2);
ggplot(aes(x=1:30, y=fit, ymin=lwr, ymax=upr), data=PI[1:30,]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") + theme_bw()
```
#### Step 1a: Adjusting for correlation between fixed and random effects
The prediction intervals above do not correct for correlations between fixed
and random effects. This tends to lead to predictive intervals that are too
conservative, especially for existing groups when there is a lot of data on
relatively few groups. In that case, a significant portion of the uncertainty
in the prediction can be due to variance in the fixed intercept which is
anti-correlated with variance in the random intercept effects. For instance,
it does not actually matter if the fixed intercept is 5 and the random
intercept effects are -2, 1, and 1, versus a fixed intercept of 6 and
random intercept effects of -3, 0, and 0. (The latter situation will never be
the MLE, but it can occur in this package's simulations.)
To show this issue, we'll use the sleep study model, predicting the reaction times
of subjects after experiencing sleep deprivation:
```{r prepSleep, message=FALSE, warning=FALSE}
fm1 <- lmer(Reaction ~ Days + (Days|Subject), data=sleepstudy)
display(fm1)
```
Let's use the model to give an interval for the true average body fat of a
large group of students like the first one in the study — a 196cm female
baseball player:
```{r showSleep, message=FALSE, warning=FALSE}
sleepstudy[1,]
predictInterval(fm1, sleepstudy[1,], include.resid.var=0) #predict the average body fat for a group of 196cm female baseball players
```
There are two ways to get predictInterval to create less-conservative intervals
to deal with this. The first is just to tell it to consider certain fixed
effects as fully-known (that is, with an effectively 0 variance.) This is done
using the `ignore.fixed.effects` argument.
```{r ShowFat2, message=FALSE, warning=FALSE}
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = 1)
# predict the average reaction time for a subject at day 0, taking the global intercept
# (mean reaction time) as fully known
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = "(Intercept)")
#Same as above
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = 1:2)
# as above, taking the first two fixed effects (intercept and days effect) as fully known
```
The second way is to use an ad-hoc variance adjustment, with the
`fix.intercept.variance` argument. This takes the model's intercept variance
$\hat\sigma^2_\mu$ and adjusts it to:
$$\hat\sigma\prime^2_\mu = \hat\sigma^2_\mu-\Sigma_{levels}\frac{1}{\Sigma_{groups(level)}1/(\hat\sigma^2_{level}+sigma^2_{group})}$$
In other words, it assumes the given intercept variance incorporates spurious
variance for each level, where each of the spurious variance terms has a
precision equal to the of the precisions due to the individual groups at that
level.
```{r ShowFat3, message=FALSE, warning=FALSE}
predictInterval(fm1, sleepstudy[1,], include.resid.var=0,
fix.intercept.variance = TRUE)
# predict the average reaction time for a subject at day 0,, using an ad-hoc
# correction for the covariance of the intercept with the random intercept effects.
```
A few notes about these two arguments:
* `fix.intercept.variance=T` is redundant with `ignore.fixed.effects=1`, but not
vice versa.
* These corrections should NOT be used when predicting outcomes for groups not
present in the original data.
### Step 2: Comparison with `arm::sim()`
How does the output above compare to what we could get from `arm::sim()`?
```{r arm.Sim, fig.width=7, fig.height=4, fig.align="center"}
PI.arm.time <- system.time(
PI.arm.sims <- arm::sim(fm1, 1000)
)
PI.arm <- data.frame(
fit=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.500)),
upr=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.975)),
lwr=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.025))
)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="arm::sim()", x=(1:nrow(PI.arm))+0.1, PI.arm))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
```
The prediction intervals from `arm:sim()` are much smaller and the random slope
for days vary more than they do for `predictInterval`. Both results are as
expected, given the small number of subjects and observations per subject in
these data. Because `predictInterval()` is incorporating uncertainty in the CMFEs
(but not the variance parameters of the random coefficients themselves), the
Days slopes are closer to the overall or pooled regression slope.
###Step 3: Comparison with `lme4::bootMer()`
As quoted above, the developers of lme4 suggest that users interested in
uncertainty estimates around their predictions use `lme4::bootmer()` to calculate
them. The documentation for `lme4::bootMer()` goes on to describe three
implemented flavors of bootstrapped estimates:
1. parametrically resampling both the *"spherical"* random effects
*u* and the i.i.d. errors $\epsilon$
2. treating the random effects as fixed and parametrically resampling the
i.i.d. errors
3. treating the random effects as fixed and semi-parametrically resampling
the i.i.d. errors from the distribution of residuals.
We will compare the results from `predictInterval()` with each method, in turn.
#### Step 3a: `lme4::bootMer()` method 1
```{r bootMer.1, fig.width=7, fig.height=4, fig.align="center"}
##Functions for bootMer() and objects
####Return predicted values from bootstrap
mySumm <- function(.) {
predict(., newdata=sleepstudy, re.form=NULL)
}
####Collapse bootstrap into median, 95% PI
sumBoot <- function(merBoot) {
return(
data.frame(fit = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE))),
lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE)))
)
)
}
##lme4::bootMer() method 1
PI.boot1.time <- system.time(
boot1 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=FALSE, type="parametric")
)
PI.boot1 <- sumBoot(boot1)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 1", x=(1:nrow(PI.boot1))+0.1, PI.boot1))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
```
The intervals produced by `predictInterval`, represented in green, cover the
point estimates produced by `bootMer` in every case for these 30 observations.
Additionally, in almost every case, the `predictInterval` encompasses the entire
interval presented by `bootMer`. Here, the estimates produced by `bootMer` are
re-estimating the group terms, but by refitting the model, they are also taking
into account the conditional variance of these terms, or `theta`, and provide
tighter prediction intervals than the `predictInterval` method.
####Step 3b: `lme4::bootMer()` method 2
```{r bootMer.2, fig.width=7, fig.height=4, fig.align="center"}
##lme4::bootMer() method 2
PI.boot2.time <- system.time(
boot2 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=TRUE, type="parametric")
)
PI.boot2 <- sumBoot(boot2)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 2", x=(1:nrow(PI.boot2))+0.1, PI.boot2))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
```
Here, the results for `predictInterval` in green again encompass the results from
`bootMer`, but are much wider. The `bootMer` estimates are ignoring the variance
in the group effects, and as such, are only incorporating the residual variance
and the variance in the fixed effects -- similar to the `arm::sim()` function.
#### Step 3c: `lme4::bootMer()` method 3
```{r bootMer.3, fig.width=7, fig.height=4, fig.align="center"}
##lme4::bootMer() method 3
PI.boot3.time <- system.time(
boot3 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=TRUE, type="semiparametric")
)
PI.boot3 <- sumBoot(boot3)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 3", x=(1:nrow(PI.boot3))+0.1, PI.boot3))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
```
These results are virtually identical to those above.
#### Step 3c: Comparison to rstanarm
```{r echo=FALSE, message=FALSE, eval = rstanarm_eval}
library(rstanarm)
central_intervals <- function(x, prob) {
if (!identical(length(prob), 1L) || prob <= 0 || prob >= 1)
stop("'prob' should be a single number greater than 0 and less than 1.",
call. = FALSE)
alpha <- (1 - prob) / 2
probs <- c(alpha, 1 - alpha)
labs <- paste0(100 * probs, "%")
out <- t(apply(x, 2L, quantile, probs = probs))
structure(out, dimnames = list(colnames(x), labs))
}
```
```{r, message=FALSE, fig.width=7, fig.height=4, fig.align="center", eval = rstanarm_eval}
PI.time.stan <- system.time({
fm_stan <- stan_lmer(Reaction ~ Days + (Days|Subject), data = sleepstudy,
verbose = FALSE, open_progress = FALSE, refresh = -1,
show_messages=FALSE, chains = 1)
zed <- posterior_predict(fm_stan)
PI.stan <- cbind(apply(zed, 2, median), central_intervals(zed, prob=0.95))
})
print(fm_stan)
PI.stan <- as.data.frame(PI.stan)
names(PI.stan) <- c("fit", "lwr", "upr")
PI.stan <- PI.stan[, c("fit", "upr", "lwr")]
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="rstanArm", x=(1:nrow(PI.stan))+0.1, PI.stan))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
```
### Computation time
Our initial motivation for writing this function was to develop a method for
incorporating uncertainty in the CMFEs for mixed models estimated on very large
samples. Even for models with only modest degrees of complexity, using
`lme4::bootMer()` quickly becomes time prohibitive because it involves
re-estimating the model for each simulation. We have seen how each alternative
compares to `predictInterval()` substantively, but how do they compare in terms
of computational time? The table below lists the output of `system.time()` for
all five methods for calculating prediction intervals for `merMod` objects.
```{r, echo=FALSE}
if (rstanarm_eval) {
times <-
rbind(PI.time,
PI.arm.time,
PI.boot1.time,
PI.boot2.time,
PI.boot3.time,
PI.time.stan)[, 1:3]
rownames(times) <-
c(
"predictInterval()",
"arm::sim()",
"lme4::bootMer()-Method 1",
"lme4::bootMer()-Method 2",
"lme4::bootMer()-Method 3",
"rstanarm:predict"
)
kable(times)
} else {
times <-
rbind(PI.time,
PI.arm.time,
PI.boot1.time,
PI.boot2.time,
PI.boot3.time)[, 1:3]
rownames(times) <-
c(
"predictInterval()",
"arm::sim()",
"lme4::bootMer()-Method 1",
"lme4::bootMer()-Method 2",
"lme4::bootMer()-Method 3"
)
kable(times)
}
```
For this simple example, we see that `arm::sim()` is the fastest--nearly five
times faster than `predictInterval()`. However, `predictInterval()` is nearly
six times faster than any of the bootstrapping options via `lme4::bootMer`. This
may not seem like a lot, but consider that the computational time for required
for bootstrapping is roughly proportional to the number of bootstrapped simulations
requested ... `predictInterval()` is not because it is just a series of draws from
various multivariate normal distributions, so the time ratios in the table below
represents the lowest bound of the computation time ratio of bootstrapping
to `predictInterval()`.
## Simulation
TBC.
merTools/inst/doc/Using_predictInterval.R 0000644 0001762 0000144 00000020675 13466137505 020253 0 ustar ligges users ## ----setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide', cache=FALSE----
library(ggplot2); library(knitr); library(merTools)
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo = TRUE
)
rstanarm_eval <- "rstanarm" %in% rownames(installed.packages())
## ----Prep, message=FALSE, warning=FALSE----------------------------------
set.seed(271828)
data(sleepstudy)
fm1 <- lmer(Reaction ~ Days + (Days|Subject), data=sleepstudy)
display(fm1)
## ----predInt-------------------------------------------------------------
PI.time <- system.time(
PI <- predictInterval(merMod = fm1, newdata = sleepstudy,
level = 0.95, n.sims = 1000,
stat = "median", type="linear.prediction",
include.resid.var = TRUE)
)
## ----Inspect predInt, results="asis", echo=FALSE-------------------------
kable(head(PI))
## ----Inspect predInt 2, fig.width=7, fig.align="center"------------------
library(ggplot2);
ggplot(aes(x=1:30, y=fit, ymin=lwr, ymax=upr), data=PI[1:30,]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") + theme_bw()
## ----prepSleep, message=FALSE, warning=FALSE-----------------------------
fm1 <- lmer(Reaction ~ Days + (Days|Subject), data=sleepstudy)
display(fm1)
## ----showSleep, message=FALSE, warning=FALSE-----------------------------
sleepstudy[1,]
predictInterval(fm1, sleepstudy[1,], include.resid.var=0) #predict the average body fat for a group of 196cm female baseball players
## ----ShowFat2, message=FALSE, warning=FALSE------------------------------
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = 1)
# predict the average reaction time for a subject at day 0, taking the global intercept
# (mean reaction time) as fully known
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = "(Intercept)")
#Same as above
predictInterval(fm1, sleepstudy[1,], include.resid.var=0, ignore.fixed.terms = 1:2)
# as above, taking the first two fixed effects (intercept and days effect) as fully known
## ----ShowFat3, message=FALSE, warning=FALSE------------------------------
predictInterval(fm1, sleepstudy[1,], include.resid.var=0,
fix.intercept.variance = TRUE)
# predict the average reaction time for a subject at day 0,, using an ad-hoc
# correction for the covariance of the intercept with the random intercept effects.
## ----arm.Sim, fig.width=7, fig.height=4, fig.align="center"--------------
PI.arm.time <- system.time(
PI.arm.sims <- arm::sim(fm1, 1000)
)
PI.arm <- data.frame(
fit=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.500)),
upr=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.975)),
lwr=apply(fitted(PI.arm.sims, fm1), 1, function(x) quantile(x, 0.025))
)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="arm::sim()", x=(1:nrow(PI.arm))+0.1, PI.arm))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
## ----bootMer.1, fig.width=7, fig.height=4, fig.align="center"------------
##Functions for bootMer() and objects
####Return predicted values from bootstrap
mySumm <- function(.) {
predict(., newdata=sleepstudy, re.form=NULL)
}
####Collapse bootstrap into median, 95% PI
sumBoot <- function(merBoot) {
return(
data.frame(fit = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE))),
lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE)))
)
)
}
##lme4::bootMer() method 1
PI.boot1.time <- system.time(
boot1 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=FALSE, type="parametric")
)
PI.boot1 <- sumBoot(boot1)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 1", x=(1:nrow(PI.boot1))+0.1, PI.boot1))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
## ----bootMer.2, fig.width=7, fig.height=4, fig.align="center"------------
##lme4::bootMer() method 2
PI.boot2.time <- system.time(
boot2 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=TRUE, type="parametric")
)
PI.boot2 <- sumBoot(boot2)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 2", x=(1:nrow(PI.boot2))+0.1, PI.boot2))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
## ----bootMer.3, fig.width=7, fig.height=4, fig.align="center"------------
##lme4::bootMer() method 3
PI.boot3.time <- system.time(
boot3 <- lme4::bootMer(fm1, mySumm, nsim=250, use.u=TRUE, type="semiparametric")
)
PI.boot3 <- sumBoot(boot3)
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="lme4::bootMer() - Method 3", x=(1:nrow(PI.boot3))+0.1, PI.boot3))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
## ----echo=FALSE, message=FALSE, eval = rstanarm_eval---------------------
library(rstanarm)
central_intervals <- function(x, prob) {
if (!identical(length(prob), 1L) || prob <= 0 || prob >= 1)
stop("'prob' should be a single number greater than 0 and less than 1.",
call. = FALSE)
alpha <- (1 - prob) / 2
probs <- c(alpha, 1 - alpha)
labs <- paste0(100 * probs, "%")
out <- t(apply(x, 2L, quantile, probs = probs))
structure(out, dimnames = list(colnames(x), labs))
}
## ---- message=FALSE, fig.width=7, fig.height=4, fig.align="center", eval = rstanarm_eval----
PI.time.stan <- system.time({
fm_stan <- stan_lmer(Reaction ~ Days + (Days|Subject), data = sleepstudy,
verbose = FALSE, open_progress = FALSE, refresh = -1,
show_messages=FALSE, chains = 1)
zed <- posterior_predict(fm_stan)
PI.stan <- cbind(apply(zed, 2, median), central_intervals(zed, prob=0.95))
})
print(fm_stan)
PI.stan <- as.data.frame(PI.stan)
names(PI.stan) <- c("fit", "lwr", "upr")
PI.stan <- PI.stan[, c("fit", "upr", "lwr")]
comp.data <- rbind(data.frame(Predict.Method="predictInterval()", x=(1:nrow(PI))-0.1, PI),
data.frame(Predict.Method="rstanArm", x=(1:nrow(PI.stan))+0.1, PI.stan))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=Predict.Method), data=comp.data[c(1:30,181:210),]) +
geom_point() +
geom_linerange() +
labs(x="Index", y="Prediction w/ 95% PI") +
theme_bw() + theme(legend.position="bottom") +
scale_color_brewer(type = "qual", palette = 2)
## ---- echo=FALSE---------------------------------------------------------
if (rstanarm_eval) {
times <-
rbind(PI.time,
PI.arm.time,
PI.boot1.time,
PI.boot2.time,
PI.boot3.time,
PI.time.stan)[, 1:3]
rownames(times) <-
c(
"predictInterval()",
"arm::sim()",
"lme4::bootMer()-Method 1",
"lme4::bootMer()-Method 2",
"lme4::bootMer()-Method 3",
"rstanarm:predict"
)
kable(times)
} else {
times <-
rbind(PI.time,
PI.arm.time,
PI.boot1.time,
PI.boot2.time,
PI.boot3.time)[, 1:3]
rownames(times) <-
c(
"predictInterval()",
"arm::sim()",
"lme4::bootMer()-Method 1",
"lme4::bootMer()-Method 2",
"lme4::bootMer()-Method 3"
)
kable(times)
}
merTools/inst/doc/imputation.R 0000644 0001762 0000144 00000006743 13466137510 016134 0 ustar ligges users ## ----setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide'----
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo=TRUE,
fig.width = 7
)
library(knitr); library(merTools)
amelia_eval <- "Amelia" %in% rownames(installed.packages())
amelia_uneval <- !amelia_eval
## ------------------------------------------------------------------------
data(hsb)
# Create a function to randomly assign NA values
add_NA <- function(x, prob){
z <- rbinom(length(x), 1, prob = prob)
x[z==1] <- NA
return(x)
}
hsb$minority <- add_NA(hsb$minority, prob = 0.05)
table(is.na(hsb$minority))
hsb$female <- add_NA(hsb$female, prob = 0.05)
table(is.na(hsb$female))
hsb$ses <- add_NA(hsb$ses, prob = 0.05)
table(is.na(hsb$ses))
hsb$size <- add_NA(hsb$size, prob = 0.05)
table(is.na(hsb$size))
## ----impute, message=FALSE, eval = amelia_eval---------------------------
# Load imputation library
library(Amelia)
# Declare the variables to include in the imputation data
varIndex <- names(hsb)
# Declare ID variables to be excluded from imputation
IDS <- c("schid", "meanses")
# Imputate
impute.out <- amelia(hsb[, varIndex], idvars = IDS,
noms = c("minority", "female"),
m = 5)
summary(impute.out)
## ----boot, message=FALSE, eval = amelia_uneval---------------------------
# # Amelia is not available so let's just boostrap resample our data
# impute.out <- vector(mode = "list", 5)
#
# for (i in 1:5) {
# impute.out[[i]] <- hsb[sample(nrow(hsb), nrow(hsb), replace = TRUE), ]
# }
#
# # Declare the variables to include in the imputation data
# summary(impute.out)
## ------------------------------------------------------------------------
fmla <- "mathach ~ minority + female + ses + meanses + (1 + ses|schid)"
mod <- lmer(fmla, data = hsb)
if(amelia_eval) {
modList <- lmerModList(fmla, data = impute.out$imputations)
} else {
# Use bootstrapped data instead
modList <- lmerModList(fmla, data = impute.out)
}
## ------------------------------------------------------------------------
fixef(mod) # model with dropped missing
fixef(modList)
## ------------------------------------------------------------------------
VarCorr(mod) # model with dropped missing
VarCorr(modList) # aggregate of imputed models
## ------------------------------------------------------------------------
lapply(modList, fixef)
## ------------------------------------------------------------------------
fixef(modList[[1]])
fixef(modList[[2]])
## ------------------------------------------------------------------------
print(modList)
## ------------------------------------------------------------------------
summary(modList)
## ------------------------------------------------------------------------
fastdisp(modList)
## ------------------------------------------------------------------------
modelRandEffStats(modList)
modelFixedEff(modList)
VarCorr(modList)
## ------------------------------------------------------------------------
modelInfo(mod)
## ------------------------------------------------------------------------
lapply(modList, modelInfo)
## ------------------------------------------------------------------------
summary(modList)
## ------------------------------------------------------------------------
modelFixedEff(modList)
## ------------------------------------------------------------------------
ranef(modList)
merTools/inst/doc/imputation.Rmd 0000644 0001762 0000144 00000013271 13466135366 016456 0 ustar ligges users ---
title: "Analyzing Imputed Data with Multilevel Models and merTools"
author: "Jared Knowles"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Imputation}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide'}
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo=TRUE,
fig.width = 7
)
library(knitr); library(merTools)
amelia_eval <- "Amelia" %in% rownames(installed.packages())
amelia_uneval <- !amelia_eval
```
## Introduction
Multilevel models are valuable in a wide array of problem areas that involve
non-experimental, or observational data. In many of these cases the data on
individual observations may be incomplete. In these situations, the analyst may
turn to one of many methods for filling in missing data depending on the specific
problem at hand, disciplinary norms, and prior research.
One of the most common cases is to use multiple imputation. Multiple imputation
involves fitting a model to the data and estimating the missing values for
observations. For details on multiple imputation, and a discussion of some of
the main implementations in R, look at the documentation and vignettes for the
`mice` and `Amelia` packages.
The key difficulty multiple imputation creates for users of multilevel models
is that the result of multiple imputation is K replicated datasets corresponding
to different estimated values for the missing data in the original dataset.
For the purposes of this vignette, I will describe how to use one flavor of
multiple imputation and the function in `merTools` to obtain estimates from a
multilevel model in the presence of missing and multiply imputed data.
## Missing Data and its Discontents
To demonstrate this workflow, we will use the `hsb` dataset in the `merTools`
package which includes data on the math achievement of a wide sample of students
nested within schools. The data has no missingness, so first we will simulate
some missing data.
```{r}
data(hsb)
# Create a function to randomly assign NA values
add_NA <- function(x, prob){
z <- rbinom(length(x), 1, prob = prob)
x[z==1] <- NA
return(x)
}
hsb$minority <- add_NA(hsb$minority, prob = 0.05)
table(is.na(hsb$minority))
hsb$female <- add_NA(hsb$female, prob = 0.05)
table(is.na(hsb$female))
hsb$ses <- add_NA(hsb$ses, prob = 0.05)
table(is.na(hsb$ses))
hsb$size <- add_NA(hsb$size, prob = 0.05)
table(is.na(hsb$size))
```
```{r impute, message=FALSE, eval = amelia_eval}
# Load imputation library
library(Amelia)
# Declare the variables to include in the imputation data
varIndex <- names(hsb)
# Declare ID variables to be excluded from imputation
IDS <- c("schid", "meanses")
# Imputate
impute.out <- amelia(hsb[, varIndex], idvars = IDS,
noms = c("minority", "female"),
m = 5)
summary(impute.out)
```
```{r boot, message=FALSE, eval = amelia_uneval}
# Amelia is not available so let's just boostrap resample our data
impute.out <- vector(mode = "list", 5)
for (i in 1:5) {
impute.out[[i]] <- hsb[sample(nrow(hsb), nrow(hsb), replace = TRUE), ]
}
# Declare the variables to include in the imputation data
summary(impute.out)
```
## Fitting and Summarizing a Model List
Fitting a model is very similar
```{r}
fmla <- "mathach ~ minority + female + ses + meanses + (1 + ses|schid)"
mod <- lmer(fmla, data = hsb)
if(amelia_eval) {
modList <- lmerModList(fmla, data = impute.out$imputations)
} else {
# Use bootstrapped data instead
modList <- lmerModList(fmla, data = impute.out)
}
```
The resulting object `modList` is a list of `merMod` objects the same length as the number
of imputation datasets. This object is assigned the class of `merModList` and
`merTools` provides some convenience functions for reporting the results of
this object.
Using this, we can directly compare the model fit with missing data excluded to
the aggregate from the imputed models:
```{r}
fixef(mod) # model with dropped missing
fixef(modList)
```
```{r}
VarCorr(mod) # model with dropped missing
VarCorr(modList) # aggregate of imputed models
```
If you want to inspect the individual models, or you do not like taking the
mean across the imputation replications, you can take the `merModList` apart
easily:
```{r}
lapply(modList, fixef)
```
And, you can always operate on any single element of the list:
```{r}
fixef(modList[[1]])
fixef(modList[[2]])
```
## Output of a Model List
```{r}
print(modList)
```
```{r}
summary(modList)
```
```{r}
fastdisp(modList)
```
The standard errors reported for the model list include a correction, Rubin's
correction (see documentation), which adjusts for the within and between imputation
set variance as well.
## Specific Model Information Summaries
```{r}
modelRandEffStats(modList)
modelFixedEff(modList)
VarCorr(modList)
```
### Diagnostics of List Components
```{r}
modelInfo(mod)
```
Let's apply this to our model list.
```{r}
lapply(modList, modelInfo)
```
### Model List Generics
```{r}
summary(modList)
```
```{r}
modelFixedEff(modList)
```
```{r}
ranef(modList)
```
## Cautions and Notes
Often it is desirable to include aggregate values in the level two or level three
part of the model such as level 1 SES and level 2 mean SES for the group. In
cases where there is missingness in either the level 1 SES values, or in the
level 2 mean SES values, caution and careful thought need to be given to how to
proceed with the imputation routine.
merTools/inst/doc/merToolsIntro.R 0000644 0001762 0000144 00000015327 13466137707 016571 0 ustar ligges users ## ----setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide'----
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo=TRUE,
fig.width = 7
)
library(knitr); library(merTools)
## ------------------------------------------------------------------------
library(lme4)
head(InstEval)
str(InstEval)
## ------------------------------------------------------------------------
m1 <- lmer(y ~ service + lectage + studage + (1|d) + (1|s), data=InstEval)
## ------------------------------------------------------------------------
library(merTools)
fastdisp(m1)
## ------------------------------------------------------------------------
feEx <- FEsim(m1, 1000)
cbind(feEx[,1] , round(feEx[, 2:4], 3))
## ------------------------------------------------------------------------
library(ggplot2)
ggplot(feEx[feEx$term!= "(Intercept)", ]) +
aes(x = term, ymin = median - 1.96 * sd,
ymax = median + 1.96 * sd, y = median) +
geom_pointrange() +
geom_hline(yintercept = 0, size = I(1.1), color = I("red")) +
coord_flip() +
theme_bw() + labs(title = "Coefficient Plot of InstEval Model",
x = "Median Effect Estimate", y = "Evaluation Rating")
## ------------------------------------------------------------------------
plotFEsim(feEx) +
theme_bw() + labs(title = "Coefficient Plot of InstEval Model",
x = "Median Effect Estimate", y = "Evaluation Rating")
## ------------------------------------------------------------------------
reEx <- REsim(m1)
head(reEx)
## ------------------------------------------------------------------------
table(reEx$term)
table(reEx$groupFctr)
## ---- eval=FALSE, echo = TRUE--------------------------------------------
# lattice::dotplot(ranef(m1, condVar=TRUE))
## ------------------------------------------------------------------------
p1 <- plotREsim(reEx)
p1
## ------------------------------------------------------------------------
example1 <- draw(m1, type = 'random')
head(example1)
## ------------------------------------------------------------------------
# predict it
predict(m1, newdata = example1)
# change values
example1$service <- "1"
predict(m1, newdata = example1)
## ------------------------------------------------------------------------
example2 <- wiggle(example1, varlist = "lectage",
valueslist = list(c("1", "2", "3", "4", "5", "6")))
example2
## ------------------------------------------------------------------------
example2$yhat <- predict(m1, newdata = example2)
ggplot(example2, aes(x = lectage, y = yhat)) + geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)
## ------------------------------------------------------------------------
example3 <- draw(m1, type = 'average')
example3
## ------------------------------------------------------------------------
example3 <- wiggle(example1, varlist = "service",
valueslist = list(c("0", "1")))
example3$yhat <- predict(m1, newdata = example3)
ggplot(example3, aes(x = service, y = yhat)) + geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)
## ------------------------------------------------------------------------
REquantile(m1, quantile = 0.25, groupFctr = "s")
REquantile(m1, quantile = 0.25, groupFctr = "d")
## ------------------------------------------------------------------------
example4 <- draw(m1, type = 'average')
example4 <- wiggle(example4, varlist = "s",
list(REquantile(m1, quantile = seq(0.1, 0.9, .1),
groupFctr = "s")))
example4$yhat <- predict(m1, newdata = example4)
ggplot(example4, aes(x = reorder(s, -yhat), y = yhat)) +
geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)
## ------------------------------------------------------------------------
subExample <- list(studage = "2", lectage = "4")
example5 <- draw(m1, type = 'average', varList = subExample)
example5
## ------------------------------------------------------------------------
data(VerbAgg)
m2 <- glmer(r2 ~ Anger + Gender + btype + situ +
(1|id) + (1 + Gender|item), family = binomial,
data = VerbAgg)
example6 <- draw(m2, type = 'average', varList = list("id" = "149"))
example6$btype <- "scold"
example6$situ <- "self"
tempdf <- wiggle(example6, varlist = "Gender", list(c("M", "F")))
tempdf <- wiggle(tempdf, varlist = "item",
list(unique(VerbAgg$item)))
tempdf$yhat <- predict(m2, newdata = tempdf, type = "response",
allow.new.levels = TRUE)
ggplot(tempdf, aes(x = item, y = yhat, group = Gender)) +
geom_line(aes(color = Gender))+
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20, hjust=1),
legend.position = "bottom") + labs(x = "Item", y = "Probability")
## ------------------------------------------------------------------------
exampPreds <- predictInterval(m2, newdata = tempdf,
type = "probability", level = 0.8)
tempdf <- cbind(tempdf, exampPreds)
ggplot(tempdf, aes(x = item, y = fit, ymin = lwr, ymax = upr,
group = Gender)) +
geom_ribbon(aes(fill = Gender), alpha = I(0.2), color = I("black"))+
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20),
legend.position = "bottom")+ labs(x = "Item", y = "Probability")
## ------------------------------------------------------------------------
exampPreds <- predictInterval(m2, newdata = tempdf,
type = "probability",
include.resid.var = FALSE, level = 0.8)
tempdf <- cbind(tempdf[, 1:8], exampPreds)
ggplot(tempdf, aes(x = item, y = fit, ymin = lwr, ymax = upr,
group = Gender)) +
geom_ribbon(aes(fill = Gender), alpha = I(0.2), color = I("black"))+
geom_line(aes(color = Gender)) +
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20),
legend.position = "bottom") + labs(x = "Item", y = "Probability")
merTools/inst/doc/marginal_effects.Rmd 0000644 0001762 0000144 00000004456 13466135366 017563 0 ustar ligges users ---
title: "Using merTools to Marginalize Over Random Effect Levels"
author: "Jared Knowles and Carl Frederick"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteEngine{knitr::rmarkdown}
%\VignetteIndexEntry{Marginalizing Random Effect Levels}
%\VignetteEncoding{UTF-8}
---
```{r setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide', cache=FALSE}
library(ggplot2); library(knitr); library(merTools)
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo = TRUE
)
```
# Marginalizing Random Effects
One of the most common questions about multilevel models is how much influence grouping terms have
on the outcome. One way to explore this is to simulate the predicted values of an observation
across the distribution of random effects for a specific grouping variable and term. This can
be described as "marginalizing" predictions over the distribution of random effects. This allows
you to explore the influence of the grouping term and grouping levels on the outcome scale by
simulating predictions for simulated values of each observation across the distribution of
effect sizes.
The `REmargins()` function allows you to do this. Here, we take the example `sleepstudy` model and
marginalize predictions for all of the random effect terms (Subject:Intercept, Subject:Days). By
default, the function will marginalize over the *quartiles* of the expected rank (see expected
rank vignette) of the effect distribution for each term.
```{r}
fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
mfx <- REmargins(merMod = fm1, newdata = sleepstudy[1:10,])
head(mfx)
```
The new data frame output from `REmargins` contains a lot of information. The first few columns
contain the original data passed to `newdata`. Each observation in `newdata` is identified by a
`case` number, because the function repeats each observation by the number of random effect terms
and number of breaks to simulate each term over. Then the `grouping_var`
# Summarizing
# Plotting
Finally - you can plot the results marginalization to evaluate the effect of the random effect terms
graphically.
```{r}
ggplot(mfx) + aes(x = breaks, y = fit_Subject, group = case) +
geom_line() +
facet_wrap(~term)
```
merTools/inst/doc/merToolsIntro.Rmd 0000644 0001762 0000144 00000036153 13402510755 017076 0 ustar ligges users ---
title: "An Introduction to merTools"
author: "Jared Knowles and Carl Frederick"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteEngine{knitr::rmarkdown}
%\VignetteIndexEntry{An Introduction to merTools}
%\VignetteEncoding{UTF-8}
---
```{r setup, echo = FALSE, message=FALSE, warning=FALSE, results='hide'}
knitr::opts_chunk$set(
cache=FALSE,
comment="#>",
collapse=TRUE,
echo=TRUE,
fig.width = 7
)
library(knitr); library(merTools)
```
## Introduction
Working with generalized linear mixed models (GLMM) and linear mixed models (LMM)
has become increasingly easy with the advances in the `lme4` package recently.
As we have found ourselves using these models more and more within our work, we,
the authors, have developed a set of tools for simplifying and speeding up common
tasks for interacting with `merMod` objects from `lme4`. This package provides
those tools.
## Illustrating Model Effects
As the complexity of the model fit grows, it becomes harder and harder to
interpret the substantive effect of parameters in the model.
Let's start with a medium-sized example model using the `InstEval` data provided
by the `lme4` package. These data represent university lecture evaluations at
ETH Zurich made by students. In this data, `s` is an individual student,
`d` is an individual lecturer, `studage` is the semester the student
is enrolled, `lectage` is how many semesters back the lecture with the
rating took place, `dept` is the department of the lecture, and `y` is an integer
1:5 representing the ratings of the lecture from "poor" to "very good":
```{r}
library(lme4)
head(InstEval)
str(InstEval)
```
Starting with a simple model:
```{r}
m1 <- lmer(y ~ service + lectage + studage + (1|d) + (1|s), data=InstEval)
```
After fitting the model we can make use of the first function provided by
`merTools`, `fastdisp` which modifies the function `arm:::display` to more
quickly display a summary of the model without calculating the model sigma:
```{r}
library(merTools)
fastdisp(m1)
```
We see some interesting effects. First, our decision to include student and
lecturer effects seems justified as there is substantial variance within these
groups. Second, there do appear to be some effects by age and for lectures
given as a service by an outside lecturer. Let's look at these in more detail.
One way to do this would be to plot the coefficients together in a line to
see which deviate from 0 and in what direction. To get a confidence interval
for our fixed effect coefficients we have a number of options that represent a
tradeoff between coverage and computation time -- see `confint.merMod` for
details.
An alternative is to simulate values of the fixed effects from the posterior
using the function `arm::sim`. Our next tool, `FEsim`, is a convenience wrapper
to do this and provide an informative data frame of the results.
```{r}
feEx <- FEsim(m1, 1000)
cbind(feEx[,1] , round(feEx[, 2:4], 3))
```
We can present these results graphically, using `ggplot2`:
```{r}
library(ggplot2)
ggplot(feEx[feEx$term!= "(Intercept)", ]) +
aes(x = term, ymin = median - 1.96 * sd,
ymax = median + 1.96 * sd, y = median) +
geom_pointrange() +
geom_hline(yintercept = 0, size = I(1.1), color = I("red")) +
coord_flip() +
theme_bw() + labs(title = "Coefficient Plot of InstEval Model",
x = "Median Effect Estimate", y = "Evaluation Rating")
```
However, an easier option is:
```{r}
plotFEsim(feEx) +
theme_bw() + labs(title = "Coefficient Plot of InstEval Model",
x = "Median Effect Estimate", y = "Evaluation Rating")
```
## Random Effects
Next, we might be interested in exploring the random effects. Again, we create a
dataframe of the values of the simulation of these effects for the individual
levels.
```{r}
reEx <- REsim(m1)
head(reEx)
```
The result is a dataframe with estimates of the values of each of the random
effects provided by the `arm::sim()` function. *groupID* represents the identfiable
level for the variable for one random effect, *term* represents whether the
simulated values are for an intercept or which slope, and *groupFctr* identifies
which of the `(1|x)` terms the values represent. To make unique identifiers
for each term, we need to use both the `groupID` and the `groupFctr` term in case
these two variables use overlapping label names for their groups. In this case:
```{r}
table(reEx$term)
table(reEx$groupFctr)
```
Most important is producing caterpillar or dotplots of these terms to explore
their variation. This is easily accomplished with the `dotplot` function:
```{r, eval=FALSE, echo = TRUE}
lattice::dotplot(ranef(m1, condVar=TRUE))
```
However, these graphics do not provide much control over the results. Instead,
we can use the `plotREsim` function in `merTools` to gain more control over
plotting of the random effect simulations.
```{r}
p1 <- plotREsim(reEx)
p1
```
The result is a ggplot2 object which can be modified however the user sees
fit. Here, we've established that most student and professor effects are
indistinguishable from zero, but there do exist extreme outliers with both
high and low averages that need to be accounted for.
## Subtantive Effects
A logical next line of questioning is to see how much of the variation in a
rating can be caused by changing the student rater and how much is due to the
fixed effects we identified above. This is a very difficult problem to solve,
but using simulation we can examine the model behavior under a range of scenarios
to understand how the model is reflecting changes in the data. To do this,
we use another set of functions available in `merTools`.
The simplest option is to pick an observation at random and then modify its
values deliberately to see how the prediction changes in response. `merTools`
makes this task very simple:
```{r}
example1 <- draw(m1, type = 'random')
head(example1)
```
The `draw` function takes a random observation from the data in the model
and extracts it as a dataframe. We can now do a number of operations to this
observation:
```{r}
# predict it
predict(m1, newdata = example1)
# change values
example1$service <- "1"
predict(m1, newdata = example1)
```
More interesting, let's programatically modify this observation to see how the
predicted value changes if we hold everything but one variable constant.
```{r}
example2 <- wiggle(example1, varlist = "lectage",
valueslist = list(c("1", "2", "3", "4", "5", "6")))
example2
```
The function `wiggle` allows us to create a new dataframe with copies of the
variable that modify just one value. Chaining together `wiggle` calls, we
can see how the variable behaves under a number of different scenarios
simultaneously.
```{r}
example2$yhat <- predict(m1, newdata = example2)
ggplot(example2, aes(x = lectage, y = yhat)) + geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)
```
The result allows us to graphically display the effect of each level of `lectage`
on an observation that is otherwise identical. This is plotted here against a
horizontal line representing the mean of the observed ratings, and two finer
lines showing plus or minus one standard deviation of the mean.
This is nice, but selecting a random observation is not very satisfying as it
may not be very meaningful. To address this, we can instead take the average
observation:
```{r}
example3 <- draw(m1, type = 'average')
example3
```
Here, the average observation is identified based on either the modal observation
for factors or on the mean for numeric variables. Then, the random effect
terms are set to the level equivalent to the median effect -- very close
to 0.
```{r}
example3 <- wiggle(example1, varlist = "service",
valueslist = list(c("0", "1")))
example3$yhat <- predict(m1, newdata = example3)
ggplot(example3, aes(x = service, y = yhat)) + geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)
```
Here we can see that for the average observation, whether the lecture is outside
of the home department has a very slight negative effect on the overall rating.
Might the individual professor or student have more of an impact on the overall
rating? To answer this question we need to wiggle the same observation across
a wide range of student or lecturer effects.
How do we identify this range? `merTools` provides the `REquantile` function
which helps to identify which levels of the grouping terms correspond to which
quantile of the magnitude of the random effects:
```{r}
REquantile(m1, quantile = 0.25, groupFctr = "s")
REquantile(m1, quantile = 0.25, groupFctr = "d")
```
Here we can see that group level `r REquantile(m1, quantile = 0.25, groupFctr = "s")`
corresponds to the 25th percentile of the effect for the student groups, and level
`REquantile(m1, quantile = 0.25, groupFctr = "d")` corresponds to the 25th
percentile for the instructor group. Using this information we can reassign
a specific observation to varying magnitudes of grouping term effects to see
how much they might influence our final prediction.
```{r}
example4 <- draw(m1, type = 'average')
example4 <- wiggle(example4, varlist = "s",
list(REquantile(m1, quantile = seq(0.1, 0.9, .1),
groupFctr = "s")))
example4$yhat <- predict(m1, newdata = example4)
ggplot(example4, aes(x = reorder(s, -yhat), y = yhat)) +
geom_line(aes(group = 1)) +
theme_bw() + ylim(c(1, 5)) +
geom_hline(yintercept = mean(InstEval$y), linetype = 2) +
geom_hline(yintercept = mean(InstEval$y) + sd(InstEval$y), linetype = 3) +
geom_hline(yintercept = mean(InstEval$y) - sd(InstEval$y), linetype = 3)
```
This figure is very interesting because it shows that moving across the range
of student effects can have a larger impact on the score than the fixed effects
we observed above. That is, getting a "generous" or a "stingy" rater can have
a substantial impact on the final rating.
But, we can do even better. First, we can move beyond the average observation
by taking advantage of the `varList` option to the function which allows us to
specify a subset of the data to compute an average for.
```{r}
subExample <- list(studage = "2", lectage = "4")
example5 <- draw(m1, type = 'average', varList = subExample)
example5
```
Now we have the average observation with a student age of 2 and a lecture age
of 4. We can then follow the same procedure as before to explore the effects
on our subsamples. Before we do that, let's fit a slightly more complex model
that includes a random slope.
```{r}
data(VerbAgg)
m2 <- glmer(r2 ~ Anger + Gender + btype + situ +
(1|id) + (1 + Gender|item), family = binomial,
data = VerbAgg)
example6 <- draw(m2, type = 'average', varList = list("id" = "149"))
example6$btype <- "scold"
example6$situ <- "self"
tempdf <- wiggle(example6, varlist = "Gender", list(c("M", "F")))
tempdf <- wiggle(tempdf, varlist = "item",
list(unique(VerbAgg$item)))
tempdf$yhat <- predict(m2, newdata = tempdf, type = "response",
allow.new.levels = TRUE)
ggplot(tempdf, aes(x = item, y = yhat, group = Gender)) +
geom_line(aes(color = Gender))+
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20, hjust=1),
legend.position = "bottom") + labs(x = "Item", y = "Probability")
```
Here we've shown that the effect of both the intercept and the gender slope on
item simultaneously affect our predicted value. This results in the two lines
for predicted values across the items not being parallel. While we can see this
by looking at the results of the summary of the model object, using `fastdisp`
in the `merTools` package for larger models, it is not intuitive what that
effect looks like across different scenarios. `merTools` has given us the
machinery to investigate this.
## Uncertainty
The above examples make use of simulation to show the model behavior after
changing some values in a dataset. However, until now, we've focused on using
point estimates to represent these changes. The use of predicted point estimates
without incorporating any uncertainty can lead to overconfidence in the precision
of the model.
In the `predictInterval` function, discussed in more detail in another package
vignette, we provide a way to incorporate three out of the four types of
uncertainty inherent in a model. These are:
1. Overall model uncertainty
2. Uncertainty in fixed effect values
3. Uncertainty in random effect values
4. Uncertainty in the distribution of the random effects
1-3 are incorporated in the results of `predictInterval`, while capturing 4
would require making use of the `bootMer` function -- options discussed in
greater detail elsewhere. The main advantage of `predictInterval` is that it is
fast. By leveraging the power of the `arm::sim()` function, we are able to
generate prediction intervals for individual observations from very large models
very quickly. And, it works a lot like `predict`:
```{r}
exampPreds <- predictInterval(m2, newdata = tempdf,
type = "probability", level = 0.8)
tempdf <- cbind(tempdf, exampPreds)
ggplot(tempdf, aes(x = item, y = fit, ymin = lwr, ymax = upr,
group = Gender)) +
geom_ribbon(aes(fill = Gender), alpha = I(0.2), color = I("black"))+
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20),
legend.position = "bottom")+ labs(x = "Item", y = "Probability")
```
Here we can see there is barely any gender difference in terms of area of
potential prediction intervals. However, by default, this approach includes the
residual variance of the model. If we instead focus just on the uncertainty of
the random and fixed effects, we get:
```{r}
exampPreds <- predictInterval(m2, newdata = tempdf,
type = "probability",
include.resid.var = FALSE, level = 0.8)
tempdf <- cbind(tempdf[, 1:8], exampPreds)
ggplot(tempdf, aes(x = item, y = fit, ymin = lwr, ymax = upr,
group = Gender)) +
geom_ribbon(aes(fill = Gender), alpha = I(0.2), color = I("black"))+
geom_line(aes(color = Gender)) +
theme_bw() + ylim(c(0, 1)) +
theme(axis.text.x = element_text(angle = 20),
legend.position = "bottom") + labs(x = "Item", y = "Probability")
```
Here, more difference emerges, but we see that the differences are not very
precise.
merTools/inst/modelFigure.R 0000644 0001762 0000144 00000007543 13402510754 015432 0 ustar ligges users # # Not working currently
#
# # Diagramming a model
# library(DiagrammeR)
#
#
# # Get the 'nycflights13' package if not already installed
# # install.packages('nycflights13')
#
# # Get the 'lubridate' package if not already installed
# # install.packages('lubridate')
#
# # Get the latest build of the 'DiagrammeR' package from GitHub
# devtools::install_github('rich-iannone/DiagrammeR')
#
# library("nycflights13")
# library("lubridate")
# library("DiagrammeR")
# library("pipeR")
#
# # Choose a day from 2013 for NYC flight data
# # (You can choose any Julian day, it's interesting to see results for different days)
# day_of_year <- 10
#
# # Get a data frame of complete cases (e.g., flights have departure and arrival times)
# nycflights13 <-
# nycflights13::flights[which(complete.cases(nycflights13::flights) == TRUE), ]
#
# # Generate a POSIXct vector of dates using the 'ISOdatetime' function
# # Columns 1, 2, and 3 are year, month, and day columns
# # Column 4 is a 4-digit combination of hours (00-23) and minutes (00-59)
# date_time <-
# data.frame("date_time" =
# ISOdatetime(year = nycflights13[,1],
# month = nycflights13[,2],
# day = nycflights13[,3],
# hour = gsub("[0-9][0-9]$", "", nycflights13[,4]),
# min = gsub(".*([0-9][0-9])$", "\\1", nycflights13[,4]),
# sec = 0, tz = "GMT"))
#
# # Add the POSIXct vector 'date_time' to the 'nycflights13' data frame
# nycflights13 <- cbind(date_time, nycflights13)
#
# # Select flights only from the specified day of the year 2013
# nycflights13_day <-
# subset(nycflights13,
# date_time >= ymd('2013-01-01', tz = "GMT") + days(day_of_year - 1) &
# date_time < ymd('2013-01-01', tz = "GMT") + days(day_of_year))
#
# # Create the 'nodes' data frame where at least one column is named "nodes" or "node_id"
# # Column 12 is the 3-letter code for the airport departing from
# # Column 13 is for the airport arriving to
# # (Option: change df to 'nycflights13_day' and only airports used for the day will be included)
# nodes_df <- create_nodes(nodes = unique(c(nycflights13[,12],
# nycflights13[,13])),
# label = FALSE)
#
# # The 'edges' data frame must have columns named 'edge_from' and 'edge_to'
# # The color attribute is determined with an 'ifelse' statement, where
# # column 8 is the minutes early (negative values) or minutes late (positive values)
# # for the flight arrival
# edges_df <- create_edges(edge_from = nycflights13_day[,12],
# edge_to = nycflights13_day[,13],
# color = ifelse(nycflights13_day[,8] < 0,
# "green", "red"))
#
# # Set the graph diagram's default attributes for...
#
# # ...nodes
# node_attrs <- c("style = filled", "fillcolor = lightblue",
# "color = gray", "shape = circle", "fontname = Helvetica",
# "width = 1")
#
# # ...edges
# edge_attrs <- c("arrowhead = dot")
#
# # ...and the graph itself
# graph_attrs <- c("layout = circo",
# "overlap = false",
# "fixedsize = true",
# "ranksep = 3",
# "outputorder = edgesfirst")
#
# # Generate the graph diagram in the RStudio Viewer.
# # The green lines show flights that weren't late (red indicates late arrivals)
# # This graph is for a single day of flights, airports that are unconnected on a
# # given day may be destinations on another day
# create_graph(nodes_df = nodes_df, edges_df = edges_df,
# graph_attrs = graph_attrs, node_attrs = node_attrs,
# edge_attrs = edge_attrs, directed = TRUE) %>>%
# render_graph(width = 1200, height = 800)
merTools/tests/ 0000755 0001762 0000144 00000000000 13403040357 013217 5 ustar ligges users merTools/tests/testthat-p_z.R 0000644 0001762 0000144 00000000125 13402510755 015771 0 ustar ligges users library(testthat)
library(merTools)
test_check("merTools", filter = "^[m-z]")
merTools/tests/shinyAppTests/ 0000755 0001762 0000144 00000000000 13402510755 016040 5 ustar ligges users merTools/tests/shinyAppTests/test-shinyApps.R 0000644 0001762 0000144 00000003570 13402510755 021123 0 ustar ligges users #------------------------------------------------------------------------------
# ShinyMer
#-----------------------------------------------------------------------------
## Data and models
n <- 20
x <- y <- rnorm(n)
z <- rnorm(n)
r <- sample(1:5, size=n, replace=TRUE)
d <- data.frame(x,y,z,r)
d2 <- expand.grid(a=factor(1:4),b=factor(1:4),rep=1:10)
n <- nrow(d2)
d2 <- transform(d2,r=sample(1:5, size=n, replace=TRUE),
z=rnorm(n))
d2 <- subset(d2,!(a=="4" & b=="4"))
fm <- lmer( z ~ a*b + (1|r), data=d2)
fm2 <- lmer( z ~ a*b + (1+b|r), data=d2)
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ YEAR + (1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
set.seed(3845)
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10), fac2 = LETTERS[10:20],
obs=1:25)
d$x <- runif(nrow(d))
d$y <- simulate(~ x + fac1 + fac2 + (1 + fac1|grp) + (1|obs), family = binomial,
newdata=d,
newparams=list(beta = rnorm(16),
theta = rnorm(16, 5, 1)))[[1]]
subD <- d[sample(row.names(d), 5000),]
g1 <- glmer(y ~ x + fac1 + fac2 + (1+fac1|grp) + (1|obs), data = subD, family = 'binomial')
g2 <- lmer(y ~ lectage + studage + (1+lectage|d) + (1|dept), data=InstEval)
# ----------------------------------------------------
# shinyMer blocks
shinyMer(fm)
shinyMer(fm2)
shinyMer(g1)
shinyMer(g2)
shinyMer(glmer3LevSlope)
shinyMer(glmer3LevSlope, simData = glmer3LevSlope@frame[22:25,])
merTools/tests/comparisons/ 0000755 0001762 0000144 00000000000 13402510755 015557 5 ustar ligges users merTools/tests/comparisons/wheelReinvention.R 0000644 0001762 0000144 00000007001 13402510755 021225 0 ustar ligges users #What exactly are the values spit out by fitted(sim(merMod), merMod)???
#Does it produce what we are trying to produce with predictInterval?
#The answer is no because it doesn't re-sample the RFX (I am fairly certain)
#The following does show the arm::sim() side-by-side with our method. It looks
#like the effect of the differences is that our within-group regression lines are
#closer to the population average regression line.
library(arm); library(abind); library(mvtnorm);
set.seed(95371)
data(sleepstudy)
model <- lmer(Reaction ~ Days + (Days|Subject), data=sleepstudy)
#Take 5 simulations
test.sim <- sim(model, 1000)
#Display simulated coefs
#coef(test.sim)
ranef.array <- coef(test.sim)$ranef[[1]]
fixef.matrix <- coef(test.sim)$fixef
#Expand and permute fixefs to conform to ranefs for elementwise addition
dim(ranef.array)
dim(fixef.matrix)
#For now do it by hand
fixef.array <- array(data = rep(fixef.matrix, dim(ranef.array)[2]),
dim = c(dim(fixef.matrix), dim(ranef.array)[2]))
fixef.array <- aperm(fixef.array, c(1,3,2))
dim(fixef.array)
combo.array <- fixef.array+ranef.array
#Extract model.matrix for our own multiplication
model.matrix <- lFormula(model@call, data=model@frame)$X
dim(model.matrix)
expanded.combo.array <- combo.array[,model@flist[[1]],]
expanded.combo.array <- aperm(expanded.combo.array, c(3,2,1))
#Matrix multiplication with arrays ???
##The following yeilds a 180x180 matrix for each prediction because I have
##multiplied ALL obs by ALL possible group coefficients
myCalc.large <- abind(
lapply(1:dim(expanded.combo.array)[3],
function(i) model.matrix %*% expanded.combo.array[,,i]),
along=3)
##I only need the diagonal of each 180x180 matrix so we get the value of
##the values for obs i and the coefficients for obs i
myCalc <- abind(lapply(1:dim(myCalc.large)[3],
function(x) diag(myCalc.large[,,x])), along=2)
isTRUE(all.equal(myCalc,fitted(test.sim, model), check.attributes=FALSE))
##Compare with predictInterval()
checkPI <- predictInterval(model, sleepstudy, nsim = 1000, predict="link")$yhat
myCalc.lwr <- apply(myCalc,1,function(x) as.numeric(quantile(x, .025)))
myCalc.fit <- apply(myCalc,1,function(x) as.numeric(quantile(x, .500)))
myCalc.upr <- apply(myCalc,1,function(x) as.numeric(quantile(x, .975)))
checkPI.lwr <- apply(checkPI,1,function(x) as.numeric(quantile(x, .025)))
checkPI.fit <- apply(checkPI,1,function(x) as.numeric(quantile(x, .500)))
checkPI.upr <- apply(checkPI,1,function(x) as.numeric(quantile(x, .975)))
plot.data <- rbind(
data.frame(model="Arm.sim",
x=(1:180)-.15,
lwr=myCalc.lwr,
fit=myCalc.fit,
upr=myCalc.upr),
data.frame(model="predictInterval",
x=(1:180)+.15,
lwr=checkPI.lwr,
fit=checkPI.fit,
upr=checkPI.upr))
ggplot(aes(x=x, y=fit, ymin=lwr, ymax=upr, color=model, position="dodge"), data=plot.data) +
geom_point() +
geom_linerange()
library(dplyr)
calc.sigma <- sqrt(1/rgamma(1000, 0.5*lme4:::df.residual.merMod(model), 0.5*getME(model, "devcomp")$cmp[["pwrss"]]))
sim.sigma <- test.sim@sigma
data.frame(
model=c(rep("ARM",1000),rep("Carl",1000)),
sigma=c(sim.sigma, calc.sigma)
) %>%
qplot(sigma, color=model, data=., geom="density")
merTools/tests/testthat-a_p.R 0000644 0001762 0000144 00000000123 13402510755 015736 0 ustar ligges users library(testthat)
library(merTools)
test_check("merTools", filter = "^[a-m]")
merTools/tests/timings/ 0000755 0001762 0000144 00000000000 13402510755 014674 5 ustar ligges users merTools/tests/timings/predictSpeed.R 0000644 0001762 0000144 00000014745 13402510755 017445 0 ustar ligges users library(microbenchmark)
# ClassFilter <- function(x) inherits(get(x), 'lm' ) & !inherits(get(x), 'gl
set.seed(101)
# Small
lmerSlope1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
bench1 <- microbenchmark(
predict(lmerSlope1, newdata = sleepstudy[1:100,]),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 100, stat = 'median',
include.resid.var = TRUE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 100, stat = 'mean',
include.resid.var = TRUE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 100, stat = 'median',
include.resid.var = FALSE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 1000, stat = 'median',
include.resid.var = TRUE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.8, n.sims = 1000, stat = 'median',
include.resid.var = TRUE),
times = 10, unit = "s"
)
bench2 <- microbenchmark(
predict(lmerSlope1, newdata = sleepstudy[1:100,]),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 100, stat = 'median',
include.resid.var = TRUE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 200, stat = 'mean',
include.resid.var = TRUE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 400, stat = 'median',
include.resid.var = FALSE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 800, stat = 'median',
include.resid.var = TRUE),
predictInterval(lmerSlope1, newdata = sleepstudy[1:100,],
level = 0.9, n.sims = 1600, stat = 'median',
include.resid.var = TRUE),
times = 10, unit = "s"
)
# Medium
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:100)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)))[[1]]
g1 <- lmer(y~fac1+(1|grp), data=d)
bench3 <- microbenchmark(predictInterval(g1, newdata = d[1:100, ], level = 0.9,
n.sims = 50,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g1, newdata = d[1:200, ], level = 0.9,
n.sims = 50,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g1, newdata = d[1:400, ], level = 0.9,
n.sims = 50,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g1, newdata = d[1:800, ], level = 0.9,
n.sims = 50,
stat = 'mean', include.resid.var = TRUE),
times = 10, unit = "s")
# Large
g2 <- lmer(y ~ lectage + studage + (1+lectage|d) + (1|dept), data=InstEval)
d2 <- InstEval[1:1000, ]
bench4 <- microbenchmark(predictInterval(g2, newdata = d2[1:100, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g2, newdata = d2[1:200, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g2, newdata = d2[1:400, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g2, newdata = d2[1:800, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
times = 10, unit = "s")
d2 <- d2[order(d2$d, d2$dept),]
bench5 <- microbenchmark(predictInterval(g2, newdata = d2[1:100, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g2, newdata = d2[1:200, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g2, newdata = d2[1:400, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
predictInterval(g2, newdata = d2[1:800, ], level = 0.9,
n.sims = 500,
stat = 'mean', include.resid.var = TRUE),
times = 10, unit = "s")
g3 <- lmer(y ~ lectage + studage + (1|s) + (1+lectage|d) + (1|dept), data=InstEval)
g2 <- lmer(y ~ lectage + studage + (1+lectage|d) + (1|dept), data=InstEval)
p2 <- profvis({
predictInterval(g2, level = 0.9, newdata = InstEval[1:100,],
n.sims = 7500,
stat = 'mean', include.resid.var = TRUE)
})
# View it with:
p2
library(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl, 4)
zzz <- predictInterval(g3, level = 0.9, newdata = InstEval,
n.sims = 7500,
stat = 'mean', include.resid.var = TRUE, .parallel = TRUE)
# set.seed(101)
# d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
# obs=1:50)
# d$y <- simulate(~fac1+(1|grp),family = binomial,
# newdata=d,
# newparams=list(beta=c(2,-1,3,-2,1.2), theta=c(.33)))[[1]]
# subD <- d[sample(row.names(d), 1200),]
# g1 <- glmer(y~fac1+(1|grp), data=subD, family = 'binomial')
# d$fitted <- predict(g1, d)
#
#
# outs <- predictInterval(g1, newdata = d, level = 0.95, n.sims = 500,
# stat = 'mean', include.resid.var = FALSE,
# type = 'linear.prediction')
#
#
# g2 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
# d1 <- InstEval[1:100, ]
#
merTools/tests/timings/Compare_bootMer_KF.R 0000644 0001762 0000144 00000042333 13402510755 020461 0 ustar ligges users # #This file compares the prediction interval from the "Correct" bootmer method
# #to our quick and dirty method to see how they differ.
#
# #I renamed some things to ease my typing
#
# #Prep R####
# #rm(list=ls())
# library(lme4)
# library(arm)
# library(mvtnorm)
# library(dplyr)
# library(tidyr)
# library(ggplot2)
# library(knitr)
# library(RPushbullet)
#
# set.seed(51315)
# data(InstEval)
# data(sleepstudy)
# data(VerbAgg)
#
# #Cannonical Models for testing purposes
# ##1) Sleepstudy eg - lmer with random slope and random intercept
# m1.form <- Reaction ~ Days + (Days | Subject)
# m1.df <- sleepstudy
# m1.new.df <- m1.df
# ##2) Verbal Aggression eg - lmer (could to glmer(logit)) with 2 levels
# m2.form <- r2 ~ (Anger + Gender + btype + situ)^2 + (1|id) + (1|item)
# m2.df <- VerbAgg
# m2.new.df <- m2.df
# ##3) Verbal Aggression eg - glmer(logit) with 2 levels
# m3.form <- r2 ~ (Anger + Gender + btype + situ)^2 + (1|id) + (1|item)
# m3.df <- VerbAgg
# m3.new.df <- m3.df
# ##4) Instructer Evaluation et - lmer cross-classified, with two different random slopes
# m4.form <- y ~ studage + lectage + service + dept + (studage|s) + (lectage|d)
# m4.df <- InstEval
# m4.new.df <- m4.df
#
# #Step 1: Our method ####
# # predictInterval <- function(model, newdata, level = 0.95, nsim=1000, stat="median", predict.type="link"){
# # #Depends
# # require(mvtnorm)
# # require(lme4)
# # #Prep Output
# # outs <- newdata
# #
# # #Sort out all the levels
# # reTerms <- names(ngrps(model))
# # n.reTerms = length(reTerms)
# #
# # ##The following 3 lines produce a matrix of linear predictors created from the fixed coefs
# # betaSim <- rmvnorm(nsim, mean = fixef(model), sigma = as.matrix(vcov(model)))
# # newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
# # fixed.xb <- newdata.modelMatrix %*% t(betaSim)
# #
# # ##Draw from random effects distributions for each level and merge onto newdata
# # reSim <- NULL
# # for (j in seq_along(reTerms)) {
# # group=reTerms[j]
# # reMeans <- array(ranef(model)[[group]])
# # reMatrix <- attr(ranef(model, condVar=TRUE)[[group]], which = "postVar")
# # reSim[[group]] <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans), ncol=nsim))
# # colnames(reSim[[group]]) <- c(group, paste("sim", 1:nsim, sep=""))
# # for (k in 1:nrow(reMeans)) {
# # lvl = rownames(reMeans)[k]
# # reSim[[group]][k,2:ncol(reSim[[group]])] <- rmvnorm(nsim, mean=as.matrix(reMeans[k,]), sigma=as.matrix(reMatrix[,,k]))
# # }
# # cnames <- colnames(reSim[[group]])
# # reSim[[group]] <- merge(newdata, reSim[[group]], by=group, all.x=TRUE)
# # reSim[[group]] <- as.matrix(reSim[[group]][,setdiff(cnames, group)])
# # }
# #
# # #Calculate yhat as sum of components
# # yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), sum)
# #
# # #Output prediction intervals
# # if (stat=="median") {
# # outs$fit <- apply(yhat,1,function(x) as.numeric(quantile(x, .5)))
# # }
# # if (stat=="mean") {
# # outs$fit <- apply(yhat,1,mean)
# # }
# # outs$upr <- apply(yhat,1,function(x) as.numeric(quantile(x, 1 - ((1-level)/2))))
# # outs$lwr <- apply(yhat,1,function(x) as.numeric(quantile(x, ((1-level)/2))))
# # if (predict.type=="response") {
# # outs$fit <- model@resp$family$linkinv(outs$fit)
# # outs$upr <- model@resp$family$linkinv(outs$upr)
# # outr$lwr <- model@resp$family$linkinc(outs$lwr)
# # }
# # #Close it out
# # return(outs)
# # }
#
# #Step 2: Unit Test Function####
# predictInterval.test <- function(model.form, model.df, model.type="lmer",
# predict.df, predict.type="link", stat="median",
# idvar=NULL, nSims=1000, ...) {
# require(lme4); require(dplyr); require(tidyr); require(ggplot2);
# ##Estimate model
# if (model.type=="lmer") {
# modelEstimation.time <- system.time(
# m1 <- lmer(model.form, data=model.df, ...)
# )
# }
# if (model.type=="glmer") {
# modelEstimation.time <- system.time(
# m1 <- glmer(model.form, data=model.df, ...)
# )
# }
# if (model.type=="blmer") {
# modelEstimation.time <- system.time(
# m1 <- blmer(model.form, data=model.df, ...)
# )
# }
# if (model.type=="bglmer") {
# modelEstimation.time <- system.time(
# m1 <- bglmer(model.form, data=model.df, ...)
# )
# }
# ##If it does not have one, add unique identifier to predict.df
# if (is.null(idvar)) {
# predict.df$.newID <- paste("newID", rownames(predict.df), sep="")
# }
# ##Functions for bootMer() and objects
# ####Return predicted values from bootstrap
# mySumm <- function(.) {
# predict(., newdata=predict.df, re.form=NULL, type=predict.type)
# }
# ####Collapse bootstrap into median, 95% PI
# sumBoot <- function(merBoot, stat) {
# if (stat=="median") {
# fit = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE)))
# }
# if (stat=="mean") {
# fit = apply(merBoot$t, 2, function(x) mean(x, na.rm=TRUE))
# }
# return(
# data.frame(merBoot$data,
# fit = fit,
# lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
# upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE))),
# colNAs = apply(merBoot$t, 2, function(x) sum(is.na(x)))
# )
# )
# }
# ##Bootstrap
# ####Method 1: parametric, re-estimate BLUPS
# cat("\n Bootstrap Method 1")
# boot1.time <- system.time(
# boot1 <- bootMer(m1, mySumm, nsim=nSims,
# use.u=FALSE, type="parametric",
# .progress = "txt", PBargs=list(style=3))
# )
# ####Method 2: parametric, conditional on estimated BLUPS
# cat("\n Bootstrap Method 2")
# boot2.time <- system.time(
# boot2 <- bootMer(m1, mySumm, nsim=nSims,
# use.u=TRUE, type="parametric",
# .progress = "txt", PBargs=list(style=3))
# )
# ####Method 3: semiparametric (draw from resid), conditional on estimated BLUPS
# cat("\n Bootstrap Method 3")
# boot3.time <- system.time(
# boot3 <- bootMer(m1, mySumm, nsim=nSims,
# use.u=TRUE, type="semiparametric",
# .progress = "txt", PBargs=list(style=3))
# )
# ##Our Method
# kf.time <- system.time(
# kf.method <- predictInterval(m1, predict.df)
# )
# ##Compare Times
# compare.time <- rbind(modelEstimation.time, kf.time, boot1.time, boot2.time, boot3.time)
#
# ##Summarize and compare results
# boot1.sum <- sumBoot(boot1, stat=stat)
# boot2.sum <- sumBoot(boot2, stat=stat)
# boot3.sum <- sumBoot(boot3, stat=stat)
#
# eval <- merge(predict.df, boot1.sum) %>%
# rename(boot1.fit=fit, boot1.lwr=lwr, boot1.upr=upr)
# eval <- merge(eval, boot2.sum) %>%
# rename(boot2.fit=fit, boot2.lwr=lwr, boot2.upr=upr)
# eval <- merge(eval, boot3.sum) %>%
# rename(boot3.fit=fit, boot3.lwr=lwr, boot3.upr=upr)
# eval <- merge(eval, kf.method) %>%
# rename(KF.fit=fit, KF.lwr=lwr, KF.upr=upr)
#
# #Check if nrow(eval) still equals nrow(predict.df) because it should
# if (nrow(eval)!=nrow(predict.df)) {
# stop("Something happened when merging bootstrap summaries together ...")
# }
#
# ##Add lmer yhats (predict.merMod) on there
# eval$with.u <- predict(m1, newdata=predict.df, re.form=NULL, type=predict.type)
# eval$no.u <- predict(m1, newdata=predict.df, re.form=NA, type=predict.type)
#
# ##Create summary statistics
# piCoveragePct <- function(ref.upr, ref.lwr, new.upr, new.lwr) {
# pct <- ifelse(ref.upr < new.lwr | ref.lwr > new.upr, 0,
# ifelse(ref.upr < new.upr & ref.lwr > new.lwr, 1,
# ifelse(ref.upr > new.upr & ref.lwr < new.lwr, (new.upr-new.lwr)/(ref.upr-ref.lwr),
# ifelse(ref.upr < new.upr, (ref.upr-new.lwr)/(ref.upr-ref.lwr),
# ifelse(ref.lwr > new.lwr, (new.upr-ref.lwr)/(ref.upr-ref.lwr), NA)))))
# return(pct)
# }
# ###Pct of other PI covered by KF PI
# eval$boot1.coverage <- piCoveragePct(eval$boot1.upr, eval$boot1.lwr, eval$KF.upr, eval$KF.lwr)
# eval$boot2.coverage <- piCoveragePct(eval$boot2.upr, eval$boot2.lwr, eval$KF.upr, eval$KF.lwr)
# eval$boot3.coverage <- piCoveragePct(eval$boot3.upr, eval$boot3.lwr, eval$KF.upr, eval$KF.lwr)
# ###Does KF PI contain point estimates
# eval$contain.boot1 <- piCoveragePct(eval$boot1.fit, eval$boot1.fit, eval$KF.upr, eval$KF.lwr)
# eval$contain.boot2 <- piCoveragePct(eval$boot2.fit, eval$boot2.fit, eval$KF.upr, eval$KF.lwr)
# eval$contain.boot3 <- piCoveragePct(eval$boot3.fit, eval$boot3.fit, eval$KF.upr, eval$KF.lwr)
# eval$contain.with.u <- piCoveragePct(eval$with.u , eval$with.u , eval$KF.upr, eval$KF.lwr)
# eval$contain.no.u <- piCoveragePct(eval$no.u , eval$no.u , eval$KF.upr, eval$KF.lwr)
# ###Point estimate "bias"
# eval$distance.boot1 <- eval$KF.fit - eval$boot1.fit
# eval$distance.boot2 <- eval$KF.fit - eval$boot2.fit
# eval$distance.boot3 <- eval$KF.fit - eval$boot3.fit
# eval$distance.with.u <- eval$KF.fit - eval$with.u
# eval$distance.no.u <- eval$KF.fit - eval$no.u
#
# ##Close it out
# return(
# list(
# compareTimes=compare.time,
# bootstraps = list(boot1, boot2, boot3),
# kf.method = kf.method,
# model=m1,
# evalData = eval
# )
# )
# }
#
# #Step 2b: Post processing predictInterval.test()
# PIE.graphics <- function(eval.df, response, grouping.factors, seed=314) {
# require(dplyr); require(tidyr); require(ggplot2); require(grid); require(gridExtra);
# ####Data prep
# set.seed(seed)
# eval$random <- runif(nrow(eval))
# Eval <- eval %>%
# mutate(
# random = row_number(random)
# ) %>%
# gather(var, value, starts_with("boot"), starts_with("KF")) %>%
# mutate(
# simMethod = sub("[[:punct:]][0-9A-Za-z]*", "", var),
# stat = sub("[0-9A-Za-z]*[[:punct:]]", "", var)
# ) %>%
# select(-var) %>%
# spread(stat, value) %>%
# group_by(.newID) %>%
# arrange(simMethod) %>%
# mutate(
# x=row_number(simMethod),
# x=(x-mean(x))/10
# ) %>%
# group_by(simMethod) %>%
# mutate(
# x=row_number(random)+x
# )
#
# ####Direct Comparison Plot
# Eval.small <- arrange(Eval, random) %>% filter(random<=30) %>% arrange(x)
# p1 <- ggplot(aes(y=fit,x=x, color=simMethod), data=Eval.small) +
# geom_point(size=I(3)) +
# geom_linerange(aes(ymax=upr, ymin=lwr), size=I(1)) +
# geom_point(shape="with.u", color="black", size=I(4),
# data=summarize(group_by(Eval.small, .newID), x=mean(x), fit=mean(with.u))) +
# geom_point(shape="no.u", color="black", size=I(4),
# data=summarize(group_by(Eval.small, .newID), x=mean(x), fit=mean(no.u))) +
# theme_bw() +
# labs(x="Index", y="Prediction Interval", title="95% Prediction interval by method for 30 random obs") +
# scale_x_discrete(breaks=1:30, labels=Eval.small$.newID[Eval.small$simMethod=="KF"]) +
# theme(axis.text.x = element_text(angle=90))
#
# #####Distribution of fitted values
# mean.statment <- paste("mean(",response,")", sep="")
# p2 <- ggplot(aes(x=fit), data=Eval) +
# geom_density(aes(color=simMethod)) +
# geom_vline(x=mean(unlist(Eval[,response]))) +
# geom_density(data=summarize_(
# group_by_(Eval, grouping.factors),
# fit = mean.statment), color="black") +
# theme_bw() +
# labs(x = "Estimate", y="Density",
# title="Average point estimates by prediction type \n (vertical black line is sample grand mean, \n black density is distribution of sample group means)")
#
# #####Bar Graph of KF PI containing other point estimates
# p3 <- eval %>% select(starts_with("contain.")) %>%
# gather(simMethod, value) %>%
# mutate(
# simMethod = sub("[[:alpha:]]*.", "", simMethod)
# ) %>%
# group_by(simMethod) %>%
# summarize(
# value=100*mean(value)
# ) %>%
# qplot(value, x=simMethod, fill=simMethod, data=., geom="bar", position="dodge", stat="identity") +
# labs(y="Percent", title="Percent of observations in which our P.I. contains other point estimates") +
# theme_bw()
#
# ##Summarizing Bias
# SD <- sd(eval[,response])
# bias.data <- eval %>% select(starts_with("distance.")) %>%
# gather(simMethod, value) %>%
# mutate(
# simMethod = sub("[[:alpha:]]*.", "", simMethod),
# value = value/SD,
# ymax = max(density(value)$y),
# xmin = min(value)
# ) %>%
# group_by(simMethod) %>%
# summarize(
# mean.Distance = round(mean(value),4),
# mad.Distance = round(mad(value),2),
# ymax = max(ymax),
# xmin = min(xmin)
# )
# bias.xmin <- min(bias.data$xmin, na.rm=TRUE)
# bias.xmax <- 0.2 * bias.xmin
# bias.ymax <- max(bias.data$ymax, na.rm=TRUE)
# bias.ymin <- .5 * bias.ymax
# bias.data <- bias.data[,1:3]
#
# p4a <- eval %>% select(starts_with("distance.")) %>%
# gather(simMethod, value) %>%
# mutate(
# simMethod = sub("[[:alpha:]]*.", "", simMethod),
# value = value/SD
# ) %>%
# qplot(x=value, color=simMethod, data=., geom="density") +
# labs(x="Standard deviations of response variable") +
# theme_bw()
#
# p4b <- tableGrob(bias.data, show.rownames = FALSE)
# p4 <- arrangeGrob(p4a, p4b, ncol=1, main="Distribution of distance from KF point estimates to other point estimates")
#
#
# ##Summarizing Bias
# coverage.data <- eval %>%
# select(ends_with(".coverage")) %>%
# gather(simMethod, value) %>%
# mutate(
# simMethod = gsub(".coverage","", simMethod, fixed=TRUE),
# TotalCoverage=value==1,
# Cov.90to100 = value>0.9 & value < 1,
# Cov.80to90 = value>0.8 & value <= 0.9,
# Cov.50to80 = value>0.5 & value <= 0.8,
# Cov.0to50 = value>0 & value <= 0.5,
# ZeroCoverage= value==0
# ) %>%
# group_by(simMethod) %>%
# summarize(
# TotalCoverage= round(100*sum(TotalCoverage)/n(),1),
# Cover.90to100 = round(100*sum(Cov.90to100)/n(),1),
# Cover.80to90 = round(100*sum(Cov.80to90)/n(),1),
# Cover.50to80 = round(100*sum(Cov.50to80)/n(),1),
# Cover.0to50 = round(100*sum(Cov.0to50)/n(),1),
# ZeroCoverage = round(100*sum(ZeroCoverage)/n(),1)
# )
# p5a <- eval %>%
# select(ends_with(".coverage")) %>%
# gather(simMethod, value) %>%
# mutate(
# simMethod = gsub(".coverage","", simMethod, fixed=TRUE)
# ) %>%
# ggplot(data=., aes(x=100*value, fill=simMethod)) +
# geom_bar(aes(y=300*(..count..)/sum(..count..)), binwidth=5) +
# facet_wrap(~simMethod, ncol=3) +
# labs(y="Percent of Observations", x="Coverage Percentage") +
# theme_bw()
# p5b <- tableGrob(coverage.data, show.rownames = FALSE)
# p5 <- arrangeGrob(p5a, p5b, ncol=1, main="Distribution of PI coverage percentages")
#
# ##Wrap-up
# return(
# list(
# CompareRandomObs=p1,
# FitDistributions=p2,
# PointEstimateCoverage=p3,
# BiasSummary=p4,
# CoverageSummary=p5
# )
# )
# }
# # PIE.graphics(cannonical.1$evalData, response="Reaction", grouping.factors = "Subject")
#
#
# #Step 3: Summarize and Compare Results####
# #debug(predictInterval.test)
# cannonical.1 <- predictInterval.test(model.form = m1.form, model.df = m1.df, predict.df = m1.new.df, nSims=2500)
# pbPost("note", title="Finished Cannonical Model 1", body=paste(kable(cannonical.1$compareTimes[,1:3]), collapse ="\n"))
# cannonical.2 <- predictInterval.test(model.form = m2.form, model.df = m2.df, predict.df = m2.new.df, nSims=2500)
# pbPost("note", title="Finished Cannonical Model 2", body=paste(kable(cannonical.2$compareTimes[,1:3]), collapse ="\n"))
# cannonical.4 <- predictInterval.test(model.form = m4.form, model.df = m4.df, predict.df = m4.new.df, nSims=2500)
# pbPost("note", title="Finished Cannonical Model 4", body=paste(kable(cannonical.4$compareTimes[,1:3]), collapse ="\n"))
# cannonical.3 <- predictInterval.test(model.form = m3.form, model.df = m3.df, model.type="glmer",
# predict.df=m3.new.df, predict.type="response", nSims=2500)
# pbPost("note", title="Finished Cannonical Model 3", body=paste(kable(cannonical.3$compareTimes[,1:3]), collapse ="\n"))
#
#
# # save.image()
#
# #Step 4: checking math with a fine-toothed comb ####
# ##Pull pieces to create toy example
# KF <- cannonical.1$kf.method
# model <- cannonical.1$model
# nsim=5
merTools/tests/timings/test_fastdisp.R 0000644 0001762 0000144 00000001474 13402510755 017701 0 ustar ligges users # Test fastdisplay
set.seed(51315)
library(lme4); library(arm)
# Sleepstudy
lmerSlope1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
###############################################
context("Fast display")
################################################
test_that("fastdisp pulls out a list", {
# hack to avoid console output
{sink("NUL"); zz <- fastdisp(lmerSlope1); sink()}
expect_is(zz, "list")
expect_identical(names(zz), c("call", "t.value", "coef", "se",
"ngrps", "AIC", "n"))
})
test_that("fastdisp speed is good", {
{sink("NUL"); t1 <- system.time(force(fastdisp(lmerSlope1)))["elapsed"];
sink()}
{sink("NUL"); t2 <- system.time(force(display(lmerSlope1)))["elapsed"];
sink()}
expect_lt(t1, t2)
expect_lt(t1, 0.1)
})
merTools/tests/testthat.R 0000644 0001762 0000144 00000000074 13014440402 015174 0 ustar ligges users library(testthat)
library(merTools)
test_check("merTools")
merTools/tests/testthat/ 0000755 0001762 0000144 00000000000 13466261516 015072 5 ustar ligges users merTools/tests/testthat/test-substEff.R 0000644 0001762 0000144 00000012652 13461456457 017766 0 ustar ligges users # Test substantive effects
library(lme4)
set.seed(157)
# Test all user parameters for REimpact----
context("Test all user parameters for REimpact")
test_that("REimpact parameters are respected", {
skip_on_cran()
g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
#Warning is about %dopar% call in predictInterval
zed <- suppressWarnings(REimpact(g1, newdata = InstEval[9:12, ], groupFctr = "d", n.sims = 50,
include.resid.var = TRUE))
expect_identical(names(zed), c("case", "bin", "AvgFit", "AvgFitSE", "nobs"))
zed2 <- REimpact(g1, newdata = InstEval[9:12, ], groupFctr = "s", n.sims = 50,
include.resid.var = TRUE)
expect_equal(nrow(zed), 3 * nrow(InstEval[9:12, ]))
expect_false(all(zed$AvgFit == zed2$AvgFit))
expect_false(all(zed$AvgFitSE == zed2$AvgFitSE))
expect_identical(names(zed2), c("case", "bin", "AvgFit", "AvgFitSE", "nobs"))
zed <- REimpact(g1, newdata = InstEval[9:12, ], groupFctr = "d", breaks = 5,
n.sims = 50, include.resid.var = TRUE)
expect_equal(nrow(zed), 5 * nrow(InstEval[9:12, ]))
})
test_that("REimpact respects passed values for predictInterval", {
skip_on_cran()
skip_on_travis()
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:30),
obs=1:100)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)))[[1]]
subD <- d[sample(row.names(d), 1000),]
g1 <- lmer(y ~ fac1 + (1|grp), data=subD)
zed <- REimpact(g1, newdata = subD[23:25, ], groupFctr = "grp", breaks = 5,
include.resid.var = FALSE, n.sims = 100, level = 0.8)
zed2 <- REimpact(g1, newdata = subD[23:25, ], groupFctr = "grp", breaks = 5,
n.sims = 500, include.resid.var = TRUE, level = 0.99)
# expect_true(all(zed2$AvgFitSE > zed$AvgFitSE))
expect_true(!all(zed2$AvgFit > zed$AvgFit))
expect_identical(names(zed), c("case", "bin", "AvgFit", "AvgFitSE", "nobs"))
expect_identical(names(zed2), c("case", "bin", "AvgFit", "AvgFitSE", "nobs"))
})
# Test for slopes, intercepts, and combinations----
context("Test for slopes, intercepts, and combinations")
test_that("Multiple terms can be accessed", {
skip_on_cran()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ YEAR + HEIGHT + (1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
# This is the same issue of zero mean zero variance in the predict interval call
zed1 <- suppressWarnings(REimpact(glmer3LevSlope, newdata = grouseticks[5, ], groupFctr = "BROOD",
term = "HEIGHT", n.sims = 500,
include.resid.var = FALSE, breaks = 4, type = "probability"))
# This is the same issue of zero mean zero variance in the predict interval call
zed2 <- suppressWarnings(REimpact(glmer3LevSlope, newdata = grouseticks[5, ], groupFctr = "BROOD",
term = "Intercept",
n.sims = 500,
include.resid.var = FALSE, breaks = 4, type = "probability"))
# This is the same issue of zero mean zero variance in the predict interval call
zed4 <- suppressWarnings(REimpact(glmer3LevSlope, newdata = grouseticks[5, ], groupFctr = "LOCATION",
n.sims = 500,
include.resid.var = FALSE, breaks = 4))
expect_true(all(zed4$AvgFit < zed2$AvgFit))
expect_true(all(zed4$AvgFit < zed1$AvgFit))
expect_false(identical(zed1, zed2))
expect_false(identical(zed1, zed2))
# No longer an error after revision 0.2.3
# expect_error(zed3 <- suppressWarnings(REimpact(glmer3LevSlope, newdata = grouseticks[5, ], groupFctr = "BROOD",
# n.sims = 500,
# include.resid.var = FALSE, breaks = 4)), "Must specify which")
# Don't think we need this ... it throws an subsetting error
expect_error(zed5 <- suppressWarnings(REimpact(glmer3LevSlope, newdata = grouseticks[5, ], groupFctr = "LOCATION",
term = "HEIGHT",
n.sims = 500,
include.resid.var = FALSE, breaks = 4)), "undefined columns selected")
})
# Custom breaks----
context("Custom breaks")
test_that("Custom breakpoints can be set", {
skip_on_cran()
g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
zed <- REimpact(g1, newdata = InstEval[9, ], breaks = c(0, 10, 50, 90, 100),
groupFctr = "d", n.sims = 50,
include.resid.var = TRUE)
zed2 <- REimpact(g1, newdata = InstEval[9, ], breaks = c(1, 20, 40, 60, 80, 100),
groupFctr = "d", n.sims = 50,
include.resid.var = TRUE)
zed3 <- REimpact(g1, newdata = InstEval[9, ], breaks = 5,
groupFctr = "d", n.sims = 50,
include.resid.var = TRUE)
expect_false(nrow(zed) == nrow(zed2))
expect_gt(sd(zed$nobs), sd(zed2$nobs))
expect_gt(mean(zed$nobs), mean(zed2$nobs))
expect_equal(zed3$nobs, zed2$nobs, tolerance = .05)
})
merTools/tests/testthat/test-REmargins.R 0000644 0001762 0000144 00000001274 13466135366 020070 0 ustar ligges users # Test REmargins
set.seed(51315)
fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
#
context("Test random effect marginalization works")
#
mfx <- REmargins(merMod = fm1, newdata = sleepstudy[1:10,])
#
test_that("Text marginalized effects object has the correct dimensions", {
skip_on_travis()
skip_on_cran()
# mfx has a row for each unique combo of row in newdata, breaks, grouping_var, and term
expect_equal(nrow(mfx), 10 * length(unique(mfx$breaks)) * length(unique(mfx$grouping_var)) *
length(unique(mfx$term)))
expect_equal(ncol(mfx), 17)
})
# ggplot(out_w) + aes(x = obs, y = fit_Subject) +
# geom_line() +
# facet_wrap(~case)
merTools/tests/testthat/test-expectedRank.R 0000644 0001762 0000144 00000006566 13466135366 020627 0 ustar ligges users # Test expected rank
#Using 2 of sample models from test_merExtract.R
set.seed(51315)
library(lme4)
# Sleepstudy
m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
m2 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
# Wackier example
data(Orthodont,package = "nlme")
Orthodont$nsex <- as.numeric(Orthodont$Sex == "Male")
Orthodont$nsexage <- with(Orthodont, nsex*age)
m3 <- lmer(distance ~ age + (0 + age + nsex|Subject), data=Orthodont)
# two grouping factors
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
form <- TICKS ~ YEAR + HEIGHT +(HEIGHT|BROOD) + (1|INDEX)
m4 <- glmer(form, family="poisson",data=grouseticks,
control = glmerControl(optimizer="Nelder_Mead",
optCtrl=list(maxfun = 1e5)))
form <- TICKS ~ YEAR + HEIGHT + (0 + HEIGHT|BROOD) + (1|INDEX)
m5 <- glmer(form, family="poisson",data=grouseticks,
control = glmerControl(optimizer="Nelder_Mead",
optCtrl=list(maxfun = 1e5)))
#Custom Expectation Functions
expect_correct_dim <- function(merMod, groupFctr=NULL, term=NULL) {
if (is.null(groupFctr)) {
n.levels <- nrow(ranef(merMod)[[1]])
} else {
n.levels <- nrow(ranef(merMod)[[groupFctr]])
}
ER <- expectedRank(merMod, groupFctr, term)
testthat::expect_true(nrow(ER) == n.levels &
ncol(ER) == 7 &
all(colnames(ER)[6:7] == c("ER", "pctER")) &
class(ER) == "data.frame")
}
###############################################
# Testing expected rank----
context("Testing expected rank")
###############################################
test_that("expectedRank parameters work and dont work as intended", {
expect_correct_dim(m1)
expect_correct_dim(m1, groupFctr="Subject")
expect_correct_dim(m1, term="(Intercept)")
expect_correct_dim(m1, groupFctr="Subject", term="(Intercept)")
expect_correct_dim(m2, term="(Intercept)")
expect_correct_dim(m2, term="Days")
expect_correct_dim(m3, groupFctr="Subject", term="age")
expect_correct_dim(m3, groupFctr="Subject", term="nsex")
expect_correct_dim(m4, groupFctr="BROOD", term="(Intercept)")
expect_correct_dim(m4, groupFctr="INDEX", term="(Intercept)")
expect_correct_dim(m5, groupFctr="BROOD")
expect_correct_dim(m5, groupFctr="INDEX")
# expect_error(expectedRank(m4), "Must specify which grouping factor when there are more than one")
# expect_error(expectedRank(m4, groupFctr="BROOD"), "Must specify which random coefficient when there are more than one per selected grouping factor")
# expect_error(expectedRank(m3, groupFctr="Subject"), "Must specify which random coefficient when there are more than one per selected grouping factor")
# expect_error(expectedRank(m3, term="int"), "undefined columns selected")
})
test_that("Ranks have the correct range", {
numGrps <- nrow(ranef(m1)[[1]])
expect_true(max(expectedRank(m1)$ER) <= numGrps)
expect_true(min(expectedRank(m1)$ER) >= 1)
expect_equal(cor(expectedRank(m1)$ER, rank(ranef(m1)[[1]])), 0.99, tolerance = .01)
})
test_that("Percentile ranks have the correct range", {
expect_true(max(expectedRank(m1)$pctER) <= 100)
expect_true(min(expectedRank(m1)$pctER) >= 0)
})
merTools/tests/testthat/test-merModList.R 0000644 0001762 0000144 00000012375 13466135366 020264 0 ustar ligges users # test merModList functions
#Do merModList objects get built and work----
context("Do merModList objects get built and work")
old_warn <- getOption("warn")
options(warn = -1)
set.seed(432422)
test_that("simple cases work", {
# skip_on_cran()
library(blme)
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:100)
split <- sample(x = LETTERS[9:15], size = nrow(d), replace=TRUE)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)))[[1]]
out <- split(d, split)
rm(split)
# TODO change tolerances
g1 <- lmerModList(formula = y~fac1+(1|grp), data=out,
control= lmerControl(check.conv.grad = .makeCC("warning", tol= 2e-3)))
expect_is(g1, "merModList")
g2 <- blmerModList(formula = y~fac1+(1|grp), data=out,
control= lmerControl(check.conv.grad = .makeCC("warning", tol= 2e-3)))
expect_is(g2, "merModList")
expect_false(class(g1[[1]]) == class(g2[[1]]))
split <- sample(x = LETTERS[1:20], size = nrow(InstEval), replace=TRUE)
out <- split(InstEval, split)
rm(split)
g1 <- lmerModList(formula = y ~ lectage + studage + (1|d) + (1|dept),
data=out,
control= lmerControl(check.conv.grad = .makeCC("warning", tol = 1e-2)))
expect_is(g1, "merModList")
})
test_that("print methods work for merModList", {
# skip_on_cran()
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:100)
split <- sample(x = LETTERS[9:15], size = nrow(d), replace=TRUE)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)))[[1]]
out <- split(d, split)
rm(split);
g1 <- lmerModList(formula = y~fac1+(1|grp), data=out,
control= lmerControl(check.conv.grad = .makeCC("warning", tol= 1e-2)));
{sink("NUL"); zz <- print(g1);
sink()}
expect_is(zz, "list")
zz <- summary(g1)
expect_is(zz, "summary.merModList")
})
# Numerical accuracy of merModList print method----
context("Numerical accuracy of merModList print method")
test_that("print method for merModList works in general case", {
# skip_on_cran()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
form <- TICKS_BIN ~ HEIGHT +(1 + HEIGHT|BROOD) + (1|YEAR)
modDat <- vector(5, mode="list")
for(i in 1:length(modDat)){
modDat[[i]] <- grouseticks[sample(x=1:nrow(grouseticks), size = nrow(grouseticks),
replace=FALSE),]
}
g1 <- glmerModList(formula = form,
data = modDat, family = "binomial",
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e6),
check.conv.grad = .makeCC("warning", tol= 1e-2)))
g1T <- glmer(form, family = "binomial", data = grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e6),
check.conv.grad = .makeCC("warning", tol= 1e-2)))
expect_equal(VarCorr(g1)$stddev$BROOD, attr(VarCorr(g1T)$BROOD, "stddev"),
tolerance = 0.0001)
expect_equal(VarCorr(g1)$stddev$YEAR, attr(VarCorr(g1T)$YEAR, "stddev"),
tolerance = 0.0001)
expect_equal(VarCorr(g1)$correlation$BROOD, attr(VarCorr(g1T)$BROOD, "corre"),
tolerance = 0.0001)
expect_equal(VarCorr(g1)$correlation$YEAR, attr(VarCorr(g1T)$YEAR, "corre"),
tolerance = 0.0001)
form <- TICKS_BIN ~ HEIGHT +(1|BROOD)
g1 <- glmerModList(formula = form,
data = modDat, family = "binomial",
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e6),
check.conv.grad = .makeCC("warning", tol= 1e-2)))
g1T <- glmer(form, family = "binomial", data = grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e6),
check.conv.grad = .makeCC("warning", tol= 1e-2)))
expect_equal(VarCorr(g1)$stddev$BROOD, attr(VarCorr(g1T)$BROOD, "stddev"),
tolerance = 0.0001)
expect_equal(VarCorr(g1)$stddev$YEAR, attr(VarCorr(g1T)$YEAR, "stddev"),
tolerance = 0.0001)
expect_equal(VarCorr(g1)$correlation$BROOD, attr(VarCorr(g1T)$BROOD, "corre"),
tolerance = 0.0001)
expect_equal(VarCorr(g1)$correlation$YEAR, attr(VarCorr(g1T)$YEAR, "corre"),
tolerance = 0.0001)
})
#ICC function----
context("ICC function")
test_that("ICC function works", {
ICC1 <- ICC(outcome = "Reaction", group = "Subject", data = sleepstudy)
expect_is(ICC1, "numeric")
expect_equal(ICC1, 0.3948896, tol = .001)
})
options(warn= old_warn)
merTools/tests/testthat/test-helpers.R 0000644 0001762 0000144 00000007773 13466135366 017655 0 ustar ligges users # Test helper functions
set.seed(51315)
# Trimming data frame----
context("Trimming data frame")
test_that("Trimming results in correct size", {
data(InstEval)
trimDat <- merTools:::trimModelFrame(InstEval)
expect_gt(nrow(InstEval), nrow( merTools:::trimModelFrame(InstEval)))
expect_equal(nrow(trimDat), 4065)
cbpp$obs <- 1:nrow(cbpp)
d1 <- cbpp
d1$y <- d1$incidence / d1$size
gm2 <- glmer(y ~ period +
(1 | herd),
family = binomial, data = d1, nAGQ = 9, weights = d1$size)
trimDat <- merTools:::trimModelFrame(gm2@frame)
expect_is(trimDat, "data.frame")
expect_equal(nrow(trimDat), 18)
})
test_that("Trimming does not corrupt order", {
tmp <- InstEval[1:10, ]
trimDat <- merTools:::trimModelFrame(InstEval)
trimDat <- rbind(tmp, trimDat)
expect_lt(nrow(trimDat), nrow(tmp) + nrow(InstEval))
row.names(tmp) <- NULL
row.names(trimDat) <- NULL
expect_identical(tmp, trimDat[1:10, ])
})
# subBoot and Theta----
context("subBoot and Theta")
test_that("Can extract theta from a fit model", {
set.seed(404)
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:100)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)))[[1]]
subD <- d[sample(row.names(d), 1000),]
g1 <- lmer(y~fac1+(1|grp), data=subD)
g1b <- lm(y ~ fac1, data = subD)
expect_equal(thetaExtract(g1), 0.2085, tolerance = .05)
expect_error(thetaExtract(g1b))
z1 <- subBoot(g1, 500, FUN = thetaExtract, R = 10)
expect_is(z1, "data.frame")
expect_equal(nrow(z1), 11)
expect_equal(ncol(z1), 2)
})
# Test formula Build-----
context("Test formula build")
test_that("Formula works for additive functions", {
n <- 20
x <- y <- rnorm(n)
z <- rnorm(n)
r <- sample(1:5, size=n, replace=TRUE)
d <- data.frame(x,y,z,r)
d2 <- expand.grid(a=factor(1:4),b=factor(1:4),rep=1:10)
n <- nrow(d2)
d2 <- transform(d2,r=sample(1:5, size=n, replace=TRUE),
z=rnorm(n))
d2 <- subset(d2,!(a=="4" & b=="4"))
fm <- lmer( z ~ a + b + (1|r), data=d2)
expect_is(merTools:::formulaBuild(fm), "formula")
expect_identical(merTools:::formulaBuild(fm), as.formula("z ~ a + b"))
})
test_that("Formula works for interactions", {
n <- 200
x <- y <- rnorm(n)
z <- rnorm(n)
r <- sample(1:5, size=n, replace=TRUE)
d <- data.frame(x,y,z,r)
d2 <- expand.grid(a=factor(1:4),b=factor(1:4), c = factor(1:4), rep=1:10)
n <- nrow(d2)
d2 <- transform(d2,r=sample(1:5, size=n, replace=TRUE),
z=rnorm(n))
d2 <- subset(d2,!(a=="4" & b=="4"))
d2$x <- rnorm(nrow(d2))
fm <- lmer( z ~ a * b + c + (1|r), data=d2)
expect_is(merTools:::formulaBuild(fm), "formula")
expect_identical(merTools:::formulaBuild(fm), as.formula("z ~ a * b + c"))
fm <- lmer( z ~ a * b * c + (1|r), data=d2)
expect_is(merTools:::formulaBuild(fm), "formula")
expect_identical(merTools:::formulaBuild(fm), as.formula("z ~ a * b * c"))
fm <- lmer( z ~ a * b * c + x + I(x^2) + (1 + c|r), data=d2)
expect_is(merTools:::formulaBuild(fm), "formula")
expect_identical(merTools:::formulaBuild(fm), as.formula("z ~ a * b * c + x + I(x^2)"))
})
test_that("Build model matrix produces matrices of the right size", {
d <- expand.grid(fac1 = LETTERS[1:5],
grp = letters[11:20],
obs = 1:50)
d$y <- simulate(~fac1 + (1 | grp), family = binomial,
newdata = d,
newparams = list( beta = c(2,-1,3,-2,1.2),
theta = c(.33)),
seed =634)[[1]]
subD <- d[sample(row.names(d), 1200), ]
g1 <- glmer(y~fac1+(1|grp), data=subD, family = 'binomial')
d$fitted <- predict(g1, d)
mm <- merTools:::buildModelMatrix(g1, newdata = d, which = "full")
expect_is(mm, "matrix")
expect_equal(dim(mm), c(2500, 15))
}
)
merTools/tests/testthat/test-merExtract.R 0000644 0001762 0000144 00000011013 13402510755 020274 0 ustar ligges users # -----------------------------------------------------
# Test framework includes tests for multiple intercepts and
# multiple slopes to ensure extraction of random effects
# works in these scenarios
#-------------------------------------------------------
set.seed(51315)
library(lme4)
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
# Build out models
form <- TICKS ~ YEAR + HEIGHT +(1|BROOD) + (1|INDEX) + (1|LOCATION)
glmer3Lev <- glmer(form, family="poisson",data=grouseticks,
control = glmerControl(optimizer="Nelder_Mead",
optCtrl=list(maxfun = 1e5)))
# GLMER 3 level + slope
form <- TICKS ~ YEAR + HEIGHT +(1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="poisson",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
# Sleepstudy
lmerSlope1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
# Wackier example
data(Orthodont,package="nlme")
Orthodont$nsex <- as.numeric(Orthodont$Sex=="Male")
Orthodont$nsexage <- with(Orthodont, nsex*age)
lmerSlope2 <- lmer(distance ~ age + (0 + age + nsex|Subject), data=Orthodont)
###############################################
# Extract Random Effects from merMod----
context("Extract Random Effects from merMod")
################################################
test_that("REextract pulls out a data frame", {
expect_is(REextract(lmerSlope2), "data.frame")
expect_is(REextract(glmer3LevSlope), "data.frame")
expect_is(REextract(glmer3Lev), "data.frame")
expect_is(REextract(lmerSlope1), "data.frame")
})
test_that("REextract issues error with non merMod objects", {
expect_error(REextract(lm(Reaction ~ Days, sleepstudy)))
expect_error(REextract(glm(TICKS ~ YEAR + HEIGHT,
family="poisson", data=grouseticks)))
})
test_that("REextract gets correct dimensions", {
expect_equal(ncol(REextract(glmer3Lev)), 4)
expect_equal(ncol(REextract(lmerSlope1)), 6)
expect_equal(ncol(REextract(lmerSlope2)), 6)
expect_equal(ncol(REextract(glmer3LevSlope)), 6)
expect_equal(nrow(REextract(glmer3Lev)), 584)
expect_equal(nrow(REextract(lmerSlope1)), 18)
expect_equal(nrow(REextract(lmerSlope2)), 27)
expect_equal(nrow(REextract(glmer3LevSlope)), 584)
})
# Check names
# Check numerics
###############################################
# Fixed effect estimates from posterior----
context("Fixed effect estimates from posterior")
################################################
test_that("FEsim produces data.frames", {
expect_is(FEsim(lmerSlope1, n.sims=100), "data.frame")
expect_is(FEsim(lmerSlope2, n.sims=100), "data.frame")
expect_is(FEsim(glmer3Lev, n.sims=100), "data.frame")
expect_is(FEsim(glmer3LevSlope, n.sims=100), "data.frame")
})
test_that("n.sims changes simulation results", {
expect_false(identical(FEsim(lmerSlope1, n.sims = 1000),
FEsim(lmerSlope1, n.sims = 10)))
})
# numeric checks
###############################################
# Random effect estimates from posterior----
context("Random effect estimates from posterior")
################################################
test_that("REsim produces data.frames", {
expect_is(REsim(lmerSlope1, n.sims=100), "data.frame")
expect_is(REsim(lmerSlope2, n.sims=100), "data.frame")
expect_is(REsim(glmer3Lev, n.sims=100), "data.frame")
expect_is(REsim(glmer3LevSlope, n.sims=100), "data.frame")
})
###############################################
# RMSE estimates----
context("RMSE estimates")
################################################
test_that("RMSE produces correct variable types", {
expect_is(RMSE.merMod(lmerSlope1), "numeric")
expect_is(RMSE.merMod(lmerSlope2), "numeric")
expect_is(RMSE.merMod(lmerSlope1, scale = TRUE), "numeric")
expect_is(RMSE.merMod(lmerSlope2, scale = TRUE), "numeric")
})
test_that("RMSE respects scale parameter", {
expect_false(identical(RMSE.merMod(lmerSlope1),
RMSE.merMod(lmerSlope1, scale = TRUE)))
expect_false(identical(RMSE.merMod(lmerSlope2),
RMSE.merMod(lmerSlope2, scale = TRUE)))
expect_lt(RMSE.merMod(lmerSlope2, scale = TRUE),
RMSE.merMod(lmerSlope2))
expect_lt(RMSE.merMod(lmerSlope1, scale = TRUE),
RMSE.merMod(lmerSlope1))
})
merTools/tests/testthat/test-plots.R 0000644 0001762 0000144 00000001256 13466135366 017342 0 ustar ligges users # Test plotting functions
# Plot functions return gg objects? ----
context("Plot functions return gg objects?")
test_that("Prediction intervals work for simple linear example", {
skip_on_cran()
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:100)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)))[[1]]
subD <- d[sample(row.names(d), 1000),]
g1 <- lmer(y~fac1+(1|grp), data=subD)
FE1 <- FEsim(g1)
p1 <- plotFEsim(FE1)
expect_is(p1, "gg")
p1 <- plotREsim(REsim(g1))
expect_is(p1, "gg")
})
merTools/tests/testthat/test-merData.R 0000644 0001762 0000144 00000054457 13466135366 017571 0 ustar ligges users # -----------------------------------------------------
#-------------------------------------------------------
set.seed(51315)
library(lme4)
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
# Build out models
form <- TICKS ~ YEAR + HEIGHT +(1|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3Lev <- glmer(form, family="poisson",data=grouseticks,
control = glmerControl(optimizer="Nelder_Mead",
optCtrl=list(maxfun = 1e5)))
# GLMER 3 level + slope
form <- TICKS ~ YEAR + HEIGHT +(1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="poisson",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
# GLMER 2 level
# data(VerbAgg)
# fmVA <- glmer(r2 ~ Anger + Gender + btype + situ +
# (1|id) + (1|item), family = binomial, data =
# VerbAgg)
# Sleepstudy
lmerSlope1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
# Wackier example
data(Orthodont,package="nlme")
Orthodont$nsex <- as.numeric(Orthodont$Sex=="Male")
Orthodont$nsexage <- with(Orthodont, nsex*age)
lmerSlope2 <- lmer(distance ~ age + (0 + age + nsex|Subject), data=Orthodont)
###############################################
#Sanitize Names----
context("Sanitize Names")
################################################
test_that("Sanitize names renames variables in data.frame", {
badMod <- lmer(distance ~ factor(Sex) + (0 + age + nsex|Subject),
data=Orthodont, control =
lmerControl(check.conv.grad = .makeCC("warning", tol= 8e-3)))
expect_false(identical(names(badMod@frame),
names(merTools:::sanitizeNames(badMod@frame))))
expect_is(merTools:::sanitizeNames(badMod@frame), "data.frame")
expect_identical(names(merTools:::sanitizeNames(badMod@frame))[2], "Sex")
expect_identical(names(badMod@frame)[2], "factor(Sex)")
})
###############################################
#Strip Attributes----
context("Strip attributes")
################################################
test_that("Attributes can be stripped from data.frame", {
full <- names(attributes(lmerSlope1@frame))
redu <- names(attributes(merTools:::stripAttributes(lmerSlope1@frame)))
redu2 <- names(attributes(merTools:::stripAttributes(glmer3LevSlope@frame)))
expect_true(length(full) > length(redu))
expect_true(all(redu %in% full))
expect_true(all(redu %in% c("names", "row.names", "class")))
expect_true(all(redu2 %in% c("names", "row.names", "class")))
})
###############################################
#Random Observation----
context("Random observation")
################################################
test_that("A random observation can be sampled from a merMod", {
data1 <- draw(glmer3Lev, type = 'random')
data2 <- draw(lmerSlope2, type = 'random')
data3 <- draw(lmerSlope1, type = 'random')
data4 <- draw(glmer3LevSlope, type = 'random')
expect_equal(nrow(data1), 1)
expect_equal(nrow(data2), 1)
expect_equal(nrow(data3), 1)
expect_equal(nrow(data4), 1)
expect_equal(ncol(data1), 6)
expect_equal(ncol(data2), 4)
expect_equal(ncol(data3), 3)
expect_equal(ncol(data4), 6)
expect_identical(names(data1), names(glmer3Lev@frame))
expect_identical(names(data2), names(lmerSlope2@frame))
expect_identical(names(data3), names(lmerSlope1@frame))
expect_identical(names(data4), names(glmer3LevSlope@frame))
expect_false(identical(names(attributes(data1)), names(attributes(glmer3Lev@frame))))
expect_false(identical(names(attributes(data2)), names(attributes(lmerSlope2@frame))))
expect_false(identical(names(attributes(data3)), names(attributes(lmerSlope1@frame))))
expect_false(identical(names(attributes(data4)), names(attributes(glmer3LevSlope@frame))))
expect_false("formula" %in% names(attributes(data1)))
expect_false("formula" %in% names(attributes(data2)))
expect_false("formula" %in% names(attributes(data3)))
expect_false("formula" %in% names(attributes(data4)))
})
test_that("Random observation preserves factor levels", {
data1 <- draw(glmer3Lev, type = 'random')
data2 <- draw(lmerSlope2, type = 'random')
data3 <- draw(lmerSlope1, type = 'random')
data4 <- draw(glmer3LevSlope, type = 'random')
expect_true(length(levels(data1$YEAR)) > length(unique(data1$YEAR)))
expect_true(length(levels(data1$BROOD)) > length(unique(data1$BROOD)))
expect_true(length(levels(data1$LOCATION)) > length(unique(data1$LOCATION)))
expect_true(length(levels(data4$YEAR)) > length(unique(data4$YEAR)))
expect_true(length(levels(data4$BROOD)) > length(unique(data4$BROOD)))
expect_true(length(levels(data4$LOCATION)) > length(unique(data4$LOCATION)))
# test levels are correct levels as well
})
###############################################
#Collapse frame----
context("Collapse frame")
################################################
test_that("Collapsing a dataframe results in single row", {
data1 <- merTools:::collapseFrame(Orthodont)
data2 <- merTools:::collapseFrame(grouseticks)
expect_equal(length(data1), length(Orthodont))
expect_equal(length(data2), length(grouseticks))
expect_equal(nrow(data1), 1)
expect_equal(nrow(data2), 1)
expect_equal(data1$distance, mean(Orthodont$distance))
expect_equal(data1$distance, mean(Orthodont$distance))
expect_equal(data1$age, mean(Orthodont$age))
expect_equal(data1$nsex, mean(Orthodont$nsex))
expect_equal(data1$nsexage, mean(Orthodont$nsexage))
expect_equal(data2$TICKS, mean(grouseticks$TICKS))
expect_equal(data2$HEIGHT, mean(grouseticks$HEIGHT))
expect_equal(data2$cHEIGHT, mean(grouseticks$cHEIGHT))
expect_equal(data2$meanTICKS, mean(grouseticks$meanTICKS))
})
###############################################
context("Subset by a list")
################################################
test_that("Data can be subset by a list", {
list11 <- list("Sex" = "Male")
list12 <- list("Sex" = "Male", "Subject" = "M05")
list13 <- list("Sex" == "Male")
list14 <- list("Sex" == "Male", "Subject" == "M05")
list15 <- list("Sex" = "Male", "Subject" == "M05")
data11 <- merTools:::subsetList(Orthodont, list11)
data12 <- merTools:::subsetList(Orthodont, list12)
expect_error(merTools:::subsetList(Orthodont, list13))
expect_error(merTools:::subsetList(Orthodont, list14))
expect_error(merTools:::subsetList(Orthodont, list15))
list21 <- list("YEAR" = "95")
list22 <- list("LOCATION" = "32", "BROOD" = "503")
data21 <- merTools:::subsetList(grouseticks, list21)
data22 <- merTools:::subsetList(grouseticks, list22)
expect_equal(length(data11), length(Orthodont))
expect_equal(length(data21), length(grouseticks))
expect_equal(length(data12), length(Orthodont))
expect_equal(length(data22), length(grouseticks))
expect_equal(nrow(data11), 64)
expect_equal(nrow(data21), 117)
expect_equal(nrow(data12), 4)
expect_equal(nrow(data22), 0)
})
###############################################
#Super factor ----
context("Super factor")
################################################
test_that("Unobserved factor levels can be respected", {
fac1 <- factor(c("502", "503"))
fac1a <- superFactor(fac1, fullLev = unique(grouseticks$BROOD))
fac2 <- factor(c("M16", "M02", "M05"))
fac2a <- superFactor(fac2, fullLev = unique(Orthodont$Subject))
expect_false(identical(levels(fac1), levels(fac1a)))
expect_false(identical(levels(fac2), levels(fac2a)))
expect_true(identical(levels(grouseticks$BROOD), levels(fac1a)))
expect_true(identical(levels(Orthodont$Subject), levels(fac2a)))
expect_equal(length(levels(fac1a)), 118)
expect_equal(length(levels(fac2a)), 27)
})
test_that("SuperFactor handles new factor levels correctly", {
fac1 <- factor(c("999", "888"))
fac1a <- superFactor(fac1, fullLev = unique(grouseticks$BROOD))
fac2 <- factor(c("Z16", "Z02", "Z05"))
fac2a <- superFactor(fac2, fullLev = unique(Orthodont$Subject))
expect_false(identical(levels(fac1), levels(fac1a)))
expect_false(identical(levels(fac2), levels(fac2a)))
expect_false(identical(levels(grouseticks$BROOD), levels(fac1a)))
expect_false(identical(levels(Orthodont$Subject), levels(fac2a)))
expect_equal(length(levels(fac1a)), length(levels(grouseticks$BROOD)) + 2)
expect_equal(length(levels(fac2a)), length(levels(Orthodont$Subject)) + 3)
expect_true(identical(levels(fac1a)[1:118], levels(grouseticks$BROOD)))
expect_true(identical(levels(fac2a)[1:27], levels(Orthodont$Subject)))
})
###############################################
#Shuffle----
context("Shuffle")
################################################
test_that("Data can be shuffled", {
expect_equal(nrow(Orthodont), nrow(merTools:::shuffle(Orthodont)))
expect_equal(ncol(Orthodont), ncol(merTools:::shuffle(Orthodont)))
expect_equal(nrow(grouseticks), nrow(merTools:::shuffle(grouseticks)))
expect_equal(ncol(grouseticks), ncol(merTools:::shuffle(grouseticks)))
})
###############################################
#Find RE Quantiles----
context("Find RE Quantiles")
################################################
test_that("Errors and messages are issued", {
expect_error(REquantile(glmer3Lev, 23, groupFctr = "BROOD"))
expect_warning(REquantile(glmer3Lev, .23, groupFctr = "BROOD", term = "Cat"))
expect_error(REquantile(glmer3Lev, .23, groupFctr = "Cat"))
expect_error(REquantile(glmer3Lev, c(23, .56, .75), "BROOD"))
expect_error(REquantile(glmer3Lev, c(.23, 56, .75), "BROOD"))
expect_error(REquantile(glmer3Lev, c(.23, .56, 75), "BROOD"))
expect_error(REquantile(glmer3Lev, c(.23, .56, 107), "BROOD"))
expect_error(REquantile(glmer3Lev, c(-2, .56, .7), "BROOD"))
expect_message(REquantile(lmerSlope1, .25, groupFctr = "Subject"))
expect_warning(REquantile(lmerSlope2, c(.24), "Subject"))
expect_warning(REquantile(lmerSlope2, c(.24), "Subject", term = "Cat"))
})
# what to do without intercepts (REquantile(lmerSlope2), c(.24), "Subject")
# test_that("Quantiles are returned correctly", {
# myRE <- ranef(glmer3Lev)[["BROOD"]]
# myRE <- myRE[order(myRE[, "(Intercept)"]), ,drop = FALSE]
# rownames(myRE)[floor(23 / nrow(myRE)*100)]
#
#
# })
###############################################
#Test observation wiggle----
context("Test observation wiggle")
################################################
test_that("Row and column lengths are correct -- single_wiggle", {
data1 <- grouseticks[5:9, ]
data1a <- wiggle(data1, var = "BROOD", values = list(c("606", "602", "537")))
data1b <- wiggle(data1a, var = "YEAR", values = list(c("96", "97")))
data2 <- grouseticks[3, ]
data2a <- wiggle(data2, var = "BROOD", values = list(c("606", "602", "537")))
data2b <- wiggle(data2a, var = "YEAR", values = list(c("96", "97")))
data3 <- grouseticks[12:14, ]
data3a <- wiggle(data3, var = "BROOD", values = list(c("606")))
data3b <- wiggle(data3a, var = "YEAR", values = list(c("96", "97")))
expect_equal(nrow(data1), 5)
expect_equal(nrow(data1a), 15)
expect_equal(nrow(data1b), 30)
expect_equal(nrow(data2), 1)
expect_equal(nrow(data2a), 3)
expect_equal(nrow(data2b), 6)
expect_equal(nrow(data3), 3)
expect_equal(nrow(data3a), 3)
expect_equal(nrow(data3b), 6)
expect_equal(length(data1), length(data1a))
expect_equal(length(data1a), length(data1b))
expect_equal(length(data2), length(data2a))
expect_equal(length(data2a), length(data2b))
expect_equal(length(data3), length(data3a))
expect_equal(length(data3a), length(data3b))
data4 <- wiggle(data3, var = "BROOD",
values = list(REquantile(glmer3Lev,
quantile = c(0.25, 0.5, 0.75),
group = "BROOD")))
expect_true(all(table(as.character(data4$BROOD),
as.character(data4$INDEX)) ==1))
})
test_that("Values are placed correctly -- single_wiggle", {
data1 <- grouseticks[5:9, ]
data1a <- wiggle(data1, var = "BROOD", list(values = c("606", "602", "537")))
data1b <- wiggle(data1a, var = "YEAR", values = list(c("96", "97")))
data2 <- grouseticks[3, ]
data2a <- wiggle(data2, var = "BROOD", values = list(c("606", "602", "537")))
data2b <- wiggle(data2a, var = "YEAR", values = list(c("96", "97")))
data3 <- grouseticks[12:14, ]
data3a <- wiggle(data3, var = "BROOD", values = list(c("606")))
data3b <- wiggle(data3a, var = "YEAR", values = list(c("96", "97")))
data4 <- Orthodont[15, ]
data4a <- wiggle(data4, var = "age", values = list(c(10, 11, 12)))
data4b <- wiggle(data4a, var = "Sex", values = list(c("Male", "Female")))
expect_false(any(unique(data1$BROOD) %in% unique(data1a$BROOD)))
expect_false(any(unique(data1$BROOD) %in% unique(data1b$BROOD)))
expect_false(any(unique(data1a$YEAR) %in% unique(data1b$YEAR)))
expect_false(any(unique(data2$BROOD) %in% unique(data2a$BROOD)))
expect_false(any(unique(data2$BROOD) %in% unique(data2b$BROOD)))
expect_false(any(unique(data2a$YEAR) %in% unique(data2b$YEAR)))
expect_false(any(unique(data3$BROOD) %in% unique(data3a$BROOD)))
expect_false(any(unique(data3$BROOD) %in% unique(data3b$BROOD)))
expect_false(any(unique(data3a$YEAR) %in% unique(data3b$YEAR)))
expect_true(all(unique(data1a$BROOD) %in% c("606", "602", "537")))
expect_true(all(unique(data1b$BROOD) %in% c("606", "602", "537")))
expect_true(all(unique(data2a$BROOD) %in% c("606", "602", "537")))
expect_true(all(unique(data2b$BROOD) %in% c("606", "602", "537")))
expect_true(all(unique(data3a$BROOD) %in% c("606")))
expect_true(all(unique(data3b$BROOD) %in% c("606")))
expect_true(all(unique(data4a$age) %in% c(10, 11, 12)))
expect_true(all(unique(data4b$age) %in% c(10, 11, 12)))
expect_true(all(!unique(data1a$YEAR) %in% c("96", "97")))
expect_true(all(unique(data1b$YEAR) %in% c("96", "97")))
expect_true(all(!unique(data2a$YEAR) %in% c("96", "97")))
expect_true(all(unique(data2b$YEAR) %in% c("96", "97")))
expect_true(all(!unique(data3a$YEAR) %in% c("96", "97")))
expect_true(all(unique(data3b$YEAR) %in% c("96", "97")))
expect_true(all(unique(data4a$Sex) %in% c("Male", "Female")))
expect_true(all(unique(data4b$Sex) %in% c("Male", "Female")))
})
test_that("we can use wiggle for multiple variables", {
data1 <- grouseticks[5:9, ]
data1a <- wiggle(data1, var = c("BROOD", "YEAR"),
list(c("606", "602", "537"), c("96", "97")))
data3 <- grouseticks[12:14, ]
data3a <- wiggle(data3, var = c("BROOD", "YEAR"), list(c("606"), c("96", "97")))
data4 <- Orthodont[15, ]
data4a <- wiggle(data4, var = c("age", "Sex"), list(c(10, 11, 12), c("Male", "Female")))
# tests 1 -- row and columns
expect_equal(nrow(data1a), nrow(data1) * 3 * 2)
expect_equal(nrow(data3a), nrow(data3) * 1 * 2)
expect_equal(nrow(data4a), nrow(data4) * 3 * 2)
expect_equal(ncol(data1a), ncol(data1))
expect_equal(ncol(data3a), ncol(data3))
expect_equal(ncol(data4a), ncol(data4))
# tests 2 -- values
expect_false(any(unique(data1$BROOD) %in% unique(data1a$BROOD)))
expect_false(any(unique(data1$YEAR) %in% unique(data1a$YEAR)))
expect_true(all.equal(sort(as.character(unique(data1a$BROOD))), c("537", "602", "606")))
expect_true(all.equal(sort(as.character(unique(data1a$YEAR))), c("96", "97")))
expect_false(any(unique(data3$BROOD) %in% unique(data3a$BROOD)))
expect_false(any(unique(data3$YEAR) %in% unique(data3a$YEAR)))
expect_true(all.equal(sort(as.character(unique(data3a$BROOD))), "606"))
expect_true(all.equal(sort(as.character(unique(data3a$YEAR))), c("96", "97")))
expect_true(all(unique(data4a$age) %in% 10:12))
expect_true(all(unique(data4a$Sex) %in% (c("Female", "Male"))))
expect_true(all.equal(sort(unique(data4a$age)), 10:12))
expect_true(all.equal(sort(as.character(unique(data4a$Sex))), c("Female", "Male")))
})
###############################################
#Test average observation extraction----
context("Test average observation extraction")
################################################
test_that("Returns a single row", {
data1 <- draw(glmer3Lev, type = 'average')
data1a <- draw(glmer3LevSlope, type = 'average')
data2 <- draw(lmerSlope1, type = 'average')
expect_equal(nrow(data1), 1)
expect_equal(nrow(data1a), 1)
expect_equal(nrow(data2), 1)
})
test_that("Warnings and errors are correct", {
expect_message(draw(lmerSlope1, type = 'average'))
expect_warning(draw(lmerSlope2, type = 'average'))
mylist2 <- list("YEAR" = "97", "LOCATION" = "16")
expect_warning(draw(glmer3LevSlope, type = 'average', varList = mylist2))
mylist3 <- list("YEAR" = "97", "LOCATION" = c("16", "56"))
expect_warning(draw(glmer3LevSlope, type = 'average', varList = mylist3))
})
test_that("Subsets work", {
mylist1 <- list("YEAR" = "97")
data1 <- draw(glmer3LevSlope, type = 'average', varList = mylist1)
data1a <- draw(glmer3LevSlope, type = 'average')
expect_false(identical(data1, data1a))
expect_equal(data1$TICKS, mean(grouseticks$TICKS[grouseticks$YEAR == "97"]))
expect_equal(data1a$TICKS, mean(grouseticks$TICKS))
mylist2 <- list("YEAR" = "97", "LOCATION" = "16")
expect_warning(draw(glmer3LevSlope, type = 'average', varList = mylist2),
"less than 20 rows, averages may be problematic")
mylist3 <- list("YEAR" = "97", "LOCATION" = c("16", "56"))
expect_warning(draw(glmer3LevSlope, type = 'average', varList = mylist3),
"fewer than 3 rows, computing global average instead")
})
test_that("Nested specifications work", {
library(ggplot2)
mod1 <- lmer(sleep_total ~ bodywt + (1|vore/order), data=msleep)
data1 <- draw(mod1, "random")
expect_is(data1, "data.frame")
data2 <- draw(mod1, "average")
expect_is(data2, "data.frame")
mylist1 <- list("vore" = "carni")
mylist2 <- list("order" = "Cetacea")
data1 <- draw(mod1, "random", varList = mylist1)
expect_is(data1, "data.frame")
expect_identical(as.character(data1$vore), "carni")
data1 <- draw(mod1, "random", varList = mylist2)
expect_is(data1, "data.frame")
expect_identical(as.character(data1$order), "Cetacea")
data1 <- suppressWarnings(draw(mod1, "average", varList = mylist1))
expect_is(data1, "data.frame")
expect_identical(as.character(data1$vore), "carni")
data1 <- suppressWarnings(draw(mod1, "average", varList = mylist2))
expect_is(data1, "data.frame")
expect_identical(as.character(data1$order), "Cetacea")
fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
data1 <- suppressWarnings(draw(fm1, type = "average", varList = list("Subject" = "308")))
expect_is(data1, "data.frame")
expect_identical(as.character(data1$Subject), "308")
})
test_that("findFormFuns works", {
#Replicable toy data
set.seed(72167)
play <- data.frame(
a = runif(1000),
b = rnorm(1000),
c = rbinom(1000, 1, .35),
d = rpois(1000, 2)
)
play$d <- factor(play$d, labels = LETTERS[seq_along(unique(play$d))])
play$y <- play$a + 0.5*play$b + 2*play$c -1.8*(play$d=="B") + .43*(play$d == "C") + runif(100, 0, .35)
play$grp <- factor(sample(x = paste("Group", 1:43), size = 1000, replace = TRUE))
statmode <- function(x){
z <- table(as.vector(x))
m <- names(z)[z == max(z)]
if (length(m) == 1) {
return(m)
}
return(".")
}
trueMeans <- merTools:::collapseFrame(play)
#Estimate toy models
##. Scenario 1: I()
s1 <- lmer(y ~ a + b + I(b^2) + c + d + (1|grp), data=play)
expect_equal(findFormFuns(s1)[names(trueMeans)], trueMeans)
expect_equal(findFormFuns(s1)$b^2, findFormFuns(s1)$`I(b^2)`)
expect_length(findFormFuns(s1), 7L)
##. Scenario 2: log and no regular a
s2 <- lmer(y ~ log(a) + b + c + d + (1|grp), data=play)
expect_warning(findFormFuns(s2))
expect_false(suppressWarnings(findFormFuns(s2)$`log(a)` == log(trueMeans$a)))
expect_silent(findFormFuns(s2, origData = play))
expect_equal(findFormFuns(s2, origData = play)$`log(a)`, log(trueMeans$a))
##. Scenario 3: 2 continuous interaction with *
s3 <- lmer(y ~ a*b + c + d + (1|grp), data=play)
expect_equal(findFormFuns(s3)[names(trueMeans)], trueMeans)
expect_length(findFormFuns(s3), 6L)
##. Scenario 4: 2 continuous interaction with :
s4 <- lmer(y ~ a:b + c + d + (1|grp), data=play)
expect_equal(findFormFuns(s4)[names(trueMeans)], trueMeans)
expect_length(findFormFuns(s4), 6L)
##. Scenario 5: 1 cont 1 cat interaction with *
s5 <- lmer(y ~ a + c + b*d + (1|grp), data = play)
expect_equal(findFormFuns(s5)[names(trueMeans)], trueMeans)
expect_length(findFormFuns(s5), 6L)
##. Scenario 6: 1 cont 1 cat interaction with :
s6 <- lmer(y ~ a + c + b:d + (1|grp), data = play)
expect_equal(findFormFuns(s6)[names(trueMeans)], trueMeans)
expect_length(findFormFuns(s6), 6L)
##. Scenario 7: 2 cat interaction with *
s7 <- lmer(y ~ a + b + c*d + (1|grp), data = play)
expect_equal(findFormFuns(s7)[names(trueMeans)], trueMeans)
expect_length(findFormFuns(s7), 6L)
##. Scenario 8: 2 cat interaction with :
s8 <- lmer(y ~ a + b + c:d + (1|grp), data = play)
expect_equal(findFormFuns(s8)[names(trueMeans)], trueMeans)
expect_length(findFormFuns(s8), 6L)
##. Scenario 9: function in random slope
s9 <- lmer(y ~ a + b + c + d + (1 + sqrt(abs(b))|grp), data = play)
expect_equal(findFormFuns(s9)[names(trueMeans)], trueMeans)
expect_equal(findFormFuns(s9)$`sqrt(abs(b))`, sqrt(abs(trueMeans$b)))
expect_length(findFormFuns(s9), 7L)
##. Scenario 10: two columns in I with no main effects
s10 <- lmer(y ~ I(log(a) + b^3) + c + d + (1|grp), data=play)
expect_warning(findFormFuns(s10))
expect_false(suppressWarnings(findFormFuns(s10)$`I(log(a) + b^3)`) == log(trueMeans$a) + trueMeans$b^3)
expect_silent(findFormFuns(s10, origData = play))
expect_equal(findFormFuns(s10, origData = play)$`I(log(a) + b^3)`, log(trueMeans$a) + trueMeans$b^3)
##. Test that draw, draw.merMod and averageObs accept origData and issue warning if appropriate
expect_warning(averageObs(s10))
expect_silent(averageObs(s10, origData = play))
expect_warning(merTools:::draw.merMod(s10, type = "average"))
expect_silent(merTools:::draw.merMod(s10, origData = play, type = "average"))
expect_silent(merTools:::draw.merMod(s10, type = "random"))
})
merTools/tests/testthat/test-subboot.R 0000644 0001762 0000144 00000004233 13466135366 017654 0 ustar ligges users # test subboot
set.seed(51315)
library(lme4)
# Sleepstudy
lmerSlope1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
###############################################
# Extract theta----
context("Extract theta using subBoot")
################################################
test_that("extract theta produces a vector", {
expect_is(thetaExtract(lmerSlope1), "numeric")
expect_equal(length(thetaExtract(lmerSlope1)), 3)
})
test_that("thetaExtract throws errors for non-merMod objects", {
expect_error(thetaExtract(lmerSlope1@frame))
m1 <- lm(mpg ~ disp + hp, data = mtcars)
expect_error(thetaExtract(m1))
})
###############################################
# subBoot----
context("subBoot")
################################################
test_that("subBoot produces correct output", {
skip_on_cran()
# Subbooot returns errors here
out1 <- subBoot(lmerSlope1, n = 100, FUN = thetaExtract, R = 100)
expect_is(out1, "data.frame")
expect_equal(ncol(out1), 4)
expect_equal(nrow(out1), 101)
out2 <- subBoot(lmerSlope1, n = 100,
FUN = function(x) getME(x, "fixef"),
R = 100)
expect_is(out2, "data.frame")
expect_equal(ncol(out2), 3)
expect_equal(nrow(out2), 101)
})
context("subBoot glmer models")
d <- expand.grid(fac1 = LETTERS[1:5],
grp = letters[11:20],
obs = 1:50)
d$y <- simulate(~fac1 + (1 | grp), family = binomial,
newdata = d,
newparams = list( beta = c(2,-1,3,-2,1.2),
theta = c(.33)),
seed =634)[[1]]
subD <- d[sample(row.names(d), 1200), ]
g1 <- glmer(y~fac1+(1|grp), data=subD, family = 'binomial')
test_that("subBoot produces correct glmer output", {
out1 <- subBoot(g1, n = 1000, FUN = thetaExtract, R = 10)
expect_is(out1, "data.frame")
expect_equal(ncol(out1), 2)
expect_equal(nrow(out1), 11)
#
out2 <- subBoot(g1, n = 500,
FUN = function(x) getME(x, "fixef"),
R = 10)
expect_is(out2, "data.frame")
expect_equal(ncol(out2), 6)
expect_equal(nrow(out2), 11)
})
merTools/tests/testthat/test-predict.R 0000644 0001762 0000144 00000115572 13466135366 017642 0 ustar ligges users
set.seed(51315)
#Prediction intervals cover for simulated problems----
context("Prediction intervals cover for simulated problems")
test_that("Prediction intervals work for simple linear example", {
skip_on_travis()
skip_on_cran()
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:100)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)), seed = 4548)[[1]]
subD <- d[sample(row.names(d), 1000),]
g1 <- lmer(y~fac1+(1|grp), data=subD)
d$fitted <- predict(g1, d)
#This suppresses the warning about no parallel backend registered
outs <- suppressWarnings(
predictInterval(g1, newdata = d, level = 0.9, n.sims = 1000,
seed = 468,
stat = 'mean', include.resid.var = TRUE)
)
outs <- cbind(d, outs); outs$coverage <- FALSE
outs$coverage <- outs$fitted <= outs$upr & outs$fitted >= outs$lwr
expect_true(all(outs$coverage))
expect_lt(abs(mean(outs$fit - outs$fitted)), .0005)
expect_lt(abs(mean(outs$fit - outs$y)), .01)
rm(outs)
})
test_that("Prediction intervals work for simple GLMM example", {
skip_on_travis()
skip_on_cran()
set.seed(101)
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:50)
d$y <- simulate(~fac1+(1|grp),family = binomial,
newdata=d,
newparams=list(beta=c(2,-1,3,-2,1.2), theta=c(.33)),
seed =634)[[1]]
subD <- d[sample(row.names(d), 1200),]
g1 <- glmer(y~fac1+(1|grp), data=subD, family = 'binomial')
d$fitted <- predict(g1, d)
outs <- predictInterval(g1, newdata = d, level = 0.9, n.sims = 500,
stat = 'mean', include.resid.var = TRUE,
type = 'linear.prediction', seed = 4563)
outs <- cbind(d, outs); outs$coverage <- FALSE
outs$coverage <- outs$fitted <= outs$upr & outs$fitted >= outs$lwr
expect_true(all(outs$coverage))
expect_lt(abs(mean(outs$fit - outs$fitted)), .1)
expect_lt(abs(mean(outs$fit - outs$y)), 2)
outs2 <- predictInterval(g1, newdata = d, level = 0.95, n.sims = 500,
stat = 'mean', include.resid.var = FALSE,
type = 'probability')
expect_false(identical(outs, outs2))
expect_true(max(outs2$fit) <= 1)
expect_true(min(outs2$fit) >= 0)
expect_true(max(outs2$lwr) <= 1)
expect_true(min(outs2$lwr) >= 0)
expect_true(max(outs2$upr) <= 1)
expect_true(min(outs2$upr) >= 0)
expect_false(max(outs$fit) <= 1)
# expect_true(min(outs$fit) < 0)
expect_false(max(outs$lwr) <= 1)
expect_false(min(outs$lwr) >= 0)
expect_false(max(outs$upr) <= 1)
rm(outs)
})
test_that("Prediction interval respects user input", {
skip_on_travis()
skip_on_cran()
set.seed(101)
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:25)
d$y <- simulate(~fac1+(1|grp),family = gaussian,
newdata=d,
newparams=list(beta=c(2,1,3,4,7), theta=c(.25),
sigma = c(.23)), seed =463)[[1]]
subD <- d[sample(row.names(d), 1000),]
g1 <- lmer(y~fac1+(1|grp), data=subD)
d$fitted <- predict(g1, d)
outs1 <- predictInterval(g1, newdata = d, level = 0.8, n.sims = 500,
stat = 'mean', include.resid.var = TRUE, seed=643)
outs2 <- predictInterval(g1, newdata = d, level = 0.95, n.sims = 500,
stat = 'mean', include.resid.var = TRUE, seed=643)
outs1a <- predictInterval(g1, newdata = d, level = 0.8, n.sims = 1500,
stat = 'mean', include.resid.var = TRUE, seed=643)
outs2a <- predictInterval(g1, newdata = d, level = 0.95, n.sims = 1500,
stat = 'mean', include.resid.var = TRUE, seed=643)
outs3 <- predictInterval(g1, newdata = d, level = 0.8, n.sims = 500,
stat = 'mean', include.resid.var = FALSE, seed=643)
outs3b <- predictInterval(g1, newdata = d, level = 0.8, n.sims = 500,
stat = 'median', include.resid.var = FALSE, seed=643)
outs3c <- predictInterval(g1, newdata = d[1, ], level = 0.8, n.sims = 500,
stat = 'median', include.resid.var = FALSE, seed=643)
expect_gt(median(outs2$upr - outs1$upr), 0.1)
expect_gt(median(outs2a$upr - outs1a$upr), 0.1)
expect_lt(median(outs3$upr - outs1$upr), -.2)
expect_lt(median(outs3b$upr - outs1a$upr), -.2)
expect_lt(mean(outs1$upr - outs1$lwr), mean(outs2$upr - outs2$lwr))
expect_lt(mean(outs1$upr - outs1$lwr), mean(outs1a$upr - outs1a$lwr))
expect_lt(mean(outs2$upr - outs2$lwr), mean(outs2a$upr - outs2a$lwr))
expect_false(median(outs3$fit) == median(outs3b$fit))
expect_equal(nrow(outs3c), 1)
})
# Prediction works for all combinations of slopes and intercepts----
context("Prediction works for all combinations of slopes and intercepts")
test_that("Predict handles unused and subset of factor levels", {
skip_on_cran()
skip_on_travis()
set.seed(101)
moddf <- InstEval[sample(rownames(InstEval), 10000), ]
g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=moddf)
d1 <- InstEval[1:100, ]
outs1 <- suppressWarnings(predictInterval(g1, newdata = d1, level = 0.8, n.sims = 500,
stat = 'mean', include.resid.var = TRUE, seed = 4632))
d2 <- rbind(d1, InstEval[670:900,])
outs1a <- suppressWarnings(predictInterval(g1, newdata = d2, level = 0.8, n.sims = 500,
stat = 'mean', include.resid.var=TRUE, seed = 4632)[1:100,])
expect_is(outs1, "data.frame")
expect_is(outs1a, "data.frame")
expect_equal(nrow(outs1), 100)
expect_equal(nrow(outs1a), 100)
g2 <- lmer(y ~ lectage + studage + (1+lectage|d) + (1|dept), data=moddf)
d2 <- InstEval[670:900,]
outs1a <- suppressWarnings(predictInterval(g2, newdata = d2, level = 0.8, n.sims = 500,
stat = 'mean', include.resid.var=TRUE, seed = 4632))
expect_is(outs1a, "data.frame")
expect_equal(nrow(outs1a), 231)
})
rm(list = ls())
test_that("Prediction intervals work for multiple parameters per level", {
skip_on_travis()
skip_on_cran()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ YEAR + HEIGHT +(1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
#In the call below we are getting warnings because our call to mvtnorm::rmvnorm
#is shotting a warning when mean and sigma of multivariate distribution are
#zero using the method="chol
outs1 <- suppressWarnings(predictInterval(glmer3LevSlope, newdata = grouseticks[1:10,]))
expect_is(outs1, "data.frame")
})
test_that("Prediction works for random slopes not in fixed", {
skip_on_travis()
skip_on_cran()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ YEAR + (1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
zNew <- grouseticks[1:10,]
#In the call below we are getting warnings because our call to mvtnorm::rmvnorm
#is shotting a warning when mean and sigma of multivariate distribution are
#zero using the method="chol
outs1 <- suppressWarnings(predictInterval(glmer3LevSlope, newdata = zNew))
expect_is(outs1, "data.frame")
# Message may not be necessary any more
# expect_message(predictInterval(glmer3LevSlope, newdata = zNew))
})
# Test for new factor levels----
context("Test for new factor levels")
test_that("Prediction intervals work with new factor levels added", {
skip_on_travis()
skip_on_cran()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ YEAR + HEIGHT +(1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
zNew <- grouseticks[1:10,]
zNew$BROOD <- as.character(zNew$BROOD)
zNew$BROOD[1:9] <- "100"
zNew$BROOD[10] <- "101"
#In the call below we are getting warnings because our call to mvtnorm::rmvnorm
#is shotting a warning when mean and sigma of multivariate distribution are
#zero using the method="chol
outs1 <- suppressWarnings(predictInterval(glmer3LevSlope, newdata = zNew))
expect_is(outs1, "data.frame")
expect_warning(predictInterval(glmer3LevSlope, newdata = zNew))
})
test_that("Prediction works for factor as a random slope not in fixed", {
skip_on_travis()
skip_on_cran()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ HEIGHT +(1 + YEAR|BROOD) + (1|LOCATION)
#Suppressing warning for known degenerate model below
glmer3LevSlope <- suppressWarnings(glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5))))
zNew <- grouseticks[1:10,]
zNew$BROOD <- as.character(zNew$BROOD)
zNew$BROOD[1:9] <- "100"
zNew$BROOD[10] <- "101"
expect_warning(predictInterval(glmer3LevSlope, newdata = zNew),
"Currently, predictions for these values are based only on the")
outs1 <- suppressWarnings(predictInterval(glmer3LevSlope, newdata = zNew))
zNew <- grouseticks[1:10,]
outs2 <- predictInterval(glmer3LevSlope, newdata = zNew)
expect_is(outs1, "data.frame")
expect_is(outs2, "data.frame")
expect_identical(dim(outs1), dim(outs2))
})
# Numeric accuracy----
context("Numeric accuracy")
# Cases
# new factor level for group term
test_that("Median of prediction interval is close to predict.lmer for single group models", {
set.seed(2311)
fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
truPred <- predict(fm1, newdata = sleepstudy)
newPred <- predictInterval(fm1, newdata = sleepstudy, n.sims = 500,
level = 0.9, stat = c("median"),
include.resid.var = FALSE, seed = 4563)
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/50)
fm1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
truPred <- predict(fm1, newdata = sleepstudy)
newPred <- predictInterval(fm1, newdata = sleepstudy, n.sims = 1500,
level = 0.9, stat = c("median"),
include.resid.var = FALSE, seed = 9598)
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/100)
})
test_that("Median of PI is close to predict.lmer for complex group models", {
skip_on_cran()
skip_on_travis()
set.seed(101)
moddf <- InstEval[sample(rownames(InstEval), 10000), ]
g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=moddf)
d1 <- moddf[1:200, ]
newPred <- predictInterval(g1, newdata = d1, level = 0.8, n.sims = 500,
stat = 'median', include.resid.var = FALSE,
seed = 4563)
truPred <- predict(g1, newdata = d1)
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/100)
rm(list=ls())
})
test_that("Median of PI is close to predict.glmer for basic and complex grouping", {
skip_on_cran()
skip_on_travis()
set.seed(8496)
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:8), fac2 = LETTERS[12:19],
obs=1:20)
d$x <- runif(nrow(d))
d$y <- simulate(~ x + fac1 + fac2 + (1 + fac1|grp) + (1|obs), family = binomial,
newdata=d,
newparams=list(beta = rnorm(13),
theta = rnorm(16, 5, 1)), seed = 4563)[[1]]
subD <- d[sample(row.names(d), 1500),]
# TOO SLOW
g1 <- glmer(y ~ x + fac1 + fac2 + (1+fac1|grp) + (1|obs), data = subD,
family = 'binomial',
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
truPred <- predict(g1, subD)
newPred <- suppressWarnings(predictInterval(g1, newdata = subD, level = 0.95, n.sims = 2000,
stat = 'median', include.resid.var = FALSE,
type = 'linear.prediction', seed = 3252))
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/20)
# This test fails currently
# g1 <- glmer(y ~ x + fac2 + (1 + fac1|grp) + (1|obs), data = subD, family = 'binomial')
# truPred <- predict(g1, subD, type = "response")
# newPred <- predictInterval(g1, newdata = subD, level = 0.8, n.sims = 500,
# stat = 'median', include.resid.var = FALSE,
# type = 'probability')
# expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/20)
rm(list = ls())
})
test_that("Prediction intervals work with new factor levels added", {
skip_on_cran()
skip_on_travis()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ YEAR + HEIGHT +(1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
zNew <- grouseticks
zNew$BROOD <- as.character(zNew$BROOD)
zNew$BROOD[1:99] <- "100"
zNew$BROOD[100] <- "101"
#In the call below we are getting warnings because our call to mvtnorm::rmvnorm
#is shotting a warning when mean and sigma of multivariate distribution are
#zero using the method="chol
newPred <- suppressWarnings(predictInterval(glmer3LevSlope, newdata = zNew, level = 0.95,
n.sims = 500, stat = 'median',
include.resid.var = TRUE, seed = 4563))
truPred <- predict(glmer3LevSlope, newdata = zNew, allow.new.levels = TRUE)
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/40)
})
test_that("Prediction intervals work with slope not in fixed effects and data reordered", {
skip_on_travis()
skip_on_cran()
data(grouseticks)
grouseticks$HEIGHT <- scale(grouseticks$HEIGHT)
grouseticks <- merge(grouseticks, grouseticks_agg[, 1:3], by = "BROOD")
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
# GLMER 3 level + slope
form <- TICKS_BIN ~ YEAR + (1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks,
control = glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun = 1e5)))
zNew <- grouseticks
zNew$BROOD <- as.character(zNew$BROOD)
zNew$BROOD[1:99] <- "100"
zNew$BROOD[100] <- "101"
zNew <- zNew[, c(10, 9, 8, 7, 1, 2, 3, 4, 5, 6, 10)]
#In the call below we are getting warnings because our call to mvtnorm::rmvnorm
#is shotting a warning when mean and sigma of multivariate distribution are
#zero using the method="chol
newPred <- suppressWarnings(predictInterval(glmer3LevSlope, newdata = zNew, level = 0.95,
n.sims = 500, stat = 'median',
include.resid.var = TRUE, seed = 4563))
truPred <- predict(glmer3LevSlope, newdata = zNew, allow.new.levels = TRUE)
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/20)
})
# Special cases - rank deficiency----
context("Special cases - rank deficiency")
test_that("Prediction intervals are accurate with interaction terms and rank deficiency", {
skip_on_travis()
skip_on_cran()
set.seed(54656)
n <- 20
x <- y <- rnorm(n)
z <- rnorm(n)
r <- sample(1:5, size=n, replace=TRUE)
d <- data.frame(x,y,z,r)
d2 <- expand.grid(a=factor(1:4),b=factor(1:4),rep=1:10)
n <- nrow(d2)
d2 <- transform(d2,r=sample(1:5, size=n, replace=TRUE),
z=rnorm(n))
d2 <- subset(d2,!(a=="4" & b=="4"))
fm <- lmer( z ~ a*b + (1|r), data=d2)
expect_is(predictInterval(fm, newdata = d2[1:10, ]), "data.frame")
newPred <- predictInterval(fm, newdata = d2, level = 0.8, n.sims = 1500,
stat = 'median', include.resid.var = FALSE,
seed = 2342)
truPred <- predict(fm, newdata = d2)
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/15)
fm2 <- lmer( z ~ a*b + (1+b|r), data=d2)
#In the call below we are getting warnings because our call to mvtnorm::rmvnorm
#is shotting a warning when mean and sigma of multivariate distribution are
#zero using the method="chol
newPred <- suppressWarnings(predictInterval(fm2, newdata = d2, level = 0.8,
n.sims = 1000,
stat = 'median', include.resid.var = FALSE))
truPred <- predict(fm2, newdata = d2)
expect_is(newPred, "data.frame")
expect_equal(mean(newPred$fit - truPred), 0, tolerance = sd(truPred)/10)
})
# Test the simResults----
context("Test the simResults")
test_that("simResults option behaves", {
skip_on_cran()
m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
preds1 <- predictInterval(m1, newdata = sleepstudy[1:5, ])
preds2 <- predictInterval(m1, newdata = sleepstudy[1:5, ],
returnSims = TRUE)
expect_null(attr(preds1, "sim.results"))
expect_is(attr(preds2, "sim.results"), "matrix")
out <- attr(preds2, "sim.results")
expect_equal(ncol(out), 1000)
expect_equal(nrow(out), 5)
m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
preds1 <- predictInterval(m1, newdata = sleepstudy[1:5, ],
returnSims = TRUE,
which = "random", seed = 23151)
preds2 <- predictInterval(m1, newdata = sleepstudy[1:5, ], which = "fixed",
returnSims = TRUE, seed = 23151)
preds3 <- predictInterval(m1, newdata = sleepstudy[1:5, ], which = "all",
returnSims = TRUE, seed = 23151)
preds4 <- predictInterval(m1, newdata = sleepstudy[1:5, ],
returnSims = TRUE, seed = 23151)
preds1b <- predictInterval(m1, newdata = sleepstudy[1:5, ],
returnSims = TRUE,
which = "random", seed = 23151,
include.resid.var = FALSE)
preds2b <- predictInterval(m1, newdata = sleepstudy[1:5, ], which = "fixed",
returnSims = TRUE, seed = 23151,
include.resid.var = FALSE)
preds3b <- predictInterval(m1, newdata = sleepstudy[1:5, ], which = "all",
returnSims = TRUE, seed = 23151,
include.resid.var = FALSE)
preds4b <- predictInterval(m1, newdata = sleepstudy[1:5, ],
returnSims = TRUE, seed = 23151,
include.resid.var = FALSE)
expect_is(attr(preds1, "sim.results"), "matrix")
expect_gt(abs(mean(attr(preds1, "sim.results") - attr(preds2, "sim.results"))),
200)
expect_is(attr(preds2, "sim.results"), "matrix")
expect_gt(abs(mean(attr(preds4, "sim.results") - attr(preds1, "sim.results"))),
200)
expect_gt(abs(mean(attr(preds4, "sim.results") - attr(preds1, "sim.results"))),
abs(mean(attr(preds1, "sim.results") - attr(preds2, "sim.results"))))
expect_is(attr(preds3, "sim.results"), "list")
expect_is(attr(preds4, "sim.results"), "matrix")
expect_is(attr(preds1b, "sim.results"), "matrix")
expect_is(attr(preds2b, "sim.results"), "matrix")
expect_is(attr(preds3b, "sim.results"), "list")
expect_is(attr(preds4b, "sim.results"), "matrix")
# Check that samples are wider for include.resid.var = TRUE
expect_gt(quantile(attr(preds1, "sim.results"), probs = 0.9) - quantile(attr(preds1b, "sim.results"), probs = 0.9),
20)
expect_lt(quantile(attr(preds1, "sim.results"), probs = 0.1) - quantile(attr(preds1b, "sim.results"), probs = 0.1),
-20)
expect_gt(quantile(attr(preds2, "sim.results"), probs = 0.9) - quantile(attr(preds2b, "sim.results"), probs = 0.9),
20)
expect_lt(quantile(attr(preds2, "sim.results"), probs = 0.1) - quantile(attr(preds2b, "sim.results"), probs = 0.1),
-20)
expect_gt(quantile(attr(preds4, "sim.results"), probs = 0.9) - quantile(attr(preds4b, "sim.results"), probs = 0.9),
15)
expect_lt(quantile(attr(preds4, "sim.results"), probs = 0.1) - quantile(attr(preds4b, "sim.results"), probs = 0.1),
-15)
})
# Test out of sample predictions----
context("Test out of sample predictions")
test_that("predictInterval makes predictions without observed outcome", {
skip_on_travis()
skip_on_cran()
possNames <- expand.grid(letters,LETTERS)
possNames <- paste(possNames[, 1], possNames[, 2])
newFac <- sample(possNames, 32)
modData <- data.frame(
y = rnorm(500),
x = rnorm(500),
team_name = sample(newFac, 500, replace = TRUE)
)
modData$y[251:500] <- rep(NA, 250)
m0 <- lmer(y ~ x + (1|team_name), data = modData[1:250,])
#In the calls below we are getting warnings because our call to mvtnorm::rmvnorm
#is shotting a warning when mean and sigma of multivariate distribution are
#zero using the method="chol
testPreds1 <- suppressWarnings(predictInterval(m0, newdata = modData[, c(3, 2, 1)]))
testPreds2 <- suppressWarnings(predictInterval(m0, newdata = modData[1:250, c(2, 3, 1)]))
testPreds3 <- suppressWarnings(predictInterval(m0, newdata = modData[251:500,]))
expect_is(testPreds1, "data.frame")
expect_is(testPreds2, "data.frame")
expect_is(testPreds3, "data.frame")
})
# Input validation checks----
context("Input validation checks")
test_that("dplyr objects are successfully coerced", {
skip_on_cran()
set.seed(101)
data(sleepstudy)
m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
predData <- sleepstudy %>% group_by(Subject) %>% dplyr::summarise(Days = mean(Days))
expect_warning(predictInterval(m1, newdata = predData),
regexp = "newdata is tbl_df or tbl object from dplyr package", all=FALSE)
#Suppress the warning that we tested for above
preds2 <- suppressWarnings(predictInterval(m1, newdata = predData, n.sims=2000))
expect_is(preds2, "data.frame")
predData2 <- as.data.frame(predData)
preds1 <- predictInterval(m1, newdata = predData2, n.sims=2000)
expect_true(sum(preds1$fit - preds2$fit) > -50 & sum(preds1$fit - preds2$fit) < 50)
})
# Model type warnings for non-binomial GLMM----
context("Model type warnings for non-binomial GLMM")
test_that("Warnings issued", {
skip_on_cran()
d <- expand.grid(fac1=LETTERS[1:5], grp=factor(1:10),
obs=1:50)
d$y <- simulate(~fac1+(1|grp),family = poisson,
newdata=d,
newparams=list(beta=c(2,-1,3,-2,1.2), theta=c(.33)),
seed = 5636)[[1]]
g1 <- glmer(y~fac1+(1|grp), data=d, family = 'poisson')
expect_warning(predictInterval(g1, newdata = d[1:100,]))
rm(list = ls())
})
# Test Parallel----
context("Test Parallel")
test_that("parallelization does not throw errors and generates good results", {
skip_on_cran()
skip_on_travis()
library(foreach)
set.seed(1241)
#TODO reign in memory usage and cpu time here
m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
predA <- predictInterval(m1, newdata = m1@frame, n.sims = 2200, seed = 54,
include.resid.var = FALSE, stat = "median")
predB <- predictInterval(m1, newdata = m1@frame, n.sims = 1750, seed = 54,
include.resid.var = FALSE, stat = "median")
expect_equal(mean(predA$fit - predB$fit), 0 , tolerance = .2)
predA <- predictInterval(m1, newdata = m1@frame, n.sims = 2500, seed = 2141,
include.resid.var = FALSE)
predB <- predictInterval(m1, newdata = m1@frame, n.sims = 1500, seed = 2141,
include.resid.var = FALSE)
expect_equal(mean(predA$fit - predB$fit), 0 , tolerance = .01)
moddf <- InstEval[sample(rownames(InstEval), 5000),]
g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=moddf)
predA <- predictInterval(g1, newdata = g1@frame, n.sims = 2500, seed = 2141,
include.resid.var = FALSE)
predB <- predictInterval(g1, newdata = g1@frame, n.sims = 1500, seed = 2141,
include.resid.var = FALSE)
expect_equal(mean(predA$fit - predB$fit), 0 , tolerance = .01)
predA <- predictInterval(g1, newdata = g1@frame[1:499,], n.sims = 2500, seed = 2141,
include.resid.var = TRUE)
predB <- predictInterval(g1, newdata = g1@frame[1:501,], n.sims = 2500, seed = 2141,
include.resid.var = TRUE)
expect_equal(mean(predA$fit[1:499] - predB$fit[1:499]), 0 , tolerance = .002)
detach("package:foreach", character.only=TRUE)
})
context("Test returning predict interval components")
# Test the option to return different predictInterval components
m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
m2 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
form <- TICKS_BIN ~ YEAR + HEIGHT +(1 + HEIGHT|BROOD) + (1|LOCATION) + (1|INDEX)
data(grouseticks)
grouseticks$TICKS_BIN <- ifelse(grouseticks$TICKS >=1, 1, 0)
grouseticks$YEAR <- as.numeric(grouseticks$YEAR)
grouseticks$HEIGHT <- grouseticks$HEIGHT - 462.2
glmer3LevSlope <- glmer(form, family="binomial",data=grouseticks)
test_that("Output is correct dimensions", {
pred1 <- predictInterval(m1, which = "random")
pred2 <- predictInterval(m2, which = "fixed")
pred3 <- predictInterval(m2, which = "all")
expect_equal(nrow(pred1), nrow(pred2))
expect_equal(ncol(pred1), 3)
expect_equal(ncol(pred2), 3)
expect_equal(ncol(pred3), 5)
expect_equal(nrow(pred3), 180*3)
expect_equal(nrow(pred1), 180)
expect_false(nrow(pred3) == nrow(pred2))
expect_true(nrow(pred3) > nrow(pred2))
pred1 <- predictInterval(m1, which = "random", stat = "mean")
pred2 <- predictInterval(m2, which = "fixed", stat = "mean")
pred3 <- predictInterval(m2, which = "all", stat = "mean")
expect_equal(nrow(pred1), nrow(pred2))
expect_equal(ncol(pred1), 3)
expect_equal(ncol(pred2), 3)
expect_equal(ncol(pred3), 5)
expect_equal(nrow(pred3), 180*3)
expect_equal(nrow(pred1), 180)
expect_false(nrow(pred3) == nrow(pred2))
expect_true(nrow(pred3) > nrow(pred2))
pred1 <- suppressWarnings(predictInterval(glmer3LevSlope, which = "random",
type = "linear.prediction"))
pred2 <- suppressWarnings(predictInterval(glmer3LevSlope, which = "random",
type = "probability"))
pred3 <- suppressWarnings(predictInterval(glmer3LevSlope, which = "all",
type = "linear.prediction"))
pred4 <- suppressWarnings(predictInterval(glmer3LevSlope, which = "all",
type = "probability"))
expect_equal(nrow(pred1), nrow(pred2))
expect_equal(nrow(pred3), nrow(pred4))
expect_equal(ncol(pred1), 3)
expect_equal(ncol(pred2), 3)
expect_equal(ncol(pred3), 5)
expect_equal(ncol(pred3), 5)
expect_true(mean(pred2$fit) > mean(pred1$fit))
expect_equal(mean(pred2$fit), 0.5, tol = 0.01)
expect_equal(mean(pred1$fit), 0.00, tol = 0.05)
expect_equal(mean(pred4$fit), mean(grouseticks$TICKS_BIN), tol = 0.15)
expect_false(mean(pred4$fit) == mean(pred3$fit))
expect_equal(nrow(pred1), 403)
expect_equal(nrow(pred2), 403)
expect_equal(nrow(pred3), 403*5)
expect_equal(nrow(pred4), 403*5)
})
test_that("Compare random, fixed, include-resid", {
predInt1 <- predictInterval(m1)
predInt1$width <- predInt1[, 2] - predInt1[, 3]
predInt2 <- predictInterval(m1, which = "random")
predInt2$width <- predInt2[, 2] - predInt2[, 3]
predInt3 <- predictInterval(m1, which = "fixed")
predInt3$width <- predInt3[, 2] - predInt3[, 3]
predInt4 <- predictInterval(m1, which = "all")
predInt4$width <- predInt4[, 3] - predInt4[, 4]
predInt1b <- predictInterval(m1, include.resid.var = FALSE)
predInt1b$width <- predInt1b[, 2] - predInt1b[, 3]
predInt2b <- predictInterval(m1, which = "random", include.resid.var = FALSE)
predInt2b$width <- predInt2b[, 2] - predInt2b[, 3]
predInt3b <- predictInterval(m1, which = "fixed", include.resid.var = FALSE)
predInt3b$width <- predInt3b[, 2] - predInt3b[, 3]
predInt4b <- predictInterval(m1, which = "all", include.resid.var = FALSE)
predInt4b$width <- predInt4b[, 3] - predInt4b[, 4]
# These should be fairly close now since residual variance is the biggest
expect_true(!all(predInt1$width > predInt2$width))
expect_true(!all(predInt3$width > predInt2$width))
expect_true(!all(predInt4$width[predInt4$effect == "combined"] > predInt1$width))
expect_true(!all(predInt4$width[predInt4$effect == "combined"] > predInt2$width))
expect_true(!all(predInt4$width[predInt4$effect == "combined"] > predInt3$width))
#
expect_true(all(predInt1b$width > predInt2b$width))
expect_true(all(predInt3b$width != predInt2b$width))
expect_true(all(predInt4b$width[predInt4b$effect == "combined"] > predInt2b$width))
expect_true(!all(predInt4b$width[predInt4b$effect == "combined"] > predInt1b$width))
# Fits
expect_true(all(predInt1$fit > predInt2$fit))
expect_true(all(predInt3$fit > predInt2$fit))
expect_true(all(predInt1$upr > predInt1b$upr))
expect_true(all(predInt1$lwr < predInt1b$lwr))
expect_true(all(predInt2$upr > predInt2b$upr))
expect_true(all(predInt2$lwr < predInt2b$lwr))
expect_true(all(predInt2$upr > predInt2b$upr))
expect_true(all(predInt3$lwr < predInt3b$lwr))
expect_true(all(predInt4$upr > predInt4b$upr))
expect_true(all(predInt4$lwr < predInt4b$lwr))
predInt1 <- predictInterval(m2)
predInt1$width <- predInt1[, 2] - predInt1[, 3]
predInt2 <- predictInterval(m2, which = "random")
predInt2$width <- predInt2[, 2] - predInt2[, 3]
predInt3 <- predictInterval(m2, which = "fixed")
predInt3$width <- predInt3[, 2] - predInt3[, 3]
predInt4 <- predictInterval(m2, which = "all")
predInt4$width <- predInt4[, 3] - predInt4[, 4]
predInt1b <- predictInterval(m2, include.resid.var = FALSE)
predInt1b$width <- predInt1b[, 2] - predInt1b[, 3]
predInt2b <- predictInterval(m2, which = "random", include.resid.var = FALSE)
predInt2b$width <- predInt2b[, 2] - predInt2b[, 3]
predInt3b <- predictInterval(m2, which = "fixed", include.resid.var = FALSE)
predInt3b$width <- predInt3b[, 2] - predInt3b[, 3]
predInt4b <- predictInterval(m2, which = "all", include.resid.var = FALSE)
predInt4b$width <- predInt4b[, 3] - predInt4b[, 4]
# These should be fairly close now since residual variance is the biggest variance
expect_true(!all(predInt1$width > predInt2$width))
expect_true(!all(predInt3$width > predInt2$width))
expect_true(!all(predInt4$width[predInt4$effect == "combined"] > predInt1$width))
expect_true(!all(predInt4$width[predInt4$effect == "combined"] > predInt2$width))
expect_true(!all(predInt4$width[predInt4$effect == "combined"] > predInt3$width))
#
expect_true(all(predInt1b$width > predInt2b$width))
expect_true(all(predInt3b$width != predInt2b$width))
expect_true(all(predInt4b$width[predInt4b$effect == "combined"] > predInt2b$width))
expect_true(!all(predInt4b$width[predInt4b$effect == "combined"] > predInt1b$width))
# Fits
expect_true(all(predInt1$fit > predInt2$fit))
expect_true(all(predInt3$fit > predInt2$fit))
expect_true(all(predInt1$upr > predInt1b$upr))
expect_true(all(predInt1$lwr < predInt1b$lwr))
expect_true(all(predInt2$upr > predInt2b$upr))
expect_true(all(predInt2$lwr < predInt2b$lwr))
expect_true(all(predInt2$upr > predInt2b$upr))
expect_true(all(predInt3$lwr < predInt3b$lwr))
expect_true(all(predInt4$upr > predInt4b$upr))
expect_true(all(predInt4$lwr < predInt4b$lwr))
})
test_that("Default is set to all effects", {
predInt1 <- predictInterval(m1, seed = 8231)
predInt2 <- predictInterval(m1, which = "full", seed = 8231)
expect_identical(predInt1, predInt2)
predInt1 <- predictInterval(m1, seed = 8231, include.resid.var = TRUE)
predInt2 <- predictInterval(m1, which = "full", seed = 8231,
include.resid.var = TRUE)
expect_identical(predInt1, predInt2)
predInt1 <- predictInterval(m1, seed = 8231, include.resid.var = TRUE)
predInt2 <- predictInterval(m1, which = "all", seed = 8231,
include.resid.var = TRUE)
expect_equal(mean(predInt1$fit - predInt2$fit[predInt2$effect == "combined"]),
0, tol = 0.14)
expect_equal(mean(predInt1$lwr - predInt2$lwr[predInt2$effect == "combined"]),
0, tol = 0.1)
expect_equal(mean(predInt1$upr - predInt2$upr[predInt2$effect == "combined"]),
0, tol=0.1)
})
# Test nested effect specifications----
context("Test nested effect specifications")
test_that("Nested effects can work", {
skip_on_cran()
library(ggplot2)
mod1 <- lmer(sleep_total ~ bodywt + (1|vore/order), data=msleep)
msleep$combn <- paste(msleep$vore, msleep$order, sep = "__")
mod2 <- lmer(sleep_total ~ bodywt + (1|combn) + (1|vore), data=msleep)
#Suppressing warnings we already tested (coerce tbl and new levels)
predInt1 <- suppressWarnings(predictInterval(merMod=mod1, newdata=msleep, seed = 548,
n.sims = 2000, include.resid.var = FALSE,
stat = "median", level = 0.8))
predInt2 <- suppressWarnings(predictInterval(merMod=mod2, newdata=msleep, seed = 548,
n.sims = 2000, include.resid.var = FALSE,
stat = "median", level = 0.8))
expect_is(predInt1, "data.frame")
expect_is(predInt2, "data.frame")
expect_equal(mean(predInt1[,1] - predInt2[,1]), 0, tol = sd(predInt1[,1])/20)
expect_equal(mean(predInt1[,2] - predInt2[,2]), 0, tol = sd(predInt1[,2])/10)
expect_equal(mean(predInt1[,3] - predInt2[,3]), 0, tol = sd(predInt1[,3])/20)
})
context("Interactions without intercepts")
sleepstudy$Test <- rep(sample(c(TRUE, FALSE), length(unique(sleepstudy$Subject)),
replace = TRUE), each = 10)
m1 <- lmer(Reaction ~ Days:Test + (0 + Days | Subject), data = sleepstudy)
sleepstudy$cars <- sleepstudy$Days*3
m2 <- lmer(Reaction ~ cars:Test + (0 + Days | Subject), data = sleepstudy)
m3 <- lmer(Reaction ~ cars:Test + (1 | Subject), data = sleepstudy)
m4 <- lmer(Reaction ~ cars:Test + (0 + cars | Subject), data = sleepstudy)
test_that("Models with cross-level interaction and no random intercept work", {
preds1 <- predictInterval(m1)
expect_equal(nrow(preds1), 180)
expect_equal(ncol(preds1), 3)
expect_message(predictInterval(m1))
preds1 <- predictInterval(m1, newdata = sleepstudy[1:10, ],
level = 0.9, n.sims = 500, include.resid.var = FALSE,
fix.intercept.variance = TRUE)
expect_equal(nrow(preds1), 10)
expect_equal(ncol(preds1), 3)
preds1 <- predictInterval(m1, newdata = sleepstudy[1:10, ],
level = 0.9, n.sims = 500, include.resid.var = FALSE,
ignore.fixed.terms = TRUE)
expect_equal(nrow(preds1), 10)
expect_equal(ncol(preds1), 3)
preds2 <- predictInterval(m1, newdata = sleepstudy[1:10, ],
level = 0.9, n.sims = 500, include.resid.var = FALSE,
ignore.fixed.terms = FALSE)
expect_equal(nrow(preds2), 10)
expect_equal(ncol(preds2), 3)
expect_false(any(preds1$fit == preds2$fit))
rm(preds1, preds2)
preds1 <- predictInterval(m2)
expect_equal(nrow(preds1), 180)
expect_equal(ncol(preds1), 3)
truPred <- predict(m2)
expect_equal(mean(preds1$fit - truPred), 0, tolerance = sd(truPred)/100)
#
preds1 <- predictInterval(m3)
expect_equal(nrow(preds1), 180)
expect_equal(ncol(preds1), 3)
truPred <- predict(m3)
expect_equal(mean(preds1$fit - truPred), 0, tolerance = sd(truPred)/100)
#
preds1 <- predictInterval(m4)
expect_equal(nrow(preds1), 180)
expect_equal(ncol(preds1), 3)
truPred <- predict(m4)
expect_equal(mean(preds1$fit - truPred), 0, tolerance = sd(truPred)/100)
})
m1 <- lmer(Reaction ~ 0 + Days + Days:Subject + (1 | Days), data = sleepstudy)
test_that("Models with cross-level interaction and no random intercept work", {
preds1 <- predictInterval(m1)
expect_equal(nrow(preds1), 180)
expect_equal(ncol(preds1), 3)
expect_warning(predictInterval(m1, fix.intercept.variance = TRUE))
preds1 <- predictInterval(m1, newdata = sleepstudy[1:10, ],
level = 0.9, n.sims = 500, include.resid.var = FALSE,
ignore.fixed.terms = TRUE)
expect_equal(nrow(preds1), 10)
expect_equal(ncol(preds1), 3)
truPred <- predict(m1, newdata = sleepstudy[1:10,])
expect_equal(mean(preds1$fit - truPred), 0, tolerance = sd(truPred)/100)
# This is less close
preds1 <- predictInterval(m1, newdata = sleepstudy[1:50, ],
level = 0.9, n.sims = 500, include.resid.var = FALSE,
ignore.fixed.terms = FALSE)
expect_equal(nrow(preds1), 50)
expect_equal(ncol(preds1), 3)
truPred <- predict(m1, newdata = sleepstudy[1:50,])
expect_equal(mean(preds1$fit - truPred), 0, tolerance = sd(truPred)/25)
expect_warning({
preds1 <- predictInterval(m1, newdata = sleepstudy[1:50, ],
level = 0.9, n.sims = 500, include.resid.var = FALSE,
fix.intercept.variance = TRUE)
})
expect_failure({
expect_equal(mean(preds1$fit - truPred), 0, tolerance = sd(truPred)/100)
})
})
merTools/tests/testthat/test-seeds.R 0000644 0001762 0000144 00000002661 13466135366 017305 0 ustar ligges users # 'seed' options in draw, REsim, FEsim, predictInterval and subBoot----
context("'seed' options in draw, REsim, FEsim, predictInterval and subBoot")
test_that("Equivalent seeds return equivalent results", {
fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
d1a <- draw(fm1, type="random", seed=1234)
d2 <- draw(fm1, type="random", seed=456)
d1b <- draw(fm1, type="random", seed=1234)
r1a <- REsim(fm1, 25, seed=1234)
r2 <- REsim(fm1, 25, seed=456)
r1b <- REsim(fm1, 25, seed=1234)
f1a <- FEsim(fm1, 25, seed=1234)
f2 <- FEsim(fm1, 25, seed=456)
f1b <- FEsim(fm1, 25, seed=1234)
# TODO - subboot now returns warnings and needs to be checked
p1a <- predictInterval(fm1, newdata=sleepstudy[1:10,], seed=1234)
p2 <- predictInterval(fm1, newdata=sleepstudy[1:10,], seed=456)
p1b <- predictInterval(fm1, newdata=sleepstudy[1:10,], seed=1234)
s1a <- subBoot(fm1, n = 160, FUN = thetaExtract, R = 20, seed=1234)
s2 <- subBoot(fm1, n = 160, FUN = thetaExtract, R = 20, seed=456)
s1b <- subBoot(fm1, n = 160, FUN = thetaExtract, R = 20, seed=1234)
expect_identical(d1a, d1b)
expect_identical(r1a, r1b)
expect_identical(f1a, f1b)
expect_identical(p1a, p1b)
expect_identical(s1a, s1b)
expect_false(identical(d1a, d2))
expect_false(identical(r1a, r2))
expect_false(identical(f1a, f2))
expect_false(identical(p1a, p2))
expect_false(identical(s1a, s2))
})
merTools/NAMESPACE 0000644 0001762 0000144 00000005334 13466135366 013317 0 ustar ligges users # Generated by roxygen2: do not edit by hand
S3method(VarCorr,merModList)
S3method(draw,merMod)
S3method(fastdisp,merMod)
S3method(fastdisp,merModList)
S3method(fixef,merModList)
S3method(print,merModList)
S3method(print,summary.merModList)
S3method(ranef,merModList)
S3method(summary,merModList)
export(FEsim)
export(ICC)
export(REcorrExtract)
export(REextract)
export(REimpact)
export(REmargins)
export(REquantile)
export(REsdExtract)
export(REsim)
export(RMSE.merMod)
export(averageObs)
export(bglmerModList)
export(blmerModList)
export(draw)
export(expectedRank)
export(fastdisp)
export(findFormFuns)
export(glmerModList)
export(lmerModList)
export(modelFixedEff)
export(modelInfo)
export(modelRandEffStats)
export(plotFEsim)
export(plotREsim)
export(predictInterval)
export(randomObs)
export(shinyMer)
export(subBoot)
export(superFactor)
export(thetaExtract)
export(wiggle)
import(arm)
import(dplyr)
import(ggplot2)
import(lme4)
importFrom(abind,abind)
importFrom(arm,sim)
importFrom(blme,bglmer)
importFrom(blme,blmer)
importFrom(broom,tidy)
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(methods,as)
importFrom(methods,is)
importFrom(mvtnorm,rmvnorm)
importFrom(shiny,actionButton)
importFrom(shiny,checkboxInput)
importFrom(shiny,downloadButton)
importFrom(shiny,downloadHandler)
importFrom(shiny,em)
importFrom(shiny,eventReactive)
importFrom(shiny,fluidPage)
importFrom(shiny,h3)
importFrom(shiny,isolate)
importFrom(shiny,mainPanel)
importFrom(shiny,numericInput)
importFrom(shiny,observeEvent)
importFrom(shiny,plotOutput)
importFrom(shiny,radioButtons)
importFrom(shiny,reactive)
importFrom(shiny,reactiveValues)
importFrom(shiny,renderPlot)
importFrom(shiny,renderPrint)
importFrom(shiny,runApp)
importFrom(shiny,shinyApp)
importFrom(shiny,sidebarLayout)
importFrom(shiny,sidebarPanel)
importFrom(shiny,strong)
importFrom(shiny,tabPanel)
importFrom(shiny,tabsetPanel)
importFrom(shiny,textOutput)
importFrom(shiny,titlePanel)
importFrom(stats,AIC)
importFrom(stats,as.formula)
importFrom(stats,delete.response)
importFrom(stats,df)
importFrom(stats,formula)
importFrom(stats,getCall)
importFrom(stats,logLik)
importFrom(stats,median)
importFrom(stats,model.frame)
importFrom(stats,model.matrix)
importFrom(stats,na.omit)
importFrom(stats,na.pass)
importFrom(stats,pnorm)
importFrom(stats,pt)
importFrom(stats,qnorm)
importFrom(stats,quantile)
importFrom(stats,reformulate)
importFrom(stats,reshape)
importFrom(stats,residuals)
importFrom(stats,rgamma)
importFrom(stats,rnorm)
importFrom(stats,runif)
importFrom(stats,sd)
importFrom(stats,terms)
importFrom(stats,vcov)
importFrom(stats,weighted.mean)
importFrom(utils,packageVersion)
merTools/NEWS.md 0000644 0001762 0000144 00000017152 13466135366 013177 0 ustar ligges users # NEWS
## merTools 0.5.0
### New Features
- `subBoot` now works with `glmerMod` objects as well
- `reMargins` a new function that allows the user to marginalize the prediction over breaks in the
distribution of random effect distributions, see `?reMargins` and the new `reMargins` vignette (closes #73)
### Bug fixes
- Fixed an issue where known convergence errors were issuing warnings and causing the test suite
to not work
- Fixed an issue where models with a random slope, no intercept, and no fixed term were unable
to be predicted (#101)
- Fixed an issue with shinyMer not working with substantive fixed effects (#93)
## merTools 0.4.2
### New Features
- Parallel fitting of `merModLists` is now supported using the `future.apply`
package and the `future_lapply` functions, optionally
- Reduced package installation surface by eliminating unnecessary packages
in the `Suggests` field
### Bug fixes
- Fixed a bug (#94) where `predictInterval()` would return a data.frame of the
wrong dimensions when predicting a single row of observations for a `glm`
- Fixed a bug (#96) related to `rstanarm` dependencies in the package vignette
- Switched from `dontrun` to `donttest` for long-running examples (CRAN compliance)
- Fixed and made more clear the generics applying to `merModList` objects (#92)
## merTools 0.4.1
### New Features
- Standard errors reported by `merModList` functions now apply the Rubin
correction for multiple imputation
### Bug fixes
- Contribution by Alex Whitworth (@alexWhitworth) adding error checking to plotting
functions
- The vignettes have been shortened and unit tests reorganized to facilitate
Travis-CI builds and reduce CRAN build burden
## merTools 0.4.0
### New Features
- Added vignette on using multilevel models with multiply imputed data
- Added `fixef` and `ranef` generics for `merModList` objects
- Added `fastdisp` generic for `merModList`
- Added `summary` generic for `merModList`
- Added `print` generic for `merModList`
- Documented all generics for `merModList` including examples and a new
imputation vignette
- Added `modelInfo` generic for `merMod` objects that provides simple summary
stats about a whole model
### Bug Fixes
- Fix bug that returned NaN for `std.error` of a multiply imputed `merModList`
when calling `modelRandEffStats`
- Fixed bug in `REimpact` where some column names in `newdata` would prevent the
prediction intervals from being computed correctly. Users will now be warned.
- Fixed bug in `wiggle` where documentation incorrectly stated the arguments to
the function and the documentation did not describe function correctly
## merTools 0.3.1
- Update the `readme.rmd` to package graphics with the R package, per CRAN
## merTools 0.3.0
- Improve handling of formulas. If the original `merMod` has functions specified
in the formula, the `draw` and `wiggle` functions will check for this and attempt
to respect these variable transformations. Where this is not possible a warning
will be issued. Most common transformations are respected as long as the the
original variable is passed untransformed to the model.
- Change the calculations of the residual variance. Previously residual variance
was used to inflate both the variance around the fixed parameters and around the
predicted values themselves. This was incorrect and resulted in overly conservative
estimates. Now the residual variance is appropriately only used around the
final predictions
- Rebuilt the readme.md to include new information about new features
- New option for `predictInterval` that allows the user to return the full
interval, the fixed component, the random component, or the fixed and each random
component separately for each observation
- Fixed a bug with slope+intercept random terms that caused a miscalculation of
the random component
- Add comparison to `rstanarm` to the Vignette
- Make `expectedRank` output more `tidy` like and allow function to calculate
expected rank for all terms at once
- Note, this breaks the API by changing the names of the columns in the output
of this function
- Remove tests that test for timing to avoid issues with R-devel JIT compiler
- Remove `plyr` and replace with `dplyr`
- Fix issue #62 `varList` will now throw an error if `==` is used instead of `=`
- Fix issue #54 `predictInterval` did not included random effects in calculations
when `newdata` had more than 1000 rows and/or user specified `parallel=TRUE`.
Note: fix was to disable the `.paropts` option for `predictInterval` ... user
can still specify for *temporary* backward compatibility but this should be
either removed or fixed in the permanent solution.
- Fix issue #53 about problems with `predictInterval` when only specific levels
of a grouping factor are in `newdata` with the colon specification of
interactions
- Fix issue #52 ICC wrong calculations ... we just needed to square the standard
deviations that we pulled
## merTools 0.2.1
- Fix dependency on `lme4` to ensure compatibility with latest changes.
## merTools 0.2
### Bug fixes
- Coerce `dplyr` `tbl` and `tbl_df` objects to data.frames when they are passed
to `predictInterval` and issue a warning
- Try to coerce other data types passed to `newdata` in `predictInterval` before
failing if coercion is unsuccessful
- Numeric stabilization of unit tests by including seed values for random tests
- Fix handling of models with nested random effect terms (GitHub #47)
- Fix vignette images
### New Functionality
- Substantial performance enhancement for `predictInterval` which includes better
handling of large numbers of parameters and simulations, performance
tweaks for added speed (~10x), and parallel backend support (currently not optimized)
- Add support for `probit` models and limited support for other `glmm` link functions, with warning (still do not know how to handle sigma parameter
for these)
- Add ability for user-specified seed for reproducibility
- Add support for `blmer` objects from the `blme` package
- Add a `merModList` object for lists of `merMod` objects fitted to subsets
of a dataset, useful for imputation or for working with extremely large datasets
- Add a `print` method for `merModList` to mimic output of `summary.merMod`
- Add a `VarCorr` method for `merModList`
- Add new package data to demonstrate replication from selected published texts
on multilevel modeling using different software (1982 High School and Beyond Survey data)
### Other changes
- Changed the default `n.sims` for the `predictInterval` function from 100 to 1,000
to give better coverage and reflect performance increase
- Changed the default for `level` in `predictInterval` to be 0.8 instead of 0.95
to reflect that 0.95 prediction intervals are more conservative than most users
need
### Future changes
- For the next release (1.0) we are considering a permanent switch to
C++ RMVN sampler courtesy of Giri Gopalan 's excellent FastGP
## merTools 0.1
- Initial release
### New Functions
- Provides `predictInterval` to allow prediction intervals from `glmer` and `lmer`
objects
- Provides `FEsim` and `REsim` to extract distributions of model parameters
- Provides `shinyMer` an interactive `shiny` application for exploring `lmer`
and `glmer` models
- Provides `expectedRank` function to interpret the ordering of effects
- Provides `REimpact` to simulate the impact of grouping factors on the outcome
- Provides `draw` function to allow user to explore a specific observation
- Provides `wiggle` function for user to build a simulated set of counterfactual
cases to explore
merTools/data/ 0000755 0001762 0000144 00000000000 13402510754 012770 5 ustar ligges users merTools/data/hsb.rda 0000644 0001762 0000144 00000106161 13402510754 014241 0 ustar ligges users BZh91AY&SY xFIPh .m-fv36eݧ;Ͳmcfmbv3clQX m.P7N3 PAU7 *| T
+fͶښsmi*fΚig-l1ųl$ZnKFDm[muFmYٛf-k[cmY5ڶ,4l͙5>
P R).|lٓenu͙i uZmӈB-1L-2Z2,ՙUO h@
*RzF& !Bd4 i!SiLm=S=CC Phj
4 AD Aa2ddɐЍMSy# 7jz`DJ?M=I觩S4