estimatr/0000755000176200001440000000000014760407272012113 5ustar liggesusersestimatr/tests/0000755000176200001440000000000014747205231013251 5ustar liggesusersestimatr/tests/sleep.R0000644000176200001440000000006014747205231014500 0ustar liggesuserslibrary(estimatr) lm_robust(extra~group, sleep) estimatr/tests/sleep.Rout.save0000644000176200001440000000172314747205231016174 0ustar liggesusers R version 3.5.0 (2018-04-23) -- "Joy in Playing" Copyright (C) 2018 The R Foundation for Statistical Computing Platform: x86_64-apple-darwin15.6.0 (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > library(estimatr) > lm_robust(extra~group, sleep) Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF (Intercept) 0.75 0.5657345 1.325710 0.20151554 -0.4385641 1.938564 18 group2 1.58 0.8490910 1.860813 0.07918671 -0.2038740 3.363874 18 > > proc.time() user system elapsed 0.782 0.085 0.851 estimatr/tests/testthat/0000755000176200001440000000000014760407272015115 5ustar liggesusersestimatr/tests/testthat/test-replicate-HT-middleton.R0000644000176200001440000001142314747205231022450 0ustar liggesuserscontext("Verification - HT matches Joel Middleton code") test_that("We match Joel's estimator", { # Code from Joel Middleton n <- 400 # simple random assignment d.00 <- diag(rep(1, n)) dmat <- rbind( cbind(d.00, -d.00) , cbind(-d.00, d.00) ) d.tilde <- diag(rep(2, 2 * n)) d.00 <- matrix(rep(-0.001251564, n ^ 2), ncol = n) + diag(rep(1 + 0.001251564, n)) pmat <- diag(rep(.5, 2 * n)) %*% (dmat + 1) %*% diag(rep(.5, 2 * n)) pmat.true <- pmat pmat[pmat == 0] <- 1 d.tilde.wt <- d.tilde / pmat # complete random assignment dmat.CR <- round(rbind( cbind(d.00, -d.00) , cbind(-d.00, d.00) ), 10) pmat.CR <- diag(rep(.5, 2 * n)) %*% (dmat.CR + 1) %*% diag(rep(.5, 2 * n)) pmat.CR.true <- pmat.CR pmat.CR[pmat.CR == 0] <- 1 d.tilde.CR <- dmat.CR + (dmat.CR == -1) + diag(rep(1, 2 * n)) d.tilde.wt.CR <- d.tilde.CR / pmat.CR # ourpmat <- declaration_to_condition_pr_mat(randomizr::declare_ra(N = 400, prob = 0.5, simple = F)) # pmat.CR.true[1:5, 1:5] # ourpmat[1:5, 1:5] # pmat.CR.true[401:410, 401:410] # ourpmat[401:410, 401:410] # pmat.CR.true[401:410, 1:10] # ourpmat[401:410,1:10] ## DGP with truly random 0.5 chance for each unit being treated dat <- data.frame( p = 0.5, z = rbinom(n, 1, 0.5), y0 = rnorm(n, sd = 3) ) # Constant treatment effects, SRS dat$y1 <- dat$y0 + 3 Y <- c(-dat$y0, dat$y1) R <- c(1 - dat$z, dat$z) pi.inv <- (c(rep(1 / (1 - dat$p)), rep(1 / dat$p))) ht_est <- sum(R * pi.inv * Y) / n y1.hat <- dat$y0 + ht_est y0.hat <- dat$y1 - ht_est # true_ses_ht <- sqrt(t(Y)%*%dmat%*%Y)/n Y.hat <- R * Y + (1 - R) * c(-y0.hat, y1.hat) se_ht <- sqrt(t(Y * R) %*% d.tilde.wt %*% (Y * R)) / n se_constant_ht <- sqrt(t(Y.hat) %*% dmat %*% Y.hat) / n dat$y <- ifelse(dat$z == 1, dat$y1, dat$y0) # Simple random assignment ht_decl_o <- horvitz_thompson( y ~ z, data = dat, ra_declaration = randomizr::declare_ra( N = nrow(dat), prob = dat$p[1], simple = TRUE ) ) # Second way to do same estimator, since it's SRS ht_prob_o <- horvitz_thompson( y ~ z, data = dat, condition_prs = p ) expect_equal( tidy(ht_decl_o)[, c("estimate", "std.error")], tidy(ht_prob_o)[, c("estimate", "std.error")] ) expect_equivalent( as.numeric(tidy(ht_decl_o)[, c("estimate", "std.error")]), c(ht_est, se_ht) ) # Now with constant effects assumption ht_const_o <- horvitz_thompson( y ~ z, data = dat, ra_declaration = randomizr::declare_ra( N = nrow(dat), prob = dat$p[1], simple = TRUE ), se_type = "constant" ) expect_equivalent( as.numeric(tidy(ht_const_o)[, c("estimate", "std.error")]), c(ht_est, se_constant_ht) ) ## Constant treatment effects, CRS dat$z <- sample(rep(0:1, each = n / 2)) dat$y <- ifelse(dat$z == 1, dat$y1, dat$y0) R <- c(1 - dat$z, dat$z) pi.inv <- (c(rep(1 / (1 - dat$p)), rep(1 / dat$p))) ht_comp_est <- sum(R * pi.inv * Y) / n Y.hat <- R * Y + (1 - R) * c(-y0.hat, y1.hat) se_comp_ht <- sqrt(t(Y * R) %*% d.tilde.wt.CR %*% (Y * R)) / n se_comp_constant_ht <- sqrt(t(Y.hat) %*% dmat.CR %*% Y.hat) / n # complete random assignment ht_comp_decl_o <- horvitz_thompson( y ~ z, data = dat, ra_declaration = randomizr::declare_ra( N = nrow(dat), prob = dat$p[1], simple = FALSE ), return_condition_pr_mat = T ) # ht_comp_decl_o$condition_pr_mat[1:5, 1:5] # pmat.CR.true[1:5, 1:5] # Don't match right now because pmats are diff # expect_equal( # tidy(ht_comp_decl_o)[, c("estimate", "std.error")], # c(ht_comp_est, se_comp_ht) # ) # Does match if I use JM's pmat ht_comp_decl_o <- horvitz_thompson( y ~ z, data = dat, condition_pr_mat = pmat.CR.true ) expect_equivalent( as.numeric(tidy(ht_comp_decl_o)[, c("estimate", "std.error")]), c(ht_comp_est, se_comp_ht) ) # Now with constant effects assumption # ht_comp_const_o <- horvitz_thompson( # y ~ z, # data = dat, # ra_declaration = randomizr::declare_ra( # N = nrow(dat), # prob = dat$p[1], # simple = FALSE # ), # se_type = "constant" # ) # # expect_equivalent( # tidy(ht_comp_const_o)[, c("estimate", "std.error")], # c(ht_comp_est, se_comp_constant_ht) # ) # ht_comp_const_o <- horvitz_thompson( # y ~ z, # data = dat, # condition_pr_mat = pmat.CR.true, # se_type = "constant" # ) # expect_equivalent( # tidy(ht_comp_const_o)[, c("estimate", "std.error")], # c(ht_comp_est, se_comp_constant_ht) # ) # Not matching so we error expect_error( horvitz_thompson( y ~ z, data = dat, condition_pr_mat = pmat.CR.true, se_type = "constant" ), "`se_type` = 'constant' only supported for simple" ) }) estimatr/tests/testthat/test-lh-robust.R0000644000176200001440000001232214760370352020132 0ustar liggesuserscontext("Estimator - lh_robust") set.seed(40) N <- 40 dat <- data.frame( Y = rnorm(N), Y2 = rnorm(N), Z = rbinom(N, 1, .5), X = rnorm(N), B = factor(rep(1:2, times = c(8, 12))), cl = sample(1:4, size = N, replace = T), w = runif(N) ) # se tests test_that("lh_robust works with all se types", { skip_if_not_installed("car") for (se_type in se_types) { lhro <- tidy( lh_robust( mpg ~ cyl + disp, data = mtcars, linear_hypothesis = "cyl + disp = 0", se_type = se_type ) ) lmro <- lm_robust(mpg ~ cyl + disp, data = mtcars, se_type = se_type) linHyp <- car::linearHypothesis(lmro, hypothesis.matrix = "cyl + disp = 0") expect_equal(lhro$std.error[lhro$term == "cyl + disp = 0"], sqrt(as.numeric(attr(linHyp , "vcov")))) } }) test_that("lh_robust with clusters works for all se_types (except CR2)", { skip_if_not_installed("car") for (se_type in cr_se_types_lh) { lhro <- tidy( lh_robust( Y ~ Z * X, data = dat, clusters = cl, linear_hypothesis = "Z + Z:X = 0", se_type = se_type ) ) lmro <- lm_robust(Y ~ Z * X, data = dat, se_type = se_type, clusters = cl) linHyp <- car::linearHypothesis(lmro, hypothesis.matrix = "Z + Z:X = 0") expect_equal(lhro$std.error[lhro$term == "Z + Z:X = 0"], sqrt(as.numeric(attr(linHyp , "vcov")))) } }) test_that("lh_robust matches lm_robust with fixed effects", { skip_if_not_installed("car") lhro <- lh_robust( Y ~ Z * X, data = dat, fixed_effects = ~ B, linear_hypothesis = c("Z + Z:X = 0") ) lmro <- lm_robust(Y ~ Z * X, data = dat, fixed_effects = ~ B) linHyp <- car::linearHypothesis(lmro, hypothesis.matrix = "Z + Z:X = 0") tidy_lhro <- tidy(lhro) expect_equal(tidy_lhro$std.error[tidy_lhro$term == "Z + Z:X = 0"], sqrt(as.numeric(attr(linHyp , "vcov")))) }) test_that("lh_robust matches lm_robust with weights", { skip_if_not_installed("car") lhro <- lh_robust( Y ~ Z * X, data = dat, weights = w, linear_hypothesis = c("Z + Z:X = 0") ) tidy_lhro <- tidy(lhro) lmro <- lm_robust(Y ~ Z * X, data = dat, weights = w) linHyp <- car::linearHypothesis(lmro, hypothesis.matrix = "Z + Z:X = 0") expect_equal(tidy_lhro$std.error[tidy_lhro$term == "Z + Z:X = 0"], sqrt(as.numeric(attr(linHyp , "vcov")))) }) test_that("lh_robust matches lm_robust with subsetted data.frame", { skip_if_not_installed("car") lhro <- lh_robust(Y ~ Z * X, data = dat, subset = B == 1, linear_hypothesis = c("Z + Z:X = 0")) tidy_lhro <- tidy(lhro) lmro <- lm_robust(Y ~ Z * X, data = dat, subset = B == 1) linHyp <- car::linearHypothesis(lmro, hypothesis.matrix = "Z + Z:X = 0") expect_equal(tidy_lhro$std.error[tidy_lhro$term == "Z + Z:X = 0"], sqrt(as.numeric(attr(linHyp , "vcov")))) }) test_that("lh_robust matches lm_robust with subsetted data.frame", { skip_if_not_installed("car") lhro <- lh_robust(Y ~ Z * X, data = dat, subset = B == 1, linear_hypothesis = c("Z + Z:X = 0")) tidy_lhro <- tidy(lhro) lmro <- lm_robust(Y ~ Z * X, data = dat, subset = B == 1) linHyp <- car::linearHypothesis(lmro, hypothesis.matrix = "Z + Z:X = 0") expect_equal(tidy_lhro$std.error[tidy_lhro$term == "Z + Z:X = 0"], sqrt(as.numeric(attr(linHyp , "vcov")))) }) # Consistency of cluster results on single coefficient test_that("lh single coefficient consistency", { skip_if_not_installed("car") for (se_type in cr_se_types_lh) { lhro <- tidy( lh_robust( Y ~ Z * X, data = dat, clusters = cl, linear_hypothesis = "X = 0", se_type = se_type ) ) expect_equal(lhro[3,5], lhro[5,5]) } }) # lh test test_that("returns error when no linear hypothesis is specified", { expect_error(lh_robust(Y ~ Z * X, data = dat)) }) # lh test test_that("returns error when joint hypothesis is specified", { expect_error(lh_robust(Y ~ Z * X, linear_hypothesis = c("Z", "X"), data = dat)) }) # lh test test_that("returns error when CR2 is specified", { expect_error(lh_robust(Y ~ Z * X, se_type = "CR2", linear_hypothesis = c("Z"), data = dat)) }) # lh test test_that("returns error when default CR2 are called", { expect_error(lh_robust(Y ~ Z * X, linear_hypothesis = c("Z"), data = dat, clusters = cl)) }) test_that("issue #405 fixed", { library(estimatr) nSize = 12 dat = data.frame( x = rnorm(nSize), e = rnorm(nSize), # Irrelevant clusters for errors eg = sample(2, nSize, replace=T) ) dat$z = dat$x + dat$e # already worked fit <- lh_robust(z~x, data=dat, se_type='HC2', linear_hypothesis='x=0') expect_equivalent(fit$lm_robust$conf.low[2], fit$lh$conf.low[1]) # was broken fit_2 <- lh_robust(z~x, data=dat, clusters=eg, se_type='stata', linear_hypothesis='x=0') expect_equivalent(fit_2$lm_robust$conf.low[2], fit_2$lh$conf.low[1]) }) estimatr/tests/testthat/stata-ests.txt0000644000176200001440000000060614747205231017744 0ustar liggesusers classical .0001024 2.669698 30 45.459797 HC1 .00018388 4.3123302 30 25.315348 HC2 .00021652 4.8093019 30 21.499286 HC3 .00027562 5.8084222 30 16.889294 stata_cl .00034882 14.922475 2 13.345142 classicalw .00009984 2.4918295 30 48.196413 HC1w .00017632 4.0902569 30 27.29069 HC2w .00020717 4.5180983 30 23.226908 HC3w .00026334 5.4069309 30 18.2733 stata_clw .00031337 13.155473 2 15.355597estimatr/tests/testthat/helper-se-types.R0000644000176200001440000000024114760370352020261 0ustar liggesusers# se_types for various files se_types <- c("classical", "HC0", "HC1", "HC2", "HC3") cr_se_types <- c("CR0", "stata", "CR2") cr_se_types_lh <- c("CR0", "stata") estimatr/tests/testthat/test-lm-cluster.R0000644000176200001440000002174514760370352020313 0ustar liggesuserscontext("Estimator - lm_robust, clustered") test_that("lm cluster se", { N <- 100 set.seed(5) dat <- data.frame( Y = rnorm(N), Z = rbinom(N, 1, .5), X = rnorm(N), J = sample(1:10, N, replace = T), W = runif(N) ) ## Test functionality lm_robust(Y ~ Z, clusters = J, data = dat) lm_robust(Y ~ Z + X, clusters = J, data = dat) lm_robust( Y ~ Z + X, clusters = J, data = dat ) lm_robust( Y ~ Z + X, clusters = J, se_type = "stata", data = dat, ci = T ) expect_equivalent( as.matrix( tidy( lm_robust( Y ~ X + Z, clusters = J, ci = FALSE, data = dat ) )[, c("p.value", "conf.low", "conf.high")] ), matrix(NA, nrow = 3, ncol = 3) ) ## Test equality lm_interact <- lm_robust( Y ~ Z * X, clusters = J, data = dat ) lm_interact_stata <- lm_robust( Y ~ Z * X, clusters = J, se_type = "stata", data = dat ) lm_interact_simple <- lm(Y ~ Z * X, data = dat) bm_interact <- BMlmSE( lm_interact_simple, clustervar = as.factor(dat$J), IK = FALSE ) bm_interact bm_interact_interval <- coef(lm_interact_simple)["Z:X"] + qt(0.975, df = bm_interact$dof["Z:X"]) * bm_interact$se["Z:X"] * c(-1, 1) bm_interact_stata_interval <- coef(lm_interact_simple)["Z:X"] + qt(0.975, df = length(unique(dat$J)) - 1) * bm_interact$se.Stata["Z:X"] * c(-1, 1) expect_equivalent( as.numeric(tidy(lm_interact)[4, c("std.error", "conf.low", "conf.high")]), c(bm_interact$se["Z:X"], bm_interact_interval) ) expect_equivalent( as.numeric(tidy(lm_interact_stata)[4, c("std.error", "conf.low", "conf.high")]), c(bm_interact$se.Stata["Z:X"], bm_interact_stata_interval) ) lm_full <- lm_robust( Y ~ Z + X, clusters = J, data = dat ) lm_full_simple <- lm(Y ~ Z + X, data = dat) bm_full <- BMlmSE( lm_full_simple, clustervar = as.factor(dat$J), IK = FALSE ) bm_full_moe <- qt(0.975, df = bm_full$dof) * bm_full$se bm_full_lower <- coef(lm_full_simple) - bm_full_moe bm_full_upper <- coef(lm_full_simple) + bm_full_moe expect_equivalent( as.matrix(tidy(lm_full)[, c("std.error", "conf.low", "conf.high")]), cbind(bm_full$se, bm_full_lower, bm_full_upper) ) ## Works with rank deficient case dat$X2 <- dat$X for (se_type in cr_se_types) { lmr_rd <- lm_robust(Y ~ X + Z + X2, data = dat, clusters = J, se_type = se_type) lmr_full <- lm_robust(Y ~ X + Z, data = dat, clusters = J, se_type = se_type) expect_equal( tidy(lmr_rd)[1:3, ], tidy(lmr_full) ) } ## Test error handling expect_error( lm_robust( Y ~ Z, clusters = J, se_type = "HC2", data = dat ), "CR2" ) expect_error( lm_robust( Y ~ Z, se_type = "CR2", data = dat ), "CR2" ) # To easily do with and without weights test_lm_cluster_variance <- function(w) { # Test other estimators lm_cr0 <- lm_robust(Y ~ Z + X, data = dat, weights = w, clusters = J, se_type = "CR0") lm_stata <- lm_robust(Y ~ Z + X, data = dat, weights = w, clusters = J, se_type = "stata") lm_cr2 <- lm_robust(Y ~ Z + X, data = dat, weights = w, clusters = J, se_type = "CR2") # Stata is the same as CR0 but with finite sample expect_equivalent( lm_cr0$std.error ^ 2, lm_stata$std.error ^ 2 * (N - length(coef(lm_stata))) * (length(unique(dat$J)) - 1) / ((N - 1) * length(unique(dat$J))) ) expect_false(all(lm_cr0$std.error == lm_stata$std.error)) expect_false(all(lm_cr0$std.error == lm_cr2$std.error)) expect_false(all(lm_stata$std.error == lm_cr2$std.error)) expect_false(all(lm_stata$df == lm_cr2$df)) expect_equivalent( lm_cr0$df, lm_stata$df ) } # No weights first test_lm_cluster_variance(NULL) test_lm_cluster_variance(dat$W) }) test_that("Clustered SEs match clubSandwich", { skip_if_not_installed("clubSandwich") skip_on_cran() lm_o <- lm(mpg ~ hp, data = mtcars) lm_ow <- lm(mpg ~ hp, data = mtcars, weights = wt) for (se_type in cr_se_types) { lm_r <- lm_robust(mpg ~ hp, data = mtcars, clusters = cyl, se_type = se_type) lm_rw <- lm_robust(mpg ~ hp, data = mtcars, weights = wt, clusters = cyl, se_type = se_type) expect_equivalent( vcov(lm_r), as.matrix(clubSandwich::vcovCR( lm_o, cluster = mtcars$cyl, type = ifelse(se_type == "stata", "CR1S", se_type) )) ) expect_equivalent( vcov(lm_rw), as.matrix(clubSandwich::vcovCR( lm_ow, cluster = mtcars$cyl, type = ifelse(se_type == "stata", "CR1S", se_type) )) ) } }) test_that("multiple outcomes", { skip_if_not_installed("clubSandwich") skip_on_cran() for (se_type in cr_se_types) { lmo <- lm(cbind(mpg, hp) ~ wt, data = mtcars) lmow <- lm(cbind(mpg, hp) ~ wt, weights = qsec, data = mtcars) lmro <- lm_robust(cbind(mpg, hp) ~ wt, data = mtcars, clusters = cyl, se_type = se_type) lmrow <- lm_robust(cbind(mpg, hp) ~ wt, weights = qsec, data = mtcars, clusters = cyl, se_type = se_type) if (se_type == "stata") { # Have to manually do correction for CR1stata # because clubSandwich uses n*ny and r*ny in place of n and r # in stata correction J <- length(unique(mtcars$cyl)) n <- nrow(mtcars) r <- 2 cs_vcov <- as.matrix(clubSandwich::vcovCR(lmo, cluster = mtcars$cyl, type = "CR0")) * ((J * (n - 1)) / ((J - 1) * (n - r))) cs_vcov_w <- as.matrix(clubSandwich::vcovCR(lmow, cluster = mtcars$cyl, type = "CR0")) * ((J * (n - 1)) / ((J - 1) * (n - r))) } else { cs_vcov <- as.matrix(clubSandwich::vcovCR(lmo, cluster = mtcars$cyl, type = se_type)) cs_vcov_w <- as.matrix(clubSandwich::vcovCR(lmow, cluster = mtcars$cyl, type = se_type)) } expect_equivalent( vcov(lmro), cs_vcov ) expect_equivalent( vcov(lmrow), cs_vcov_w ) } # Test same as individual models lmro <- lm_robust(cbind(mpg, hp) ~ wt, data = mtcars, clusters = cyl) lmmpg <- lm_robust(mpg ~ wt, data = mtcars, clusters = cyl) lmhp <- lm_robust(hp ~ wt, data = mtcars, clusters = cyl) expect_equivalent( tidy(lmro)$df[1:2], lmmpg$df ) expect_equivalent( tidy(lmro)$df[3:4], lmhp$df ) expect_equivalent(lmro$r.squared[1], lmmpg$r.squared) expect_equivalent(lmro$r.squared[2], lmhp$r.squared) }) test_that("lm cluster se with missingness", { dat <- data.frame( Y = rnorm(100), Z = rbinom(100, 1, .5), X = rnorm(100), J = sample(1:10, 100, replace = T), W = runif(100) ) dat$X[23] <- NA dat$J[63] <- NA expect_warning( estimatr_cluster_out <- lm_robust( Y ~ Z + X, clusters = J, data = dat ), "missingness in the cluster" ) estimatr_cluster_sub <- lm_robust( Y ~ Z + X, clusters = J, data = dat[-c(23, 63), ] ) estimatr_cluster_out[["call"]] <- NULL estimatr_cluster_sub[["call"]] <- NULL expect_equal( estimatr_cluster_out, estimatr_cluster_sub ) }) test_that("lm works with quoted or unquoted vars and withor without factor clusters", { dat <- data.frame( Y = rnorm(100), Z = rbinom(100, 1, .5), X = rnorm(100), J = sample(1:10, 100, replace = T), W = runif(100) ) lmr <- lm_robust(Y~Z, data = dat, weights = W) lmrq <- lm_robust(Y~Z, data = dat, weights = W) expect_equal( rmcall(lmr), rmcall(lmrq) ) # works with char dat$J <- as.character(dat$J) lmrc <- lm_robust(Y~Z, data = dat, clusters = J) lmrcq <- lm_robust(Y~Z, data = dat, clusters = J) expect_equal( rmcall(lmrc), rmcall(lmrcq) ) # works with num dat$J_num <- as.numeric(dat$J) lmrc_qnum <- lm_robust(Y~Z, data = dat, clusters = J_num) expect_equal( rmcall(lmrc), rmcall(lmrc_qnum) ) # works with factor dat$J_fac <- as.factor(dat$J) expect_equivalent( rmcall(lm_robust(Y~Z, data = dat, clusters = J_fac)), rmcall(lm_robust(Y~Z, data = dat, clusters = J)) ) # works with being cast in the call lm_robust(Y~Z, data = dat, clusters = as.factor(J)) }) test_that("Clustered SEs work with clusters of size 1", { dat <- data.frame( Y = rnorm(100), X = rnorm(100), J = 1:100 ) lm_cr2 <- lm_robust(Y ~ X, data = dat, clusters = J) lm_stata <- lm_robust(Y ~ X, data = dat, clusters = J, se_type = "stata") lmo <- lm(Y ~ X, data = dat) bmo <- BMlmSE( lmo, clustervar = as.factor(dat$J), IK = FALSE ) expect_equivalent( as.matrix(tidy(lm_cr2)[, c("estimate", "std.error", "df")]), cbind(coef(lmo), bmo$se, bmo$dof) ) expect_equivalent( as.matrix(tidy(lm_stata)[, c("estimate", "std.error")]), cbind(coef(lmo), bmo$se.Stata) ) }) estimatr/tests/testthat/run-stata-iv-models.do0000644000176200001440000000544414747205231021257 0ustar liggesusers// This file fits many models in stata and outputs the estimates for comparison with estimatr clear all import delimited mtcars.csv gen w = drat / 5 file open outf using stata-iv-ests.txt, write r ivregress 2sls mpg (hp am = wt gear), small mat V=e(V) file write outf _n "classical" _tab (V[1,1]) _tab (V[2,2]) _tab (V[3,3]) _tab (e(F)) _tab (e(r2)) _tab (e(r2_a)) _tab (e(rmse)) ivregress 2sls mpg (hp am = wt gear), small rob mat V=e(V) file write outf _n "rob" _tab (V[1,1]) _tab (V[2,2]) _tab (V[3,3]) _tab (e(F)) _tab (e(r2)) _tab (e(r2_a)) _tab (e(rmse)) ivregress 2sls mpg (hp am = wt gear), small vce(cluster cyl) mat V=e(V) file write outf _n "cl" _tab (V[1,1]) _tab (V[2,2]) _tab (V[3,3]) _tab (e(F)) _tab (e(r2)) _tab (e(r2_a)) _tab (e(rmse)) ivregress 2sls mpg (hp am = wt gear) [aweight = w], small mat V=e(V) file write outf _n "classical_w" _tab (V[1,1]) _tab (V[2,2]) _tab (V[3,3]) _tab (e(F)) _tab (e(r2)) _tab (e(r2_a)) _tab (e(rmse)) ivregress 2sls mpg (hp am = wt gear) [aweight = w], small rob mat V=e(V) file write outf _n "rob_w" _tab (V[1,1]) _tab (V[2,2]) _tab (V[3,3]) _tab (e(F)) _tab (e(r2)) _tab (e(r2_a)) _tab (e(rmse)) ivregress 2sls mpg (hp am = wt gear) [aweight = w], small vce(cluster cyl) mat V=e(V) file write outf _n "cl_w" _tab (V[1,1]) _tab (V[2,2]) _tab (V[3,3]) _tab (e(F)) _tab (e(r2)) _tab (e(r2_a)) _tab (e(rmse)) file close outf cap file close outfdiag file open outfdiag using stata-iv-diagnostics.txt, write r #delimit ; local formulae = `" "(hp = wt)" "(hp am = wt gear)" "gear (hp = wt)" "gear (hp = wt am)" "' ; local options = `" "small" "rob" "cluster(cyl)" "small noconstant" "rob noconstant" "cluster(cyl) noconstant" "' ; local weights = `" "" "[aweight = w]" "' ; #delimit cr foreach f in `formulae' { display "`f'" foreach opt in `options' { foreach w in `weights' { ivregress 2sls mpg `f' `w', `opt' estat firststage, all mat singleresults=r(singleresults) local rows = rowsof(singleresults) forvalues i=1/`rows' { cap file write outfdiag "`f';`w';`opt';" "weak`i'" ";" (singleresults[`i',5]) ";" (singleresults[`i',6]) ";" (singleresults[`i',4]) ";" (singleresults[`i',7]) _n } estat endogenous, forceweights if strpos("`opt'", "rob") > 0 | strpos("`opt'", "cluster") > 0 { file write outfdiag "`f';`w';`opt';" "endog" ";" (r(regFdf_n)) ";" (r(regFdf_d)) ";" (r(regF)) ";" (r(p_regF)) _n cap estat overid, forceweights file write outfdiag "`f';`w';`opt';" "overid" ";" (r(df)) ";.;" (r(score)) ";" (r(p_score)) _n } else { file write outfdiag "`f';`w';`opt';" "endog" ";" (r(df)) ";" (r(wudf_r)) ";" (r(wu)) ";" (r(p_wu)) _n cap estat overid, forceweights file write outfdiag "`f';`w';`opt';" "overid" ";" (r(df)) ";.;" (r(sargan)) ";" (r(p_sargan)) _n } } } } file close outfdiag estimatr/tests/testthat/test-starprep.R0000644000176200001440000001620714747205231020057 0ustar liggesuserscontext("Helper - commarobust + starprep") test_that("starprep works", { skip_if_not_installed("stargazer") fit_1 <- lm(mpg ~ hp, data = mtcars) fit_2 <- lm(mpg ~ hp, data = mtcars) fit_1_r <- lm_robust(mpg ~ hp, data = mtcars) fit_2_r <- lm_robust(mpg ~ hp, data = mtcars) expect_output( stargazer::stargazer(fit_1, fit_2, type = "text", se = starprep(fit_1_r, fit_2), p = starprep(fit_1_r, fit_2, stat = "p.value")), "\\(0\\.015\\)\\s+\\(0\\.015\\)" ) expect_output( stargazer::stargazer(fit_1, fit_2, type = "text", ci.custom = starprep(fit_1_r, fit_2, stat = "ci")), "\\(25\\.620\\, 34\\.578\\)\\s+\\(25\\.620\\, 34\\.578\\)" ) }) set.seed(43) N <- 480 dat <- data.frame( Z = rbinom(N, 1, .5), X = rnorm(N), B = factor(rep(1:2, times = c(8, 12))), cl = sample(1:(N/4), size = N, replace = T), w = runif(N) ) dat$Y <- dat$Z + dat$X + rnorm(N) dat$Y2 = dat$Z + rnorm(N) dat$Xdup <- dat$X dat$Bdup <- dat$B # In outcome datmiss <- dat datmiss$Y[5] <- NA datmiss$B[1] <- NA test_that("commarobust works with regular lm", { # expect cluster length error lo <- lm(Y ~ Z + X + factor(B), data = datmiss) expect_error( clo <- commarobust(lo, clusters = datmiss$cl, se_type = "CR0"), "`clusters` must be the same length as the model data." ) ## Test unclustered SEs for (se_type in se_types) { ro <- lm_robust(Y ~ Z + X + factor(B), data = datmiss, se_type = se_type) lo <- lm(Y ~ Z + X + factor(B), data = datmiss) clo <- commarobust(lo, se_type = se_type) expect_equal( tidy(ro), tidy(clo) ) expect_equal( ro$fstatistic, clo$fstatistic ) expect_equal( ro[c("r.squared", "adj.r.squared")], clo[c("r.squared", "adj.r.squared")] ) } ## Test clustered SEs for (se_type in cr_se_types) { ro <- lm_robust(Y ~ Z + X + factor(B), clusters = cl, data = datmiss, se_type = se_type) lo <- lm(Y ~ Z + X + factor(B), data = datmiss) clo <- commarobust(lo, clusters = datmiss$cl[complete.cases(datmiss)], se_type = se_type) expect_equal( tidy(ro), tidy(clo) ) expect_equal( ro$fstatistic, clo$fstatistic ) expect_equal( ro[c("r.squared", "adj.r.squared")], clo[c("r.squared", "adj.r.squared")] ) } # Works with character, factor, and numeric clusters datmiss$cl_char <- sample(letters, size = nrow(datmiss), replace = TRUE) datmiss$cl_num <- sample(rnorm(3), size = nrow(datmiss), replace = TRUE) datmiss$cl_fac <- as.factor(datmiss$cl_char) ro <- lm_robust(Y ~ Z + X + factor(B), clusters = cl_char, data = datmiss, se_type = "CR2") lo <- lm(Y ~ Z + X + factor(B), data = datmiss) clo <- commarobust(lo, clusters = datmiss$cl_char[complete.cases(datmiss)], se_type = "CR2") expect_equal( tidy(ro), tidy(clo) ) ro <- lm_robust(Y ~ Z + X + factor(B), clusters = cl_num, data = datmiss, se_type = "CR2") lo <- lm(Y ~ Z + X + factor(B), data = datmiss) clo <- commarobust(lo, clusters = datmiss$cl_num[complete.cases(datmiss)], se_type = "CR2") expect_equal( tidy(ro), tidy(clo) ) ro <- lm_robust(Y ~ Z + X + factor(B), clusters = cl_fac, data = datmiss, se_type = "CR2") lo <- lm(Y ~ Z + X + factor(B), data = datmiss) clo <- commarobust(lo, clusters = datmiss$cl_fac[complete.cases(datmiss)], se_type = "CR2") expect_equal( tidy(ro), tidy(clo) ) }) test_that("commarobust works with weighted lm", { # Test unclustered SEs for (se_type in se_types) { ro <- lm_robust(Y ~ Z + X + factor(B), data = datmiss, weights = w, se_type = se_type) lo <- lm(Y ~ Z + X + factor(B), data = datmiss, weights = w) clo <- commarobust(lo, se_type = se_type) expect_equal( tidy(ro), tidy(clo) ) expect_equal( ro$fstatistic, clo$fstatistic ) expect_equal( ro[c("r.squared", "adj.r.squared")], clo[c("r.squared", "adj.r.squared")] ) } ## Test clustered SEs for (se_type in cr_se_types) { ro <- lm_robust(Y ~ Z + X + factor(B), clusters = cl, data = datmiss, weights = w, se_type = se_type) lo <- lm(Y ~ Z + X + factor(B), data = datmiss, weights = w) clo <- commarobust(lo, clusters = datmiss$cl[complete.cases(datmiss)], se_type = se_type) max(abs(clo$vcov - ro$vcov)) expect_equal( tidy(ro), tidy(clo) ) expect_equal( ro$fstatistic, clo$fstatistic ) expect_equal( ro[c("r.squared", "adj.r.squared")], clo[c("r.squared", "adj.r.squared")] ) } }) test_that("commarobust works with dependency, weighted lm", { check_obj <- function(ro, clo, x) { if (x != "call") { print(x) expect_equal(ro[[x]], clo[[x]]) } } for (se_type in se_types) { ro <- lm_robust(Y ~ Z + X + Xdup + factor(B), data = datmiss, weights = w, se_type = se_type) lo <- lm(Y ~ Z + X + Xdup + factor(B), data = datmiss, weights = w) clo <- commarobust(lo, se_type = se_type) capture_output(sapply(names(ro), check_obj, ro = ro, clo = clo)) expect_equal( tidy(ro), tidy(clo) ) expect_equal( ro$fstatistic, clo$fstatistic ) expect_equal( ro[c("r.squared", "adj.r.squared")], clo[c("r.squared", "adj.r.squared")] ) } for (se_type in cr_se_types) { ro <- lm_robust(Y ~ Z + X + Xdup + factor(B), clusters = cl, data = datmiss, weights = w, se_type = se_type) lo <- lm(Y ~ Z + X + Xdup + factor(B), data = datmiss, weights = w) clo <- commarobust(lo, clusters = datmiss$cl[complete.cases(datmiss)], se_type = se_type) capture_output(sapply(names(ro), check_obj, ro = ro, clo = clo)) expect_equal( tidy(ro), tidy(clo) ) expect_equal( ro$fstatistic, clo$fstatistic ) expect_equal( ro[c("r.squared", "adj.r.squared")], clo[c("r.squared", "adj.r.squared")] ) } }) test_that("Only works with lm, not mlm or glm", { expect_error( commarobust(glm(vs ~ hp, mtcars, family = binomial), "HC2"), "`model` must be an lm object" ) expect_error( commarobust(lm(cbind(vs, mpg) ~ hp, data = mtcars), "HC2"), "`model` must be an lm object" ) }) test_that("starprep takes lists of fits", { a <- lm(mpg ~ cyl, mtcars) b <- lm(mpg ~ cyl + disp, mtcars) c <- lm(mpg ~ cyl + disp + hp, mtcars) abc <- list(a, b, c) ab <- list(a, b) expect_equal(starprep(a,b,c), starprep(abc)) expect_equal(starprep(a,b), starprep(ab)) expect_is(starprep(a), "list") a <- lm_robust(mpg ~ cyl, mtcars) b <- lm_robust(mpg ~ cyl + disp, mtcars) c <- lm_robust(mpg ~ cyl + disp + hp, mtcars) abc <- list(a, b, c) ab <- list(a, b) expect_equal(starprep(a,b,c), starprep(abc)) expect_equal(starprep(a,b), starprep(ab)) expect_is(starprep(a), "list") # Also check errors expect_error( starprep(ab, c), "`...` must be one list of model fits or several comma separated model fits" ) expect_error( starprep(list(a, "should_fail")), "must contain only `lm` or `lm_robust` objects." ) expect_error( starprep(a, "should_fail"), "must contain only `lm` or `lm_robust` objects." ) }) estimatr/tests/testthat/test-lm-robust.R0000644000176200001440000004320614747205231020142 0ustar liggesuserscontext("Estimator - lm_robust, non-clustered") test_that("lm robust se", { set.seed(42) N <- 40 dat <- data.frame(Y = rnorm(N), Z = rbinom(N, 1, .5), X = rnorm(N), W = runif(N)) tidy(lm_robust(Y ~ Z, data = dat)) lm_robust(Y ~ Z, se_type = "none", data = dat) lm_robust(Y ~ Z + X, data = dat) lm_robust(Y ~ Z * X, data = dat) expect_equivalent( coef(lm_robust(Y ~ 1, data = dat))[1], mean(dat$Y) ) expect_error( lm_robust(Y ~ Z + X, data = dat, se_type = "not_a_real_one"), "`se_type` must be either 'HC0', 'HC1', 'stata', 'HC2', 'HC3'," ) # Works with subset lmsub <- lm_robust(Y ~ Z + X, data = dat, subset = W > 0.5) lmbool <- lm_robust(Y ~ Z + X, data = dat[dat$W > 0.5, ]) expect_equal( rmcall(lmsub), rmcall(lmbool) ) lm_robust(Y ~ Z, weights = W, data = dat) # matches. # commarobust::commarobust(lm(Y ~ Z, weights = W, data = dat)) # To easily do with and without weights test_lm_robust_variance <- function(w) { # Test other estimators lm_hc0 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC0") lm_hc1 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC1") lm_hc2 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC2") lm_hc3 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC3") lm_stata <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "stata") # Stata is the same as HC1 expect_equal( rmcall(lm_hc1), rmcall(lm_stata) ) expect_false(all(lm_hc0$std.error == lm_hc1$std.error)) expect_false(all(lm_hc0$std.error == lm_hc2$std.error)) expect_false(all(lm_hc0$std.error == lm_hc3$std.error)) expect_false(all(lm_hc1$std.error == lm_hc2$std.error)) expect_false(all(lm_hc1$std.error == lm_hc3$std.error)) expect_false(all(lm_hc2$std.error == lm_hc3$std.error)) expect_equivalent(lm_hc0$df,lm_hc1$df) expect_equivalent(lm_hc0$df,lm_hc2$df) expect_equivalent(lm_hc0$df,lm_hc3$df) expect_equivalent(lm_hc0$df,lm_stata$df) expect_equivalent( lm_hc0$std.error ^ 2, lm_hc1$std.error ^ 2 * ((N - length(coef(lm_hc1))) / N) ) } # No weights first test_lm_robust_variance(NULL) test_lm_robust_variance(dat$W) # works with formula in a variable (always worked) form <- Y ~ Z lm_form <- lm_robust(form, data = dat) # works with formula inside a function (didn't work before 0.4.0) f <- function(data) { form2 <- Y ~ Z return(lm_robust(form2, data = data)) } lm_f_form <- f(dat) expect_equal( rmcall(lm_form), rmcall(lm_f_form) ) # Drops unused levels appropriately dat$Z <- as.factor(sample(LETTERS[1:3], nrow(dat), replace = TRUE)) lmall <- lm_robust(Y ~ Z, data = dat) lm1 <- lm_robust(Y ~ Z, data = dat[dat$Z %in% c("A", "B"), ]) lm2 <- lm_robust(Y ~ Z, data = dat, subset = Z %in% c("A", "B")) expect_equal( rmcall(lm1), rmcall(lm2) ) # pvals and cis diff because dof are diff expect_equal( tidy(lmall)[1:2, 1:3], tidy(lm1)[, 1:3] ) # rlang works my_w_vec <- rlang::sym("W") expect_equal( tidy(lm_robust(Y ~ Z + X, data = dat, weights = !!my_w_vec, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) my_dat <- rlang::sym("dat") expect_equal( tidy(lm_robust(Y ~ Z + X, data = !!my_dat, weights = W, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) my_y <- rlang::sym("Y") expect_equal( tidy(lm_robust(!!my_y ~ Z + X, data = dat, weights = W, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) my_formula <- Y ~ Z + X expect_equal( tidy(lm_robust(!!my_formula, data = dat, weights = W, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) }) test_that("lm robust F-tests are correct", { skip_if_not_installed("car") skip_if_not_installed("clubSandwich") co <- lm_robust(mpg ~ hp + am, data = mtcars, se_type = "classical") caro <- car::linearHypothesis(co, c("hp = 0", "am = 0"), test = "F") carolm <- car::linearHypothesis(lm(mpg ~ hp + am, data = mtcars), c("hp = 0", "am = 0"), test = "F") expect_equivalent( co$fstatistic, c(caro$F[2], caro$Df[2], caro$Res.Df[2]) ) expect_equivalent( co$fstatistic, c(carolm$F[2], carolm$Df[2], carolm$Res.Df[2]) ) cow <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, se_type = "classical") caro <- car::linearHypothesis(cow, c("hp = 0", "am = 0"), test = "F") expect_equivalent( cow$fstatistic, c(caro$F[2], caro$Df[2], caro$Res.Df[2]) ) for (se_type in setdiff(se_types, "classical")) { lmr <- lm_robust(mpg ~ hp + am, data = mtcars, se_type = se_type) caro <- car::linearHypothesis(lmr, c("hp = 0", "am = 0"), test = "F") carolm <- car::linearHypothesis(lm(mpg ~ hp + am, data = mtcars), c("hp = 0", "am = 0"), test = "F", white.adjust = tolower(se_type)) expect_equivalent( lmr$fstatistic, c(caro$F[2], caro$Df[2], caro$Res.Df[2]) ) expect_equivalent( lmr$fstatistic, c(carolm$F[2], carolm$Df[2], carolm$Res.Df[2]) ) lmrw <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, se_type = se_type) carow <- car::linearHypothesis(lmrw, c("hp = 0", "am = 0"), test = "F") carolmw <- car::linearHypothesis(lm(mpg ~ hp + am, weights = wt, data = mtcars), c("hp = 0", "am = 0"), test = "F", white.adjust = tolower(se_type)) expect_equivalent( lmrw$fstatistic, c(carow$F[2], carow$Df[2], carow$Res.Df[2]) ) expect_equivalent( lmrw$fstatistic, c(carolmw$F[2], carolmw$Df[2], carolmw$Res.Df[2]) ) } for (se_type in cr_se_types) { lmcr <- lm_robust(mpg ~ hp + am, data = mtcars, clusters = carb, se_type = se_type) caro <- clubSandwich::Wald_test(lm(mpg ~ hp + am, data = mtcars), cluster = mtcars$carb, constraints = clubSandwich::constrain_zero(2:3), vcov = ifelse(se_type == "stata", "CR1S", se_type), test = "Naive-F") lmcrw <- lm_robust(mpg ~ hp + am, data = mtcars, clusters = carb, weights = wt, se_type = se_type) carow <- clubSandwich::Wald_test(lm(mpg ~ hp + am, weights = wt, data = mtcars), cluster = mtcars$carb, constraints = clubSandwich::constrain_zero(2:3), vcov = ifelse(se_type == "stata", "CR1S", se_type), test = "Naive-F") expect_equivalent( lmcr$fstatistic[c(1, 3)], c(caro$Fstat, caro$df_denom) ) expect_equivalent( lmcrw$fstatistic[c(1, 3)], c(carow$Fstat, carow$df_denom) ) } }) test_that("lm robust mlm gets right fstats", { for (se_type in se_types) { lmcyl <- lm_robust(cyl ~ hp + am, data = mtcars, se_type = se_type) lmmpg <- lm_robust(mpg ~ hp + am, data = mtcars, se_type = se_type) lm2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, se_type = se_type) expect_equivalent( lm2$fstatistic[1:2], c(lmcyl$fstatistic[1], lmmpg$fstatistic[1]) ) lmwcyl <- lm_robust(cyl ~ hp + am, data = mtcars, weights = wt, se_type = se_type) lmwmpg <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, se_type = se_type) lmw2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, weights = wt, se_type = se_type) expect_equivalent( lmw2$fstatistic[1:2], c(lmwcyl$fstatistic[1], lmwmpg$fstatistic[1]) ) } for (se_type in cr_se_types) { lmccyl <- lm_robust(cyl ~ hp + am, data = mtcars, cluster = carb, se_type = se_type) lmcmpg <- lm_robust(mpg ~ hp + am, data = mtcars, cluster = carb, se_type = se_type) lmc2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, cluster = carb, se_type = se_type) expect_equivalent( lmc2$fstatistic[1:2], c(lmccyl$fstatistic[1], lmcmpg$fstatistic[1]) ) lmcwcyl <- lm_robust(cyl ~ hp + am, data = mtcars, weights = wt, cluster = carb, se_type = se_type) lmcwmpg <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, cluster = carb, se_type = se_type) lmcw2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, weights = wt, cluster = carb, se_type = se_type) expect_equivalent( lmcw2$fstatistic[1:2], c(lmcwcyl$fstatistic[1], lmcwmpg$fstatistic[1]) ) } }) test_that("lm robust works with missingness", { skip_if_not_installed("sandwich") dat <- data.frame( Y = rnorm(100), Z = rbinom(100, 1, .5), X = rnorm(100), W = runif(100) ) dat$X[23] <- NA expect_equal( rmcall(lm_robust(Y ~ Z + X, data = dat)), rmcall(lm_robust(Y ~ Z + X, data = dat[-23, ])) ) lm_robust(Y ~ Z + X, data = dat) lm_robust(Y ~ Z * X, data = dat) ## Outcome missingness dat$Y[35] <- NA estimatr_missout_out <- lm_robust(Y ~ Z + X, data = dat) lm_missout_out <- lm(Y ~ Z + X, data = dat) lm_missout_hc2 <- cbind( coef(lm_missout_out), sqrt(diag(sandwich::vcovHC(lm_missout_out, type = "HC2"))) ) expect_equivalent( as.matrix(tidy(estimatr_missout_out)[, c("estimate", "std.error")]), lm_missout_hc2 ) # nested DFs dat$Y2 <- matrix(dat$Y) expect_equivalent( tidy(lm_robust(Y ~ Z + X, data = dat))[, 1:6], tidy(lm_robust(Y2 ~ Z + X, data = dat))[, 1:6] ) }) test_that("lm_robust doesn't include aux variables when . is used", { n <- 10 dat <- data.frame(y = rnorm(n), x = rnorm(n)) # not in data.frame clust <- rep(1:5, each = 2) expect_equal( rmcall(lm_robust(y ~ ., clusters = clust, data = dat)), rmcall(lm_robust(y ~ x, clusters = clust, data = dat)) ) }) test_that("lm robust works with weights", { skip_if_not_installed("sandwich") N <- 100 dat <- data.frame( Y = rnorm(N), Z = rbinom(N, 1, .5), X = rnorm(N), W = runif(N) ) ## Make sure weighting works expect_error( estimatr_out <- lm_robust(Y ~ Z * X, weights = W, data = dat), NA ) expect_true( any(grepl("Weighted", capture.output(summary(estimatr_out)))) ) # Compare to lm output lm_out <- lm(Y ~ Z * X, weights = W, data = dat) lmo_hc2 <- cbind( coef(lm_out), sqrt(diag(sandwich::vcovHC(lm_out, type = "HC2"))) ) expect_equivalent( as.matrix(tidy(estimatr_out)[, c("estimate", "std.error")]), lmo_hc2 ) ## Make sure weighting works with missingness dat$W[39] <- NA expect_warning( estimatr_miss_out <- lm_robust(Y ~ Z * X, weights = W, data = dat), "missing" ) expect_equal( rmcall(estimatr_miss_out), rmcall(lm_robust(Y ~ Z * X, weights = W, data = dat[-39, ])) ) # Compare to lm output lm_miss_out <- lm(Y ~ Z * X, weights = W, data = dat) lmo_miss_hc2 <- cbind( coef(lm_miss_out), sqrt(diag(sandwich::vcovHC(lm_miss_out, type = "HC2"))) ) expect_equivalent( as.matrix(tidy(estimatr_miss_out)[, c("estimate", "std.error")]), lmo_miss_hc2 ) expect_error( lm_robust(Y ~ Z, data = dat, weights = c(-0.5, runif(N - 1))), "`weights` must not be negative" ) }) test_that("lm_robust_fit adds column names", { n <- 10 y <- rnorm(n) X <- matrix(rnorm(n * 3), ncol = 3) lm_o <- lm_robust_fit( y = y, X = X, weights = NULL, cluster = NULL, ci = TRUE, se_type = "classical", alpha = 0.05, return_vcov = TRUE, try_cholesky = TRUE, has_int = FALSE, iv_stage = list(0) ) expect_equal( lm_o$term, c("X1", "X2", "X3") ) }) test_that("lm robust works with large data", { N <- 75000 dat <- data.frame( Y = rbinom(N, 1, .5), X1 = rnorm(N), X2 = rnorm(N), X3 = rnorm(N) ) expect_error( lm_robust(Y ~ X1 + X2 + X3, data = dat, se_type = "none"), NA ) }) set.seed(42) N <- 100 dat <- data.frame( Y = rbinom(N, 1, .5), X1 = rnorm(N), X2 = rnorm(N), X3 = rnorm(N) ) test_that("lm robust works with rank-deficient X", { dat$Z1 <- dat$X1 sum_lm <- summary(lm(Y ~ X1 + X2 + Z1 + X3, data = dat)) ## manually build vector of coefficients, can't extract from summary.lm out_sumlm <- matrix(NA, nrow = length(sum_lm$aliased), ncol = 2) j <- 1 for (i in seq_along(sum_lm$aliased)) { if (!sum_lm$aliased[i]) { out_sumlm[i, ] <- coef(sum_lm)[j, 1:2] j <- j + 1 } } ## order sometimes is different! Not stable order! # expect_equivalent( # as.matrix(tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, se_type = 'classical'))[, c('estimate', 'std.error')]), # out_sumlm # ) dat$Z1 <- dat$X1 + 5 ## Not the same as LM! Different QR decompositions when dependency isn't just equivalency expect_equivalent( as.matrix(tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, se_type = "classical"))[, c("estimate", "std.error")]), as.matrix(RcppEigen:::summary.fastLm(RcppEigen::fastLm(Y ~ X1 + X2 + Z1 + X3, data = dat))$coefficients[, 1:2]) ) # trigger cascade to QR from try_chol; set seed above because try_cholesky # sometimes will work! expect_equivalent( tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat)), tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, try_cholesky = TRUE)) ) # Weighted rank deficient dat$w <- 1 expect_equivalent( tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat)), tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, weights = w)) ) expect_true( any(grepl( "not defined because the design matrix is rank deficient", capture.output(summary(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat))) )) ) }) test_that("r squared is right", { lmo <- summary(lm(mpg ~ hp, mtcars)) lmow <- summary(lm(mpg ~ hp, mtcars, weights = wt)) lmon <- summary(lm(mpg ~ hp - 1, mtcars)) lmown <- summary(lm(mpg ~ hp - 1, mtcars, weights = wt)) lmro <- lm_robust(mpg ~ hp, mtcars) lmrow <- lm_robust(mpg ~ hp, mtcars, weights = wt) lmroclust <- lm_robust(mpg ~ hp, mtcars, clusters = carb) lmrowclust <- lm_robust(mpg ~ hp, mtcars, weights = wt, clusters = carb) lmron <- lm_robust(mpg ~ hp - 1, mtcars) lmrown <- lm_robust(mpg ~ hp - 1, mtcars, weights = wt) lmrclust <- lm_robust(mpg ~ hp - 1, mtcars, clusters = carb) lmrwclust <- lm_robust(mpg ~ hp - 1, mtcars, weights = wt, clusters = carb) # Use equivalent instead of equal because we change the name of the fstat value expect_equivalent( c(lmo$r.squared, lmo$adj.r.squared), c(lmro$r.squared, lmro$adj.r.squared) ) expect_equivalent( c(lmow$r.squared, lmow$adj.r.squared), c(lmrow$r.squared, lmrow$adj.r.squared) ) expect_equivalent( c(lmon$r.squared, lmon$adj.r.squared), c(lmron$r.squared, lmron$adj.r.squared) ) expect_equivalent( c(lmown$r.squared, lmown$adj.r.squared), c(lmrown$r.squared, lmrown$adj.r.squared) ) expect_equal( c(lmon$r.squared, lmon$adj.r.squared), c(lmrclust$r.squared, lmrclust$adj.r.squared) ) expect_equal( c(lmown$r.squared, lmown$adj.r.squared), c(lmrwclust$r.squared, lmrwclust$adj.r.squared) ) expect_equal( c(lmo$r.squared, lmo$adj.r.squared), c(lmroclust$r.squared, lmroclust$adj.r.squared) ) expect_equal( c(lmow$r.squared, lmow$adj.r.squared), c(lmrowclust$r.squared, lmrowclust$adj.r.squared) ) # multiple outcomes lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars) lmmpg <- lm_robust(mpg ~ cyl, data = mtcars) lmhp <- lm_robust(hp ~ cyl, data = mtcars) expect_equivalent(lmro$r.squared[1], lmmpg$r.squared) expect_equivalent(lmro$r.squared[2], lmhp$r.squared) }) test_that("multiple outcomes", { lmo <- lm(cbind(mpg, hp) ~ cyl, data = mtcars) lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars, se_type = "classical") mo <- tidy(lmro) expect_identical( mo$term, c("(Intercept)", "cyl", "(Intercept)", "cyl") ) expect_equal( coef(lmro), coef(lmo) ) expect_equal( vcov(lmo), vcov(lmro) ) for (se_type in setdiff(se_types, "classical")) { expect_equal( sandwich::vcovHC(lmo, type = se_type), vcov(lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars, se_type = se_type)) ) } # with weights lmo <- lm(cbind(mpg, hp) ~ cyl, data = mtcars, weights = wt) lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars, weights = wt, se_type = "classical") mo <- tidy(lmro) expect_identical( mo$term, c("(Intercept)", "cyl", "(Intercept)", "cyl") ) expect_equal( coef(lmro), coef(lmo) ) expect_equivalent( sapply(summary(lmo)[[1]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1) ) expect_equivalent( sapply(summary(lmo)[[2]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 2) ) # with missingness mtcarsmiss <- mtcars mtcarsmiss$hp[10] <- NA lmo <- lm(cbind(mpg, hp) ~ cyl, data = mtcarsmiss) lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcarsmiss, se_type = "classical") expect_equivalent( do.call(rbind, lapply(summary(lmo), function(x) x$coefficients[, 1:4])), as.matrix(tidy(lmro)[, c("estimate", "std.error", "statistic", "p.value")]) ) expect_equivalent( sapply(summary(lmo)[[1]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1) ) expect_equivalent( sapply(summary(lmo)[[2]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 2) ) }) estimatr/tests/testthat/test-iv-robust.R0000644000176200001440000004251214747205231020147 0ustar liggesuserscontext("Estimator - iv_robust") N <- 20 dat <- data.frame( y = rnorm(N), x = rnorm(N), x2 = rnorm(N), z2 = rnorm(N), w = runif(N), clust = sample(letters[1:3], size = N, replace = TRUE) ) dat$z <- dat$x * 0.5 + rnorm(N) dat$x1_c <- dat$x test_that("iv_robust warnings and errors are correct", { expect_warning( ivro <- iv_robust(mpg ~ hp + cyl | am, data = mtcars, se_type = "HC0"), "More regressors than instruments" ) expect_error( iv_robust(mpg ~ hp + cyl, data = mtcars), "Must specify a `formula` with both regressors and instruments." ) }) test_that("iv_robust matches AER + ivpack", { skip_if_not_installed("AER") skip_if_not_installed("ivpack") skip("ivpack is not available") ivco <- iv_robust(y ~ x | z, data = dat, se_type = "classical") ivfit <- AER::ivreg(y ~ x | z, data = dat) ivo <- summary(ivfit) expect_equivalent( as.matrix(tidy(ivco)[, c("estimate", "std.error", "statistic", "p.value")]), coef(ivo)[, 1:4] ) # Same as stata if you specify `small` as a stata option # which applies the N / N-k finite sample correction expect_equivalent( ivfit$fitted.values, ivco$fitted.values ) # Stata defaults to HC0 as well, but does HC1 with `small` ivro <- iv_robust(y ~ x | z, data = dat, se_type = "HC0") capture_output(ivpackrob <- ivpack::robust.se(ivfit)) expect_equivalent( as.matrix(tidy(ivro)[, c("estimate", "std.error", "statistic", "p.value")]), ivpackrob[, 1:4] ) expect_equivalent( ivfit$fitted.values, ivro$fitted.values ) # "Stata" clustered SEs are CR0, but they are the same as below with `small` ivclusto <- iv_robust(y ~ x | z, data = dat, se_type = "stata", clusters = clust) capture_output(ivpackclust <- ivpack::cluster.robust.se(ivfit, dat$clust)) # Our p-values are bigger (ivpack is be using less conservative DF, we use J - 1 which # is what stata uses for clusters w/ `small` and in OLS) expect_equivalent( as.matrix(tidy(ivclusto)[, c("estimate", "std.error")]), ivpackclust[, c(1, 2)] ) expect_equivalent( ivfit$fitted.values, ivclusto$fitted.values ) # Weighting classical ivw <- AER::ivreg(y ~ x | z, data = dat, weights = w) ivcw <- iv_robust(y ~ x | z, data = dat, weights = w, se_type = "classical") ivregsum <- summary(ivcw) expect_equivalent( as.matrix(tidy(ivcw)[, c("estimate", "std.error", "statistic", "p.value")]), coef(ivregsum)[, 1:4] ) expect_equivalent( ivw$fitted.values, ivcw$fitted.values ) # HC0 weighted ivrw <- iv_robust(y ~ x | z, data = dat, weights = w, se_type = "HC0") capture_output(ivpackrobw <- ivpack::robust.se(ivw)) expect_equivalent( as.matrix(tidy(ivrw)[, c("estimate", "std.error", "statistic", "p.value")]), ivpackrobw[, 1:4] ) expect_equivalent( ivrw$fitted.values, ivcw$fitted.values ) # Stata weighted ivclrw <- iv_robust(y ~ x | z, data = dat, clusters = clust, weights = w, se_type = "stata") ivclw <- AER::ivreg(y ~ x | z, data = dat, weights = w) capture_output(ivclwse <- ivpack::cluster.robust.se(ivclw, clusterid = dat$clust)) expect_equivalent( as.matrix(tidy(ivclrw)[1:2, c("estimate", "std.error")]), ivclwse[, c(1, 2)] ) expect_equivalent( ivclrw$fitted.values, ivclw$fitted.values ) # Rank-deficiency # HC0 ivdefr <- iv_robust(y ~ x + x1_c| z + z2, data = dat, se_type = "HC0") ivdef <- AER::ivreg(y ~ x + x1_c| z + z2, data = dat) capture_output(ivdefse <- ivpack::robust.se(ivdef)) expect_equal( coef(ivdefr), coef(ivdef) ) expect_equivalent( as.matrix(tidy(ivdefr)[1:2, c("estimate", "std.error", "statistic", "p.value")]), ivdefse[, 1:4] ) expect_equivalent( ivdefr$fitted.values, ivdef$fitted.values ) # # Does not work if instrument is collinear with other instrument # ivdefri <- iv_robust(y ~ z + z2| x + x1_c, data = dat, se_type = "HC0") # ivdefi <- AER::ivreg(y ~ z + z2| x + x1_c, data = dat) # ivdefsei <- ivpack::robust.se(ivdefi) # # # No longer equal! # expect_equal( # coef(ivdefri), # coef(ivdefi) # ) # expect_equivalent( # as.matrix(tidy(ivdefri)[1:2, c("estimate", "std.error", "statistic", "p.value")]), # ivdefsei[, 1:4] # ) # Stata ivdefclr <- iv_robust(y ~ x + x1_c | z + z2, data = dat, clusters = clust, se_type = "stata") ivdefcl <- AER::ivreg(y ~ x + x1_c | z + z2, data = dat) capture_output(ivdefclse <- ivpack::cluster.robust.se(ivdefcl, clusterid = dat$clust)) expect_equal( coef(ivdefclr), coef(ivdefcl) ) expect_equivalent( as.matrix(tidy(ivdefclr)[1:2, c("estimate", "std.error")]), ivdefclse[, c(1, 2)] ) expect_equivalent( ivdefclr$fitted.values, ivdefcl$fitted.values ) # HC0 Weighted ivdefrw <- iv_robust(y ~ x + x1_c| z + z2, weights = w, data = dat, se_type = "HC0") ivdefw <- AER::ivreg(y ~ x + x1_c| z + z2, weights = w, data = dat) capture_output(ivdefsew <- ivpack::robust.se(ivdefw)) expect_equal( coef(ivdefrw), coef(ivdefw) ) expect_equivalent( as.matrix(tidy(ivdefrw)[1:2, c("estimate", "std.error", "statistic", "p.value")]), ivdefsew[, 1:4] ) expect_equivalent( ivdefrw$fitted.values, ivdefw$fitted.values ) # Stata weighted ivdefclr <- iv_robust(y ~ x + x1_c | z + z2, data = dat, weights = w, clusters = clust, se_type = "stata") ivdefcl <- AER::ivreg(y ~ x + x1_c | z + z2, data = dat, weights = w) capture_output(ivdefclse <- ivpack::cluster.robust.se(ivdefcl, clusterid = dat$clust)) expect_equal( coef(ivdefclr), coef(ivdefcl) ) expect_equivalent( as.matrix(tidy(ivdefclr)[1:2, c("estimate", "std.error")]), ivdefclse[, c(1, 2)] ) expect_equivalent( ivdefclr$fitted.values, ivdefcl$fitted.values ) # F-stat fails properly with blocks of size 1 set.seed(42) N <- 20 dat <- data.frame(y = rnorm(N), x = rnorm(N), z = rnorm(N), bl = sample(letters, size = N, replace = T)) ivr <- iv_robust(y ~ bl + x | bl + z, data = dat, se_type = "stata") expect_equivalent( ivr$fstatistic[1], NA_integer_ ) }) test_that("iv_robust matches AER + clubSandwich", { skip_if_not_installed("AER") skip_if_not_installed("clubSandwich") skip_on_cran() # ClubSandwich IV tests for (se_type in cr_se_types) { ivfit <- AER::ivreg(y ~ x | z, data = dat) ivfitw <- AER::ivreg(y ~ x | z, weights = w, data = dat) # Standard IV models ivcr <- iv_robust(y ~ x | z, data = dat, clusters = clust, se_type = se_type) clubsand <- clubSandwich::coef_test(ivfit, vcov = ifelse(se_type == "stata", "CR1S", se_type), cluster = dat$clust, test = ifelse(se_type == "CR2", "Satterthwaite", "naive-t")) clubsand <- as.data.frame(clubsand) cols <- c("estimate", "std.error", "statistic", "df", "p.value") expect_equivalent( as.matrix(tidy(ivcr)[, cols]), as.matrix(clubsand[,-1]) ) expect_equivalent( ivfit$fitted.values, ivcr$fitted.values ) # Weighted IV models ivcrw <- iv_robust(y ~ x | z, data = dat, clusters = clust, weights = w, se_type = se_type) clubsandw <- clubSandwich::coef_test(ivfitw, vcov = ifelse(se_type == "stata", "CR1S", se_type), cluster = dat$clust, test = ifelse(se_type == "CR2", "Satterthwaite", "naive-t")) clubsandw <- as.data.frame(clubsandw) expect_equivalent( as.matrix(tidy(ivcrw)[, cols]), as.matrix(clubsandw[,-1]) ) expect_equivalent( ivfitw$fitted.values, ivcrw$fitted.values ) # Rank-deficiency ivfit_rd <- AER::ivreg(y ~ x + x1_c | z + z2, data = dat) ivcr_rd <- iv_robust(y ~ x + x1_c | z + z2, data = dat, clusters = clust, se_type = se_type) clubsand_rd <- clubSandwich::coef_test(ivfit_rd, vcov = ifelse(se_type == "stata", "CR1S", se_type), cluster = dat$clust, test = ifelse(se_type == "CR2", "Satterthwaite", "naive-t")) clubsand_rd <- as.data.frame(clubsand_rd) expect_equivalent( na.omit(as.matrix(tidy(ivcr_rd)[, cols])), na.omit(as.matrix(clubsand_rd[,-1])) ) expect_equivalent( ivfit_rd$fitted.values, ivcr_rd$fitted.values ) # Rank-deficient, weighted ivfitw_rd <- AER::ivreg(y ~ x + x1_c | z + z2, weights = w, data = dat) ivcrw_rd <- iv_robust(y ~ x + x1_c | z + z2, data = dat, weights = w, clusters = clust, se_type = se_type) clubsandw_rd <- clubSandwich::coef_test(ivfitw_rd, vcov = ifelse(se_type == "stata", "CR1S", se_type), cluster = dat$clust, test = ifelse(se_type == "CR2", "Satterthwaite", "naive-t")) clubsandw_rd <- as.data.frame(clubsandw_rd) expect_equivalent( na.omit(as.matrix(tidy(ivcrw_rd)[, cols])), na.omit(as.matrix(clubsandw_rd[,-1])) ) expect_equivalent( ivfitw_rd$fitted.values, ivcrw_rd$fitted.values ) } }) test_that("iv_robust different specifications work", { skip_if_not_installed("AER") skip_if_not_installed("ivpack") skip("ivpack not available") # More instruments than endog. regressors ivro <- iv_robust(mpg ~ wt | hp + cyl, data = mtcars, se_type = "HC0") ivo <- AER::ivreg(mpg ~ wt | hp + cyl, data = mtcars) capture_output(ivpo <- ivpack::robust.se(ivo)) expect_equivalent( as.matrix(tidy(ivro)[, c("estimate", "std.error", "statistic", "p.value")]), ivpo[, 1:4] ) # . notation for multiple exog, doesnt work! # ivro <- iv_robust(mpg ~ wt + hp + vs | . - vs + cyl, data = mtcars, se_type = "HC0") # ivo <- AER::ivreg(mpg ~ wt + hp + vs | . - vs + cyl, data = mtcars) # ivpo <- ivpack::robust.se(ivo) # expect_equivalent( # as.matrix(tidy(ivro)[, c("estimate", "std.error", "statistic", "p.value")]), # ivpo[, 1:4] # ) # . notation in general ivro <- iv_robust(mpg ~ .| ., data = mtcars, se_type = "HC0") ivo <- AER::ivreg(mpg ~ . | ., data = mtcars) capture_output(ivpo <- ivpack::robust.se(ivo)) expect_equivalent( as.matrix(tidy(ivro)[, c("estimate", "std.error", "statistic", "p.value")]), ivpo[, 1:4] ) }) test_that("S3 methods", { skip_if_not_installed("AER") ivo <- AER::ivreg(mpg ~ hp + cyl | wt + gear, data = mtcars) ivro <- iv_robust(mpg ~ hp + cyl | wt + gear, data = mtcars, se_type = "classical") expect_equal( vcov(ivro), vcov(ivo) ) expect_is( tidy(ivro), "data.frame" ) expect_equal( nrow(tidy(ivro)), 3 ) summary(ivro) siv <- capture_output( summary(ivro), print = TRUE ) expect_true( grepl( "iv\\_robust\\(formula = mpg \\~ hp \\+ cyl \\| wt \\+ gear, data = mtcars,", siv ) ) expect_true( grepl( "F\\-statistic\\: 33\\.73 on 2 and 29 DF, p\\-value\\: 2\\.706e\\-08", siv ) ) capture_output( expect_equivalent( coef(summary(ivro)), print(ivro) ) ) expect_equivalent( ivro$fstatistic, summary(ivo)$waldtest[-2] ) expect_equal( predict(ivro, newdata = mtcars), predict(ivo) ) glance_ivro <- glance(ivro) expect_equal(nrow(glance_ivro), 1) expect_equal( colnames(glance(ivro)), c("r.squared", "adj.r.squared", "df.residual", "nobs", "se_type", "statistic", "p.value", "statistic.weakinst", "p.value.weakinst", "statistic.endogeneity", "p.value.endogeneity", "statistic.overid", "p.value.overid") ) # no intercept ivo <- AER::ivreg(mpg ~ hp + cyl + 0 | wt + gear, data = mtcars) ivro <- iv_robust(mpg ~ hp + cyl + 0 | wt + gear, data = mtcars, se_type = "classical") expect_equivalent( ivro$fstatistic, summary(ivo)$waldtest[-2] ) }) test_that("IV diagnostics", { skip_if_not_installed("AER") # Load stata diagnostics stata_diags <- read.table( "stata-iv-diagnostics.txt", col.names = c("formula", "weights", "options", "diag", "df1", "df2", "val", "pval"), sep = ";", na = ".", stringsAsFactors = FALSE ) formulae <- c( "(hp = wt)" = mpg ~ hp | wt, # mpg ~ 0 + hp | wt, "(hp = wt)0" = mpg ~ 0 + hp | 0 + wt, "(hp am = wt gear)" = mpg ~ hp + am | wt + gear, # mpg ~ 0 + hp + am | wt + gear, "(hp am = wt gear)0" = mpg ~ 0 + hp + am | 0 + wt + gear, "gear (hp = wt)" = mpg ~ hp + gear | wt + gear, # mpg ~ 0 + hp + gear | wt + gear, "gear (hp = wt)0" = mpg ~ 0 + hp + gear | 0 + wt + gear, "gear (hp = wt am)" = mpg ~ hp + gear | wt + gear + am, # mpg ~ 0 + hp + gear | wt + gear + am, # mpg ~ hp + gear | 0 + wt + gear + am, "gear (hp = wt am)0" = mpg ~ 0 + hp + gear | 0 + wt + gear + am ) for (f_n in names(formulae)) { f <- formulae[[f_n]] ivro <- iv_robust(f, data = mtcars, se_type = "classical", diagnostics = TRUE) aer_ivro <- summary(AER::ivreg(f, data = mtcars), diagnostics = TRUE) # Sargan stat seems to be wrong for AER for this model (-ve critical value) if (f_n == "gear (hp = wt am)0") { expect_equivalent( build_ivreg_diagnostics_mat(ivro)[1:2, ], aer_ivro[["diagnostics"]][1:2, ] ) } else { expect_equivalent( build_ivreg_diagnostics_mat(ivro), aer_ivro[["diagnostics"]] ) } stata_diag <- subset( stata_diags, formula == gsub("0", "", f_n) & (grepl("noconstant", options) == grepl("0", f_n)) ) expect_equivalent( build_ivreg_diagnostics_mat(ivro, stata = TRUE), as.matrix(stata_diag[ grepl("small", stata_diag$options) & nchar(stata_diag$weights) == 0, c("df1", "df2", "val", "pval") ]), tolerance = 1e-6 ) # With weights, don't match `overid` test, as we don't report it ivrow <- iv_robust(f, data = mtcars, se_type = "classical", weights = drat, diagnostics = TRUE) ivrow_diag_mat <- build_ivreg_diagnostics_mat(ivrow, stata = TRUE) expect_equivalent(ivrow_diag_mat[nrow(ivrow_diag_mat), ], rep(NA_real_, 4)) expect_equivalent( ivrow_diag_mat[-nrow(ivrow_diag_mat), ], as.matrix(stata_diag[ grepl("small", stata_diag$options) & nchar(stata_diag$weights) > 0 & stata_diag$diag != "overid", c("df1", "df2", "val", "pval") ]), tolerance = 1e-6 ) ivro_hc1 <- iv_robust(f, data = mtcars, se_type = "HC1", diagnostics = TRUE) ivrow_hc1 <- iv_robust(f, data = mtcars, se_type = "HC1", weights = drat, diagnostics = TRUE) expect_equivalent( build_ivreg_diagnostics_mat(ivro_hc1, stata = TRUE), as.matrix(stata_diag[ grepl("rob", stata_diag$options) & nchar(stata_diag$weights) == 0, c("df1", "df2", "val", "pval") ]), tolerance = 1e-6 ) # Again, no overid test reported with weights ivrow_hc1_diag_mat <- build_ivreg_diagnostics_mat(ivrow_hc1, stata = TRUE) expect_equivalent(ivrow_hc1_diag_mat[nrow(ivrow_hc1_diag_mat), ], rep(NA_real_, 4)) expect_equivalent( ivrow_hc1_diag_mat[-nrow(ivrow_hc1_diag_mat), ], as.matrix(stata_diag[ grepl("rob", stata_diag$options) & nchar(stata_diag$weights) > 0 & stata_diag$diag != "overid", c("df1", "df2", "val", "pval") ]), tolerance = 1e-6 ) # tolerance higher here due to larger values in general ivro_crs <- iv_robust(f, data = mtcars, se_type = "stata", clusters = cyl, diagnostics = TRUE) ivro_crs_diag_mat <- build_ivreg_diagnostics_mat(ivro_crs, stata = TRUE) ivrow_crs <- iv_robust(f, data = mtcars, se_type = "stata", clusters = cyl, weights = drat, diagnostics = TRUE) ivrow_crs_diag_mat <- build_ivreg_diagnostics_mat(ivrow_crs, stata = TRUE) expect_equivalent(ivrow_crs_diag_mat[nrow(ivrow_crs_diag_mat), ], rep(NA_real_, 4)) expect_equivalent( ivro_crs_diag_mat[-nrow(ivro_crs_diag_mat), ], as.matrix(stata_diag[ grepl("cluster", stata_diag$options) & nchar(stata_diag$weights) == 0 & stata_diag$diag != "overid", c("df1", "df2", "val", "pval") ]), tolerance = 1e-3 ) # Stata doesn't report overid test with clusters expect_equivalent( ivrow_crs_diag_mat[-nrow(ivrow_crs_diag_mat), ], as.matrix(stata_diag[ grepl("cluster", stata_diag$options) & nchar(stata_diag$weights) > 0 & stata_diag$diag != "overid", c("df1", "df2", "val", "pval") ]), tolerance = 1e-3 ) # Sanity check unmatched diagnostics for (se_type in se_types) { ivro <- iv_robust(f, data = mtcars, se_type = se_type, diagnostics = TRUE) diagnostic_mat <- build_ivreg_diagnostics_mat(ivro) expect_true( all(diagnostic_mat[1:2, ] > 0) & all(diagnostic_mat[3, ] >= 0 | is.na(diagnostic_mat[3, ])) ) } # Test default se_type ivro <- iv_robust(f, data = mtcars, diagnostics = TRUE) diagnostic_mat <- build_ivreg_diagnostics_mat(ivro) expect_true( all(diagnostic_mat[1:2, ] > 0) & all(diagnostic_mat[3, ] >= 0 | is.na(diagnostic_mat[3, ])) ) for (cr_se_type in cr_se_types) { ivro <- iv_robust(f, data = mtcars, se_type = cr_se_type, clusters = cyl, diagnostics = TRUE) diagnostic_mat <- build_ivreg_diagnostics_mat(ivro) expect_true( all(diagnostic_mat[1:2, ] > 0) & all(diagnostic_mat[3, ] >= 0 | is.na(diagnostic_mat[3, ])) ) } } }) estimatr/tests/testthat/test-condition1-condition2.R0000644000176200001440000000760014747205231022331 0ustar liggesuserscontext("Helper - Condition parsing for difference estimators") test_that("Condition arguments behave as expected", { n <- 40 dat <- data.frame( y = rnorm(n), bl = rep(1:5, each = 8), z = 1:4, ps = runif(n) ) horvitz_thompson(y ~ z, data = dat, subset = z <= 2, condition_prs = ps) # Subsetting and just selecting two conditions expect_identical( horvitz_thompson(y ~ z, data = dat, subset = z <= 2, condition_prs = ps), horvitz_thompson(y ~ z, data = dat, condition1 = 1L, condition2 = 2L, condition_prs = ps) ) expect_identical( difference_in_means(y ~ z, data = dat, subset = z <= 2), difference_in_means(y ~ z, data = dat, condition1 = 1L, condition2 = 2L) ) expect_identical( difference_in_means(y ~ z, data = dat, subset = z <= 2, blocks = bl), difference_in_means(y ~ z, data = dat, condition1 = 1L, condition2 = 2L, blocks = bl) ) # Subsetting and just selecting two conditions expect_identical( tidy(horvitz_thompson( y ~ z, data = dat, condition1 = 3, condition2 = 4, condition_prs = rep(0.5, nrow(dat)) ))[c("estimate", "std.error")], tidy(horvitz_thompson( y ~ z, data = dat, condition1 = 4, condition2 = 3, condition_prs = rep(0.5, nrow(dat)) ))[c("estimate", "std.error")] * c(-1, 1) ) expect_identical( tidy(difference_in_means( y ~ z, data = dat, condition1 = 2, condition2 = 1 ))[c("estimate", "std.error")], tidy(difference_in_means( y ~ z, data = dat, condition1 = 1, condition2 = 2 ))[c("estimate", "std.error")] * c(-1, 1) ) # Errors if not specifying both expect_error( horvitz_thompson( y ~ z, data = dat, condition1 = 4, condition_prs = ps ), "condition1" ) expect_error( horvitz_thompson( y ~ z, data = dat, condition2 = 4, condition_prs = ps ), "condition1" ) expect_error( horvitz_thompson( y ~ z, data = dat, condition_prs = ps ), "condition1" ) expect_error( difference_in_means( y ~ z, data = dat, condition1 = 4 ), "condition1" ) expect_error( difference_in_means( y ~ z, data = dat, condition2 = 4 ), "condition1" ) expect_error( difference_in_means( y ~ z, data = dat ), "condition1" ) # Specifying only one works with binary treatment dat$z <- c("Treated", "Control") expect_identical( difference_in_means( y ~ z, data = dat, condition1 = "Treated" ), difference_in_means( y ~ z, data = dat, condition1 = "Treated", condition2 = "Control" ) ) expect_identical( horvitz_thompson( y ~ z, data = dat, condition1 = "Treated" ), horvitz_thompson( y ~ z, data = dat, condition1 = "Treated", condition2 = "Control" ) ) expect_identical( difference_in_means( y ~ z, data = dat, condition2 = "Treated" ), difference_in_means( y ~ z, data = dat, condition2 = "Treated", condition1 = "Control" ) ) expect_identical( horvitz_thompson( y ~ z, data = dat, condition2 = "Treated" ), horvitz_thompson( y ~ z, data = dat, condition2 = "Treated", condition1 = "Control" ) ) # Works with factor dat$z <- factor(c("T", "C")) # Must pass string! difference_in_means(y ~ z, condition2 = "T", data = dat) # Errors if not found expect_error( difference_in_means( y ~ z, condition2 = 1, data = dat ), "`condition1` and `condition2` must be values found in the treatment" ) dat$z <- 1 expect_error( difference_in_means(y ~ z, data = dat), "Must have more than one value in treatment unless using Horvitz" ) }) estimatr/tests/testthat/stata-iv-ests.txt0000644000176200001440000000071214747205231020356 0ustar liggesusers classical .0001602 2.2793852 5.2396632 44.808289 .75605596 .73923223 3.0776867 rob .00015781 2.2003453 4.7873499 41.366981 .75605596 .73923223 3.0776867 cl .00004096 .7950331 .31612954 108.37591 .75605596 .73923223 3.0776867 classical_w .00015649 2.3424049 5.2400158 46.736059 .76415188 .74788649 3.0971208 rob_w .00014704 2.0834808 4.5677404 45.410829 .76415188 .74788649 3.0971208 cl_w .00006651 .64892154 .54670656 81.595912 .76415188 .74788649 3.0971208estimatr/tests/testthat/test-lm-robust-helper.R0000644000176200001440000000124014747205231021407 0ustar liggesusers# N <- 10000 # dat <- data.frame(y = rnorm(N), x1 = rnorm(N), x2 = rnorm(N)) # X <- model.matrix.default(~ x1 + x2, data = dat) # y <- dat$y # fit <- lm(y ~ x1 + x2, data = dat) # # # Speed Tests ------------------------------------------------------------- # # system.time(replicate(500, lm_robust_helper(X = X, y = dat$y, type = "classical"))) # # system.time(replicate(500, lm_robust_helper(X = X, y = dat$y, type = "HC0"))) # system.time(replicate(500, lm_robust_helper(X = X, y = dat$y, type = "HC1"))) # system.time(replicate(500, lm_robust_helper(X = X, y = dat$y, type = "HC2"))) # system.time(replicate(500, lm_robust_helper(X = X, y = dat$y, type = "HC3"))) # # estimatr/tests/testthat/test-s3-methods.R0000644000176200001440000006710414760370352020211 0ustar liggesuserscontext("S3") set.seed(5) n <- 10 dat <- data.frame( x = rep(0:1, times = 5), p = 0.5, z = rnorm(n), y = rnorm(n) ) lmbo <- lm_robust(y ~ z + as.factor(x), data = dat) lmfo <- lm_robust(y ~ z, data = dat, fixed_effects = ~ x) test_that("tidy, glance, summary, and print work", { ## lm_robust lmo <- lm_robust(y ~ x, data = dat, se_type = "classical") capture_output( summary(lmo) ) capture_output( summary(lmfo) ) expect_output( print(summary(lmfo)), "proj\\. model" ) expect_is( tidy(lmo), "data.frame" ) expect_is( tidy(lmfo), "data.frame" ) expect_equal( nrow(tidy(lm_robust(y ~ x, data = dat, se_type = "classical"))), 2 ) expect_equal( nrow(tidy(lmfo)), 1 ) glance_lmo <- glance(lmo) expect_is(glance_lmo , "data.frame") expect_equal(nrow(glance_lmo), 1) expect_equal( colnames(glance(lmo)), c("r.squared", "adj.r.squared", "statistic", "p.value", "df.residual", "nobs", "se_type") ) expect_equal( colnames(coef(summary(lmo))), c("Estimate", "Std. Error", "t value", "Pr(>|t|)", "CI Lower", "CI Upper", "DF") ) capture_output( expect_equivalent( coef(summary(lmo)), print(lmo) ) ) capture_output( expect_equivalent( coef(summary(lmfo)), print(lmfo) ) ) # works with multiple outcomes lmrmo <- lm_robust(cbind(y, x) ~ z, data = dat, se_type = "classical") lmmo <- lm(cbind(y, x) ~ z, data = dat) slmmo <- summary(lmmo) expect_equivalent( as.matrix(tidy(lmrmo)[, c("term", "outcome")]), cbind( rep(c("(Intercept)", "z"), times = 2), rep(c("y", "x"), each = 2) ) ) expect_equal( dimnames(vcov(lmrmo)), list( c("y:(Intercept)", "y:z", "x:(Intercept)", "x:z"), c("y:(Intercept)", "y:z", "x:(Intercept)", "x:z") ) ) expect_equal( coef(lmrmo), coef(lmmo) ) capture_output( expect_equal( rownames(print(lmrmo)), rownames(vcov(lmrmo)) ) ) expect_equal( predict(lmrmo, newdata = dat), predict(lmmo) ) expect_error( predict(lmrmo, newdata = dat, se.fit = TRUE), "Can't set `se.fit` == TRUE with multivariate outcome" ) expect_error( slmrmo <- summary(lmrmo), NA ) expect_error( glance(lmrmo), "Cannot use `glance` on linear models with multiple responses." ) lmroy <- lm_robust(y ~ z, data = dat, se_type = "classical") lmrox <- lm_robust(x ~ z, data = dat, se_type = "classical") # Only difference is name on fstatistic! expect_equivalent( slmrmo$`Response y`, summary(lmroy) ) expect_equivalent( slmrmo$`Response x`, summary(lmrox) ) expect_equivalent( lapply(slmrmo, function(x) coef(x)[, 1:4]), lapply(slmmo, function(x) coef(x)[, 1:4]) ) expect_equivalent( confint(lmrmo)[1:2,], confint(lmroy) ) expect_equivalent( confint(lmrmo)[3:4,], confint(lmrox) ) ## lm_lin lmlo <- lm_lin(y ~ x, ~ z, data = dat) expect_is( tidy(lmlo), "data.frame" ) capture_output( expect_equivalent( coef(summary(lmlo)), print(lmlo) ) ) glance_lmlo <- glance(lmlo) expect_equal(nrow(glance_lmlo), 1) expect_equal( colnames(glance(lmlo)), c("r.squared", "adj.r.squared", "statistic", "p.value", "df.residual", "nobs", "se_type") ) ## lh_robust lho <- lh_robust( mpg ~ cyl + am, data = mtcars, se_type = "classical", linear_hypothesis = "cyl = am" ) tlho <- tidy(lho) slho <- summary(lho) glho <- glance(lho) ## iv_robust 1st-stage stats. ivro <- iv_robust(y ~ x | z, data = dat, se_type = "classical", diagnostics = T) expect_equivalent( as.numeric(glance(ivro)[c("statistic.weakinst", "p.value.weakinst")]), summary(ivro)$diagnostic_first_stage_fstatistic[c(1, 4)] ) expect_equivalent( as.numeric(glance(ivro)[c("statistic.endogeneity", "p.value.endogeneity")]), summary(ivro)$diagnostic_endogeneity_test[c(1, 4)] ) expect_equivalent( as.logical(is.na(glance(ivro)[c("statistic.overid", "p.value.overid")])), c(T, T) ) ## iv-robust over-identification test ivro_oid <- iv_robust(y ~ x | z + p, data = dat, se_type = "classical", diagnostics = T) expect_equivalent( as.numeric(glance(ivro_oid)[c("statistic.weakinst", "p.value.weakinst")]), summary(ivro_oid)$diagnostic_first_stage_fstatistic[c(1, 4)] ) expect_equivalent( as.numeric(glance(ivro_oid)[c("statistic.endogeneity", "p.value.endogeneity")]), summary(ivro_oid)$diagnostic_endogeneity_test[c(1, 4)] ) expect_equivalent( as.numeric(glance(ivro_oid)[c("statistic.overid", "p.value.overid")]), summary(ivro_oid)$diagnostic_overid_test[c(1, 3)] ) # tidy adds rows for each LH expect_equal( tlho$term, c("(Intercept)", "cyl", "am", "cyl = am") ) # glance only glances lm_robust object lro <- lm_robust(mpg ~ cyl + am, data = mtcars, se_type = "classical") expect_identical( glance(lho), glance(lro) ) printsummary_lho <- capture.output(summary(lho)) printsummary_lro <- capture.output(summary(lro)) expect_identical( printsummary_lho[7:16], printsummary_lro[5:14] ) expect_identical( printsummary_lho[19:20], capture.output(summary(lho$lh)) ) expect_output( print(lho$lh), "Estimate.*cyl = am" ) # print also gets right number of rows expect_equal( length(capture.output(print(lho$lh))), 2 ) ## horvitz_thompson ht <- horvitz_thompson(y ~ x, condition_prs = p, data = dat) expect_is( tidy(ht), "data.frame" ) expect_equivalent( as.matrix(tidy(ht)[, c("estimate", "std.error", "statistic", "p.value", "conf.low", "conf.high", "df")]), coef(summary(ht)) ) expect_equal( colnames(coef(summary(ht))), c("Estimate", "Std. Error", "z value", "Pr(>|z|)", "CI Lower", "CI Upper", "DF") ) capture_output( expect_equivalent( coef(summary(ht)), print(ht) ) ) glance_ht <- glance(ht) expect_equal(nrow(glance_ht), 1) expect_equal( colnames(glance(ht)), c("nobs", "se_type", "condition2", "condition1") ) ## difference_in_means dim <- difference_in_means(y ~ x, data = dat) expect_is( tidy(dim), "data.frame" ) expect_equal( colnames(coef(summary(dim))), c("Estimate", "Std. Error", "t value", "Pr(>|t|)", "CI Lower", "CI Upper", "DF") ) capture_output( expect_equivalent( coef(summary(dim)), print(dim) ) ) glance_dim <- glance(dim) expect_equal(nrow(glance_dim), 1) expect_equal( colnames(glance(dim)), c("design", "df", "nobs", "nblocks", "nclusters", "condition2", "condition1") ) # rank deficient dat$z2 <- dat$z lmro <- lm_robust(y ~ z + z2 + x, data = dat) tidy(lmro) # instrumental variables S3 methods are in the IV test, owing to # the AER dependency # iv_robust }) test_that("vcov works", { skip_if_not_installed("AER") # not identical due to < 1e-15 diffs expect_equal( vcov(lm_robust(y ~ x, data = dat, se_type = "classical")), vcov(lm(y ~ x, data = dat)) ) # support complete with dependencies dat$xdup <- dat$x # save test for 3.5 # expect_equal( # vcov(lm_robust(y ~ x + xdup, data = dat, se_type = "classical")), # vcov(lm(y ~ x + xdup, data = dat)) # ) expect_equal( coef(lm_robust(y ~ x + xdup, data = dat, se_type = "classical")), coef(lm(y ~ x + xdup, data = dat)) ) expect_equal( coef(lm_robust(y ~ x + xdup, data = dat, se_type = "classical"), complete = FALSE), coef(lm(y ~ x + xdup, data = dat), complete = FALSE) ) expect_equal( vcov(lmbo)["z", "z"], vcov(lmfo)["z", "z"] ) expect_error( vcov(lm_lin(y ~ x, ~ z, data = dat)), NA ) expect_error( vcov(lm_robust(y ~ x, data = dat, return_vcov = FALSE)), "return_vcov = TRUE" ) hto <- horvitz_thompson(y ~ x, condition_prs = p, data = dat) expect_equal( vcov(hto), matrix(hto$std.error ^ 2, dimnames = list(hto$term, hto$term)) ) dimo <- difference_in_means(y ~ x, data = dat) expect_equal( vcov(dimo), matrix(dimo$std.error ^ 2, dimnames = list(dimo$term, dimo$term)) ) # Instrumental variables ivo <- AER::ivreg(y ~ x | z, data = dat) ivro <- iv_robust(y ~ x | z, data = dat, se_type = "classical") expect_equal( AER:::vcov.ivreg(ivo), ivro$vcov ) }) test_that("coef and confint work", { lmo <- lm_robust(y ~ x, data = dat) expect_equivalent( coef(lmo), lmo$coefficients ) expect_equivalent( coef(lmfo), lmfo$coefficients ) expect_equivalent( coef(lmbo)["z"], coef(lmfo)["z"] ) expect_equivalent( confint(lmo), cbind(lmo$conf.low, lmo$conf.high) ) expect_equivalent( confint(lmfo), cbind(lmfo$conf.low, lmfo$conf.high) ) expect_equal( confint(lmbo, parm = "z"), confint(lmfo, parm = "z") ) expect_equal( confint(lmbo, parm = "z", level = 0.8), confint(lmfo, parm = "z", level = 0.8) ) lm2o <- lm_robust(y ~ x + z, data = dat) expect_equivalent( coef(lm2o)[2], lm2o$coefficients["x"] ) expect_equivalent( confint(lm2o)[2, , drop = FALSE], confint(lm2o, parm = "x") ) expect_equivalent( confint(lmo, parm = "x", level = 0.90), with( lm_robust(y ~ x, data = dat, alpha = 0.10), cbind(conf.low[2], conf.high[2]) ) ) lmlo <- lm_lin(y ~ x, ~ z, data = dat, se_type = "HC3") expect_equivalent( confint(lmlo), cbind(lmlo$conf.low, lmlo$conf.high) ) dim <- difference_in_means(y ~ x, data = dat) expect_equivalent( coef(dim), dim$coefficients ) expect_equivalent( confint(dim), cbind(dim$conf.low, dim$conf.high) ) ht <- horvitz_thompson(y ~ x, condition_prs = p, data = dat) expect_equivalent( coef(ht), ht$coefficients ) expect_equivalent( confint(ht), cbind(ht$conf.low, ht$conf.high) ) # rank deficient dat$z2 <- dat$z lmro <- lm_robust(y ~ z + z2 + x, data = dat) confint(lmro) coef(lmro) capture.output( expect_equal( nobs(lmro), nobs(summary(lmro)) ) ) }) test_that("predict works", { set.seed(42) n <- 20 dat <- data.frame( x = rep(0:1, times = 10), w = runif(n), z = rnorm(n), cl = as.factor(sample(letters[1:3], size = n, replace = T)), y = rnorm(n) ) lm_out <- lm(y ~ z * x + cl, data = dat) lmr_out <- lm_robust(y ~ z * x + cl, data = dat, se_type = "classical") expect_equivalent( predict(lm_out, dat), predict(lmr_out, dat) ) # various specifications expect_equivalent( predict(lm_out, dat, se.fit = T, interval = "confidence")[c(1, 2)], predict(lmr_out, dat, se.fit = T, interval = "confidence")[c(1, 2)] ) expect_equivalent( predict(lm_out, dat, se.fit = T, interval = "prediction")[c(1, 2)], predict(lmr_out, dat, se.fit = T, interval = "prediction")[c(1, 2)] ) # missingness n <- 21 new_dat <- data.frame( x = rep(0:1, times = c(10, 11)), w = runif(n), z = rnorm(n), cl = as.factor(sample(letters[1:3], size = n, replace = T)), y = rnorm(n) ) # remove one level to make sure works with missing levels new_dat <- new_dat[new_dat$cl == "a", ] new_dat[1, "x"] <- NA expect_equivalent( predict(lm_out, new_dat), predict(lmr_out, new_dat) ) # various specifications expect_equivalent( predict(lm_out, new_dat, se.fit = T, interval = "confidence")[c(1, 2)], predict(lmr_out, new_dat, se.fit = T, interval = "confidence")[c(1, 2)] ) expect_equivalent( predict(lm_out, new_dat, se.fit = T, interval = "prediction")[c(1, 2)], predict(lmr_out, new_dat, se.fit = T, interval = "prediction")[c(1, 2)] ) # weights lm_out <- lm(y ~ z * x + cl, data = dat, weights = w) lmr_out <- lm_robust(y ~ z * x + cl, data = dat, se_type = "classical", weights = w) expect_equivalent( predict(lm_out, dat), predict(lmr_out, dat) ) expect_equivalent( predict(lm_out, dat, se.fit = T, interval = "confidence")[c(1, 2)], predict(lmr_out, dat, se.fit = T, interval = "confidence")[c(1, 2)] ) expect_warning( plmo <- predict(lm_out, dat, se.fit = T, interval = "prediction")[c(1, 2)], "Assuming constant prediction variance" ) expect_warning( plmro <- predict(lmr_out, dat, se.fit = T, interval = "prediction")[c(1, 2)], "Assuming constant prediction variance" ) expect_equivalent( plmo, plmro ) # Now with missingness and newdat expect_equivalent( predict(lm_out, new_dat), predict(lmr_out, new_dat) ) # mimic lm behavior with missing weights (can't get prediction intervals) new_dat$w[3] <- NA # various specifications expect_equivalent( predict(lm_out, new_dat, se.fit = T, interval = "confidence", weights = ~ w)[c(1, 2)], predict(lmr_out, new_dat, se.fit = T, interval = "confidence", weights = w)[c(1, 2)] ) expect_equivalent( predict(lm_out, new_dat, se.fit = T, interval = "prediction", weights = ~w)[c(1, 2)], predict(lmr_out, new_dat, se.fit = T, interval = "prediction", weights = w)[c(1, 2)] ) # other arguments expect_equivalent( predict(lm_out, new_dat, se.fit = T, interval = "prediction", pred.var = 2.3)[c(1, 2)], predict(lmr_out, new_dat, se.fit = T, interval = "prediction", pred.var = 2.3)[c(1, 2)] ) # lm_lin n <- 21 new_dat <- data.frame( x = rep(0:1, times = c(10, 11)), z = rnorm(n), cl = as.factor(sample(letters[1:3], size = n, replace = TRUE)), y = rnorm(n) ) lml_out <- lm_lin(y ~ x, covariates = ~ z + cl, data = dat, se_type = "classical") dat$z_bar <- dat$z - mean(dat$z) dat$clb <- as.numeric(dat$cl == "b") dat$clc <- as.numeric(dat$cl == "c") dat$clb_bar <- dat$clb - mean(dat$clb) dat$clc_bar <- dat$clc - mean(dat$clc) lm_int_out <- lm(y ~ x + x * z_bar + x * clb_bar + x * clc_bar, data = dat) # have to scale new data by old mean values! # now predict does this for you! ok.emoji new_dat$z_bar <- new_dat$z - mean(dat$z) new_dat$clb <- as.numeric(new_dat$cl == "b") new_dat$clc <- as.numeric(new_dat$cl == "c") new_dat$clb_bar <- new_dat$clb - mean(dat$clb) new_dat$clc_bar <- new_dat$clc - mean(dat$clc) # not identical due to some numerical difference, presumably due to the way I save the means from lm_lin expect_equivalent( predict(lml_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lm_int_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)] ) # With and without factorial treatment, intercept, predictions should be the same lml_out <- lm_lin(y ~ x, covariates = ~ z + I(z^2) +cl, data = dat, se_type = "classical") lml_0_out <- lm_lin(y ~ x + 0, covariates = ~ z + I(z^2) + cl, data = dat, se_type = "classical") lml_f_out <- lm_lin(y ~ factor(x), covariates = ~ z + I(z^2) + cl, data = dat, se_type = "classical") lm_f0_out <- lm_lin(y ~ factor(x) + 0, covariates = ~ z + I(z^2) + cl, data = dat, se_type = "classical") expect_equivalent( predict(lml_0_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lml_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)] ) expect_equivalent( predict(lml_f_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lml_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], ) expect_equivalent( predict(lml_f_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lml_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)] ) expect_equivalent( predict(lml_f_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lm_f0_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)] ) # For multi-level treatments, with and without factorial treatment, intercept, predictions should be the same dat$x_multi <- rep(0:2, times = 7)[1:20] new_dat$x_multi <- rep(0:2, times = 7) lml_out <- lm_lin(y ~ x_multi, covariates = ~ z + I(z^2) +cl, data = dat, se_type = "classical") lml_0_out <- lm_lin(y ~ x_multi + 0, covariates = ~ z + I(z^2) + cl, data = dat, se_type = "classical") lml_f_out <- lm_lin(y ~ factor(x_multi), covariates = ~ z + I(z^2) + cl, data = dat, se_type = "classical") lm_f0_out <- lm_lin(y ~ factor(x_multi) + 0, covariates = ~ z + I(z^2) + cl, data = dat, se_type = "classical") expect_equivalent( predict(lml_0_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lml_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)] ) expect_equivalent( predict(lml_f_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lml_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], ) expect_equivalent( predict(lml_f_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lml_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)] ) expect_equivalent( predict(lml_f_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)], predict(lm_f0_out, new_dat, se.fit = TRUE, interval = "confidence")[c(1, 2)] ) # working with rank deficient X head(dat) dat$z2 <- dat$z lm_out <- lm(y ~ z * x + z2 + cl, data = dat) lmr_out <- lm_robust(y ~ z * x + z2 + cl + z, data = dat, se_type = "classical") suppressWarnings({ expect_equivalent( predict(lm_out, dat), predict(lmr_out, dat) ) # various specifications expect_equivalent( predict(lm_out, dat, se.fit = T, interval = "confidence")[c(1, 2)], predict(lmr_out, dat, se.fit = T, interval = "confidence")[c(1, 2)] ) expect_equivalent( predict(lm_out, dat, se.fit = T, interval = "prediction")[c(1, 2)], predict(lmr_out, dat, se.fit = T, interval = "prediction")[c(1, 2)] ) }) }) test_that("predict works with fixed effects", { ro <- lm_robust(mpg ~ hp + vs + factor(cyl), data = mtcars) rfo <- lm_robust(mpg ~ hp + vs, fixed_effects = ~ cyl, data = mtcars) lo <- lm(mpg ~ hp + vs + factor(cyl), data = mtcars) plo <- predict(lo, newdata = mtcars) expect_equal( predict(ro, newdata = mtcars), predict(rfo, newdata = mtcars) ) expect_error( predict(rfo, newdata = mtcars, se.fit = TRUE), "Can't set `se.fit`|TRUE with `fixed_effects`" ) mtcars2 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 2, 4) ) expect_error( predict(ro, newdata = mtcars2), "factor factor\\(cyl\\) has new levels 2" ) expect_error( predict(rfo, newdata = mtcars2), "Can't have new levels in `newdata` `fixed_effects` variable, such as: cyl2" ) mtcars3 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 6, 4) ) expect_equal( predict(ro, newdata = mtcars3), predict(rfo, newdata = mtcars3) ) ## Weights row <- lm_robust(mpg ~ 0 + hp + vs + factor(cyl), weights = wt, data = mtcars) rfow <- lm_robust(mpg ~ hp + vs, fixed_effects = ~ cyl, weights = wt, data = mtcars) low <- lm(mpg ~ hp + vs + factor(cyl), weights = wt, data = mtcars) plow <- predict(low, newdata = mtcars) prow <- predict(row, newdata = mtcars) prfow <- predict(rfow, newdata = mtcars) expect_error( predict(rfow, newdata = mtcars, se.fit = TRUE), "Can't set `se.fit`|TRUE with `fixed_effects`" ) expect_equal( prow, prfow ) expect_equal( prow, plow ) expect_equivalent( row$fitted.values, low$fitted.values ) expect_equal( row$fitted.values, rfow$fitted.values ) mtcars2 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 2, 4) ) expect_error( predict(row, newdata = mtcars2), "factor factor\\(cyl\\) has new levels 2" ) expect_error( predict(rfow, newdata = mtcars2), "Can't have new levels in `newdata` `fixed_effects` variable, such as: cyl2" ) mtcars3 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 6, 4) ) expect_equal( predict(row, newdata = mtcars3), predict(rfow, newdata = mtcars3) ) ## Clustered roc <- lm_robust(mpg ~ 0 + hp + vs + factor(cyl), clusters = carb, data = mtcars) rfoc <- lm_robust(mpg ~ hp + vs, fixed_effects = ~ cyl, clusters = carb, data = mtcars) proc <- predict(roc, newdata = mtcars) prfoc <- predict(rfoc, newdata = mtcars) expect_equal( roc$fitted.values, rfoc$fitted.values ) expect_equivalent( roc$fitted.values, lo$fitted.values # not weighted, just lm predictions ) expect_equal( proc, prfoc ) expect_equal( prfoc, plo ) ## Clustered, weights rocw <- lm_robust(mpg ~ 0 + hp + vs + factor(cyl), weights = wt, clusters = carb, data = mtcars, se_type = "stata") rfocw <- lm_robust(mpg ~ hp + vs, fixed_effects = ~ cyl, weights = wt, clusters = carb, data = mtcars, se_type = "stata") procw <- predict(rocw, newdata = mtcars) prfocw <- predict(rfocw, newdata = mtcars) expect_equal( rocw$fitted.values, rfocw$fitted.values ) expect_equivalent( rocw$fitted.values, low$fitted.values # not weighted, just lm predictions ) expect_equal( procw, prfocw ) expect_equal( prfocw, plow ) ## Fails with two fixed effects rfocw <- lm_robust(mpg ~ hp + vs, fixed_effects = ~ cyl + carb, data = mtcars) expect_error( predict(rfocw, newdata = mtcars), "Can't use `predict.lm_robust` with more than one set of `fixed_effects`" ) }) test_that("predict.iv_robust works with fixed effects", { skip_if_not_installed("AER") ro <- iv_robust(mpg ~ hp + factor(cyl) | vs + factor(cyl), data = mtcars) rfo <- iv_robust(mpg ~ hp | vs, fixed_effects = ~ cyl, data = mtcars) io <- AER::ivreg(mpg ~ hp + factor(cyl) | vs + factor(cyl), data = mtcars) pio <- predict(io, newdata = mtcars) expect_equal( ro$fitted.values, rfo$fitted.values ) expect_equivalent( rfo$fitted.values, io$fitted.values ) expect_equal( predict(ro, newdata = mtcars), predict(rfo, newdata = mtcars) ) expect_equal( pio, predict(rfo, newdata = mtcars) ) mtcars2 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 2, 4) ) expect_error( predict(ro, newdata = mtcars2), "factor factor\\(cyl\\) has new levels 2" ) expect_error( predict(rfo, newdata = mtcars2), "Can't have new levels in `newdata` `fixed_effects` variable, such as\\: cyl2" ) mtcars3 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 6, 4) ) expect_equal( predict(ro, newdata = mtcars3), predict(rfo, newdata = mtcars3) ) expect_equal( predict(io, newdata = mtcars3), predict(rfo, newdata = mtcars3) ) ## Weights row <- iv_robust(mpg ~ hp + factor(cyl) | vs + factor(cyl), weights = wt, data = mtcars) rfow <- iv_robust(mpg ~ hp | vs, fixed_effects = ~ cyl, weights = wt, data = mtcars) iow <- AER::ivreg(mpg ~ hp + factor(cyl) | vs + factor(cyl), weights = wt, data = mtcars) piow <- predict(iow, newdata = mtcars) prow <- predict(row, newdata = mtcars) prfow <- predict(rfow, newdata = mtcars) expect_equal( prow, prfow ) expect_equal( prow, piow ) expect_equivalent( rfow$fitted.values, iow$fitted.values ) expect_equal( row$fitted.values, rfow$fitted.values ) mtcars2 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 2, 4) ) expect_error( predict(row, newdata = mtcars2), "factor factor\\(cyl\\) has new levels 2" ) expect_error( predict(rfow, newdata = mtcars2), "Can't have new levels in `newdata` `fixed_effects` variable, such as: cyl2" ) mtcars3 <- data.frame( mpg = 1:3, hp = rnorm(3), vs = rbinom(3, 1, 0.5), cyl = c(4, 6, 4) ) expect_equal( predict(row, newdata = mtcars3), predict(rfow, newdata = mtcars3) ) ## Clustered roc <- iv_robust(mpg ~ hp + factor(cyl) | vs + factor(cyl), clusters = carb, data = mtcars) rfoc <- iv_robust(mpg ~ hp | vs, fixed_effects = ~ cyl, clusters = carb, data = mtcars) proc <- predict(roc, newdata = mtcars) prfoc <- predict(rfoc, newdata = mtcars) expect_equal( roc$fitted.values, rfoc$fitted.values ) expect_equivalent( rfoc$fitted.values, io$fitted.values # not weighted, just lm predictions ) expect_equal( proc, prfoc ) expect_equal( prfoc, pio ) ## Clustered, weights rocw <- iv_robust(mpg ~ hp + factor(cyl) | vs + factor(cyl), clusters = carb, weights = wt, data = mtcars, se_type = "stata") rfocw <- iv_robust(mpg ~ hp | vs, fixed_effects = ~ cyl, clusters = carb, weights = wt, data = mtcars, se_type = "stata") procw <- predict(rocw, newdata = mtcars) prfocw <- predict(rfocw, newdata = mtcars) expect_equal( rocw$fitted.values, rfocw$fitted.values ) expect_equivalent( rocw$fitted.values, iow$fitted.values # not weighted, just lm predictions ) expect_equal( procw, prfocw ) expect_equal( prfocw, piow ) ## Fails with two fixed effects rfocw <- iv_robust(mpg ~ hp | vs, fixed_effects = ~ cyl + carb, data = mtcars) expect_error( predict(rfocw, newdata = mtcars), "Can't use `predict.lm_robust` with more than one set of `fixed_effects`" ) }) test_that("tidy conf_level", { mod <- lm_robust(mpg ~ hp + factor(cyl) + gear, mtcars) expect_equal(unname(confint(mod, level = .95)[, 1]), tidy(mod)$conf.low) expect_equal(unname(confint(mod, level = .999)[, 1]), tidy(mod, conf.int = TRUE, conf.level = .999)$conf.low) dimmod <- difference_in_means(mpg ~ am, mtcars) expect_equal( unname(confint(dimmod, level = .999)[, 1]), tidy(dimmod, conf.int = TRUE, conf.level = .999)$conf.low ) expect_false( tidy(dimmod, conf.int = TRUE, conf.level = .999)$conf.low == tidy(dimmod)$conf.low ) htmod <- horvitz_thompson(mpg ~ am, mtcars, condition_prs = 0.5) expect_equal( unname(confint(htmod, level = .999)[, 1]), tidy(htmod, conf.int = TRUE, conf.level = .999)$conf.low ) expect_false( tidy(htmod, conf.int = TRUE, conf.level = .999)$conf.low == tidy(htmod)$conf.low ) }) test_that("update works", { l1 <- lm_robust(mpg ~ hp, mtcars) l2 <- lm_robust(mpg ~ cyl, mtcars) expect_equal( tidy(l2), tidy(update(l1, . ~ cyl)) ) iv1 <- iv_robust(mpg ~ hp | am, mtcars) iv2 <- iv_robust(mpg ~ cyl | am, mtcars) expect_equal( tidy(iv2), tidy(update(iv1, . ~ cyl | .)) ) }) test_that("setting different alpha in lm_robust call leads to different CIs in tidy", { set.seed(15) library(fabricatr) dat <- fabricate( N = 40, y = rpois(N, lambda = 4), x = rnorm(N), z = rbinom(N, 1, prob = 0.4) ) # Default variance estimator is HC2 robust standard errors lmro05 <- lm_robust(y ~ x + z, data = dat) td1 <- tidy(lmro05) lmro01 <- lm_robust(y ~ x + z, alpha = 0.01, data = dat) td2 <- tidy(lmro01) td3 <- tidy(lmro01, conf.int = TRUE, conf.level = 0.95) expect_false(identical(round(td1$conf.low, 2), round(td2$conf.low, 2))) expect_true(identical(round(td1$conf.low, 2), round(td3$conf.low, 2))) }) test_that("conf int for lh_robust works", { set.seed(15) library(fabricatr) dat <- fabricate( N = 40, y = rpois(N, lambda = 4), x = rnorm(N), z = rbinom(N, 1, prob = 0.4) ) # Default variance estimator is HC2 robust standard errors lhro05 <- lh_robust(y ~ x + z, linear_hypothesis = "z - 0.05 = 0", data = dat) td1 <- tidy(lhro05) lhro01 <- lh_robust(y ~ x + z, linear_hypothesis = "z - 0.05 = 0", alpha = 0.01, data = dat) td2 <- tidy(lhro01) td3 <- tidy(lhro05, conf.int = TRUE, conf.level = 0.95) expect_false(identical(round(td1$conf.low, 2), round(td2$conf.low, 2))) expect_true(identical(round(td1$conf.low, 2), round(td3$conf.low, 2))) }) estimatr/tests/testthat/run-stata-areg-models.do0000644000176200001440000000163214747205231021552 0ustar liggesusers// This file fits many models in stata and outputs the estimates for comparison with estimatr clear all import delimited mtcars.csv gen w = drat / 5 file open outf using stata-fe-ests.txt, write r // xtset carb // xtreg mpg hp, fe areg mpg hp, absorb(carb) mat V=e(V) file write outf _n "classical" _tab (V[1,1]) _tab (e(F)) areg mpg hp, absorb(carb) vce(robust) mat V=e(V) file write outf _n "HC1" _tab (V[1,1]) _tab (e(F)) areg mpg hp, absorb(carb) vce(cluster cyl) mat V=e(V) file write outf _n "stata_cl" _tab (V[1,1]) _tab (e(F)) areg mpg hp [aweight=w], absorb(carb) predict hii, hat mat V=e(V) file write outf _n "classicalw" _tab (V[1,1]) _tab (e(F)) areg mpg hp [aweight=w], absorb(carb) vce(robust) mat V=e(V) file write outf _n "HC1w" _tab (V[1,1]) _tab (e(F)) areg mpg hp [aweight=w], absorb(carb) vce(cluster cyl) mat V=e(V) file write outf _n "stata_clw" _tab (V[1,1]) _tab (e(F)) file close outf estimatr/tests/testthat/stata-fe-ests.txt0000644000176200001440000000024614747205231020334 0ustar liggesusers classical .00024844 20.439199 HC1 .00018026 28.169159 stata_cl .00021845 23.24499 classicalw .00023978 20.31818 HC1w .00019399 25.11446 stata_clw .00018777 25.946624estimatr/tests/testthat/test-replicate-lin2013.R0000644000176200001440000000731614747205231021256 0ustar liggesuserscontext("Verification - lm_lin replicates Lin 2013") # Lin paper available here: www.stat.berkeley.edu/~winston/agnostic.pdf # Citation: # Lin, Winston. 2013. "Agnostic notes on regression adjustments to experimental # data: Reexamining Freedman’s critique." The Annals of Applied Statistics. # Stat. 7(1): 295-318. doi:10.1214/12-AOAS583. # https://projecteuclid.org/euclid.aoas/1365527200. test_that("lm_lin recreates Lin 2013 Table 2", { data("alo_star_men") ## Table 2 # Lin uses "classic sandwich," or in our package, HC0 # unadjusted, Lin est = -0.036, se = 0.158 expect_equivalent( round( tidy( lm_robust( GPA_year1 ~ sfsp, data = alo_star_men, se_type = "HC0" ) )[2, c("estimate", "std.error")], 3 ), c(-0.036, 0.158) ) # usual adjusted for HS gpa, Lin est = -0.083, se = 0.146 expect_equivalent( unlist(round( tidy( lm_robust( GPA_year1 ~ sfsp + gpa0, data = alo_star_men, se_type = "HC0" ) )[2, c("estimate", "std.error")], 3 )), c(-0.083, 0.146) ) # interaction adjusted, Lin est = -0.081, se = 0.146 expect_equivalent( unlist(round( tidy( lm_lin( GPA_year1 ~ sfsp, covariates = ~ gpa0, data = alo_star_men, se_type = "HC0" ) )[2, c("estimate", "std.error")], 3 )), c(-0.081, 0.146) ) }) ## Table 3 too long to run rep_table_3 <- FALSE if (rep_table_3) { data("alo_star_men") ## Table 3 samp_dat <- alo_star_men its <- 250000 set.seed(161235) check_cover <- function(obj, point = 0) { return(obj$conf.low[2] < point & obj$conf.high[2] > point) } ci_dist <- function(obj) { return(obj$conf.high[2] - obj$conf.low[2]) } ci_custom <- function(obj) { return(list( conf.high = coef(obj)[2] + obj$std.error[2] * 1.96, conf.low = coef(obj)[2] - obj$std.error[2] * 1.96 )) } ses <- c("HC0", "HC1", "HC2", "HC3") ests <- matrix( NA, nrow = its, ncol = 3 ) sd_mats <- cover_mats <- width_mats <- array( NA, dim = c(its, length(ses), 3) ) for (i in 1:its) { samp_dat$sfsp <- sample(samp_dat$sfsp) sd_mat <- cover_mat <- width_mat <- matrix( NA, nrow = length(ses), ncol = 3 ) for (j in seq_along(ses)) { unadj <- lm_robust( GPA_year1 ~ sfsp, data = samp_dat, se_type = ses[j] ) tradadj <- lm_robust( GPA_year1 ~ sfsp + gpa0, data = samp_dat, se_type = ses[j] ) intadj <- lm_lin( GPA_year1 ~ sfsp, covariates = ~ gpa0, data = samp_dat, se_type = ses[j] ) sd_mat[j, ] <- c(unadj$std.error[2], tradadj$std.error[2], intadj$std.error[2]) cover_mat[j, ] <- c( check_cover(ci_custom(unadj)), check_cover(ci_custom(tradadj)), check_cover(ci_custom(intadj)) ) width_mat[j, ] <- c( ci_dist(ci_custom(unadj)), ci_dist(ci_custom(tradadj)), ci_dist(ci_custom(intadj)) ) } ests[i, ] <- c(coef(unadj)[2], coef(tradadj)[2], coef(intadj)[2]) sd_mats[i, , ] <- sd_mat cover_mats[i, , ] <- cover_mat width_mats[i, , ] <- width_mat if (i %% 1000 == 0) print(i) } # Panel A colMeans(ests) # Panel B apply(sd_mats, c(2, 3), mean) - apply(ests, 2, sd) # Panel C apply(sd_mats, c(2, 3), sd) # Panel D, not replicated because he uses normal dist. while we use t dist, all slightly larger apply(cover_mats, c(2, 3), mean) # Panel E, not replicated because he uses normal dist. while we use t dist, all slightly larger apply(width_mats, c(2, 3), mean) } estimatr/tests/testthat/test-lm-lin.R0000644000176200001440000002360314760370122017402 0ustar liggesuserscontext("Estimator - lm_lin") test_that("Test LM Lin", { dat <- data.frame( Y = rnorm(100), Z = rbinom(100, 1, .5), X1 = rnorm(100), X2 = rbinom(100, 1, .5), cluster = sample(1:10, size = 100, replace = T) ) lm_lin_out <- lm_lin(Y ~ Z, covariates = ~ X1 + X2, data = dat) expect_error( lm_lin_out <- lm_lin(Y ~ Z, data = dat, covariates = ~ X1 + X2), NA ) expect_error( lm_lin( Y ~ Z + X1, covariates = ~ X2, data = dat ), "must only have the treatment variable on the right-hand side of the formula" ) dat2 <- dat dat2$Z <- rnorm(100) # For now we allow huge multi-valued treatments, but output is wonky # expect_error( # lm_lin(Y ~ Z, # covariates = ~ X1, # data = dat2), # 'binary' # ) expect_error( lm_lin( Y ~ Z, covariates = Y ~ X1, data = dat ), "right-sided formula" ) # Now allows multi-valued treatments expect_error( lm_lin( Y ~ factor(cluster), ~ X1, data = dat ), NA ) expect_error( lm_lin(Y ~ treat, dat$X1, data = dat), "must be specified as a formula" ) expect_error( lm_lin(Y ~ treat, ~ 1, data = dat), "variable on the right-hand side" ) # works with one covar expect_error( lm_lin( Y ~ Z, covariates = ~ X1, data = dat ), NA ) dat$X1_c <- dat$X1 - mean(dat$X1) dat$X2_c <- dat$X2 - mean(dat$X2) lm_rob_out <- lm_robust(Y ~ Z + Z * X1_c + Z * X2_c, data = dat) expect_equal( tidy(lm_lin_out), tidy(lm_rob_out) ) expect_equivalent( coef(lm_lin_out), coef(lm(Y ~ Z + Z * X1_c + Z * X2_c, data = dat)) ) ## Works with multi-valued treatments dat$Z_mult <- rep_len(1:3, length.out = 100) test_mult <- function(treatment_type, dat) { dat$Z_mult <- switch(treatment_type, int = rep_len(1:3, length.out = 100), num = rep_len(c(1.5, 2.5, 5), length.out = 100), char = rep_len(letters[1:4], length.out = 100), bin = rbinom(100, 1, 0.5) ) mult_out <- lm_lin( Y ~ Z_mult, covariates = ~ X1 + X2, data = dat ) fact_mult_out <- lm_lin( Y ~ factor(Z_mult), covariates = ~ X1 + X2, data = dat ) noint_mult_out <- lm_lin( Y ~ Z_mult + 0, covariates = ~ X1 + X2, data = dat ) noint_fact_mult_out <- lm_lin( Y ~ factor(Z_mult) + 0, covariates = ~ X1 + X2, data = dat ) expect_equal( tidy(mult_out)[, -1], tidy(fact_mult_out)[, -1] ) rob_fact_mult_out <- lm_robust(Y ~ factor(Z_mult) * X1_c + factor(Z_mult) * X2_c, data = dat) expect_equal( tidy(fact_mult_out), tidy(rob_fact_mult_out) ) # Also works with no intercept! expect_equal( tidy(noint_mult_out)[, -1], tidy(noint_fact_mult_out)[, -1] ) } test_mult("int", dat) test_mult("num", dat) test_mult("char", dat) # test_mult("bin", dat) this gives weird answers because it isn't really valid when not a factor! ## Works with missing data in treatment dat$Z[23] <- NA dat$X1_c <- dat$X1 - mean(dat$X1[-23]) dat$X2_c <- dat$X2 - mean(dat$X2[-23]) expect_equal( tidy(lm_lin( Y ~ Z, covariates = ~ X1 + X2, data = dat )), tidy(lm_robust( Y ~ Z + Z * X1_c + Z * X2_c, data = dat )) ) ## Test cluster passes through expect_equal( tidy(lm_lin( Y ~ Z, covariates = ~ X1 + X2, data = dat, clusters = cluster )), tidy(lm_robust( Y ~ Z + Z * X1_c + Z * X2_c, data = dat, clusters = cluster )) ) ## Test that it works with subset keep <- setdiff(which(dat$Y > 0), 23) dat$X1_c <- dat$X1 - mean(dat$X1[keep]) dat$X2_c <- dat$X2 - mean(dat$X2[keep]) expect_equal( tidy(lm_lin( Y ~ Z, covariates = ~ X1 + X2, data = dat, clusters = cluster, subset = Y > 0 )), tidy(lm_robust( Y ~ Z + Z * X1_c + Z * X2_c, data = dat, clusters = cluster, subset = Y > 0 )) ) # Works with factors dat <- data.frame( Y = rnorm(100), treat = as.factor(rbinom(100, 1, .5)), X1 = rnorm(100), X2 = as.factor(rbinom(100, 1, .5)), cluster = sample(1:10, size = 100, replace = T) ) dat$X1_c <- dat$X1 - mean(dat$X1) dat$X21_c <- as.numeric(dat$X2 == 1) - mean(dat$X2 == 1) expect_equivalent( tidy(lm_lin( Y ~ treat, covariates = ~ X1 + X2, data = dat, clusters = cluster )), tidy(lm_robust( Y ~ treat + treat * X1_c + treat * X21_c, data = dat, clusters = cluster )) ) ## Works with a factor with spaces in the name (often the case for clusters) dat$X2 <- as.factor(sample( c("This is a level", "Get on my level"), size = 100, replace = T )) ## for lm_robust dat$X2_c <- as.numeric(dat$X2 == "This is a level") - mean(dat$X2 == "This is a level") ## Names will differ expect_equivalent( tidy( lm_lin( Y ~ treat, covariates = ~ X1 + X2, data = dat, clusters = cluster ) )[, -1], tidy( lm_robust( Y ~ treat + treat * X1_c + treat * X2_c, data = dat, clusters = cluster ) )[, -1] ) ## Works with missingness on cluster dat$cluster[40] <- NA dat$X1_c <- dat$X1 - mean(dat$X1[-40]) dat$X2_c <- as.numeric(dat$X2 == "This is a level") - mean(dat$X2[-40] == "This is a level") expect_warning( lin_out <- lm_lin( Y ~ treat, covariates = ~ X1 + X2, data = dat, clusters = cluster ), "missingness in the cluster" ) expect_warning( rob_out <- lm_robust( Y ~ treat + treat * X1_c + treat * X2_c, data = dat, clusters = cluster ), "missingness in the cluster" ) # drop coefficient name because names will differ expect_equivalent( tidy(lin_out)[, -1], tidy(rob_out)[, -1] ) # rank deficient cases dat$treat2 <- dat$treat dat$X1_2 <- dat$X1 lm_lin(Y ~ treat, ~ treat2 + X1, data = dat) # somewhat odd behavior expect_equivalent( is.na(coef(lm_lin(Y ~ treat, ~ X1_2 + X1, data = dat))), c(FALSE, FALSE, FALSE, TRUE, FALSE, TRUE) ) # Does lm_lin parse covariate names correctly? # Previously it dropped the factor(), now properly builds factor lmlo <- lm_lin(Y ~ treat, ~factor(cluster) + X1, data = dat) expect_equal( nrow(tidy(lmlo)), 22 ) # works with a binary with no intercept (bit odd, though!) dat$z <- rbinom(nrow(dat), 1, 0.5) lmlo <- lm_lin(Y ~ z + 0, ~X1, data = dat) expect_equal( lmlo$term, c("z0", "z1", "z0:X1_c", "z1:X1_c") ) # works with a multi-value treatment with no intercept dat$z <- factor(rbinom(nrow(dat), 2, 0.5)) lmlo <- lm_lin(Y ~ z + 0, ~X1, data = dat) expect_equal( lmlo$term, c("z0", "z1", "z2", "z0:X1_c", "z1:X1_c", "z2:X1_c") ) # behaves correctly with "odd" covariates (see issue #283) dat$z <- rbinom(nrow(dat), 1, 0.5) lmlo <- lm_lin(Y ~ z, ~ is.na(X1), data = dat) expect_equal( lmlo$term, c("(Intercept)", "z", "(is.na(X1)TRUE)_c", "z:(is.na(X1)TRUE)_c") ) }) test_that("lm_lin same as sampling perspective", { # Unweighted matches sampling view lmo <- lm_lin(mpg ~ am, ~ hp, data = mtcars) m_hp <- mean(mtcars$hp) areg <- lm(mpg ~ hp, data = mtcars, subset = am == 1) breg <- lm(mpg ~ hp, data = mtcars, subset = am == 0) ate <- with(mtcars[mtcars$am == 1, ], mean(mpg) + (m_hp - mean(hp)) * coef(areg)[2]) - with(mtcars[mtcars$am == 0, ], mean(mpg) + (m_hp - mean(hp)) * coef(breg)[2]) expect_equivalent( ate, coef(lmo)["am"] ) }) test_that("weighted lm_lin same as with one covar sampling view", { # Weighted matches (one covar) lmwo <- lm_lin(mpg ~ am, ~ hp, weights = wt, data = mtcars) hp_wmean <- weighted.mean(mtcars$hp, mtcars$wt) wareg <- lm(mpg ~ hp, data = mtcars, subset = am == 1, weights = wt) wbreg <- lm(mpg ~ hp, data = mtcars, subset = am == 0, weights = wt) wate <- with(mtcars[mtcars$am == 1, ], weighted.mean(mpg, wt) + (hp_wmean - weighted.mean(hp, wt)) * coef(wareg)[2]) - with(mtcars[mtcars$am == 0, ], weighted.mean(mpg, wt) + (hp_wmean - weighted.mean(hp, wt)) * coef(wbreg)[2]) expect_equivalent( wate, coef(lmwo)["am"] ) }) test_that("weighted lm_lin same as with two covar sampling view", { # Weighted matches (two covars) lmw2o <- lm_lin(mpg ~ am, ~ hp + cyl, weights = wt, data = mtcars) hpcyl_wmean <- apply(mtcars[, c("hp", "cyl")], 2, weighted.mean, mtcars$wt) w2areg <- lm(mpg ~ hp + cyl, data = mtcars, subset = am == 1, weights = wt) w2breg <- lm(mpg ~ hp + cyl, data = mtcars, subset = am == 0, weights = wt) w2ate <- with(mtcars[mtcars$am == 1, ], weighted.mean(mpg, wt) + (hpcyl_wmean - apply(cbind(hp, cyl), 2, weighted.mean, wt)) %*% coef(w2areg)[2:3]) - with(mtcars[mtcars$am == 0, ], weighted.mean(mpg, wt) + (hpcyl_wmean - apply(cbind(hp, cyl), 2, weighted.mean, wt)) %*% coef(w2breg)[2:3]) expect_equivalent( w2ate, coef(lmw2o)["am"] ) }) test_that("lm_lin properly renames trickily named variables", { # lm_lin should add parentheses around variables that have colons in their name # or that have parentheses in the name that are not in the first position lo <- lm_lin(mpg ~ am, ~ wt*cyl + log(wt), mtcars) expect_equal( lo$term, c("(Intercept)", "am", "wt_c", "cyl_c", "(log(wt))_c", "(wt:cyl)_c", "am:wt_c", "am:cyl_c", "am:(log(wt))_c", "am:(wt:cyl)_c") ) }) test_that("lm_lin works with multiple outcomes", { lmpg <- lm_lin(mpg ~ am, ~ cyl, mtcars) lwt <- lm_lin(wt ~ am, ~ cyl, mtcars) lboth <- lm_lin(cbind(mpg, wt) ~ am, ~ cyl, mtcars) expect_equivalent( tidy(lmpg), tidy(lboth)[1:4, ] ) expect_equivalent( tidy(lwt), tidy(lboth)[5:8, ] ) expect_equivalent( lboth$fstatistic[1:2], c(lmpg$fstatistic[1], lwt$fstatistic[1]) ) }) estimatr/tests/testthat/test-sig-testing.R0000644000176200001440000000064114747205231020447 0ustar liggesuserscontext("Helper - significance testing") test_that("Errors properly", { dat <- data.frame( mps = rep(1:4, each = 2), y = rnorm(8), z = c(0, 1) ) expect_warning( lm_lin(y ~ z, ~ factor(mps), data = dat), "Some degrees of freedom have been estimated as negative or zero" ) expect_error( lm_robust(y ~ z, data = dat, alpha = 10), "`alpha` must be numeric between 0 and 1" ) }) estimatr/tests/testthat/stata-iv-diagnostics.txt0000644000176200001440000002306014747205231021710 0ustar liggesusers(hp = wt);;small;weak1;1;30;22.998737;.00004146 (hp = wt);;small;endog;1;29;37.5609;1.120e-06 (hp = wt);;small;overid;.;.;.;. (hp = wt);[aweight = w];small;weak1;1;30;24.131227;.00002984 (hp = wt);[aweight = w];small;endog;1;29;37.410397;1.158e-06 (hp = wt);[aweight = w];small;overid;.;.;.;. (hp = wt);;rob;weak1;1;30;60.763169;1.068e-08 (hp = wt);;rob;endog;1;29;35.460434;1.803e-06 (hp = wt);;rob;overid;.;.;.;. (hp = wt);[aweight = w];rob;weak1;1;30;56.623663;2.175e-08 (hp = wt);[aweight = w];rob;endog;1;29;35.955804;1.609e-06 (hp = wt);[aweight = w];rob;overid;.;.;.;. (hp = wt);;cluster(cyl);weak1;1;2;111.06094;.00888425 (hp = wt);;cluster(cyl);endog;1;2;30.699387;.0310641 (hp = wt);;cluster(cyl);overid;.;.;.;. (hp = wt);[aweight = w];cluster(cyl);weak1;1;2;193.07404;.00513947 (hp = wt);[aweight = w];cluster(cyl);endog;1;2;31.848856;.02999288 (hp = wt);[aweight = w];cluster(cyl);overid;.;.;.;. (hp = wt);;small noconstant;weak1;1;31;282.49539;3.967e-17 (hp = wt);;small noconstant;endog;1;30;13.040548;.00109784 (hp = wt);;small noconstant;overid;.;.;.;. (hp = wt);[aweight = w];small noconstant;weak1;1;31;264.18222;1.012e-16 (hp = wt);[aweight = w];small noconstant;endog;1;30;14.714903;.00059805 (hp = wt);[aweight = w];small noconstant;overid;.;.;.;. (hp = wt);;rob noconstant;weak1;1;31;271.63797;6.863e-17 (hp = wt);;rob noconstant;endog;1;30;21.814041;.00005897 (hp = wt);;rob noconstant;overid;.;.;.;. (hp = wt);[aweight = w];rob noconstant;weak1;1;31;241.31276;3.547e-16 (hp = wt);[aweight = w];rob noconstant;endog;1;30;24.632111;.00002586 (hp = wt);[aweight = w];rob noconstant;overid;.;.;.;. (hp = wt);;cluster(cyl) noconstant;weak1;1;2;88.701936;.01108658 (hp = wt);;cluster(cyl) noconstant;endog;1;2;3.9831298;.18407918 (hp = wt);;cluster(cyl) noconstant;overid;.;.;.;. (hp = wt);[aweight = w];cluster(cyl) noconstant;weak1;1;2;71.014042;.01379109 (hp = wt);[aweight = w];cluster(cyl) noconstant;endog;1;2;4.355972;.17215019 (hp = wt);[aweight = w];cluster(cyl) noconstant;overid;.;.;.;. (hp am = wt gear);;small;weak1;2;29;16.699987;.00001495 (hp am = wt gear);;small;weak2;2;29;35.541819;1.583e-08 (hp am = wt gear);;small;endog;2;27;4.9756537;.014468 (hp am = wt gear);;small;overid;.;.;.;. (hp am = wt gear);[aweight = w];small;weak1;2;29;17.879363;8.729e-06 (hp am = wt gear);[aweight = w];small;weak2;2;29;34.107361;2.414e-08 (hp am = wt gear);[aweight = w];small;endog;2;27;4.8550937;.0158049 (hp am = wt gear);[aweight = w];small;overid;.;.;.;. (hp am = wt gear);;rob;weak1;2;29;27.565656;1.963e-07 (hp am = wt gear);;rob;weak2;2;29;77.611179;2.277e-12 (hp am = wt gear);;rob;endog;2;27;5.2051983;.01224657 (hp am = wt gear);;rob;overid;.;.;.;. (hp am = wt gear);[aweight = w];rob;weak1;2;29;27.980359;1.703e-07 (hp am = wt gear);[aweight = w];rob;weak2;2;29;73.995899;4.069e-12 (hp am = wt gear);[aweight = w];rob;endog;2;27;4.9419293;.01482928 (hp am = wt gear);[aweight = w];rob;overid;.;.;.;. (hp am = wt gear);;cluster(cyl);weak1;2;2;960.39475;.00104016 (hp am = wt gear);;cluster(cyl);weak2;2;2;1878.568;.00053204 (hp am = wt gear);;cluster(cyl);endog;2;2;4.803949;.17229648 (hp am = wt gear);;cluster(cyl);overid;.;.;.;. (hp am = wt gear);[aweight = w];cluster(cyl);weak1;2;2;327.10182;.00304783 (hp am = wt gear);[aweight = w];cluster(cyl);weak2;2;2;977.17346;.00102231 (hp am = wt gear);[aweight = w];cluster(cyl);endog;2;2;3.8882561;.20457193 (hp am = wt gear);[aweight = w];cluster(cyl);overid;.;.;.;. (hp am = wt gear);;small noconstant;weak1;2;30;140.11773;6.046e-16 (hp am = wt gear);;small noconstant;weak2;2;30;68.76829;6.240e-12 (hp am = wt gear);;small noconstant;endog;2;28;74.999626;5.680e-12 (hp am = wt gear);;small noconstant;overid;.;.;.;. (hp am = wt gear);[aweight = w];small noconstant;weak1;2;30;129.98801;1.665e-15 (hp am = wt gear);[aweight = w];small noconstant;weak2;2;30;75.861218;1.844e-12 (hp am = wt gear);[aweight = w];small noconstant;endog;2;28;74.133263;6.514e-12 (hp am = wt gear);[aweight = w];small noconstant;overid;.;.;.;. (hp am = wt gear);;rob noconstant;weak1;2;30;198.51053;5.013e-18 (hp am = wt gear);;rob noconstant;weak2;2;30;76.755646;1.592e-12 (hp am = wt gear);;rob noconstant;endog;2;28;90.63959;5.889e-13 (hp am = wt gear);;rob noconstant;overid;.;.;.;. (hp am = wt gear);[aweight = w];rob noconstant;weak1;2;30;169.2255;4.583e-17 (hp am = wt gear);[aweight = w];rob noconstant;weak2;2;30;86.85906;3.322e-13 (hp am = wt gear);[aweight = w];rob noconstant;endog;2;28;91.067515;5.562e-13 (hp am = wt gear);[aweight = w];rob noconstant;overid;.;.;.;. (hp am = wt gear);;cluster(cyl) noconstant;weak1;2;2;125.30453;.00791737 (hp am = wt gear);;cluster(cyl) noconstant;weak2;2;2;13504.401;.00007404 (hp am = wt gear);;cluster(cyl) noconstant;endog;2;2;260.8136;.00381951 (hp am = wt gear);;cluster(cyl) noconstant;overid;.;.;.;. (hp am = wt gear);[aweight = w];cluster(cyl) noconstant;weak1;2;2;104.47309;.00948109 (hp am = wt gear);[aweight = w];cluster(cyl) noconstant;weak2;2;2;8223.5428;.00012159 (hp am = wt gear);[aweight = w];cluster(cyl) noconstant;endog;2;2;249.68439;.00398908 (hp am = wt gear);[aweight = w];cluster(cyl) noconstant;overid;.;.;.;. gear (hp = wt);;small;weak1;1;29;32.413956;3.706e-06 gear (hp = wt);;small;endog;1;28;14.269359;.00076059 gear (hp = wt);;small;overid;.;.;.;. gear (hp = wt);[aweight = w];small;weak1;1;29;34.922532;2.042e-06 gear (hp = wt);[aweight = w];small;endog;1;28;15.079776;.00057414 gear (hp = wt);[aweight = w];small;overid;.;.;.;. gear (hp = wt);;rob;weak1;1;29;28.881051;8.971e-06 gear (hp = wt);;rob;endog;1;28;16.492556;.00035663 gear (hp = wt);;rob;overid;.;.;.;. gear (hp = wt);[aweight = w];rob;weak1;1;29;31.149152;5.054e-06 gear (hp = wt);[aweight = w];rob;endog;1;28;16.768105;.00032565 gear (hp = wt);[aweight = w];rob;overid;.;.;.;. gear (hp = wt);;cluster(cyl);weak1;1;2;675.0204;.00147815 gear (hp = wt);;cluster(cyl);endog;1;2;8.4945694;.10031937 gear (hp = wt);;cluster(cyl);overid;.;.;.;. gear (hp = wt);[aweight = w];cluster(cyl);weak1;1;2;131.09104;.0075421 gear (hp = wt);[aweight = w];cluster(cyl);endog;1;2;8.5046145;.10021809 gear (hp = wt);[aweight = w];cluster(cyl);overid;.;.;.;. gear (hp = wt);;small noconstant;weak1;1;30;39.331604;6.544e-07 gear (hp = wt);;small noconstant;endog;1;29;3.9155489;.05740796 gear (hp = wt);;small noconstant;overid;.;.;.;. gear (hp = wt);[aweight = w];small noconstant;weak1;1;30;37.965391;8.883e-07 gear (hp = wt);[aweight = w];small noconstant;endog;1;29;3.812533;.06058323 gear (hp = wt);[aweight = w];small noconstant;overid;.;.;.;. gear (hp = wt);;rob noconstant;weak1;1;30;94.273346;9.051e-11 gear (hp = wt);;rob noconstant;endog;1;29;4.9821876;.0334979 gear (hp = wt);;rob noconstant;overid;.;.;.;. gear (hp = wt);[aweight = w];rob noconstant;weak1;1;30;94.145441;9.193e-11 gear (hp = wt);[aweight = w];rob noconstant;endog;1;29;5.2815305;.02895546 gear (hp = wt);[aweight = w];rob noconstant;overid;.;.;.;. gear (hp = wt);;cluster(cyl) noconstant;weak1;1;2;78.227076;.01254329 gear (hp = wt);;cluster(cyl) noconstant;endog;1;2;12.722345;.0704022 gear (hp = wt);;cluster(cyl) noconstant;overid;.;.;.;. gear (hp = wt);[aweight = w];cluster(cyl) noconstant;weak1;1;2;97.948106;.01005575 gear (hp = wt);[aweight = w];cluster(cyl) noconstant;endog;1;2;9.4946134;.09115158 gear (hp = wt);[aweight = w];cluster(cyl) noconstant;overid;.;.;.;. gear (hp = wt am);;small;weak1;2;28;16.465278;.00001873 gear (hp = wt am);;small;endog;1;28;11.127253;.00240949 gear (hp = wt am);;small;overid;1;.;1.9409554;.16356483 gear (hp = wt am);[aweight = w];small;weak1;2;28;17.974405;9.518e-06 gear (hp = wt am);[aweight = w];small;endog;1;28;11.349395;.00221295 gear (hp = wt am);[aweight = w];small;overid;1;.;2.1540878;.14219087 gear (hp = wt am);;rob;weak1;2;28;12.963381;.0001035 gear (hp = wt am);;rob;endog;1;28;12.930604;.00122733 gear (hp = wt am);;rob;overid;1;.;1.6113311;.20430479 gear (hp = wt am);[aweight = w];rob;weak1;2;28;14.174797;.00005594 gear (hp = wt am);[aweight = w];rob;endog;1;28;12.433246;.00147299 gear (hp = wt am);[aweight = w];rob;overid;1;.;1.8448118;.17438803 gear (hp = wt am);;cluster(cyl);weak1;2;2;2299.2858;.00043473 gear (hp = wt am);;cluster(cyl);endog;1;2;5.9125743;.13557085 gear (hp = wt am);;cluster(cyl);overid;.;.;.;. gear (hp = wt am);[aweight = w];cluster(cyl);weak1;2;2;12866.727;.00007771 gear (hp = wt am);[aweight = w];cluster(cyl);endog;1;2;5.4014526;.14572676 gear (hp = wt am);[aweight = w];cluster(cyl);overid;.;.;.;. gear (hp = wt am);;small noconstant;weak1;2;29;20.821094;2.474e-06 gear (hp = wt am);;small noconstant;endog;1;29;3.6331882;.06659292 gear (hp = wt am);;small noconstant;overid;1;.;.23214555;.62993748 gear (hp = wt am);[aweight = w];small noconstant;weak1;2;29;20.134738;3.288e-06 gear (hp = wt am);[aweight = w];small noconstant;endog;1;29;3.9025848;.05779711 gear (hp = wt am);[aweight = w];small noconstant;overid;1;.;.04860186;.82551425 gear (hp = wt am);;rob noconstant;weak1;2;29;42.220045;2.574e-09 gear (hp = wt am);;rob noconstant;endog;1;29;4.1280347;.05142994 gear (hp = wt am);;rob noconstant;overid;1;.;.29829419;.58495379 gear (hp = wt am);[aweight = w];rob noconstant;weak1;2;29;39.812501;4.828e-09 gear (hp = wt am);[aweight = w];rob noconstant;endog;1;29;5.0398417;.03256522 gear (hp = wt am);[aweight = w];rob noconstant;overid;1;.;.06945184;.79213657 gear (hp = wt am);;cluster(cyl) noconstant;weak1;2;2;739.99192;.00134954 gear (hp = wt am);;cluster(cyl) noconstant;endog;1;2;30.775677;.03099064 gear (hp = wt am);;cluster(cyl) noconstant;overid;.;.;.;. gear (hp = wt am);[aweight = w];cluster(cyl) noconstant;weak1;2;2;233.38758;.00426644 gear (hp = wt am);[aweight = w];cluster(cyl) noconstant;endog;1;2;52.241925;.01860907 gear (hp = wt am);[aweight = w];cluster(cyl) noconstant;overid;.;.;.;. estimatr/tests/testthat/test-return.R0000644000176200001440000000373214747205231017535 0ustar liggesuserscontext("Output - test similiarity across estimators") test_that("Structure of output is the same", { n <- 40 dat <- data.frame( y = rnorm(n), z = c(0, 0, rep(0:1, times = 9)), x = rnorm(n), bl = rep(1:4, each = 10), cl = rep(1:20, each = 2) ) # Should be in all estimator returns in_return <- c( "coefficients", "std.error", "df", "p.value", "conf.low", "conf.high", "outcome", "alpha", "nobs" ) lmr_o <- lm_robust(y ~ z, data = dat) lmr_cl_o <- lm_robust(y ~ z, data = dat, clusters = cl) lml_o <- lm_lin(y ~ z, ~ x, data = dat) lml_cl_o <- lm_lin(y ~ z, ~ x, data = dat) # Major branching for diff estimators is for blocks ht_o <- horvitz_thompson(y ~ z, data = dat) ht_bl_o <- horvitz_thompson(y ~ z, blocks = bl, data = dat) dim_o <- difference_in_means(y ~ z, data = dat) dim_bl_o <- difference_in_means(y ~ z, blocks = bl, data = dat) expect_true(all(in_return %in% names(lmr_o))) expect_true(all(in_return %in% names(lmr_cl_o))) expect_true(all(in_return %in% names(lml_o))) expect_true(all(in_return %in% names(lml_cl_o))) expect_true(all(in_return %in% names(dim_o))) expect_true(all(in_return %in% names(dim_bl_o))) expect_true(all(in_return %in% names(ht_o))) expect_true(all(in_return %in% names(ht_bl_o))) expect_equal( colnames(tidy(lmr_o)), colnames(tidy(lmr_cl_o)), colnames(tidy(lml_o)), colnames(tidy(lml_cl_o)), colnames(tidy(ht_o)), colnames(tidy(ht_bl_o)), colnames(tidy(dim_o)), colnames(tidy(dim_bl_o)) ) expect_equal( difference_in_means(y ~ z - 1, data = dat)$term, "z" ) }) # test_that("Warns properly if df is negative or 0", { # dat = data.frame(y = 1, z = 1, p = .5) # I can't come up with a meaningful test for this now # This should never happen I don't think # expect_warning( # (y ~ z, data = dat, condition_prs = p), # "Estimated negative or zero degrees of freedom" # ) # }) estimatr/tests/testthat/test-difference-in-means.R0000644000176200001440000004553114747205231022020 0ustar liggesusers context("Estimator - difference_in_means") test_that("DIM", { dat <- data.frame(Y = rnorm(100), Z = sample(1:3, 100, replace = TRUE), X = rnorm(100)) difference_in_means(Y ~ Z, condition1 = 1, condition2 = 2, data = dat) difference_in_means(Y ~ Z, condition1 = 2, condition2 = 1, data = dat) difference_in_means(Y ~ Z, condition1 = 3, condition2 = 1, data = dat) dimo <- difference_in_means(Y ~ Z, condition1 = 3, condition2 = 2, data = dat) expect_equal( dimo$design, "Standard" ) }) test_that("DIM arguments parsed correctly", { dat <- data.frame(Y = rnorm(100), Z = rbinom(100, 1, .5), X = rnorm(100)) expect_equivalent( as.matrix(tidy(difference_in_means( Y ~ Z, data = dat, ci = FALSE ))[, c("p.value", "conf.low", "conf.high")]), matrix(NA, nrow = 1, ncol = 3) ) expect_error( difference_in_means(Y ~ Z + X, data = dat), "must have only one variable on the right-hand side" ) dat$bl <- rep(1:10, each = 10) dat$bad_cl <- rep(1:10, times = 10) expect_error( difference_in_means(Y ~ Z, blocks = bl, clusters = bad_cl, data = dat), "All `clusters` must be contained within `blocks`" ) dat$bad_bl <- c(1, rep(2:10, length.out = 99)) expect_error( difference_in_means(Y ~ Z, blocks = bad_bl, data = dat), "All `blocks` must have multiple units" ) dat$bad_mp <- rep(1:50, each = 2) dat$bad_mp[dat$bad_mp == 50] <- 49 dat$Z <- 0:1 expect_warning( difference_in_means(Y ~ Z, blocks = bad_mp, data = dat), "Some `blocks` have two units/`clusters` while other blocks have more units/`clusters`" ) expect_error( difference_in_means(Y ~ Z + X, data = dat), "must have only one variable on the right-hand side" ) # not matched pair but has some blocks with only 1 treated bl <- rep(1:2, each = 4) z <- c(1, 0, 0, 0, 1, 1, 0, 0) y <- rnorm(8) expect_error( difference_in_means(y ~ z, blocks = bl), "If design is not pair\\-matched\\, every block must" ) }) test_that("DIM Blocked", { dat <- data.frame( Y = rnorm(100), Z = rbinom(100, 1, .5), block = sample(c("A", "B", "C"), 100, replace = TRUE) ) difference_in_means(Y ~ Z, blocks = block, data = dat) dim_normal <- difference_in_means(Y ~ Z, condition1 = 0, condition2 = 1, blocks = block, data = dat) dim_reverse <- difference_in_means(Y ~ Z, condition1 = 1, condition2 = 0, blocks = block, data = dat) expect_equal( tidy(dim_normal)[c("estimate", "std.error")], tidy(dim_reverse)[c("estimate", "std.error")] * c(-1, 1) ) difference_in_means(Y ~ Z, alpha = .05, blocks = block, data = dat) difference_in_means(Y ~ Z, alpha = .10, blocks = block, data = dat) expect_equal( dim_normal$design, "Blocked" ) # Blocks and conditions works dat <- data.frame( y = 1:12, z = rep(0:2, each = 4), bl = rep(1:2, times = 6) ) dim_01 <- difference_in_means(y ~ z, blocks = bl, data = dat, condition1 = "0", condition2 = "1") dim_01_num <- difference_in_means(y ~ z, blocks = bl, data = dat, condition1 = 0, condition2 = 1) expect_equal( condchr(rmcall(dim_01)), condchr(rmcall(dim_01_num)) ) expect_equivalent(dim_01$coefficients, 4) expect_equivalent( tidy(dim_01), tidy( difference_in_means(y ~ z, blocks = bl, data = dat, subset = z < 2) ) ) dim_02 <- difference_in_means(y ~ z, blocks = bl, data = dat, condition1 = 0, condition2 = 2) expect_equivalent(dim_02$coefficients, 8) expect_equivalent( tidy(dim_02), tidy( difference_in_means(y ~ z, blocks = bl, data = dat, subset = z != 1) ) ) }) test_that("DIM same as t.test", { # test df correction dat <- data.frame(Y = rnorm(100), Z = rbinom(100, 1, .5), X = rnorm(100)) expect_equal( unlist( difference_in_means(Y ~ Z, data = dat)[c("p.value", "conf.low", "conf.high", "df")], F, F ), unlist( with(dat, t.test(Y[Z == 1], Y[Z == 0]))[c("p.value", "conf.int", "parameter")], F, F ) ) }) test_that("DIM Weighted", { n <- 100 dat <- data.frame(y = rnorm(n), z = 0:1, w = 1, bl = rep(1:10, each = 10)) dimw <- difference_in_means(y ~ z, weights = w, data = dat) difference_in_means(y ~ z, data = dat) dimbw <- difference_in_means(y ~ z, weights = w, blocks = bl, data = dat) difference_in_means(y ~ z, blocks = bl, data = dat) expect_equal( dimw$design, "Standard (weighted)" ) expect_equal( dimbw$design, "Blocked (weighted)" ) # Weights and conditions works dat <- data.frame( y = 1:12, z = rep(0:2, each = 4), w = rep(1:6, each = 2) ) dim_01 <- difference_in_means(y ~ z, weights = w, data = dat, condition1 = "0", condition2 = "1") expect_equivalent( tidy(dim_01), tidy( difference_in_means(y ~ z, weights = w, data = dat, subset = z < 2) ) ) dim_02 <- difference_in_means(y ~ z, weights = w, data = dat, condition1 = 0, condition2 = 2) expect_equivalent( tidy(dim_02), tidy( difference_in_means(y ~ z, weights = w, data = dat, subset = z != 1) ) ) }) test_that("DIM Clustered", { dat <- data.frame( weights = runif(100), weights2 = 1, J = rep(1:4, each = 25) ) dat$Y <- rnorm(100, mean = rep(rnorm(4, sd = sqrt(0.1)), each = 25), sd = sqrt(0.9)) dat$Z <- as.numeric(dat$J %in% 1:2) difference_in_means(Y ~ Z, alpha = .05, data = dat) dim_05 <- difference_in_means(Y ~ Z, alpha = .05, clusters = J, data = dat) dim_10 <- difference_in_means(Y ~ Z, alpha = .10, clusters = J, data = dat) expect_true(dim_05$conf.low < dim_10$conf.low) expect_equal( dim_10$design, "Clustered" ) # Clusters and conditions works dat <- data.frame( y = 1:12, z = rep(0:2, each = 4), cl = rep(1:6, each = 2) ) dim_01 <- difference_in_means(y ~ z, clusters = cl, data = dat, condition1 = "0", condition2 = "1") dim_01_num <- difference_in_means(y ~ z, clusters = cl, data = dat, condition1 = 0, condition2 = 1) expect_equal( condchr(rmcall(dim_01)), condchr(rmcall(dim_01_num)) ) expect_equivalent( tidy(dim_01), tidy( difference_in_means(y ~ z, clusters = cl, data = dat, subset = z < 2) ) ) expect_equivalent(dim_01$coefficients, 4) dim_02 <- difference_in_means(y ~ z, clusters = cl, data = dat, condition1 = 0, condition2 = 2) expect_equivalent(dim_02$coefficients, 8) expect_equivalent( tidy(dim_02), tidy( difference_in_means(y ~ z, clusters = cl, data = dat, subset = z != 1) ) ) }) test_that("DIM Pair Matched", { dat <- data.frame( Y = rnorm(100), Z = rbinom(100, 1, .5), weights = runif(100), weights2 = 1, block = rep(1:50, each = 2) ) expect_error( difference_in_means(Y ~ Z, alpha = .05, blocks = block, data = dat), "both treatment" ) dat$Z <- rep(0:1, 50) dim_mp <- difference_in_means(Y ~ Z, alpha = .05, blocks = block, data = dat) expect_equal( dim_mp$design, "Matched-pair" ) }) test_that("DIM Matched Pair Cluster Randomization", { dat <- data.frame( Y = rnorm(100), block = rep(1:25, each = 4), cluster = as.character(rep(1:50, each = 2)), Z = rep(0:1, times = 50) ) expect_error( difference_in_means( Y ~ Z, alpha = .05, blocks = block, clusters = cluster, data = dat ), "same treatment condition" ) dat$Z <- c(rep(rep(0:1, each = 4), 12), rep(0, 4)) expect_error( difference_in_means( Y ~ Z, alpha = .05, blocks = block, clusters = cluster, data = dat ), "both treatment conditions" ) dat$Z <- rep(rep(0:1, each = 2), 25) dim_mpc <- difference_in_means( Y ~ Z, alpha = .05, blocks = block, clusters = cluster, data = dat ) expect_equal( dim_mpc$design, "Matched-pair clustered" ) }) test_that("DIM Matched Pair Cluster Randomization = Matched Pair when cluster size is 1", { dat <- data.frame( Y = rnorm(100), block = rep(1:25, each = 4), cluster = 1:100, Z = rep(c(0, 0, 1, 1), times = 25) ) expect_equal( tidy(difference_in_means( Y ~ Z, alpha = .05, blocks = block, clusters = cluster, data = dat )), tidy(difference_in_means( Y ~ Z, alpha = .05, blocks = block, data = dat )) ) }) test_that("DIM works with missingness", { dat <- data.frame( Y = rnorm(100), block = rep(1:2, each = 50), cluster = 1:100, Z = rep(c(0, 0, 1, 1), times = 25) ) ## Missingness on treatment dat$Z[23] <- NA expect_error( estimatr_dim_out <- difference_in_means( Y ~ Z, alpha = .05, blocks = block, data = dat ), NA ) expect_equal( estimatr_dim_out, difference_in_means( Y ~ Z, alpha = .05, blocks = block, data = dat[-23, ] ) ) ## Missingness on block dat$block[35] <- NA expect_warning( estimatr_missblock_dim <- difference_in_means( Y ~ Z, alpha = .05, blocks = block, data = dat ), "missingness in the block" ) expect_equal( estimatr_missblock_dim, difference_in_means( Y ~ Z, alpha = .05, blocks = block, data = dat[-c(23, 35), ] ) ) ## Missingness on cluster dat$cluster[1] <- NA expect_warning( estimatr_missclust_dim <- difference_in_means( Y ~ Z, alpha = .05, clusters = cluster, data = dat ), "missingness in the cluster" ) expect_equal( estimatr_missclust_dim, difference_in_means( Y ~ Z, alpha = .05, clusters = cluster, data = dat[-c(1, 23), ] ) ) }) test_that("DIM works with character args", { dat <- data.frame( Y = rnorm(100), block = rep(1:25, each = 4), cluster = 1:100, Z = rep(c(0, 0, 1, 1), times = 25) ) dim_unquote <- difference_in_means( Y ~ Z, alpha = .05, blocks = block, clusters = cluster, data = dat ) dim_quote <- difference_in_means( Y ~ Z, alpha = .05, blocks = block, clusters = cluster, data = dat ) expect_equal( dim_unquote, dim_quote ) expect_equivalent( difference_in_means( Y ~ Z, alpha = .05, weights = cluster, data = dat ), difference_in_means( Y ~ Z, alpha = .05, weights = cluster, data = dat ) ) }) test_that("DIM unbiased", { dat <- data.frame( i = 1:10, Y0 = c( 2.1, 3.5, -131.2, -1.3, -4, 0.1, 8.1, -1.3, 1.1, 9.1 ), Y1 = c( 2.6, 3.0, -132, -0.7, -3.3, 0.5, 24.3, -1, 1.6, 0.3 ) ) # True SATE = 0.91 trueSATE <- mean(dat$Y1) - mean(dat$Y0) ## Complete Randomization # True se(SATE_hat) true_seSATE <- sqrt((var(dat$Y0) + var(dat$Y1) + 2 * cov(dat$Y0, dat$Y1)) / (10 - 1)) declaration <- randomizr::declare_ra(N = nrow(dat)) treatment_perms <- randomizr::obtain_permutation_matrix(declaration) ests <- apply( treatment_perms, 2, function(x) { dat$Z <- x dat$Y <- ifelse(dat$Z, dat$Y1, dat$Y0) dim <- difference_in_means(Y ~ Z, data = dat) coef(dim) } ) expect_equivalent( trueSATE, mean(ests) ) ## cluster randomized design, 5 blocks of 2 dat$cluster <- rep(1:5, each = 2) declaration <- randomizr::declare_ra( N = nrow(dat), clusters = dat$cluster ) treatment_perms <- randomizr::obtain_permutation_matrix(declaration) ests <- apply( treatment_perms, 2, function(x) { dat$Z <- x dat$Y <- ifelse(dat$Z, dat$Y1, dat$Y0) dim <- difference_in_means( Y ~ Z, clusters = cluster, data = dat ) coef(dim) } ) expect_equivalent( trueSATE, mean(ests) ) ## Matched pair design, 5 blocks of 2 dat$blocks <- rep(1:5, each = 2) declaration <- randomizr::declare_ra( N = nrow(dat), blocks = dat$blocks, block_m = rep(1, 5) ) treatment_perms <- randomizr::obtain_permutation_matrix(declaration) ests <- apply( treatment_perms, 2, function(x) { dat$Z <- x dat$Y <- ifelse(dat$Z, dat$Y1, dat$Y0) dim <- difference_in_means( Y ~ Z, blocks = blocks, data = dat ) coef(dim) } ) expect_equivalent( trueSATE, mean(ests) ) ## block randomized design, 2 blocks of 5 dat$blocks <- rep(1:2, each = 5) declaration <- randomizr::declare_ra( N = nrow(dat), blocks = dat$blocks, block_m = c(3, 3) ) treatment_perms <- randomizr::obtain_permutation_matrix(declaration) ests <- apply( treatment_perms, 2, function(x) { dat$Z <- x dat$Y <- ifelse(dat$Z, dat$Y1, dat$Y0) dim <- difference_in_means( Y ~ Z, blocks = blocks, data = dat ) coef(dim) } ) expect_equivalent( trueSATE, mean(ests) ) ## cluster matched pair, different sized blocks dat$blocks <- rep(1:3, times = c(4, 4, 2)) dat$clusters <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6) declaration <- randomizr::declare_ra( N = nrow(dat), blocks = dat$blocks, clusters = dat$clusters ) treatment_perms <- randomizr::obtain_permutation_matrix(declaration) ests <- apply( treatment_perms, 2, function(x) { dat$Z <- x dat$Y <- ifelse(dat$Z, dat$Y1, dat$Y0) dim <- difference_in_means( Y ~ Z, blocks = blocks, clusters = clusters, data = dat ) coef(dim) } ) expect_equivalent( trueSATE, mean(ests) ) }) test_that("DIM matches lm_robust under certain conditions", { n <- 400 dat <- data.frame(Y = rnorm(n)) ## DIM and lm_robust agree without clustering except for DoF because DIM uses Satterthwaite approx dat$z <- c(0, 1) lm_o <- lm_robust(Y ~ z, data = dat) dim_o <- difference_in_means(Y ~ z, data = dat) expect_equivalent( tidy(lm_o)[2, 1:3], tidy(dim_o)[, 1:3] ) ## DIM and lm_robust agree with clustering becuase DIM just uses lm_robust w/ CR2! dat$cl_diff_size <- sample(100, size = 400, replace = TRUE) dat$z_clustered <- as.numeric(dat$cl_diff_size <= 50) lm_cl_o <- lm_robust(Y ~ z_clustered, clusters = cl_diff_size, data = dat) dim_cl_o <- difference_in_means(Y ~ z_clustered, clusters = cl_diff_size, data = dat) expect_equivalent( tidy(lm_cl_o)[2, ], tidy(dim_cl_o) ) ## Blocked design is equivalent to lm_lin dat$bl <- rep(1:25, each = 16) dat$z_blocked <- rep(c(0, 1), each = 2) lm_bl_o <- lm_lin(Y ~ z_blocked, ~ factor(bl), data = dat) dim_bl_o <- difference_in_means(Y ~ z_blocked, blocks = bl, data = dat) # Not identical since row name of lm_bl_o is 2 due to the intercept expect_equivalent( tidy(lm_bl_o)[2, ], tidy(dim_bl_o) ) ## Block-clustered is equivalent to lm_lin ## (and indeed uses lm_robust machinery for the ests and ses, the DoF are equivalent with equal clusters ## by design dat$cl <- rep(1:200, each = 2) lm_blcl_o <- lm_lin(Y ~ z_blocked, ~ factor(bl), data = dat, clusters = cl) dim_blcl_o <- difference_in_means(Y ~ z_blocked, blocks = bl, clusters = cl, data = dat) # Not identical since row name of lm_bl_o is 2 due to the intercept expect_equivalent( tidy(lm_blcl_o)[2, ], tidy(dim_blcl_o) ) # With weights now, identical to lm_robust, HC2 by force! except for matched pairs which fails dat$w <- runif(nrow(dat)) # simple W expect_equivalent( tidy(lm_robust(Y ~ z, data = dat, weights = w))[2, ], tidy(difference_in_means(Y ~ z, data = dat, weights = w)) ) # blocked W expect_equivalent( tidy(lm_lin(Y ~ z_blocked, ~ factor(bl), weights = w, data = dat))[2, ], tidy(difference_in_means(Y ~ z_blocked, data = dat, blocks = bl, weights = w)) ) # blocked-clustered W (goes to CR2) # DF different in clustered case dim_bl_cl_w <- difference_in_means( Y ~ z_blocked, data = dat, clusters = cl, blocks = bl, weights = w ) expect_equivalent( tidy(lm_lin(Y ~ z_blocked, ~ factor(bl), clusters = cl, weights = w, data = dat))[2, 1:3], tidy(dim_bl_cl_w)[, 1:3] ) expect_equal( dim_bl_cl_w$design, "Block-clustered (weighted)" ) # Clustered W dim_cl_w <- difference_in_means( Y ~ z_clustered, data = dat, clusters = cl_diff_size, weights = w ) expect_equivalent( tidy(lm_robust(Y ~ z_clustered, clusters = cl_diff_size, weights = w, data = dat))[2, 1:3], tidy(dim_cl_w)[, 1:3] ) expect_equal( dim_cl_w$design, "Clustered (weighted)" ) # errors with matched pairs dat$mps <- rep(1:(n / 2), each = 2) dat$z_mps <- rep(0:1, times = (n / 2)) expect_error( difference_in_means(Y ~ z_mps, data = dat, weights = w, blocks = mps), "Cannot use `weights` with matched pairs design at the moment" ) }) test_that("se_type = none works", { # simple dim <- difference_in_means(mpg ~ am, data = mtcars) dim_no <- difference_in_means(mpg ~ am, data = mtcars, se_type = "none") expect_equal( dim$coefficients, dim_no$coefficients ) expect_equivalent( tidy(dim_no)[c("std.error", "p.value", "conf.low", "conf.high")], rep(NA_real_, 4) ) # block dimb <- difference_in_means(mpg ~ am, blocks = cyl, data = mtcars) dimb_no <- difference_in_means(mpg ~ am, blocks = cyl, data = mtcars, se_type = "none") expect_equal( dimb$coefficients, dimb_no$coefficients ) expect_equivalent( tidy(dimb_no)[c("std.error", "p.value", "conf.low", "conf.high")], rep(NA_real_, 4) ) # cluster mtcars$z <- as.numeric(mtcars$cyl < 5) dimc <- difference_in_means(mpg ~ z, clusters = cyl, data = mtcars) dimc_no <- difference_in_means(mpg ~ z, clusters = cyl, data = mtcars, se_type = "none") expect_equal( dimc$coefficients, dimc_no$coefficients ) expect_equivalent( tidy(dimc_no)[c("std.error", "p.value", "conf.low", "conf.high")], rep(NA_real_, 4) ) # weight dimw <- difference_in_means(mpg ~ am, weights = wt, data = mtcars) dimw_no <- difference_in_means(mpg ~ am, weights = wt, data = mtcars, se_type = "none") expect_equal( dimw$coefficients, dimw_no$coefficients ) expect_equivalent( tidy(dimw_no)[c("std.error", "p.value", "conf.low", "conf.high")], rep(NA_real_, 4) ) # cluster, weight dimcw <- difference_in_means(mpg ~ z, weights = wt, clusters = cyl, data = mtcars) dimcw_no <- difference_in_means(mpg ~ z, weights = wt, clusters = cyl, data = mtcars, se_type = "none") expect_equal( dimcw$coefficients, dimcw_no$coefficients ) expect_equivalent( tidy(dimcw_no)[c("std.error", "p.value", "conf.low", "conf.high")], rep(NA_real_, 4) ) # matched-pair mtcars$mp <- rep(1:(nrow(mtcars) / 2), each = 2) mtcars$z <- rep(c(0, 1), times = nrow(mtcars) / 2) dimmp <- difference_in_means(mpg ~ z, blocks = mp, data = mtcars) dimmp_no <- difference_in_means(mpg ~ z, blocks = mp, data = mtcars, se_type = "none") expect_equal( dimmp$coefficients, dimmp_no$coefficients ) expect_equivalent( tidy(dimmp_no)[c("std.error", "p.value", "conf.low", "conf.high")], rep(NA_real_, 4) ) }) estimatr/tests/testthat/test-na-omit-details.R0000644000176200001440000000156314747205231021205 0ustar liggesusers context("Helper - na.omit_detailed") df <- expand.grid(Y = c(1:5, NA), Z = c(LETTERS, NA)) stock <- na.omit(df) detailed <- na.omit_detailed.data.frame(df) stock_action <- attr(stock, "na.action") detailed_action <- attr(detailed, "na.action") test_that("Omits are the same", { expect_equal( as.vector(stock_action), as.vector(detailed_action) ) }) test_that("Row names are set correctly", { expect_identical( names(stock_action), names(detailed_action) ) }) test_that("Logic for nested dfs and lists holds", { df$X <- list(x = c(NA, 2:nrow(df))) df$Xmat <- matrix(rep(c(1, NA, 3:nrow(df)), 2), nrow(df)) stock <- na.omit(df) detailed <- na.omit_detailed.data.frame(df) stock_action <- attr(stock, "na.action") detailed_action <- attr(detailed, "na.action") expect_identical( names(stock_action), names(detailed_action) ) }) estimatr/tests/testthat/test-stata-output.R0000644000176200001440000001136014747205231020664 0ustar liggesuserscontext("Verification - lm and iv match Stata") test_that("lm_robust matches stata", { # write.csv(mtcars, # file = 'tests/testthat/mtcars.csv', # row.names = F) stata_ests <- read.table( "stata-ests.txt", col.names = c("model", "se1", "se2", "df", "fstat"), stringsAsFactors = FALSE ) mtcars$w <- mtcars$drat / 5 estimatr_mat <- matrix(NA, 10, 4) lm_c <- lm_robust(mpg ~ hp, data = mtcars, se_type = "classical") estimatr_mat[1, ] <- c(lm_c$std.error ^ 2, lm_c$df[2], lm_c$fstatistic[1]) lm_hc1 <- lm_robust(mpg ~ hp, data = mtcars, se_type = "HC1") estimatr_mat[2, ] <- c(lm_hc1$std.error ^ 2, lm_hc1$df[2], lm_hc1$fstatistic[1]) lm_hc2 <- lm_robust(mpg ~ hp, data = mtcars, se_type = "HC2") estimatr_mat[3, ] <- c(lm_hc2$std.error ^ 2, lm_hc2$df[2], lm_hc2$fstatistic[1]) lm_hc3 <- lm_robust(mpg ~ hp, data = mtcars, se_type = "HC3") estimatr_mat[4, ] <- c(lm_hc3$std.error ^ 2, lm_hc3$df[2], lm_hc3$fstatistic[1]) lm_stata <- lm_robust(mpg ~ hp, clusters = cyl, data = mtcars, se_type = "stata") estimatr_mat[5, ] <- c(lm_stata$std.error ^ 2, lm_stata$df[2], lm_stata$fstatistic[1]) lm_c_w <- lm_robust(mpg ~ hp, data = mtcars, weights = w, se_type = "classical") estimatr_mat[6, ] <- c(lm_c_w$std.error ^ 2, lm_c_w$df[2], lm_c_w$fstatistic[1]) lm_hc1_w <- lm_robust(mpg ~ hp, data = mtcars, weights = w, se_type = "HC1") estimatr_mat[7, ] <- c(lm_hc1_w$std.error ^ 2, lm_hc1_w$df[2], lm_hc1_w$fstatistic[1]) lm_hc2_w <- lm_robust(mpg ~ hp, data = mtcars, weights = w, se_type = "HC2") estimatr_mat[8, ] <- c(lm_hc2_w$std.error ^ 2, lm_hc2_w$df[2], lm_hc2_w$fstatistic[1]) lm_hc3_w <- lm_robust(mpg ~ hp, data = mtcars, weights = w, se_type = "HC3") estimatr_mat[9, ] <- c(lm_hc3_w$std.error ^ 2, lm_hc3_w$df[2], lm_hc3_w$fstatistic[1]) lm_stata_w <- lm_robust(mpg ~ hp, clusters = cyl, weights = w, data = mtcars, se_type = "stata") estimatr_mat[10, ] <- c(lm_stata_w$std.error ^ 2, lm_stata_w$df[2], lm_stata_w$fstatistic[1]) # All look numerically identical except for HC2 and HC3 with weights which # have non-negligible difference. This is due to differences in how the hat # matrix is built that are still unresolved # Therefore rows 8 and 9 will have larger differences expect_true( max(abs(estimatr_mat[c(1:7, 10), 1:4] - apply(stata_ests[c(1:7, 10), c(3, 2, 4, 5)], 2, as.numeric))) < 1e-5 ) }) test_that("iv_robust matches stata", { skip_if_not_installed("AER") # write.csv(mtcars, # file = 'tests/testthat/mtcars.csv', # row.names = F) stata_ests <- read.table( "stata-iv-ests.txt", col.names = c("model", "v1", "v2", "v3", "fstat", "r2", "r2_a", "rmse"), stringsAsFactors = FALSE ) mtcars$w <- mtcars$drat / 5 estimatr_mat <- matrix(NA, 6, 7) iv_c <- iv_robust(mpg ~ hp + am | wt + gear, data = mtcars, se_type = "classical") estimatr_mat[1, ] <- c(iv_c$std.error ^ 2, iv_c$fstatistic[1], iv_c$r.squared, iv_c$adj.r.squared, sqrt(iv_c$res_var)) iv_hc1 <- iv_robust(mpg ~ hp + am | wt + gear, data = mtcars, se_type = "HC1") estimatr_mat[2, ] <- c(iv_hc1$std.error ^ 2, iv_hc1$fstatistic[1], iv_hc1$r.squared, iv_hc1$adj.r.squared, sqrt(iv_hc1$res_var)) iv_stata <- iv_robust(mpg ~ hp + am | wt + gear, clusters = cyl, data = mtcars, se_type = "stata") estimatr_mat[3, ] <- c(iv_stata$std.error ^ 2, iv_stata$fstatistic[1], iv_stata$r.squared, iv_stata$adj.r.squared, sqrt(iv_stata$res_var)) iv_c_w <- iv_robust(mpg ~ hp + am | wt + gear, data = mtcars, weights = w, se_type = "classical") estimatr_mat[4, ] <- c(iv_c_w$std.error ^ 2, iv_c_w$fstatistic[1], iv_c_w$r.squared, iv_c_w$adj.r.squared, sqrt(iv_c_w$res_var)) iv_hc1_w <- iv_robust(mpg ~ hp + am | wt + gear, data = mtcars, weights = w, se_type = "HC1") estimatr_mat[5, ] <- c(iv_hc1_w$std.error ^ 2, iv_hc1_w$fstatistic[1], iv_hc1_w$r.squared, iv_hc1_w$adj.r.squared, sqrt(iv_hc1_w$res_var)) iv_stata_w <- iv_robust(mpg ~ hp + am | wt + gear, clusters = cyl, weights = w, data = mtcars, se_type = "stata") estimatr_mat[6, ] <- c(iv_stata_w$std.error ^ 2, iv_stata_w$fstatistic[1], iv_stata_w$r.squared, iv_stata_w$adj.r.squared, sqrt(iv_stata_w$res_var)) expect_true( max(abs(estimatr_mat[, 1] - as.numeric(stata_ests[, 4]))) < 2e-05 ) expect_true( max(abs(estimatr_mat[, 4] - as.numeric(stata_ests[, 5]))) < 3e-05 ) # Note, RMSE is different for stata with weights than ivreg or iv_robust expect_true( max(abs(estimatr_mat[, 5:6] - stata_ests[, 6:7])) < 4e-08 ) ivrego_w <- AER::ivreg(mpg ~ hp + am | wt + gear, data = mtcars, weights = w) expect_equal( ivrego_w$sigma, sqrt(iv_c_w$res_var) ) expect_equal( ivrego_w$sigma, sqrt(iv_hc1_w$res_var) ) expect_equal( ivrego_w$sigma, sqrt(iv_stata_w$res_var) ) }) estimatr/tests/testthat/test-lm-robust-fes.R0000644000176200001440000004371514747205231020722 0ustar liggesuserscontext("Estimator - lm_robust, fixed effects") set.seed(43) N <- 20 dat <- data.frame( Y = rnorm(N), Y2 = rnorm(N), Z = rbinom(N, 1, .5), X = rnorm(N), B = factor(rep(1:2, times = c(8, 12))), B2 = factor(rep(1:4, times = c(3, 3, 4, 10))), cl = sample(1:4, size = N, replace = T), w = runif(N) ) dat$Xdup <- dat$X dat$Bdup <- dat$B test_that("FE matches lm_robust with dummies", { ## One FE, one covar for (se_type in se_types) { ro <- tidy(lm_robust(Y ~ Z + factor(B), data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z, fixed_effects = ~ B, data = dat, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z"), ], rfo[rfo$term %in% c("Z"), ] ) } for (se_type in cr_se_types) { ro <- tidy(lm_robust(Y ~ Z + factor(B), clusters = cl, data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z, fixed_effects = ~ B, clusters = cl, data = dat, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z"), ], rfo[rfo$term %in% c("Z"), ] ) } }) test_that("FE matches with multiple FEs and covars", { ## Multiple FEs, multiple covars for (se_type in se_types) { ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = dat, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z"), ], rfo[rfo$term %in% c("Z"), ] ) # Weights ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = dat, weights = w, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = dat, weights = w, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z", "X"), ], rfo[rfo$term %in% c("Z", "X"), ] ) } for (se_type in cr_se_types) { ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z"), ], rfo[rfo$term %in% c("Z"), ] ) # Weights if (se_type %in% c("CR2", "CR3")) { expect_error( rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type)), "Cannot use `fixed_effects` with weighted" ) # ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, weights = w, se_type = "CR2")) # # expect_equivalent( # ro[ro$term %in% c("Z", "X"), ], # rfo[rfo$term %in% c("Z", "X"), ] # ) } else { ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, weights = w, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z", "X"), ], rfo[rfo$term %in% c("Z", "X"), ] ) } } ## HC3 # Uncomment for perfect fits which reveal problems for our estimators # set.seed(41) # N <- 10 # dat <- data.frame( # Y = rnorm(N), # Z = rbinom(N, 1, .5), # X = rnorm(N), # B = factor(rep(1:2, times = c(4, 6))), # B2 = factor(rep(1:3, times = c(3, 3, 4))), # cl = sample(1:4, size = N, replace = T) # ) # lmo <- lm(Y ~ Z + X + factor(B) + factor(B2), data = dat) # summary(lmo) # sandwich::vcovHC(lmo, "HC3") }) test_that("FEs work with multiple outcomes", { ## Multiple Outcomes for (se_type in se_types) { ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), data = dat, se_type = se_type) rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, data = dat, se_type = se_type) tro <- tidy(ro) trfo <- tidy(rfo) expect_equivalent( tro[tro$term %in% c("Z", "X"), ], trfo[trfo$term %in% c("Z", "X"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) # Weights ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), data = dat, weights = w, se_type = se_type) rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, data = dat, weights = w, se_type = se_type) tro <- tidy(ro) trfo <- tidy(rfo) expect_equivalent( tro[tro$term %in% c("Z", "X"), ], trfo[trfo$term %in% c("Z", "X"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) } # clusters for (se_type in cr_se_types) { ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, se_type = se_type) rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, se_type = se_type) tro <- tidy(ro) trfo <- tidy(rfo) expect_equivalent( tro[tro$term %in% c("Z", "X"), ], trfo[trfo$term %in% c("Z", "X"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) # Weights if (se_type %in% c("CR2", "CR3")) { expect_error( rfo <- tidy(lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type)), "Cannot use `fixed_effects` with weighted" ) } else { ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), data = dat, clusters = cl, weights = w, se_type = se_type) rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, data = dat, clusters = cl, weights = w, se_type = se_type) tro <- tidy(ro) trfo <- tidy(rfo) expect_equivalent( tro[tro$term %in% c("Z", "X"), ], trfo[trfo$term %in% c("Z", "X"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) } } }) test_that("FEs work with missingness", { skip_if_not_installed("sandwich") # In outcome datmiss <- dat datmiss$Y[5] <- NA datmiss$B[1] <- NA for (se_type in se_types) { expected_warning <- "Some observations have missingness in the fixed_effects variable(s) but not in the outcome or covariates. These observations have been dropped." ro <- lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = datmiss, se_type = se_type) expect_warning( rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = datmiss, se_type = se_type), expected_warning, fixed = TRUE ) expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) } # Check to make sure with only one FE when missingness works expect_warning( lm_robust(Y ~ Z + X, fixed_effects = ~ B, data = datmiss, se_type = "HC2"), expected_warning, fixed = TRUE ) expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## HC3 ro <- lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = datmiss, se_type = "HC3") expect_warning( rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = datmiss, se_type = "HC3"), expected_warning, fixed = TRUE ) lfo <- lm(Y ~ Z + X + factor(B) + factor(B2), data = datmiss) expect_equivalent( rfo$std.error[rfo$term %in% c("Z", "X")], sqrt(diag(sandwich::vcovHC(lfo, type = "HC3"))[2:3]) ) for (se_type in cr_se_types) { ro <- lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = datmiss, se_type = se_type) expect_warning( rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = datmiss, se_type = se_type), expected_warning, fixed = TRUE ) expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) } }) test_that("FEs handle collinear FEs", { ## Collinear factors for (se_type in se_types) { ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(Bdup) + factor(B2), data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + Bdup + B2, data = dat, se_type = se_type)) expect_equivalent( ro$estimate[ro$term %in% c("Z", "X")], rfo$estimate[rfo$term %in% c("Z", "X")] ) if (se_type %in% c("HC2", "HC3")) { # HC2 or HC3 work because we can get the collinearity in the FEs for free as we have to invert # UtU anyways (where U is cbind(X, FE_dummy_mat)) expect_equivalent( ro[ro$term %in% c("Z", "X"), ], rfo[rfo$term %in% c("Z", "X"), ] ) } else { # DoF is wrong because we count the FEs incorrectly for the finite sample correction with collinearity expect_false( all( ro$df[ro$term %in% c("Z", "X")] == rfo$df[rfo$term %in% c("Z", "X")] ) ) if (se_type == "HC0") { # But std errors work here bc no DoF correction expect_equivalent( ro$std.error[ro$term %in% c("Z", "X")], rfo$std.error[rfo$term %in% c("Z", "X")] ) } else { expect_false( all( ro$std.error[ro$term %in% c("Z", "X")] == rfo$std.error[rfo$term %in% c("Z", "X")] ) ) } } } ## Collinear factors for (se_type in cr_se_types) { ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(Bdup) + factor(B2), clusters = cl, data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + Bdup + B2, clusters = cl, data = dat, se_type = se_type)) # DoF for CR0/CR3 works, unlike HC0, because our DoF for CR0/CR3 is N_clust - 1, not N - total_rank if (se_type %in% c("CR0", "CR2", "CR3")) { expect_equivalent( ro[ro$term %in% c("Z", "X"), ], rfo[rfo$term %in% c("Z", "X"), ] ) } else { expect_equivalent( ro$estimate[ro$term %in% c("Z", "X")], rfo$estimate[rfo$term %in% c("Z", "X")] ) expect_false( all( ro$std.error[ro$term %in% c("Z", "X")] == rfo$std.error[rfo$term %in% c("Z", "X")] ) ) } } }) test_that("FEs work with collinear covariates", { ## Classical sum(dat$X^2) * 1e-4 * 1e-8 sum(dat$Xdup^2) for (se_type in se_types) { ro <- tidy(lm_robust(Y ~ Z + X + Xdup + factor(B) + factor(B2), data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X + Xdup, fixed_effects = ~ B + B2, data = dat, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z", "X", "Xdup"), ], rfo[rfo$term %in% c("Z", "X", "Xdup"), ] ) } # Clustered methods for (se_type in cr_se_types) { ro <- tidy(lm_robust(Y ~ Z + X + Xdup + factor(B) + factor(B2), clusters = cl, data = dat, se_type = se_type)) rfo <- tidy(lm_robust(Y ~ Z + X + Xdup, fixed_effects = ~ B + B2, clusters = cl, data = dat, se_type = se_type)) expect_equivalent( ro[ro$term %in% c("Z", "X", "Xdup"), ], rfo[rfo$term %in% c("Z", "X", "Xdup"), ] ) } }) test_that("test matches stata absorb", { # write.csv(mtcars, # file = 'tests/testthat/mtcars.csv', # row.names = F) stata_ests <- read.table( "stata-fe-ests.txt", col.names = c("model", "var", "fstat"), stringsAsFactors = FALSE ) mtcars$w <- mtcars$drat / 5 estimatr_mat <- matrix(NA, 6, 1) rfo <- lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, se_type = "classical") # areg mpg hp, absorb(carb) estimatr_mat[1, ] <- c(rfo$std.error ^ 2) rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, se_type = "HC1")) # areg mpg hp, absorb(carb) rob estimatr_mat[2, ] <- c(rfo$std.error ^ 2) rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, clusters = cyl, se_type = "stata")) # areg mpg hp, absorb(carb) cl(cyl) estimatr_mat[3, ] <- c(rfo$std.error ^ 2) rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, weights = w, se_type = "classical")) # areg mpg hp [aweight=w], absorb(carb) estimatr_mat[4, ] <- c(rfo$std.error ^ 2) rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, weights = w, se_type = "HC1")) # areg mpg hp [aweight=w], absorb(carb) rob estimatr_mat[5, ] <- c(rfo$std.error ^ 2) rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, weights = w, clusters = cyl, se_type = "stata")) # areg mpg hp [aweight=w], absorb(carb) cl(cyl) estimatr_mat[6, ] <- c(rfo$std.error ^ 2) expect_equal( estimatr_mat[, 1], stata_ests[, 2] ) }) test_that("FE matches lm_robust with one block", { skip_if_not_installed("sandwich") # In outcome datmiss <- dat datmiss$Y[5] <- NA datmiss$oneB <- as.factor("A") ## Classical ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "classical") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "classical") expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## HC0 ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC0") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC0") expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## HC1 ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC1") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC1") expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## HC2 ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC2") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC2") expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## HC3 ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC3") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC3") lfo <- lm(Y ~ Z + X, data = datmiss) expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equivalent( rfo$std.error[rfo$term %in% c("Z", "X")], sqrt(diag(sandwich::vcovHC(lfo, type = "HC3"))[2:3]) ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## CR0 ro <- lm_robust(Y ~ Z + X, clusters = cl, data = datmiss, se_type = "CR0") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, clusters = cl, data = datmiss, se_type = "CR0") expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## CR stata ro <- lm_robust(Y ~ Z + X, clusters = cl, data = datmiss, se_type = "stata") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, clusters = cl, data = datmiss, se_type = "stata") expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## CR2 ro <- lm_robust(Y ~ Z + X, clusters = cl, data = datmiss, se_type = "CR2") rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, clusters = cl, data = datmiss, se_type = "CR2") expect_equivalent( tidy(ro)[ro$term %in% c("Z", "X"), ], tidy(rfo)[rfo$term %in% c("Z", "X"), ] ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) ## Error when combined with other blocks expect_error( lm_robust(Y ~ Z + X, fixed_effects = ~ oneB + B, data = datmiss), "Can't have a fixed effect with only one group AND multiple fixed effect variables" ) }) test_that("FEs handle collinear covariates", { mtcars2 <- mtcars mtcars2$cyl2 <- mtcars2$cyl for (se_type in se_types) { lmo <- lm_robust(mpg ~ cyl + cyl2 + factor(gear), data = mtcars2) lmfo <- lm_robust(mpg ~ cyl + cyl2, fixed_effects = ~ gear, data = mtcars2) expect_equivalent( tidy(lmo)[grepl("cyl", lmo$term), ], tidy(lmfo)[grepl("cyl", lmfo$term), ] ) } for (se_type in cr_se_types) { lmo <- lm_robust(mpg ~ cyl + cyl2 + factor(gear), clusters = am, data = mtcars2) lmfo <- lm_robust(mpg ~ cyl + cyl2, fixed_effects = ~ gear, clusters = am, data = mtcars2) expect_equivalent( tidy(lmo)[grepl("cyl", lmo$term), ], tidy(lmfo)[grepl("cyl", lmfo$term), ] ) } }) test_that("FEs handle large numbers", { df <- data.frame( i = rep(1:100,100), x = rnorm(10000) * 1000000 + 10000000, y = rnorm(10000) ) fefit <- tidy(lm_robust(y ~ x, data = df, fixed_effects = ~i, se_type = "HC1")) lmfit <- tidy(lm_robust(y ~ x - 1 + factor(i), data = df, se_type = "HC1")) expect_equal( fefit[fefit$term == "x", ], lmfit[lmfit$term == "x", ], ) }) test_that("Handle perfect fits appropriately", { skip_on_os("solaris") dat$Bsingle <- c(1, 2, rep(3:4, each = 9)) rfo <- lm_robust(Y ~ X, fixed_effects = ~ Bsingle, data = dat) ro <- lm_robust(Y ~ X + factor(Bsingle), data = dat) expect_equivalent( tidy(rfo)[rfo$term == "X", ], tidy(ro)[ro$term == "X", ] ) }) estimatr/tests/testthat/test-horvitz-thompson.R0000644000176200001440000004146114747205231021571 0ustar liggesuserscontext("Estimator - horvitz_thompson") test_that("Horvitz-Thompson matches d-i-m under certain conditions", { n <- 4 dat <- data.frame( y0 = rnorm(n), z = rep(0:1, each = n / 2), ps = rep(0.5, n) ) dat$y1 <- dat$y0 + 0.43 dat$y <- ifelse(dat$z, dat$y1, dat$y0) expect_equal( coef(horvitz_thompson( y ~ z, condition_prs = ps, data = dat )), coef(difference_in_means( y ~ z, data = dat )) ) }) test_that("Horvitz-Thompson works in simple case", { n <- 40 dat <- data.frame( y = rnorm(n) ) simp_decl <- randomizr::declare_ra(N = n, prob = 0.4, simple = T) dat$z <- randomizr::conduct_ra(simp_decl) ht_simp <- horvitz_thompson( y ~ z, data = dat, ra_declaration = simp_decl, return_condition_pr_mat = TRUE ) # Also with no SEs ht_simp_no <- horvitz_thompson( y ~ z, data = dat, ra_declaration = simp_decl, return_condition_pr_mat = TRUE, se_type = "none" ) expect_equal( ht_simp$coefficients, ht_simp_no$coefficients ) expect_equivalent( as.numeric(tidy(ht_simp_no)[c("std.error", "p.value", "conf.low", "conf.high")]), rep(NA_real_, 4) ) # Works with constant effects assumption ht_const <- horvitz_thompson( y ~ z, data = dat, ra_declaration = simp_decl, se_type = "constant" ) # picks out right declaration ht_rev <- horvitz_thompson( y ~ z, data = dat, condition1 = 1, condition2 = 0, ra_declaration = simp_decl, return_condition_pr_mat = TRUE ) # Fails properly if condition in treatment but not in declaration dat$z[1] <- 2 expect_error( horvitz_thompson( y ~ z, data = dat, condition1 = 0, condition2 = 2, ra_declaration = simp_decl ) ) expect_equal( tidy(ht_simp)[, c("estimate", "std.error")], tidy(ht_rev)[, c("estimate", "std.error")] * c(-1, 1) ) # Simple designs needn't use condition matrix as joint prs are product of marginals expect_equal( ht_simp$condition_pr_mat, NULL ) # complete randomization works as well comp_decl <- randomizr::declare_ra(N = n, prob = 0.4, simple = FALSE) dat$z_comp <- randomizr::conduct_ra(comp_decl) dat$pr_comp <- 0.4 # We can learn it! or you can tell us expect_equal( ht_comp <- horvitz_thompson(y ~ z_comp, data = dat, simple = FALSE), horvitz_thompson(y ~ z_comp, data = dat, ra_declaration = comp_decl) ) expect_equal( ht_comp, horvitz_thompson(y ~ z_comp, data = dat, simple = FALSE, condition_prs = pr_comp) ) # Also with no SEs ht_comp_no <- horvitz_thompson(y ~ z_comp, data = dat, simple = FALSE, se_type = "none") expect_equal( ht_comp$coefficients, ht_comp_no$coefficients ) expect_equivalent( as.numeric(tidy(ht_comp_no)[c("std.error", "p.value", "conf.low", "conf.high")]), rep(NA_real_, 4) ) # error if you pass wrong prs dat$pr_wrong <- dat$pr_comp dat$pr_wrong[1] <- 0.5 expect_error( horvitz_thompson(y ~ z_comp, data = dat, simple = FALSE, condition_prs = pr_wrong), "Treatment probabilities must be fixed for complete randomized designs" ) # Works without data frame! ht_with <- with( dat, horvitz_thompson(y ~ z_comp, simple = FALSE, condition_prs = pr_comp) ) pr_comp <- dat$pr_comp y <- dat$y z_comp <- dat$z_comp ht_glob <- horvitz_thompson(y ~ z_comp, simple = FALSE, condition_prs = pr_comp) ht_rec <- horvitz_thompson(y ~ z_comp, simple = FALSE, condition_prs = 0.4) expect_equal( ht_with, ht_glob ) expect_equal( ht_with, ht_rec ) # with declaration ht_nod <- horvitz_thompson(y ~ z_comp, ra_declaration = comp_decl) ht_d <- horvitz_thompson(y ~ z_comp, data = dat, ra_declaration = comp_decl) expect_equal( tidy(ht_nod), tidy(ht_d) ) }) test_that("Horvitz-Thompson works with clustered data", { n <- 8 dat <- data.frame( y = rnorm(n), cl = rep(1:4, each = 2) ) ## Complete random sample, clustered clust_crs_decl <- randomizr::declare_ra(N = nrow(dat), clusters = dat$cl, prob = 0.5) dat$z <- randomizr::conduct_ra(clust_crs_decl) # Regular SE using Young's inequality ht_crs_decl <- horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_crs_decl) expect_true( !is.na(ht_crs_decl$coefficients) ) expect_equivalent( ht_crs_decl$df, NA ) # Also with no SEs ht_crs_decl_no <- horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_crs_decl, se_type = "none") expect_equal( ht_crs_decl$coefficients, ht_crs_decl_no$coefficients ) expect_equivalent( as.numeric(tidy(ht_crs_decl_no)[c("std.error", "p.value", "conf.low", "conf.high")]), rep(NA_real_, 4) ) expect_message( horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_prs = rep(0.5, nrow(dat))), "Assuming simple cluster randomization" ) expect_message( horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_prs = rep(0.5, nrow(dat)), simple = FALSE), NA ) # Can infer probabilities as well expect_equal( ht_crs_decl, horvitz_thompson(y ~ z, data = dat, clusters = cl, simple = FALSE) ) # And constant effects error for non-simple designs expect_error( horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_crs_decl, se_type = "constant"), "`se_type` = 'constant' only supported for simple random" ) ## Simple random sample, clustered clust_srs_decl <- randomizr::declare_ra( N = nrow(dat), clusters = dat$cl, prob = 0.4, simple = TRUE ) # With declaration # Regular SE using Young's inequality ht_srs_decl <- horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_srs_decl) # Also with no SEs ht_srs_decl_no <- horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_srs_decl, se_type = "none") expect_equal( ht_srs_decl$coefficients, ht_srs_decl_no$coefficients ) expect_equivalent( as.numeric(tidy(ht_srs_decl_no)[c("std.error", "p.value", "conf.low", "conf.high")]), rep(NA_real_, 4) ) # Not the same because second doesn't know it's clustered! # Just passing mat clust_srs_mat <- declaration_to_condition_pr_mat(clust_srs_decl) ht_srs_nodecl <- horvitz_thompson(y ~ z, data = dat, condition_pr_mat = clust_srs_mat) # Also with no SEs ht_srs_nodecl_no <- horvitz_thompson(y ~ z, data = dat, condition_pr_mat = clust_srs_mat, se_type = "none") expect_equal( ht_srs_nodecl$coefficients, ht_srs_nodecl_no$coefficients ) # works if I also pass cluster expect_identical( ht_srs_decl, ht_srs_cl <- horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_pr_mat = clust_srs_mat) ) # Also with no SEs ht_srs_cl_no <- horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_pr_mat = clust_srs_mat, se_type = "none") expect_equal( ht_srs_cl$coefficients, ht_srs_cl_no$coefficients ) # Can infer from number of treated clusters per block the treatment pr clbl_dat <- data.frame( cl_new = cl_new <- c(1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8), bl = rep(1:3, each = 4), y = rnorm(12) ) # pr = 0.25 in first, 0.5 in second blcl_ra <- randomizr::declare_ra(blocks = clbl_dat$bl, clusters = clbl_dat$cl_new, block_m = c(1, 2, 1)) clbl_dat$z_clbl <- randomizr::conduct_ra(blcl_ra) expect_equivalent( horvitz_thompson(y ~ z_clbl, data = clbl_dat, ra_declaration = blcl_ra), horvitz_thompson(y ~ z_clbl, data = clbl_dat, blocks = bl, clusters = cl_new) ) # should work with just a column if SRS! dat$ps <- 0.4 expect_identical( ht_srs_decl, ht_srs_prs <- horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_prs = ps) ) # Also with no SEs ht_srs_prs_no <- horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_prs = ps, se_type = "none") expect_equal( ht_srs_prs$coefficients, ht_srs_prs_no$coefficients ) # And constant effects # Only work for simple for now expect_error( horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_srs_decl, se_type = "constant"), "`se_type` = 'constant' only supported for simple random designs at the moment" ) # Fails with condition_pr varying within cluster dat$p_wrong <- dat$ps dat$p_wrong[1] <- 0.545 expect_error( horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_prs = p_wrong), "`condition_prs` must be constant within `cluster`" ) # Or pr outside of 0 1 dat$p_wrong[1] <- 1.5 expect_error( horvitz_thompson(y ~ z, data = dat, clusters = cl, condition_prs = p_wrong), "`condition_prs` must be a vector of positive values no greater than 1" ) # or treatment varying within a cluster dat$z_wrong <- dat$z dat$z_wrong[1:2] <- c(0, 1) table(dat$z_wrong, dat$cl) expect_error( horvitz_thompson(y ~ z_wrong, data = dat, clusters = cl, condition_prs = ps), "Treatment condition must be constant within `clusters`" ) }) test_that("Horvitz-Thompson works with missingness", { n <- 40 dat <- data.frame( y = rnorm(n), bl = rep(1:10, each = 4), ps = 0.35 ) decl <- randomizr::declare_ra(n, prob = 0.35) dat$z <- randomizr::conduct_ra(decl) missing_dat <- dat missing_dat$y[1] <- NA expect_error( ht_miss <- horvitz_thompson(y ~ z, data = missing_dat, ra_declaration = decl), NA ) expect_error( ht_miss_pr <- horvitz_thompson(y ~ z, data = missing_dat, condition_prs = 0.35, simple = FALSE), NA ) expect_equal(ht_miss, ht_miss_pr) # Test that we didn't edit the declaration in the users env # Should work a second time expect_error( horvitz_thompson(y ~ z, data = missing_dat, ra_declaration = decl), NA ) missing_dat$ps[2] <- NA dat$drop_these <- c(1, 1, rep(0, times = n - 2)) expect_warning( ht_miss <- horvitz_thompson(y ~ z, data = missing_dat, condition_prs = ps), "missingness in the condition_pr" ) expect_equal( horvitz_thompson(y ~ z, data = dat, condition_prs = ps, subset = drop_these == 0), ht_miss ) }) # test blocks in the data test_that("Estimating Horvitz-Thompson can be done two ways with blocks", { n <- 40 dat <- data.frame( y = rnorm(n), bl = rep(1:10, each = 4) ) bl_ra <- randomizr::declare_ra(blocks = dat$bl) dat$z <- randomizr::conduct_ra(bl_ra) bl_pr_mat <- declaration_to_condition_pr_mat(bl_ra) # This creates estimates within blocks and then joins them together using the common # formula ht_declare_bl <- horvitz_thompson(y ~ z, data = dat, ra_declaration = bl_ra) # This estimates the treatment effect at once using only condition_pr_mat ht_condmat_bl <- horvitz_thompson(y ~ z, data = dat, condition_pr_mat = bl_pr_mat) expect_equivalent( tidy(ht_declare_bl), tidy(ht_condmat_bl) ) # Also with no SEs ht_declare_bl_no <- horvitz_thompson(y ~ z, data = dat, ra_declaration = bl_ra, se_type = "none") ht_condmat_bl_no <- horvitz_thompson(y ~ z, data = dat, condition_pr_mat = bl_pr_mat, se_type = "none") expect_equal( ht_declare_bl$coefficients, ht_declare_bl_no$coefficients ) expect_equal( ht_condmat_bl$coefficients, ht_condmat_bl_no$coefficients ) dat$mps <- rep(1:20, each = 2) mp_ra <- randomizr::declare_ra(blocks = dat$mps) dat$z <- randomizr::conduct_ra(mp_ra) mp_pr_mat <- declaration_to_condition_pr_mat(mp_ra) ht_declare_mp <- horvitz_thompson(y ~ z, data = dat, ra_declaration = mp_ra) # This estimates the treatment effect at once using only condition_pr_mat ht_condmat_mp <- horvitz_thompson(y ~ z, data = dat, condition_pr_mat = mp_pr_mat) expect_equivalent( tidy(ht_declare_mp), tidy(ht_condmat_mp) ) # block messages when passing with simple = TRUE flag, not otherwise dat$p <- tapply(dat$z, dat$bl, mean)[dat$bl] expect_message( ht_declare_mp <- horvitz_thompson(y ~ z, data = dat, blocks = bl, condition_prs = p, simple = TRUE), "Assuming complete random assignment of clusters within blocks." ) expect_message( ht_declare_mp <- horvitz_thompson(y ~ z, data = dat, blocks = bl, condition_prs = p, simple = FALSE), NA ) }) # errors when arguments are passed that shouldn't be together test_that("Horvitz-Thompson properly checks arguments and data", { n <- 8 dat <- data.frame( y = rnorm(n), ps = 0.4, z = sample(rep(0:1, each = n / 2)), x = runif(n), cl = rep(1:4, each = 2), bl = rep(1:2, each = 4) ) decl <- randomizr::declare_ra(N = n, prob = 0.4, simple = FALSE) # default is mean(ps) expect_identical( horvitz_thompson(y ~ z, data = dat), horvitz_thompson(y ~ z, data = dat, condition_prs = rep(0.5, times = nrow(dat))) ) expect_error( horvitz_thompson(y ~ z, data = dat, condition_prs = ps, ra_declaration = decl), "Cannot use `ra_declaration` with any of" ) expect_error( horvitz_thompson(y ~ z, data = dat, condition_pr_mat = declaration_to_condition_pr_mat(decl), ra_declaration = decl), "Cannot use `ra_declaration` with any of" ) expect_error( horvitz_thompson(y ~ z + x, data = dat, ra_declaration = decl), "must have only one variable on the right-hand side" ) expect_error( horvitz_thompson(y ~ z, data = dat, ra_declaration = randomizr::declare_ra(N = n + 1, prob = 0.4)), "variable lengths differ" ) ht_o <- horvitz_thompson(y ~ z, data = dat, ci = FALSE) expect_equivalent( as.matrix(tidy(horvitz_thompson(y ~ z, data = dat, ci = FALSE))[, c("p.value", "conf.low", "conf.high")]), matrix(NA, nrow = 1, ncol = 3) ) # condition pr mat is the wrong size expect_error( horvitz_thompson( y ~ z, data = dat, condition_pr_mat = matrix(rnorm(4), 2, 2) ), "cleaning the data" ) }) test_that("Works without variation in treatment", { set.seed(1) dat <- data.frame( y = rnorm(20), bl = 1:5, ps = 0.4 ) # Simple case dat$z_const <- 1 ht_const_1 <- horvitz_thompson( y ~ z_const, data = dat ) ht_const_cond1 <- horvitz_thompson( y ~ z_const, data = dat, condition2 = 1 ) expect_equivalent( ht_const_1, ht_const_cond1 ) expect_equivalent(coef(ht_const_1), mean(dat$y)) expect_equivalent(ht_const_1$std.error, 1 / (nrow(dat)) * sqrt(sum(dat$y ^ 2))) expect_equivalent( ht_const_1$df, NA ) ht_const <- horvitz_thompson( y ~ z_const, data = dat, condition_prs = ps ) expect_equivalent(coef(ht_const), mean(dat$y / dat$ps)) expect_equivalent(ht_const$std.error, 1 / (nrow(dat)) * sqrt(sum((dat$y / dat$ps) ^ 2))) ## Blocks and all are treated ht_block <- horvitz_thompson( y ~ z_const, data = dat, blocks = bl, condition_prs = ps, return_condition_pr_mat = TRUE ) # with blocks SE is different because not simple any more expect_equivalent(coef(ht_block), mean(dat$y / dat$ps)) # expect_equal(ht_block$std.error, 1/(nrow(dat)) * sqrt(sum((dat$y / dat$ps)^2))) ## Blocks and some are treated! dat$z_diff <- as.numeric(dat$bl <= 2) ht_block <- horvitz_thompson( y ~ z_diff, data = dat, blocks = bl, condition_prs = rep(0.4, nrow(dat)) ) ht_block # With only one treatment, but value is 0, still put it as treatment!! # But note we leave a hint in the coefficient name dat$z <- 0 ht_zero <- horvitz_thompson( y ~ z, data = dat, blocks = bl, condition_prs = rep(0.5, nrow(dat)) ) expect_identical(ht_zero$term, "z0") # Drop name if they specify the only treatment as condition1 ht_rev <- horvitz_thompson( y ~ z, data = dat, blocks = bl, condition1 = 0, condition_prs = rep(0.5, nrow(dat)) ) expect_identical(ht_rev$term, "z") # This is only true because condition prs are 0.5 expect_identical( tidy(ht_zero)[c("estimate", "std.error")], tidy(ht_rev)[c("estimate", "std.error")] * c(-1, 1) ) # Some weird specifications that hit unusual parts of the variance cpm <- diag(0.5, nrow = 4, ncol = 4) y <- rnorm(2) t <- c(0, 1) expect_error( horvitz_thompson(y ~ t, condition_pr_mat = cpm), NA ) t <- c(1, 1) expect_error( horvitz_thompson(y ~ t, condition_pr_mat = cpm), NA ) }) test_that("multi-valued treatments not allowed in ra_declaration", { dat <- data.frame( y = rnorm(20), ps = 0.4 ) decl_multi <- randomizr::declare_ra(N = 20, prob_each = c(0.4, 0.4, 0.2)) dat$z <- randomizr::conduct_ra(decl_multi) expect_error( horvitz_thompson(y ~ z, data = dat, ra_declaration = decl_multi), "Cannot use horvitz_thompson\\(\\) with a `ra_declaration` with" ) # will work but you have to get the PRs right! ht_condition <- horvitz_thompson( y ~ z, data = dat, condition_prs = ps, condition1 = "T1", condition2 = "T2" ) subdat <- dat[dat$z != "T3", ] ht_subdat <- horvitz_thompson( y ~ z, data = subdat, condition_prs = ps ) ht_subset <- horvitz_thompson( y ~ z, data = dat, subset = z != "T3", condition_prs = ps ) expect_equal( ht_condition, ht_subdat ) expect_equal( ht_condition, ht_subset ) }) estimatr/tests/testthat/test-lm-robust_emmeans.R0000644000176200001440000000243014747205231021641 0ustar liggesuserscontext("S3 - emmeans") test_that("emmeans can work with lm_robust objects", { skip_if_not_installed("emmeans") library(emmeans) lmr <- lm_robust(mpg ~ factor(cyl) * hp + wt, data = mtcars) rg <- emmeans::ref_grid(lmr) expect_equal(class(rg)[1], "emmGrid") grid <- rg@grid expect_equal(nrow(grid), 3) expect_equal(sum(grid$.wgt.), 32) expect_equal(predict(rg)[1], 17.424, tolerance = .01) }) test_that("lm_robust multivariate model works with emmeans", { skip_if_not_installed("emmeans") library(emmeans) lmr <- lm_robust(yield ~ Block + Variety, data = emmeans::MOats) emm <- emmeans(lmr, "rep.meas") expect_equal(summary(emm)$emmean[4], 123.4, tolerance = 0.1) }) test_that("lm_robust model with rank deficiency works with emmeans", { skip_if_not_installed("emmeans") library(emmeans) lmr <- lm_robust(log(breaks) ~ wool * tension, data = warpbreaks, subset = -(19:30)) pred <- predict(ref_grid(lmr)) expect_true(is.na(pred[5])) expect_equal(length(pred), 6) expect_equal(sum(is.na(pred)), 1) }) # Not testing emmeans package capabilities themselves. If we can construct the # reference grid correctly, we are basically OK. # Pretty much anything else that could fail would happen in the emmeans package, # not in the support methods in this package. estimatr/tests/testthat/test-condition-pr-matrix.R0000644000176200001440000002031514747205231022121 0ustar liggesuserscontext("Helper - HT condition_pr_matrix") n <- 5 test_that("Checks class", { # Errors appropriately expect_error( declaration_to_condition_pr_mat(rbinom(5, 1, 0.5)), "`ra_declaration` must be an object of class 'ra_declaration'" ) }) test_that("Complete randomization", { skip_if_not_installed("randomizr") prs <- rep(0.4, times = n) comp_ra <- randomizr::declare_ra(N = n, prob = prs[1]) perms <- randomizr::obtain_permutation_matrix(comp_ra) expect_equal( declaration_to_condition_pr_mat(comp_ra), permutations_to_condition_pr_mat(perms) ) }) test_that("declaration to condition_pr_mat errors", { expect_error( declaration_to_condition_pr_mat(randomizr::declare_ra(N = n), 1, NULL), "Cannot have `condition2 == NULL`" ) expect_error( declaration_to_condition_pr_mat(randomizr::declare_ra(N = n), NULL, 1), "Cannot have `condition1 == NULL`" ) expect_error( declaration_to_condition_pr_mat(rbinom(5, 1, 0.5)), "`ra_declaration` must be an object of class 'ra_declaration'" ) }) test_that("condition args work properly", { # Condition args work properly mat01 <- declaration_to_condition_pr_mat( randomizr::declare_ra(N = n, prob = 0.4), 0, 1 ) mat10 <- declaration_to_condition_pr_mat( randomizr::declare_ra(N = n, prob = 0.4), 1, 0 ) # Diagonals are just flipped, check the names! # colnames(mat01) # colnames(mat10) expect_equal(mat01, mat10[rownames(mat01), colnames(mat01)]) }) test_that("Complete randomization with number of treated units not fixed", { # comp_odd_ra <- randomizr::declare_ra(N = 3, prob = 0.5) perms <- randomizr::obtain_permutation_matrix(comp_odd_ra) decl_cond_pr_mat <- declaration_to_condition_pr_mat(comp_odd_ra) # following passes so just use perms instead of get_perms # get_perms <- replicate(40000, conduct_ra(comp_odd_ra)) # expect_true( # max(permutations_to_condition_pr_mat(perms) - # round(permutations_to_condition_pr_mat(get_perms), 3)) < 0.01 # ) expect_equal( decl_cond_pr_mat, permutations_to_condition_pr_mat(perms) ) }) test_that("Complete randomization with non 0.5 as remainder", { comp_odd_ra <- randomizr::declare_ra(N = 3, prob = 0.4) decl_cond_pr_mat <- declaration_to_condition_pr_mat(comp_odd_ra) set.seed(40) get_perms <- replicate(10000, randomizr::conduct_ra(comp_odd_ra)) expect_equal( decl_cond_pr_mat, permutations_to_condition_pr_mat(get_perms), tolerance = 0.01 ) }) test_that("Simple ra", { # Simple randomization prs <- rep(0.4, times = n) simp_ra <- randomizr::declare_ra(N = n, prob = prs[1], simple = TRUE) # perms <- randomizr::obtain_permutation_matrix(simp_ra) # Won't work because some permutations are more likely than others # So instead we just resample and set the tolerance perms <- replicate(10000, randomizr::conduct_ra(simp_ra)) # Won't be equal because some permutations are more likely than others in # this case expect_equal( declaration_to_condition_pr_mat(simp_ra), permutations_to_condition_pr_mat(perms), tolerance = 0.02 ) }) test_that("Blocked complete ra", { # Blocked case dat <- data.frame( bl = c("A", "B", "A", "B", "B", "B"), pr = c(0.5, 0.25, 0.5, 0.25, 0.25, 0.25) ) bl_ra <- randomizr::declare_ra(blocks = dat$bl, block_m = c(1, 1)) bl_perms <- randomizr::obtain_permutation_matrix(bl_ra) expect_equal( declaration_to_condition_pr_mat(bl_ra), permutations_to_condition_pr_mat(bl_perms) ) }) test_that("Blocked complete ra with remainder", { dat <- data.frame( bl = c("A", "B", "A", "B", "B", "B"), pr = c(0.5, 0.25, 0.5, 0.25, 0.25, 0.25) ) # with remainder bl <- c("A", "B", "A", "A", "B", "B") # Is this used anywhere? bl_ra <- randomizr::declare_ra(blocks = dat$bl, prob = 0.4) bl_perms <- replicate(5000, randomizr::conduct_ra(bl_ra)) expect_equal( declaration_to_condition_pr_mat(bl_ra), permutations_to_condition_pr_mat(bl_perms), tolerance = 0.02 ) }) test_that("Clustered complete ra", { # Cluster complete case dat <- data.frame( cl = c("A", "B", "A", "C", "A", "B") ) cl_ra <- randomizr::declare_ra(clusters = dat$cl, m = 1) cl_perms <- randomizr::obtain_permutation_matrix(cl_ra) expect_equal( declaration_to_condition_pr_mat(cl_ra), permutations_to_condition_pr_mat(cl_perms) ) # with remainder cl_ra <- randomizr::declare_ra(clusters = dat$cl, prob = 0.5) cl_perms <- randomizr::obtain_permutation_matrix(cl_ra) # lapply(1:ncol(cl_perms), function(x) table(dat$cl, cl_perms[, x])) expect_equal( declaration_to_condition_pr_mat(cl_ra), permutations_to_condition_pr_mat(cl_perms) ) }) test_that("Clustered ra", { # Cluster simple ? Should this be simple or no? --NJF dat <- data.frame( cl = c("A", "B", "A", "C", "A", "B") ) dat$prs <- 0.3 cl_simp_ra <- randomizr::declare_ra(clusters = dat$cl, prob = dat$prs[1]) cl_simp_perms <- randomizr::obtain_permutation_matrix(cl_simp_ra) cl_simp_cpm <- declaration_to_condition_pr_mat(cl_simp_ra) set.seed(42) expect_equal( cl_simp_cpm, permutations_to_condition_pr_mat(cl_simp_perms), tolerance = 0.1 ) set.seed(42) cl_simp_sim_perms <- replicate(10000, randomizr::conduct_ra(cl_simp_ra)) expect_equal( cl_simp_cpm, permutations_to_condition_pr_mat(cl_simp_sim_perms), tolerance = 0.01 ) }) test_that("Blocked and Clustered ra", { # Blocked and clustered dat <- data.frame( bl = c("A", "B", "B", "B", "A", "A", "B", "B"), cl = c(1, 2, 3, 3, 4, 4, 5, 5) ) bl_cl_ra <- randomizr::declare_ra(clusters = dat$cl, blocks = dat$bl, block_m = c(1, 2)) bl_cl_perms <- randomizr::obtain_permutation_matrix(bl_cl_ra) expect_equal( declaration_to_condition_pr_mat(bl_cl_ra), permutations_to_condition_pr_mat(bl_cl_perms) ) }) test_that("Blocked and clusted ra with remainder", { # with remainder dat <- data.frame( bl = c("A", "B", "B", "B", "A", "A", "B", "B"), cl = c(1, 2, 3, 3, 4, 4, 5, 5) ) bl_cl_ra <- randomizr::declare_ra(clusters = dat$cl, blocks = dat$bl, prob = 0.5) bl_cl_perms <- randomizr::obtain_permutation_matrix(bl_cl_ra) expect_equal( declaration_to_condition_pr_mat(bl_cl_ra), permutations_to_condition_pr_mat(bl_cl_perms) ) }) test_that("Custom ra", { cust_perms <- cbind(c(1, 0, 1, 0), c(1, 1, 0, 0)) cust_ra <- randomizr::declare_ra(permutation_matrix = cust_perms) expect_equal( declaration_to_condition_pr_mat(cust_ra), permutations_to_condition_pr_mat(cust_perms) ) }) test_that("Errors for things that we can't support", { # # multiple armed experiments mult_ra <- randomizr::declare_ra(N = 10, prob_each = c(0.2, 0.2, 0.6)) expect_error( declaration_to_condition_pr_mat(mult_ra), "`ra_declaration` must have only two arms when passed directly" ) # Permutation error expect_error( permutations_to_condition_pr_mat(matrix(c(1, 2, 2, 1), nrow = 2)), "Matrix of `permutations` must be comprised of only 0s and 1s" ) # Not unique treatment prob for all clusters when complete randomized expect_error( gen_pr_matrix_cluster( c(1, 1, 2, 2), treat_probs = runif(4), simple = FALSE ), "Treatment probabilities cannot vary within blocks" ) }) test_that("probability not fixed within blocks", { bl_small <- randomizr::declare_ra( blocks = c(1, 1, 2, 2), prob = 0.4 ) assign( "probabilities_matrix", matrix( c(0.4, 0.5, 0.6, 0.7, 0.6, 0.5, 0.4, 0.3), ncol = 2, dimnames = list(NULL, c("prob_0", "prob_1")) ), bl_small ) expect_error( declaration_to_condition_pr_mat(bl_small), "Treatment probabilities must be fixed within blocks for block randomized" ) }) test_that("N=2, m=1", { comp <- randomizr::declare_ra(N = 2, m = 1) assign( "probabilities_matrix", matrix( c(0.4, 0.5, 0.6, 0.5), ncol = 2, dimnames = list(NULL, c("prob_0", "prob_1")) ), comp ) expect_error( declaration_to_condition_pr_mat(comp), "Treatment probabilities must be fixed for complete randomized designs" ) # error in internal function expect_error( estimatr:::gen_pr_matrix_block(c(1, 2), c(1, 2)), "Must specify one of `t`, `p2`, or `p1`" ) }) estimatr/tests/testthat/helper-return-cleaners.R0000644000176200001440000000060714747205231021625 0ustar liggesusers# This fn removes calls from function returns to make testing easier rmcall <- function(obj) { if (!is.null(obj[["call"]])) { obj[["call"]] <- NULL } return(obj) } # Casts conditions as character objects for equality purposes condchr <- function(obj) { obj[["condition2"]] <- as.character(obj[["condition2"]]) obj[["condition1"]] <- as.character(obj[["condition1"]]) obj } estimatr/tests/testthat/test-lm-robust_margins.R0000644000176200001440000000661414760370122021661 0ustar liggesuserscontext("Helper - lm_robust margins") mv <- c("AME", "SE", "z", "p") test_that("lm robust can work with margins", { skip_if_not_installed("margins") x <- lm(mpg ~ cyl * hp + wt, data = mtcars) lmr <- lm_robust(mpg ~ cyl * hp + wt, data = mtcars) # Note old package vce defaults to delta # new margins on github defaults to none with our obj lm_sum_marg <- summary( margins::margins( x, vcov = sandwich::vcovHC(x, type = "HC2"), vce = "delta" ) ) lmr_sum_marg <- margins:::summary.margins(margins::margins(lmr, vce = "delta")) # Close enough with HC2? expect_equal( lm_sum_marg[, mv], lmr_sum_marg[, mv], tolerance = 0.01 ) # Close with classical lmr_class <- lm_robust(mpg ~ cyl * hp + wt, data = mtcars, se_type = "classical") lmrc <- margins:::summary.margins(margins::margins(lmr_class, vce = "delta")) lmc <- margins:::summary.margins(margins::margins(x, vce = "delta")) expect_equal( lmc[, mv], lmrc[, mv], tolerance = 0.01 ) # Works with other vce set.seed(42) lmrc <- margins:::summary.margins(margins::margins(lmr_class, vce = "bootstrap", iterations = 10L)) expect_true(!any(is.na(lmrc))) lmrc <- margins:::summary.margins(margins::margins(lmr_class, vce = "simulation", iterations = 10L)) expect_true(!any(is.na(lmrc))) lmrc <- margins:::summary.margins(margins::margins(lmr_class, vce = "simulation", iterations = 10L)) expect_true(!any(is.na(lmrc))) }) test_that("lm robust + weights can work with margins", { skip_if_not_installed("margins") x <- lm(mpg ~ cyl * hp, data = mtcars, weights = wt) x2 <- lm_robust(mpg ~ cyl * hp, data = mtcars, weights = wt, se_type = "classical") expect_equal(margins::marginal_effects(x), margins::marginal_effects(x2)) suppressWarnings( {lmc <- round(margins:::summary.margins(margins::margins(x, vce = "delta"))[, mv], 3)} ) suppressWarnings( {lmr <- round(margins:::summary.margins(margins::margins(x2, vce = "delta"))[, mv], 3)} ) expect_equal(lmc, lmr) }) test_that("lm robust + cluster can work with margins", { skip_if_not_installed("margins") # works but throws a lot of warnings x <- lm(mpg ~ cyl * hp + wt, data = mtcars) x2 <- lm_robust(mpg ~ cyl * hp + wt, data = mtcars, clusters = am) lmc <- round(margins:::summary.margins(margins::margins(x, vce = "delta"))[, mv], 8) expect_warning( lmr <- round(margins:::summary.margins(margins::margins(x2, vce = "delta"))[, mv], 8), NA ) # With rounding expect_equal(lmc[, 1], lmr[, 1]) expect_true( !any(lmc[, 2] == lmr[, 2]) ) # Works with character cluster (avoided terms(mod) "dataClasses" problem) mtcars$testc <- letters[1:4] expect_error( margins::margins(lm_robust(mpg ~ cyl * hp + wt, data = mtcars, clusters = testc)), NA ) }) test_that("lm lin can work with margins", { skip_if_not_installed("margins") data("alo_star_men") # instruct margins to treat treatment as a factor lml <- lm_lin(GPA_year1 ~ factor(ssp), ~ gpa0, data = alo_star_men, se_type = "classical") alo_star_men$gpa0_tilde <- alo_star_men$gpa0 - mean(alo_star_men$gpa0) lmo <- lm(GPA_year1 ~ factor(ssp) * gpa0_tilde, data = alo_star_men) lml_sum <- margins:::summary.margins(margins::margins(lml, vce = "delta")) lmo_sum <- margins:::summary.margins(margins::margins(lmo, vce = "delta")) expect_equal( lml_sum[, 4], lmo_sum[, 4], tolerance = 0.000001 ) }) estimatr/tests/testthat/test-arg-checking.R0000644000176200001440000000045714747205231020541 0ustar liggesuserscontext("Estimator - Arg checking fails as expected.") test_that("#349 Early fail when formula is a string", { expect_error( estimatr::lm_robust("mpg~hp", data = mtcars, cluster = wt), "formula" ) expect_length( estimatr::lm_robust(mpg~hp, data = mtcars, cluster = wt), 29 ) }) estimatr/tests/testthat/helper-lm-robust-se.R0000644000176200001440000001315714760370122021046 0ustar liggesusers## BMlmSE.R implements Bell-McCaffrey standard errors ## This code is taken from Michal Kolesar ## https://github.com/kolesarm/Robust-Small-Sample-Standard-Errors ## Only changed the name of one function, df #' Compute the inverse square root of a symmetric matrix #' @param A matrix MatSqrtInverse <- function(A) { ei <- eigen(A, symmetric = TRUE) if (min(ei$values) <= 0) { warning("Gram matrix doesn't appear to be positive definite") } d <- pmax(ei$values, 0) d2 <- 1 / sqrt(d) d2[d == 0] <- 0 ## diag(d2) is d2 x d2 identity if d2 is scalar, instead we want 1x1 matrix ei$vectors %*% (if (length(d2) == 1) d2 else diag(d2)) %*% t(ei$vectors) } #' Compute Bell-McCaffrey Standard Errors #' @param model Fitted model returned by the \code{lm} function #' @param clustervar Factor variable that defines clusters. If \code{NULL} (or #' not supplied), the command computes heteroscedasticity-robust standard #' errors, rather than cluster-robust standard errors. #' @param ell A vector of the same length as the dimension of covariates, #' specifying which linear combination \eqn{\ell'\beta} of coefficients #' \eqn{\beta} to compute. If \code{NULL}, compute standard errors for each #' regressor coefficient #' @param IK Logical flag only relevant if cluster-robust standard errors are #' being computed. Specifies whether to compute the degrees-of-freedom #' adjustment using the Imbens-Kolesár method (if \code{TRUE}), or the #' Bell-McCaffrey method (if \code{FALSE}) #' @return Returns a list with the following components \describe{ #' #' \item{vcov}{Variance-covariance matrix estimator. For the case without #' clustering, it corresponds to the HC2 estimator (see MacKinnon and White, #' 1985 and the reference manual for the \code{sandwich} package). For the case #' with clustering, it corresponds to a generalization of the HC2 estimator, #' called LZ2 in Imbens and Kolesár.} #' #' \item{dof}{Degrees-of-freedom adjustment} #' #' \item{se}{Standard error} #' #' \item{adj.se}{Adjusted standard errors. For \beta_j, they are defined as #' \code{adj.se[j]=sqrt(vcov[j,j]se*qt(0.975,df=dof)} so that the Bell-McCaffrey #' confidence intervals are given as \code{coefficients(fm)[j] +- 1.96* adj.se=} #' #' \item{se.Stata}{Square root of the cluster-robust variance estimator used in #' STATA} #' #' } #' @examples #' ## No clustering: #' set.seed(42) #' x <- sin(1:10) #' y <- rnorm(10) #' fm <- lm(y~x) #' BMlmSE(fm) #' ## Clustering, defining the first six observations to be in cluster 1, the #' #next two in cluster 2, and the last three in cluster three. #' clustervar <- as.factor(c(rep(1, 6), rep(2, 2), rep(3, 2))) #' BMlmSE(fm, clustervar) BMlmSE <- function(model, clustervar=NULL, ell=NULL, IK=TRUE) { X <- model.matrix(model) sum.model <- summary.lm(model) n <- sum(sum.model$df[1:2]) K <- model$rank XXinv <- sum.model$cov.unscaled # XX^{-1} u <- residuals(model) ## Compute DoF given G'*Omega*G without calling eigen as suggested by ## Winston Lin DoF <- function(GG) sum(diag(GG)) ^ 2 / sum(GG * GG) ## Previously: ## lam <- eigen(GG, only.values=TRUE)$values ## sum(lam)^2/sum(lam^2) ## no clustering if (is.null(clustervar)) { Vhat <- sandwich::vcovHC(model, type = "HC2") Vhat.Stata <- Vhat * NA M <- diag(n) - X %*% XXinv %*% t(X) # annihilator matrix ## G'*Omega*G GOG <- function(ell) { Xtilde <- drop(X %*% XXinv %*% ell / sqrt(diag(M))) crossprod(M * Xtilde) } } else { if (!is.factor(clustervar)) stop("'clustervar' must be a factor") ## Stata S <- length(levels(clustervar)) # number clusters uj <- apply(u * X, 2, function(x) tapply(x, clustervar, sum)) Vhat.Stata <- S / (S - 1) * (n - 1) / (n - K) * sandwich::sandwich(model, meat = crossprod(uj) / n) ## HC2 tXs <- function(s) { Xs <- X[clustervar == s, , drop = FALSE] MatSqrtInverse(diag(NROW(Xs)) - Xs %*% XXinv %*% t(Xs)) %*% Xs } tX <- lapply(levels(clustervar), tXs) # list of matrices tu <- split(u, clustervar) tutX <- sapply(seq_along(tu), function(i) crossprod(tu[[i]], tX[[i]])) Vhat <- sandwich::sandwich(model, meat = tcrossprod(tutX) / n) ## DOF adjustment tHs <- function(s) { Xs <- X[clustervar == s, , drop = FALSE] index <- which(clustervar == s) ss <- outer(rep(0, n), index) # n x ns matrix of 0 ss[cbind(index, 1:length(index))] <- 1 ss - X %*% XXinv %*% t(Xs) } tH <- lapply(levels(clustervar), tHs) # list of matrices Moulton <- function() { ## Moulton estimates ns <- tapply(u, clustervar, length) ssr <- sum(u ^ 2) rho <- max((sum(sapply(seq_along(tu), function(i) sum(tu[[i]] %o% tu[[i]]))) - ssr) / (sum(ns ^ 2) - n), 0) c(sig.eps = max(ssr / n - rho, 0), rho = rho) } GOG <- function(ell) { G <- sapply( seq_along(tX), function(i) tH[[i]] %*% tX[[i]] %*% XXinv %*% ell ) GG <- crossprod(G) ## IK method if (IK == TRUE) { Gsums <- apply( G, 2, function(x) tapply(x, clustervar, sum) ) # Z'*G GG <- Moulton()[1] * GG + Moulton()[2] * crossprod(Gsums) } GG } } if (!is.null(ell)) { se <- drop(sqrt(crossprod(ell, Vhat) %*% ell)) dof <- DoF(GOG(ell)) se.Stata <- drop(sqrt(crossprod(ell, Vhat.Stata) %*% ell)) } else { se <- sqrt(diag(Vhat)) dof <- sapply(seq(K), function(k) DoF(GOG(diag(K)[, k]))) se.Stata <- sqrt(diag(Vhat.Stata)) } names(dof) <- names(se) list( vcov = Vhat, dof = dof, adj.se = se * qt(0.975, df = dof) / qnorm(0.975), se = se, se.Stata = se.Stata ) } estimatr/tests/testthat/test-iv-robust-fes.R0000644000176200001440000000633414747205231020724 0ustar liggesuserscontext("Estimator - iv_robust, fixed effects") set.seed(43) N <- 20 dat <- data.frame( Y = rnorm(N), X1 = rnorm(N), X2 = rnorm(N), Z = rbinom(N, 1, .5), B = factor(rep(1:2, times = c(8, 12))), B2 = factor(rep(1:4, times = c(3, 3, 4, 10))), cl = sample(1:4, size = N, replace = T), w = runif(N) ) dat$Xdup <- dat$X dat$Bdup <- dat$B test_that("FE matches with multiple FEs and covars", { for (se_type in se_types) { ro <- iv_robust(Y ~ X1 + X2 + factor(B) + factor(B2) | Z + X2 + factor(B) + factor(B2), data = dat, se_type = se_type) rfo <- iv_robust(Y ~ X1 + X2 | Z + X2, fixed_effects = ~ B + B2, data = dat, se_type = se_type) expect_equivalent( tidy(ro)[ro$term %in% c("X1", "X2"), ], tidy(rfo)[rfo$term %in% c("X1", "X2"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) # weights ro <- iv_robust(Y ~ X1 + X2 + factor(B) + factor(B2) | Z + X2 + factor(B) + factor(B2), data = dat, weights = w, se_type = se_type) rfo <- iv_robust(Y ~ X1 + X2 | Z + X2, fixed_effects = ~ B + B2, data = dat, weights = w, se_type = se_type) expect_equivalent( tidy(ro)[ro$term %in% c("X1", "X2"), ], tidy(rfo)[rfo$term %in% c("X1", "X2"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) } for (se_type in cr_se_types) { ro <- iv_robust(Y ~ X1 + X2 + factor(B) + factor(B2) | Z + X2 + factor(B) + factor(B2), clusters = cl, data = dat, se_type = se_type) rfo <- iv_robust(Y ~ X1 + X2 | Z + X2, fixed_effects = ~ B + B2, clusters = cl, data = dat, se_type = se_type) expect_equivalent( tidy(ro)[ro$term %in% c("X1", "X2"), ], tidy(rfo)[rfo$term %in% c("X1", "X2"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) # weights if (se_type %in% c("CR2", "CR3")) { expect_error( rfo <- iv_robust(Y ~ X1 + X2 | Z + X2, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type), "Cannot use `fixed_effects` with weighted CR2" ) } else { ro <- iv_robust(Y ~ X1 + X2 + factor(B) + factor(B2) | Z + X2 + factor(B) + factor(B2), clusters = cl, data = dat, weights = w, se_type = se_type) rfo <- iv_robust(Y ~ X1 + X2 | Z + X2, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type) expect_equivalent( tidy(ro)[ro$term %in% c("X1", "X2"), ], tidy(rfo)[rfo$term %in% c("X1", "X2"), ] ) expect_equivalent( ro$fitted.values, rfo$fitted.values ) expect_equal( ro[c("r.squared", "adj.r.squared")], rfo[c("r.squared", "adj.r.squared")] ) } } }) test_that("IV FE warns about diagnostics", { expect_warning( iv_robust(mpg ~ hp | wt, data = mtcars, fixed_effects = cyl, diagnostics = TRUE), "Will not return `diagnostics` if `fixed_effects` are used." ) }) estimatr/tests/testthat/test-modelsummary.R0000644000176200001440000000273514760370122020733 0ustar liggesusers# This file is ignored by .Rbuildignore to keep from suggesting gt and modelsummary context("S3 - modelsummary works") test_that("modelsummary works with glance", { skip_if_not_installed("modelsummary") library(modelsummary) set.seed(5) model1 <- lm_robust(mpg ~ am, mtcars) model2 <- lm_robust(mpg ~ am, mtcars, clusters = cyl) model3 <- lm_lin(mpg ~ am, ~ cyl, mtcars) mso <- modelsummary(list(model1, model2, model3), output = "data.frame") expect_equal(colnames(mso), c("part", "term", "statistic", "(1)", "(2)", "(3)")) expect_equal(nrow(mso), 15L) expect_equal(ncol(mso), 6L) # iv_robust model1 <- iv_robust(mpg ~ am | gear, mtcars) model2 <- iv_robust(mpg ~ am | gear, mtcars, clusters = cyl, diagnostics = TRUE) mso <- modelsummary(list(model1, model2), gof_omit = c("N|[sS]tatistic|p.value|p{1}"), output = "data.frame") expect_equal(nrow(mso), 10) expect_equal(ncol(mso), 5) # difference_in_means model1 <- difference_in_means(mpg ~ am, mtcars) model2 <- difference_in_means(mpg ~ am, mtcars, blocks = vs) mso <- modelsummary(list(model1, model2), output = "data.frame") # horvitz_thompson model1 <- horvitz_thompson(mpg ~ am, mtcars) model2 <- horvitz_thompson(mpg ~ am, mtcars, blocks = vs) mso <-modelsummary(list(model1, model2), output = "data.frame") expect_equal(nrow(mso), 6) expect_equal(ncol(mso), 5) }) estimatr/tests/testthat.R0000644000176200001440000000007414747205231015235 0ustar liggesuserslibrary(testthat) library(estimatr) test_check("estimatr") estimatr/MD50000644000176200001440000001320714760407272012426 0ustar liggesusers5496c35bcff9d7919f5ccb114c00895d *DESCRIPTION 987155d343d711d5e9d9d382312efc4e *LICENSE 2779fe2842cd9c794d8b1b41a729c864 *NAMESPACE e37b4835d1c7f6b1d54f79d08ac2576c *NEWS.md e794789da71c6eaf8250c94aacc90b34 *R/RcppExports.R c66013917fb121a9dcbb21838526b355 *R/S3_confint.R 362a180401e1a9abe458a2b1161c940e *R/S3_emmeans.R d763342a367b06b2d266137db30c54d0 *R/S3_glance.R 214e5e79d9f90ea1523579bd1ea475d4 *R/S3_nobs.R 9e1eb0a89c3f741c579a9b331d7c4cbd *R/S3_predict.R fcca9cfcf68c25a72dff66be5e175060 *R/S3_print.R 78f30ce6937a09e60d1a4b63d9c3640c *R/S3_summary.R 1a781c2d7f021ca94654d9d2b7dffd72 *R/S3_tidy.R 3fe711da6b62b3356640798760011e2b *R/S3_update.R 44bccc6c96e6a943dc48d806a25ea744 *R/S3_vcov.R 1286cabe439e6b258b18d9c51b32dd71 *R/data.R 5fb8af4dcd394ec8dcbf582df6d1f7be *R/estimatr.R 62b1ce87dc62d51c0c4ee25c2f3fba4a *R/estimatr_difference_in_means.R a0c03bcddec0e5199746a75625932ee5 *R/estimatr_horvitz_thompson.R daa1a38131794202008a1b1d5d29d405 *R/estimatr_iv_robust.R e577454743a702669021366774ccd943 *R/estimatr_lh_robust.R 23c9cdf7d6a1484975302523f951f550 *R/estimatr_lm_lin.R f09423fda9d2984b73c4f2c84d0d7df5 *R/estimatr_lm_robust.R 82cae7328ff0f66fec821c3bf43f42da *R/helper_cis_pvals.R 707875f4c773f8a6a7629c86f3429f42 *R/helper_clean_model_data.R 39566c7434d2f16fc506d0ea1cf3ab6e *R/helper_condition_pr_matrix.R 0bbc50e9a3d89754274f519a38cea9f3 *R/helper_extract.R 1668dd410fb6cb11acd8399f71f8d794 *R/helper_lm_robust_fit.R 797460fa81d670dfc5b0d17b501a50a3 *R/helper_na_omit_detailed.R fe2c1bcbf4d4bbd54d32bde691ad492e *R/helper_parse_arguments.R 74c2cec12979b190a90adaf5a418aefe *R/helper_return.R 4300272a84b983b611993090b93aca4b *R/helper_starprep.R 6e46076834fb4633ec2dd44086f07d59 *R/zzz.R 76161b65639451c966be75488458b3c3 *build/partial.rdb 07161ed396beafe204615ad6fd046966 *data/alo_star_men.rda 319e1d78fa76f18ddface7e5e314229f *man/alo_star_men.Rd bd4f4763e5074aeb64e6c93218ded93d *man/commarobust.Rd 25bab70be9b6d8843e49f479f431bbda *man/declaration_to_condition_pr_mat.Rd 5567425ce88fdbf12940aec3cdc72d3d *man/difference_in_means.Rd a83b8df095b168653e7c531c866c4fda *man/estimatr.Rd ad42979dbeca3de71bb1f74d6385a1fd *man/estimatr_glancers.Rd 03c6616bb3467b3f03c8ccaa15687aa2 *man/estimatr_tidiers.Rd 65a1b3bcc3f00657799aa3e292d4efc7 *man/extract.lm_robust.Rd 6a07315ebd3fab1bf5d8aca081bea3e9 *man/gen_pr_matrix_cluster.Rd a847fb69284e1a559925d42c4afca013 *man/horvitz_thompson.Rd f0ef1c9bbcc16b7c1caa767aeb0f3a80 *man/iv_robust.Rd d07d77341c0048440ecb57bcddbf3e73 *man/lh_robust.Rd 652807c1150fdca17d3458865a33d362 *man/lm_lin.Rd 5aef274d38d5d39d28e7fdcf88293d5a *man/lm_robust.Rd 9356a83bb84aa3fbf1b841d171e94a25 *man/lm_robust_fit.Rd 01554f53863a38ffea869c78a26246d9 *man/na.omit_detailed.data.frame.Rd 52f17e8feef3de5371af7cea9ef1de5b *man/permutations_to_condition_pr_mat.Rd db9dfb270a3e6ac7f25db894bc5e84b7 *man/predict.lm_robust.Rd b99e1183bac8b6374bca0787d94c9959 *man/reexports.Rd 2698e585810b191d1d2683235e48ba3c *man/starprep.Rd 2a6f9e9e044a78154d3cfda5936d6f48 *src/Makevars 8d46d69896a1f1d31ed76035a0e49d67 *src/Makevars.win 2dee34f9779b5072df3d195be67dcdb2 *src/RcppExports.cpp 262f27dcdab298357c2296dbf5be58ba *src/horvitz_thompson_variance.cpp ed61257125572543b442f1b31257cd13 *src/lm_robust_helper.cpp 0ce9e5919ff02e443442a46a010ff445 *src/naomit.cpp c61554942399850c8d487fe5ab27f157 *tests/sleep.R eef5067ab9f80a91f3e11c1b04e354e0 *tests/sleep.Rout.save a5eaeaadf01147c82c674c019660ba69 *tests/testthat.R bf1f68bc951a34038257d029ce3d03b7 *tests/testthat/helper-lm-robust-se.R 31ae154b8e2b7679ea82877e007190fc *tests/testthat/helper-return-cleaners.R 220d87ab65609b4494ef958822557f8c *tests/testthat/helper-se-types.R c63d70d67972018ffa0f15c93a3d4a80 *tests/testthat/run-stata-areg-models.do 3ad64bc9595530e109c9919159814118 *tests/testthat/run-stata-iv-models.do 5b797afe70bfb5c639a9472083b04926 *tests/testthat/stata-ests.txt 5ff20b5c5763dd58c3ef30a6d517911e *tests/testthat/stata-fe-ests.txt d97511188b7dd08fe6d38805cd5653e6 *tests/testthat/stata-iv-diagnostics.txt f6c2a1c3f2f7bf025a8553a26967ff1f *tests/testthat/stata-iv-ests.txt 7cce18597ada14c1a2ffeeeb3af43aa4 *tests/testthat/test-arg-checking.R be641ef6baf18d63fb2c5b7a532e9089 *tests/testthat/test-condition-pr-matrix.R f932eb2be12806e4809f13db4bc0eed0 *tests/testthat/test-condition1-condition2.R 7fbd452f4bcf03c43069de04e5a12169 *tests/testthat/test-difference-in-means.R 26f66bdeec089c1d651b4103d3cd949c *tests/testthat/test-horvitz-thompson.R 1296bc7b52d4e3deb57c840fd62c79b2 *tests/testthat/test-iv-robust-fes.R 16745573d072227066fbbf2d13b1acef *tests/testthat/test-iv-robust.R 839e9ddc18ade28767134ca70c21835b *tests/testthat/test-lh-robust.R 90a53b7755c3c14339f2a20f6c3d74a8 *tests/testthat/test-lm-cluster.R 4fe023bfa15f5c11e14ffd8e64a96328 *tests/testthat/test-lm-lin.R a0ae244e84840b72239982d400dcae9f *tests/testthat/test-lm-robust-fes.R da369c3fa07476e1d4a2332ccd8d3a8e *tests/testthat/test-lm-robust-helper.R 4d8c0dd767fffe70b6506c88a44a8756 *tests/testthat/test-lm-robust.R 16c66af13ebc59f94b07639e681daec2 *tests/testthat/test-lm-robust_emmeans.R ac082b94b4d2316d52f34bacae5232a0 *tests/testthat/test-lm-robust_margins.R 1eabceb5f36e4cc53220f463287919cf *tests/testthat/test-modelsummary.R 3837983c42ad308f413a160ec9a4a76b *tests/testthat/test-na-omit-details.R 84bcbf1e97c21590d6c7bc65a26f6d44 *tests/testthat/test-replicate-HT-middleton.R 98b4614754d05c467e98c0b12011e8d6 *tests/testthat/test-replicate-lin2013.R b93ceb4270c7b899ed37aaa15be8774a *tests/testthat/test-return.R 3fffd90948f8e9b17ef92149534f59a2 *tests/testthat/test-s3-methods.R 069930d497740d73d9bdab0edf860d10 *tests/testthat/test-sig-testing.R bcc2fd1d23adf816a6d11153cf063ce1 *tests/testthat/test-starprep.R dec6327278372584a05f5862fcf6ae35 *tests/testthat/test-stata-output.R estimatr/R/0000755000176200001440000000000014760400636012311 5ustar liggesusersestimatr/R/helper_clean_model_data.R0000644000176200001440000001470014747205231017227 0ustar liggesusers# library(estimatr) # f <- function(w) { # dat <- data.frame(x = rnorm(10), y = rnorm(10)) # lm_robust(y ~ x, data = dat, w = w) # } # f(NULL) # f(1:10) # Internal method to process data #' @importFrom rlang f_rhs %||% clean_model_data <- function(data, datargs, estimator = "") { # if data exists, evaluate it data <- if (quo_is_missing(data)) NULL else eval_tidy(data) if (getOption("estimatr.debug.clean_model_data", FALSE)) browser() mfargs <- Filter(Negate(quo_is_missing), datargs) m_formula <- eval_tidy(mfargs[["formula"]]) m_formula_env <- environment(m_formula) stopifnot("`formula` argument must be a formula"=inherits(m_formula, "formula")) # From this point on we never use the environment of anything # in mfargs as we always evaluate in `data` explicitly # Therefore we can just change it to a list that can take # expressions without environments attached to them mfargs <- as.list(mfargs) args_ignored <- c("fixed_effects", "se_type") # For each ... that would go to model.fram .default, early eval, # save to formula env, and point to it # subset is also non-standard eval to_process <- setdiff( names(mfargs), c( setdiff(names(formals(stats::model.frame.default)), "subset"), args_ignored ) ) for (da in to_process) { name <- sprintf(".__%s%%%d__", da, sample.int(.Machine$integer.max, 1)) m_formula_env[[name]] <- eval_tidy(mfargs[[da]], data = data) mfargs[[da]] <- sym(name) } if ("fixed_effects" %in% names(mfargs)) { name <- sprintf(".__fixed_effects%%%d__", sample.int(.Machine$integer.max, 1)) m_formula_env[[name]] <- sapply( stats::model.frame.default( mfargs[["fixed_effects"]], data = data, na.action = NULL ), FUN = as.factor ) mfargs[["fixed_effects"]] <- sym(name) } condition_pr <- NULL if ("condition_pr" %in% names(mfargs) && length(eval(mfargs[["condition_pr"]], m_formula_env)) == 1) { condition_pr <- eval(mfargs[["condition_pr"]], m_formula_env) mfargs[["condition_pr"]] <- NULL } mfargs[["formula"]] <- Formula::as.Formula(m_formula) # Get model frame mf <- eval_tidy(quo((stats::model.frame)( !!!mfargs, data = data, na.action = na.omit_detailed.data.frame, drop.unused.levels = TRUE ))) local({ na.action <- attr(mf, "na.action") why_omit <- attr(na.action, "why_omit") # Warn if missingness in ancillary variables missing_warning <- c( "Some observations have missingness in the %s variable(s) but not in ", "the outcome or covariates. These observations have been dropped." ) to_check_if_missing <- c( "cluster", "condition_pr", "block", "weights", "fixed_effects" ) for (x in to_check_if_missing) { if (!is.null(why_omit[[sprintf("(%s)", x)]])) { warning(sprintf(missing_warning, x)) } } }) if (!is.null(attr(terms(mf), "Formula_without_dot"))) { formula <- attr(terms(mf), "Formula_without_dot") } else { formula <- eval_tidy(mfargs[["formula"]]) # unwrap quosure => a formula } ret <- list( outcome = model.response(mf, type = "numeric"), design_matrix = model.matrix(terms(formula, rhs = 1), data = mf), formula = formula ) if (estimator == "iv") { if (length(formula)[2] != 2) { stop( "Must specify a `formula` with both regressors and instruments. For ", "example, `formula = y ~ x1 + x2 | x1 + z2` where x1 and x2 are the ", "regressors and z1 and z2 are the instruments.\n\nSee ?iv_robust." ) } ret[["instrument_matrix"]] <- model.matrix(terms(formula, rhs = 2), data = mf) ret[["terms_regressors"]] <- terms(formula, rhs = 1) } else if (estimator %in% c("ht", "dim")) { ret[["original_treatment"]] <- mf[, colnames(mf) == all.vars(terms(mf)[[3]])[1]] } ret[["weights"]] <- model.extract(mf, "weights") if (any(ret[["weights"]] < 0)) { stop("`weights` must not be negative") } ret[["cluster"]] <- model.extract(mf, "cluster") if (!(class(ret[["cluster"]]) %in% c("factor", "integer")) && !is.null(ret[["cluster"]])) { ret[["cluster"]] <- as.factor(ret[["cluster"]]) } ret[["block"]] <- model.extract(mf, "block") ret[["condition_pr"]] <- if (is.numeric(condition_pr)) rep(condition_pr, nrow(ret[["design_matrix"]])) else model.extract(mf, "condition_pr") ret[["fixed_effects"]] <- model.extract(mf, "fixed_effects") # If there is NA in the blocks and only one block, returns vector not matrix # so coerce to matrix if (is.character(ret[["fixed_effects"]])) { ret[["fixed_effects"]] <- as.matrix(ret[["fixed_effects"]]) } if (any(ret[["condition_pr"]] <= 0 | ret[["condition_pr"]] > 1)) { stop( "`condition_prs` must be a vector of positive values no greater than 1" ) } ret[["terms"]] <- attr(mf, "terms") dcs <- attr(ret[["terms"]], "dataClasses") # Clobber auxiliary variables in dataClasses for margins drop_vars <- c("(fixed_effects)", "(condition_pr)", "(block)", "(cluster)") attr(ret[["terms"]], "dataClasses") <- dcs[setdiff(names(dcs), drop_vars)] ret[["xlevels"]] <- .getXlevels(ret[["terms"]], mf) if (is.character(ret[["fixed_effects"]])) { ret[["felevels"]] <- lapply(as.data.frame(ret[["fixed_effects"]]), unique) } return(ret) } demean_fes <- function(model_data) { fe.ints <- apply(model_data[["fixed_effects"]], 2, function(x) match(x, unique(x))) eps <- 1e-8 weights <- model_data[["weights"]] %||% rep(1, nrow(model_data[["design_matrix"]])) has_int <- attr(model_data$terms, "intercept") demeaned <- list() # save names demeaned[["outcome"]] <- demeanMat2(as.matrix(model_data[["outcome"]]), fe.ints, weights, 0, eps) dimnames(demeaned[["outcome"]]) <- dimnames(model_data[["outcome"]]) model_data[["outcome"]] <- demeaned[["outcome"]] demeaned[["design_matrix"]] <- demeanMat2(model_data[["design_matrix"]], fe.ints, weights, has_int, eps) new_names <- dimnames(model_data[["design_matrix"]]) new_names[[2]] <- new_names[[2]][new_names[[2]] != "(Intercept)"] dimnames(demeaned[["design_matrix"]]) <- new_names model_data[["design_matrix"]] <- demeaned[["design_matrix"]] if (is.numeric(model_data[["instrument_matrix"]])) { demeaned[["instrument_matrix"]] <- demeanMat2(model_data[["instrument_matrix"]], fe.ints, weights, has_int, eps) model_data[["instrument_matrix"]] <- demeaned[["instrument_matrix"]] } model_data[["fe_levels"]] <- apply(fe.ints, 2, max) - 1 return(model_data) } estimatr/R/estimatr_difference_in_means.R0000644000176200001440000004512314747205231020313 0ustar liggesusers#' Design-based difference-in-means estimator #' #' @description Difference-in-means estimators that selects the appropriate #' point estimate, standard errors, and degrees of freedom for a variety of #' designs: unit randomized, cluster randomized, block randomized, #' block-cluster randomized, matched-pairs, and matched-pair cluster #' randomized designs #' #' @param formula an object of class formula, as in \code{\link{lm}}, such as #' \code{Y ~ Z} with only one variable on the right-hand side, the treatment. #' @param data A \code{data.frame}. #' @param blocks An optional bare (unquoted) name of the block variable. Use #' for blocked designs only. #' @param clusters An optional bare (unquoted) name of the variable that #' corresponds to the clusters in the data; used for cluster randomized #' designs. For blocked designs, clusters must nest within blocks. #' @param weights the bare (unquoted) names of the weights variable in the #' supplied data. #' @param subset An optional bare (unquoted) expression specifying a subset of #' observations to be used. #' @param se_type An optional string that can be one of \code{c("default", "none")}. If "default" (the default), it will use the default standard error estimator for the design, and if "none" then standard errors will not be computed which may speed up run time if only the point estimate is required. #' @param condition1 value in the treatment vector of the condition #' to be the control. Effects are #' estimated with \code{condition1} as the control and \code{condition2} as the #' treatment. If unspecified, \code{condition1} is the "first" condition and #' \code{condition2} is the "second" according to levels if the treatment is a #' factor or according to a sortif it is a numeric or character variable (i.e #' if unspecified and the treatment is 0s and 1s, \code{condition1} will by #' default be 0 and \code{condition2} will be 1). See the examples for more. #' @param condition2 value in the treatment vector of the condition to be the #' treatment. See \code{condition1}. #' @param ci logical. Whether to compute and return p-values and #' confidence intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. #' #' @details This function implements a difference-in-means estimator, with #' support for blocked, clustered, matched-pairs, block-clustered, and #' matched-pair clustered designs. One specifies their design by passing #' the blocks and clusters in their data and this function chooses which #' estimator is most appropriate. #' #' If you pass only \code{blocks}, if all blocks are of size two, we will #' infer that the design is a matched-pairs design. If they are all size four #' or larger, we will infer that it is a regular blocked design. If you pass #' both \code{blocks} and \code{clusters}, we will similarly #' infer whether it is a matched-pairs clustered design or a block-clustered #' design the number of clusters per block. If the user passes only #' \code{clusters}, we will infer that the design was cluster-randomized. If #' the user specifies neither the \code{blocks} nor the \code{clusters}, #' a regular Welch's t-test will be performed. #' #' Importantly, if the user specifies weights, the estimation is handed off #' to \code{\link{lm_robust}} with the appropriate robust standard errors #' as weighted difference-in-means estimators are not implemented here. #' More details of the about each of the estimators can be found in the #' \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{mathematical notes}. #' #' @return Returns an object of class \code{"difference_in_means"}. #' #' The post-estimation commands functions \code{summary} and \code{\link{tidy}} #' return results in a \code{data.frame}. To get useful data out of the return, #' you can use these data frames, you can use the resulting list directly, or #' you can use the generic accessor functions \code{coef} and #' \code{confint}. #' #' An object of class \code{"difference_in_means"} is a list containing at #' least the following components: #' \item{coefficients}{the estimated difference in means} #' \item{std.error}{the estimated standard error} #' \item{statistic}{the t-statistic} #' \item{df}{the estimated degrees of freedom} #' \item{p.value}{the p-value from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} #' \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} #' \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} #' \item{term}{a character vector of coefficient names} #' \item{alpha}{the significance level specified by the user} #' \item{N}{the number of observations used} #' \item{outcome}{the name of the outcome variable} #' \item{design}{the name of the design learned from the arguments passed} #' #' @seealso \code{\link{lm_lin}} #' #' @references #' Gerber, Alan S, and Donald P Green. 2012. Field Experiments: Design, Analysis, and Interpretation. New York: W.W. Norton. #' #' Imai, Kosuke, Gary King, Clayton Nall. 2009. "The Essential Role of Pair Matching in Cluster-Randomized Experiments, with Application to the Mexican Universal Health Insurance Evaluation." Statistical Science 24 (1). Institute of Mathematical Statistics: 29-53. \doi{10.1214/08-STS274}. #' #' @examples #' #' library(fabricatr) #' library(randomizr) #' # Get appropriate standard errors for unit-randomized designs #' #' # ---------- #' # 1. Unit randomized #' # ---------- #' dat <- fabricate( #' N = 100, #' Y = rnorm(100), #' Z_comp = complete_ra(N, prob = 0.4), #' ) #' #' table(dat$Z_comp) #' difference_in_means(Y ~ Z_comp, data = dat) #' #' # ---------- #' # 2. Cluster randomized #' # ---------- #' # Accurates estimates and standard errors for clustered designs #' dat$clust <- sample(20, size = nrow(dat), replace = TRUE) #' dat$Z_clust <- cluster_ra(dat$clust, prob = 0.6) #' #' table(dat$Z_clust, dat$clust) #' summary(difference_in_means(Y ~ Z_clust, clusters = clust, data = dat)) #' #' # ---------- #' # 3. Block randomized #' # ---------- #' dat$block <- rep(1:10, each = 10) #' dat$Z_block <- block_ra(dat$block, prob = 0.5) #' #' table(dat$Z_block, dat$block) #' difference_in_means(Y ~ Z_block, blocks = block, data = dat) #' #' # ---------- #' # 4. Block cluster randomized #' # ---------- #' # Learns this design if there are two clusters per block #' dat$small_clust <- rep(1:50, each = 2) #' dat$big_blocks <- rep(1:5, each = 10) #' #' dat$Z_blcl <- block_and_cluster_ra( #' blocks = dat$big_blocks, #' clusters = dat$small_clust #' ) #' #' difference_in_means( #' Y ~ Z_blcl, #' blocks = big_blocks, #' clusters = small_clust, #' data = dat #' ) #' #' # ---------- #' # 5. Matched-pairs #' # ---------- #' # Matched-pair estimates and standard errors are also accurate #' # Specified same as blocked design, function learns that #' # it is matched pair from size of blocks! #' dat$pairs <- rep(1:50, each = 2) #' dat$Z_pairs <- block_ra(dat$pairs, prob = 0.5) #' #' table(dat$pairs, dat$Z_pairs) #' difference_in_means(Y ~ Z_pairs, blocks = pairs, data = dat) #' #' # ---------- #' # 6. Matched-pair cluster randomized #' # ---------- #' # Learns this design if there are two clusters per block #' dat$small_clust <- rep(1:50, each = 2) #' dat$cluster_pairs <- rep(1:25, each = 4) #' table(dat$cluster_pairs, dat$small_clust) #' #' dat$Z_mpcl <- block_and_cluster_ra( #' blocks = dat$cluster_pairs, #' clusters = dat$small_clust #' ) #' #' difference_in_means( #' Y ~ Z_mpcl, #' blocks = cluster_pairs, #' clusters = small_clust, #' data = dat #' ) #' #' # ---------- #' # Other examples #' # ---------- #' #' # Also works with multi-valued treatments if users specify #' # comparison of interest #' dat$Z_multi <- simple_ra( #' nrow(dat), #' conditions = c("Treatment 2", "Treatment 1", "Control"), #' prob_each = c(0.4, 0.4, 0.2) #' ) #' #' # Only need to specify which condition is treated `condition2` and #' # which is control `condition1` #' difference_in_means( #' Y ~ Z_multi, #' condition1 = "Treatment 2", #' condition2 = "Control", #' data = dat #' ) #' difference_in_means( #' Y ~ Z_multi, #' condition1 = "Treatment 1", #' condition2 = "Control", #' data = dat #' ) #' #' # Specifying weights will result in estimation via lm_robust() #' dat$w <- runif(nrow(dat)) #' difference_in_means(Y ~ Z_comp, weights = w, data = dat) #' lm_robust(Y ~ Z_comp, weights = w, data = dat) #' #' @export difference_in_means <- function(formula, data, blocks, clusters, weights, subset, se_type = c("default", "none"), condition1 = NULL, condition2 = NULL, ci = TRUE, alpha = .05) { if (length(all.vars(f_rhs(eval_tidy(formula)))) > 1) { stop( "'formula' must have only one variable on the right-hand side: the ", "treatment variable." ) } se_type <- match.arg(se_type) datargs <- enquos( formula = formula, weights = weights, subset = subset, block = blocks, cluster = clusters ) data <- enquo(data) model_data <- clean_model_data(data = data, datargs, estimator = "dim") data <- data.frame( y = model_data$outcome, t = model_data$original_treatment, stringsAsFactors = FALSE ) data$cluster <- model_data$cluster # rescale weights for convenience if (is.numeric(model_data$weights)) { data$weights <- model_data$weights / mean(model_data$weights) } data$block <- model_data$block if (!is.null(data$weights) && length(unique(data$weights)) == 1 && is.null(data$cluster) && is.null(data$block)) { message( "Constant `weights` passed to `difference_in_means` will ", "unnecessarily trigger `lm_robust()` and the Welch-Satterthwaite ", "approximation will not be used for the degrees of freedom." ) } rm(model_data) # parse condition names if (is.null(condition1) || is.null(condition2)) { condition_names <- parse_conditions( treatment = data$t, condition1 = condition1, condition2 = condition2, estimator = "difference_in_means" ) condition2 <- condition_names[[2]] condition1 <- condition_names[[1]] } # subset data data <- subset.data.frame(data, t %in% c(condition1, condition2)) nblocks <- NULL nclusters <- NULL if (is.null(data$block)) { return_frame <- difference_in_means_internal( condition1 = condition1, condition2 = condition2, data = data, alpha = alpha, se_type = se_type ) if (is.null(data$cluster)) { design <- "Standard" } else { nclusters <- return_frame[["nclusters"]] design <- "Clustered" } } else { pair_matched <- FALSE # When learning whether design is matched pairs, # should only use rows in relevant conditions data <- subset.data.frame(data, t %in% c(condition1, condition2)) clust_per_block <- check_clusters_blocks(data) # Check if design is pair matched if (any(clust_per_block == 1)) { stop("All `blocks` must have multiple units (or `clusters`)") } else if (all(clust_per_block == 2)) { pair_matched <- TRUE } else if (any(clust_per_block == 2) & any(clust_per_block > 2)) { pair_matched <- TRUE warning( "Some `blocks` have two units/`clusters` while other blocks ", "have more units/`clusters`. As standard variance estimates ", "cannot be computed within blocks with two units, we use the ", "matched pairs estimator of the variance." ) } block_dfs <- split(data, data$block) block_estimates <- lapply(block_dfs, function(x) { difference_in_means_internal( data = x, condition1 = condition1, condition2 = condition2, pair_matched = pair_matched, alpha = alpha, se_type = se_type ) }) block_estimates <- do.call(rbind, block_estimates) N_overall <- with(block_estimates, sum(nobs)) nclusters <- with(block_estimates, sum(nclusters)) # Blocked design, (Gerber Green 2012, p73, eq3.10) diff <- with(block_estimates, sum(coefficients * nobs / N_overall)) df <- NA std.error <- NA nblocks <- nrow(block_estimates) if (pair_matched) { if (is.null(data$cluster)) { design <- "Matched-pair" # Pair matched, unit randomized (Gerber Green 2012, p77, eq3.16) if (se_type != "none") { std.error <- with( block_estimates, sqrt( (1 / (nblocks * (nblocks - 1))) * sum((coefficients - diff)^2) ) ) } } else { design <- "Matched-pair clustered" # Pair matched, cluster randomized (Imai, King, Nall 2009, p36, eq6) if (se_type != "none") { std.error <- with( block_estimates, sqrt( (nblocks / ((nblocks - 1) * N_overall^2)) * sum((nobs * coefficients - (N_overall * diff) / nblocks)^2) ) ) } } # For pair matched, cluster randomized Imai et al. 2009 recommend (p. 37) df <- nblocks - 1 } else { # Block randomized (Gerber and Green 2012, p. 74, footnote 17) if (se_type != "none") { std.error <- with( block_estimates, sqrt(sum(std.error^2 * (nobs / N_overall)^2)) ) } ## we don't know if this is correct! ## matches lm_lin, two estimates per block if (is.null(data$cluster)) { design <- "Blocked" df <- nrow(data) - 2 * nblocks } else { design <- "Block-clustered" # Also matches lm_lin for even sized clusters, should be conservative df <- nclusters - 2 * nblocks } } return_frame <- data.frame( coefficients = diff, std.error = std.error, df = df, nobs = N_overall, stringsAsFactors = FALSE ) } if (!is.null(data$weights)) { design <- paste0(design, " (weighted)") } return_list <- add_cis_pvals(return_frame, alpha, ci) # print(c("Pair Matched? ", pair_matched)) return_list <- dim_like_return( return_list, alpha = alpha, formula = formula, conditions = list(condition1, condition2) ) return_list[["design"]] <- design if (is.numeric(nblocks)) { return_list[["nblocks"]] <- nblocks } if (is.numeric(nclusters)) { return_list[["nclusters"]] <- nclusters } attr(return_list, "class") <- "difference_in_means" return(return_list) } difference_in_means_internal <- function(condition1 = NULL, condition2 = NULL, data, pair_matched = FALSE, alpha = .05, se_type = "default") { # Check that treatment status is uniform within cluster, checked here # so that the treatment vector t doesn't have to be built anywhere else clustered <- !is.null(data$cluster) if (clustered) { if (is.factor(data$cluster)) { data$cluster <- droplevels(data$cluster) } if (any(!tapply(data$t, data$cluster, function(x) all(x == x[1])))) { stop( "All units within a cluster must have the same treatment condition." ) } } Y2 <- data$y[data$t == condition2] Y1 <- data$y[data$t == condition1] N2 <- length(Y2) N1 <- length(Y1) N <- N2 + N1 if ((N1 == 0) || (N2 == 0)) { stop("Must have units with both treatment conditions within each block.") } # Check to make sure multiple in each group if pair matched is false if (!pair_matched & (N2 == 1 | N1 == 1)) { stop( "If design is not pair-matched, every block must have at least two ", "treated and control units." ) } df <- NA nclusters <- NA if (clustered && !pair_matched) { # For now, all clustered cases go to lm_robust # CR2 nests Gerber and Green 2012, p. 83, eq. 3.23 when clusters are # equal sizes (we think) and is more appropriate when clusters are of # different sizes X <- cbind(1, t = as.numeric(data$t == condition2)) # TODO currently lm_robust_fit does too much, need to refactor it # if it will be used here in the long run cr2_out <- lm_robust_fit( y = data$y, X = cbind(1, t = as.numeric(data$t == condition2)), cluster = data$cluster, se_type = if(se_type == "none") "none" else "CR2", weights = data$weights, ci = TRUE, try_cholesky = TRUE, alpha = alpha, return_vcov = FALSE, has_int = TRUE, iv_stage = list(0) ) diff <- coef(cr2_out)[2] std.error <- cr2_out[["std.error"]][2] df <- cr2_out[["df"]][2] nclusters <- cr2_out[["nclusters"]] } else { if (is.null(data$weights)) { diff <- mean(Y2) - mean(Y1) if (pair_matched || se_type == "none") { if (clustered) { nclusters <- 2 } # Pair matched designs std.error <- NA } else { # Non-pair matched designs, unit level randomization var_Y2 <- var(Y2) var_Y1 <- var(Y1) std.error <- sqrt(var_Y2 / N2 + var_Y1 / N1) df <- std.error^4 / ( (var_Y2 / N2)^2 / (N2 - 1) + (var_Y1 / N1)^2 / (N1 - 1) ) } } else { if (pair_matched) { stop( "Cannot use `weights` with matched pairs design at the moment" ) } X <- cbind(1, t = as.numeric(data$t == condition2)) # print("Using lm_robust") # TODO currently lm_robust_fit does too much, need to refactor it # if it will be used here in the long run w_hc2_out <- lm_robust_fit( y = data$y, X = cbind(1, t = as.numeric(data$t == condition2)), se_type = ifelse(se_type == "none", "none", "HC2"), weights = data$weights, cluster = NULL, ci = TRUE, try_cholesky = TRUE, alpha = alpha, return_vcov = FALSE, has_int = TRUE, iv_stage = list(0) ) diff <- coef(w_hc2_out)[2] std.error <- w_hc2_out$std.error[2] df <- w_hc2_out$df[2] } } return_frame <- data.frame( coefficients = diff, std.error = std.error, df = df, stringsAsFactors = FALSE ) if (is.numeric(data$weights)) { return_frame$nobs <- sum(data$weights) } else { return_frame$nobs <- N } if (is.numeric(nclusters)) { return_frame$nclusters <- nclusters } return(return_frame) } estimatr/R/RcppExports.R0000644000176200001440000000176714760400636014740 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 ht_covar_partial <- function(y1, y0, p10, p1, p0) { .Call(`_estimatr_ht_covar_partial`, y1, y0, p10, p1, p0) } ht_var_partial <- function(y, p) { .Call(`_estimatr_ht_var_partial`, y, p) } demeanMat2 <- function(what, fes, weights, start_col, eps) { .Call(`_estimatr_demeanMat2`, what, fes, weights, start_col, eps) } AtA <- function(A) { .Call(`_estimatr_AtA`, A) } Kr <- function(A, B) { .Call(`_estimatr_Kr`, A, B) } lm_solver <- function(X, y, try_cholesky) { .Call(`_estimatr_lm_solver`, X, y, try_cholesky) } lm_variance <- function(X, Xunweighted, XtX_inv, ei, weight_mean, cluster, J, ci, se_type, which_covs, fe_rank) { .Call(`_estimatr_lm_variance`, X, Xunweighted, XtX_inv, ei, weight_mean, cluster, J, ci, se_type, which_covs, fe_rank) } naomitwhy <- function(df, recursive_subset) { .Call(`_estimatr_naomitwhy`, df, recursive_subset) } estimatr/R/estimatr.R0000644000176200001440000000223014747205231014260 0ustar liggesusers#' estimatr #' #' @description Fast procedures for small set of commonly-used, design-appropriate estimators with robust standard errors and confidence intervals. Includes estimators for linear regression, instrumental variables regression, difference-in-means, Horvitz-Thompson estimation, and regression improving precision of experimental estimates by interacting treatment with centered pre-treatment covariates introduced by Lin (2013) . #' #' @docType package #' @useDynLib estimatr, .registration = TRUE #' @importFrom Rcpp evalCpp #' @importFrom stats sd var model.matrix.default pt qt var weighted.mean lm #' vcov model.frame.default model.response complete.cases terms reformulate #' update model.extract setNames delete.response .checkMFClasses model.frame #' model.matrix na.pass nobs coef pf .getXlevels df.residual fitted.values #' formula model.matrix.lm resid weights lm.fit na.omit pchisq printCoefmat #' residuals #' @importFrom methods setGeneric setMethod isGeneric className #' @importFrom Formula as.Formula #' @importFrom rlang enquos enquo eval_tidy quo_get_expr quo_set_expr quo_is_missing sym quo #' @name estimatr "_PACKAGE" estimatr/R/S3_glance.R0000644000176200001440000001547414747205231014244 0ustar liggesusers# Helpers to retrieve values retrieve_value <- function(x, what) if(exists(what, x)) x[[what]] else NA_real_ retrieve_fstatistic <- function(x) { if (exists("fstatistic", x)) { data.frame( statistic = x[["fstatistic"]][1], p.value = pf( x[["fstatistic"]][1], x[["fstatistic"]][2], x[["fstatistic"]][3], lower.tail = FALSE ) ) } else { data.frame(statistic = NA_real_, p.value = NA_real_) } } #' @importFrom generics glance #' @export generics::glance #' Glance at an estimatr object #' @name estimatr_glancers #' @templateVar class lm_robust #' @return For \code{glance.lm_robust}, a data.frame with columns: #' \item{r.squared}{the \eqn{R^2}, #' \deqn{R^2 = 1 - Sum(e[i]^2) / Sum((y[i] - y^*)^2),} where \eqn{y^*} #' is the mean of \eqn{y[i]} if there is an intercept and zero otherwise, #' and \eqn{e[i]} is the ith residual.} #' \item{adj.r.squared}{the \eqn{R^2} but penalized for having more parameters, \code{rank}} #' \item{se_type}{the standard error type specified by the user} #' \item{statistic}{the value of the F-statistic} #' \item{p.value}{p-value from the F test} #' \item{df.residual}{residual degrees of freedom} #' \item{nobs}{the number of observations used} #' #' @param x An object returned by one of the estimators #' @param ... extra arguments (not used) #' #' @export #' @family estimatr glancers #' @seealso [generics::glance()], [estimatr::lm_robust()], [estimatr::lm_lin()], [estimatr::iv_robust()], [estimatr::difference_in_means()], [estimatr::horvitz_thompson()] #' @md glance.lm_robust <- function(x, ...) { if (length(x[["outcome"]]) > 1) { stop("Cannot use `glance` on linear models with multiple responses.") } ret <- cbind( data.frame( r.squared = x[["r.squared"]], adj.r.squared = x[["adj.r.squared"]] ), retrieve_fstatistic(x), data.frame( df.residual = x[["df"]][1], nobs = as.integer(x[["nobs"]]), se_type = x[["se_type"]], stringsAsFactors = FALSE ) ) rownames(ret) <- NULL ret } #' @rdname estimatr_glancers #' @templateVar class lh_robust #' @return For \code{glance.lh_robust}, we glance the \code{lm_robust} component only. You can access the linear hypotheses as a data.frame directy from the \code{lh} component of the \code{lh_robust} object #' #' @export #' @family estimatr glancers glance.lh_robust <- function(x, ...) { glance(x[["lm_robust"]]) } #' @rdname estimatr_glancers #' @templateVar class iv_robust #' @return For \code{glance.iv_robust}, a data.frame with columns: #' \item{r.squared}{The \eqn{R^2} of the second stage regression} #' \item{adj.r.squared}{The \eqn{R^2} but penalized for having more parameters, \code{rank}} #' \item{df.residual}{residual degrees of freedom} #' \item{N}{the number of observations used} #' \item{se_type}{the standard error type specified by the user} #' \item{statistic}{the value of the F-statistic} #' \item{p.value}{p-value from the F test} #' \item{statistic.weakinst}{the value of the first stage F-statistic, useful for the weak instruments test; only reported if there is only one endogenous variable} #' \item{p.value.weakinst}{p-value from the first-stage F test, a test of weak instruments; only reported if there is only one endogenous variable} #' \item{statistic.endogeneity}{the value of the F-statistic for the test of endogeneity; often called the Wu-Hausman statistic, with robust standard errors, we employ the regression based test} #' \item{p.value.endogeneity}{p-value from the F-test for endogeneity} #' \item{statistic.overid}{the value of the chi-squared statistic for the test of instrument correlation with the error term; only reported with overidentification} #' \item{p.value.overid}{p-value from the chi-squared test; only reported with overidentification} #' #' @export #' @family estimatr glancers glance.iv_robust <- function(x, ...) { if (length(x[["outcome"]]) > 1) { stop("Cannot use `glance` on linear models with multiple responses.") } ret <- cbind( data.frame( r.squared = x[["r.squared"]], adj.r.squared = x[["adj.r.squared"]], df.residual = x[["df.residual"]], nobs = as.integer(x[["nobs"]]), se_type = x[["se_type"]], stringsAsFactors = FALSE ), retrieve_fstatistic(x), if (exists("diagnostic_first_stage_fstatistic", x) && length(x[["diagnostic_first_stage_fstatistic"]] == 4)) { data.frame( statistic.weakinst = x[["diagnostic_first_stage_fstatistic"]]["value"], p.value.weakinst = x[["diagnostic_first_stage_fstatistic"]]["p.value"] ) } else { data.frame(statistic.weakinst = NA_real_, p.value.weakinst = NA_real_) }, if (exists("diagnostic_endogeneity_test", x)) { data.frame( statistic.endogeneity = x[["diagnostic_endogeneity_test"]]["value"], p.value.endogeneity = x[["diagnostic_endogeneity_test"]]["p.value"] ) } else { data.frame(statistic.endogeneity = NA_real_, p.value.endogeneity = NA_real_) }, if (exists("diagnostic_overid_test", x)) { data.frame( statistic.overid = x[["diagnostic_overid_test"]]["value"], p.value.overid = x[["diagnostic_overid_test"]]["p.value"] ) } else { data.frame(statistic.overid = NA_real_, p.value.overid = NA_real_) } ) ret } #' @rdname estimatr_glancers #' @templateVar class difference_in_means #' @return For \code{glance.difference_in_means}, a data.frame with columns: #' \item{design}{the design used, and therefore the estimator used} #' \item{df}{the degrees of freedom} #' \item{nobs}{the number of observations used} #' \item{nblocks}{the number of blocks, if used} #' \item{nclusters}{the number of clusters, if used} #' \item{condition2}{the second, "treatment", condition} #' \item{condition1}{the first, "control", condition} #' #' @export #' @family estimatr glancers glance.difference_in_means <- function(x, ...) { data.frame( design = x[["design"]], df = x[["df"]], nobs = as.integer(x[["nobs"]]), nblocks = retrieve_value(x, "nblocks"), nclusters = retrieve_value(x, "nclusters"), condition2 = x[["condition2"]], condition1 = x[["condition1"]], stringsAsFactors = FALSE ) } #' @rdname estimatr_glancers #' @templateVar class horvitz_thompson #' @return For \code{glance.horvitz_thompson}, a data.frame with columns: #' \item{nobs}{the number of observations used} #' \item{se_type}{the type of standard error estimator used} #' \item{condition2}{the second, "treatment", condition} #' \item{condition1}{the first, "control", condition} #' #' @export #' @family estimatr glancers glance.horvitz_thompson <- function(x, ...) { data.frame( nobs = as.integer(x[["nobs"]]), se_type = x[["se_type"]], condition2 = x[["condition2"]], condition1 = x[["condition1"]], stringsAsFactors = FALSE ) } estimatr/R/helper_extract.R0000644000176200001440000000613514747205231015451 0ustar liggesusers# This code modified from # https://github.com/leifeld/texreg/blob/master/R/extract.R (no LICENSE) #' Extract model data for \pkg{texreg} package #' @rdname extract.lm_robust #' #' @description Prepares a \code{"lm_robust"} or \code{"iv_robust"} object for the \pkg{texreg} #' package. This is largely a clone of the \code{extract.lm} #' method. #' #' @param model an object of class \code{\link{lm_robust}} or \code{"iv_robust"} #' @param include.ci logical. Defaults to TRUE #' @param include.rsquared logical. Defaults to TRUE #' @param include.adjrs logical. Defaults to TRUE #' @param include.nobs logical. Defaults to TRUE #' @param include.fstatistic logical. Defaults to TRUE #' @param include.rmse logical. Defaults to TRUE #' @param include.nclusts logical. Defaults to TRUE if clusters in \code{model} #' @param ... unused #' extract.robust_default <- function(model, include.ci = TRUE, include.rsquared = TRUE, include.adjrs = TRUE, include.nobs = TRUE, include.fstatistic = FALSE, include.rmse = TRUE, include.nclusts = TRUE, ...) { s <- tidy(model) names <- s[["term"]] co <- s[["estimate"]] se <- s[["std.error"]] pval <- s[["p.value"]] cilow <- numeric() ciupper <- numeric() if (include.ci) { cilow <- s[["conf.low"]] ciupper <- s[["conf.high"]] } rs <- model$r.squared # extract R-squared adj <- model$adj.r.squared # extract adjusted R-squared n <- nobs(model) # extract number of observations gof <- numeric() gof.names <- character() gof.decimal <- logical() if (include.rsquared) { gof <- c(gof, rs) gof.names <- c(gof.names, "R$^2$") gof.decimal <- c(gof.decimal, TRUE) } if (include.adjrs) { gof <- c(gof, adj) gof.names <- c(gof.names, "Adj.\ R$^2$") gof.decimal <- c(gof.decimal, TRUE) } if (include.nobs) { gof <- c(gof, n) gof.names <- c(gof.names, "Num.\ obs.") gof.decimal <- c(gof.decimal, FALSE) } if (include.fstatistic) { fstat <- model[["fstatistic"]][[1]] gof <- c(gof, fstat) gof.names <- c(gof.names, "F statistic") gof.decimal <- c(gof.decimal, TRUE) } if (include.rmse && !is.null(model[["res_var"]])) { rmse <- sqrt(model[["res_var"]]) gof <- c(gof, rmse) gof.names <- c(gof.names, "RMSE") gof.decimal <- c(gof.decimal, TRUE) } if (include.nclusts && model[["clustered"]]) { rmse <- sqrt(model[["res_var"]]) gof <- c(gof, model[["nclusters"]]) gof.names <- c(gof.names, "N Clusters") gof.decimal <- c(gof.decimal, FALSE) } tr <- texreg::createTexreg( coef.names = names, coef = co, se = se, pvalues = pval, ci.low = cilow, ci.up = ciupper, gof.names = gof.names, gof = gof, gof.decimal = gof.decimal ) return(tr) } #' @rdname extract.lm_robust #' #' @export extract.lm_robust <- extract.robust_default #' @rdname extract.lm_robust #' #' @export extract.iv_robust <- extract.robust_default estimatr/R/S3_tidy.R0000644000176200001440000000777414747205231013770 0ustar liggesusers#' @importFrom generics tidy #' @export generics::tidy tidy_data_frame <- function(x, conf.int = TRUE, conf.level = NULL, ...) { vec_cols <- c( "coefficients", "std.error", "statistic", "p.value", "conf.low", "conf.high", "df" ) if(!conf.int){ vec_cols <- c( "coefficients", "std.error", "statistic", "p.value", "df" ) } tidy_mat <- do.call("cbind", lapply(x[vec_cols], as.vector)) vec_cols[vec_cols == "coefficients"] <- "estimate" colnames(tidy_mat) <- vec_cols return_frame <- data.frame( term = x[["term"]], tidy_mat, outcome = rep(x[["outcome"]], each = length(x[["term"]])), stringsAsFactors = FALSE ) rownames(return_frame) <- NULL if(!is.null(conf.level) && conf.int){ ci <- stats::confint(x, level = conf.level, ...) if (all(row.names(ci) == return_frame$term)) { return_frame$conf.low <- ci[, 1] return_frame$conf.high <- ci[, 2] } } return(return_frame) } warn_singularities <- function(x) { if (x$rank < x$k) { singularities <- x$k - x$rank what <- ifelse(singularities > 1, " coefficients ", " coefficient ") message( singularities, what, " not defined because the design matrix is rank deficient\n" ) } } #' Tidy an estimatr object #' @name estimatr_tidiers #' @templateVar class lm_robust #' @return A data.frame with columns for coefficient names, estimates, standard #' errors, confidence intervals, p-values, degrees of freedom, and the #' name of the outcome variable #' #' @param x An object returned by one of the estimators #' @param conf.int Logical indicating whether or not to include a #' confidence interval in the tidied output. Defaults to ‘TRUE’. #' @param conf.level The confidence level to use for the confidence #' interval if ‘conf.int = TRUE’. Must be strictly greater than 0 and less #' than 1. Defaults to 0.95, which corresponds to a 95 percent confidence #' interval. #' @param ... extra arguments (not used) #' #' @export #' @family estimatr tidiers #' @seealso [generics::tidy()], [estimatr::lm_robust()], [estimatr::iv_robust()], [estimatr::difference_in_means()], [estimatr::horvitz_thompson()] #' @md tidy.lm_robust <- function(x, conf.int = TRUE, conf.level = NULL, ...) { warn_singularities(x) tidy_data_frame(x, conf.int = conf.int, conf.level = conf.level, ...) } #' @rdname estimatr_tidiers #' @templateVar class iv_robust #' #' @export #' @family estimatr tidiers tidy.iv_robust <- function(x, conf.int = TRUE, conf.level = NULL, ...) { warn_singularities(x) tidy_data_frame(x, conf.int = conf.int, conf.level = conf.level, ...) } #' @rdname estimatr_tidiers #' @templateVar class difference_in_means #' #' @export #' @family estimatr tidiers tidy.difference_in_means <- tidy_data_frame #' @rdname estimatr_tidiers #' @templateVar class horvitz_thompson #' #' @export #' @family estimatr tidiers tidy.horvitz_thompson <- tidy_data_frame #' @rdname estimatr_tidiers #' @templateVar class lh_robust #' #' @export #' @family estimatr tidiers tidy.lh_robust <- function(x, conf.int = TRUE, conf.level = NULL, ...) { rbind(tidy(x$lm_robust, conf.int = conf.int, conf.level = conf.level, ...), tidy(x$lh, conf.int = conf.int, conf.level = conf.level, ...)) } #' @rdname estimatr_tidiers #' @templateVar class lh #' #' @export #' @family estimatr tidiers tidy.lh <- function(x, conf.int = TRUE, conf.level = NULL, ...) { tidy_data_frame(simplify_lh_outcome(x), conf.int = conf.int, conf.level = conf.level, ...) } # Simplifies the `lh` outcome column for tidy.lh and print.lh simplify_lh_outcome <- function(x) { x_list <- as.list(x) x_list[["outcome"]] <- unique(x_list[["outcome"]]) class(x_list) <- "lh" x_list } estimatr/R/estimatr_lm_robust.R0000644000176200001440000003146514747205231016362 0ustar liggesusers#' Ordinary Least Squares with Robust Standard Errors #' #' @description This formula fits a linear model, provides a variety of #' options for robust standard errors, and conducts coefficient tests #' #' @param formula an object of class formula, as in \code{\link{lm}} #' @param data A \code{data.frame} #' @param weights the bare (unquoted) names of the weights variable in the #' supplied data. #' @param subset An optional bare (unquoted) expression specifying a subset #' of observations to be used. #' @param clusters An optional bare (unquoted) name of the variable that #' corresponds to the clusters in the data. #' @param fixed_effects An optional right-sided formula containing the fixed #' effects that will be projected out of the data, such as \code{~ blockID}. Do not #' pass multiple-fixed effects with intersecting groups. Speed gains are greatest for #' variables with large numbers of groups and when using "HC1" or "stata" standard errors. #' See 'Details'. #' @param se_type The sort of standard error sought. If \code{clusters} is #' not specified the options are "HC0", "HC1" (or "stata", the equivalent), #' "HC2" (default), "HC3", or #' "classical". If \code{clusters} is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients. #' @param ci logical. Whether to compute and return p-values and confidence #' intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. #' @param return_vcov logical. Whether to return the variance-covariance #' matrix for later usage, TRUE by default. #' @param try_cholesky logical. Whether to try using a Cholesky #' decomposition to solve least squares instead of a QR decomposition, #' FALSE by default. Using a Cholesky decomposition may result in speed gains, but should only #' be used if users are sure their model is full-rank (i.e., there is no #' perfect multi-collinearity) #' #' @details #' #' This function performs linear regression and provides a variety of standard #' errors. It takes a formula and data much in the same was as \code{\link{lm}} #' does, and all auxiliary variables, such as clusters and weights, can be #' passed either as quoted names of columns, as bare column names, or #' as a self-contained vector. Examples of usage can be seen below and in the #' \href{https://declaredesign.org/r/estimatr/articles/getting-started.html}{Getting Started vignette}. #' #' The mathematical notes in #' \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{this vignette} #' specify the exact estimators used by this function. #' The default variance estimators have been chosen largely in accordance with the #' procedures in #' \href{https://github.com/acoppock/Green-Lab-SOP/blob/master/Green_Lab_SOP.pdf}{this manual}. #' The default for the case #' without clusters is the HC2 estimator and the default with clusters is the #' analogous CR2 estimator. Users can easily replicate Stata standard errors in #' the clustered or non-clustered case by setting \code{`se_type` = "stata"}. #' #' The function estimates the coefficients and standard errors in C++, using #' the \code{RcppEigen} package. By default, we estimate the coefficients #' using Column-Pivoting QR decomposition from the Eigen C++ library, although #' users could get faster solutions by setting \code{`try_cholesky` = TRUE} to #' use a Cholesky decomposition instead. This will likely result in quicker #' solutions, but the algorithm does not reliably detect when there are linear #' dependencies in the model and may fail silently if they exist. #' #' If \code{`fixed_effects`} are specified, both the outcome and design matrix #' are centered using the method of alternating projections (Halperin 1962; Gaure 2013). Specifying #' fixed effects in this way will result in large speed gains with standard error #' estimators that do not need to invert the matrix of fixed effects. This means using #' "classical", "HC0", "HC1", "CR0", or "stata" standard errors will be faster than other #' standard error estimators. Be wary when specifying fixed effects that may result #' in perfect fits for some observations or if there are intersecting groups across #' multiple fixed effect variables (e.g. if you specify both "year" and "country" fixed effects #' with an unbalanced panel where one year you only have data for one country). #' #' As with \code{`lm()`}, multivariate regression (multiple outcomes) will only admit #' observations into the estimation that have no missingness on any outcome. #' #' @return An object of class \code{"lm_robust"}. #' #' The post-estimation commands functions \code{summary} and \code{\link{tidy}} #' return results in a \code{data.frame}. To get useful data out of the return, #' you can use these data frames, you can use the resulting list directly, or #' you can use the generic accessor functions \code{coef}, \code{vcov}, #' \code{confint}, and \code{predict}. Marginal effects and uncertainty about #' them can be gotten by passing this object to #' \code{\link[margins]{margins}} from the \pkg{margins}, #' or to \code{emmeans} in the \pkg{emmeans} package. #' #' Users who want to print the results in TeX of HTML can use the #' \code{\link[texreg]{extract}} function and the \pkg{texreg} package. #' #' If users specify a multivariate linear regression model (multiple outcomes), #' then some of the below components will be of higher dimension to accommodate #' the additional models. #' #' An object of class \code{"lm_robust"} is a list containing at least the #' following components: #' \item{coefficients}{the estimated coefficients} #' \item{std.error}{the estimated standard errors} #' \item{statistic}{the t-statistic} #' \item{df}{the estimated degrees of freedom} #' \item{p.value}{the p-values from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} #' \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} #' \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} #' \item{term}{a character vector of coefficient names} #' \item{alpha}{the significance level specified by the user} #' \item{se_type}{the standard error type specified by the user} #' \item{res_var}{the residual variance} #' \item{N}{the number of observations used} #' \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} #' \item{rank}{the rank of the fitted model} #' \item{vcov}{the fitted variance covariance matrix} #' \item{r.squared}{The \eqn{R^2}, #' \deqn{R^2 = 1 - Sum(e[i]^2) / Sum((y[i] - y^*)^2),} where \eqn{y^*} #' is the mean of \eqn{y[i]} if there is an intercept and zero otherwise, #' and \eqn{e[i]} is the ith residual.} #' \item{adj.r.squared}{The \eqn{R^2} but penalized for having more parameters, \code{rank}} #' \item{fstatistic}{a vector with the value of the F-statistic with the numerator and denominator degrees of freedom} #' \item{weighted}{whether or not weights were applied} #' \item{call}{the original function call} #' \item{fitted.values}{the matrix of predicted means} #' We also return \code{terms} and \code{contrasts}, used by \code{predict}. If \code{fixed_effects} are specified, then we return \code{proj_fstatistic}, \code{proj_r.squared}, and \code{proj_adj.r.squared}, which are model fit statistics that are computed on the projected model (after demeaning the fixed effects). #' #' @references #' Abadie, Alberto, Susan Athey, Guido W Imbens, and Jeffrey Wooldridge. 2017. "A Class of Unbiased Estimators of the Average Treatment Effect in Randomized Experiments." arXiv Pre-Print. \url{https://arxiv.org/abs/1710.02926v2}. #' #' Bell, Robert M, and Daniel F McCaffrey. 2002. "Bias Reduction in Standard Errors for Linear Regression with Multi-Stage Samples." Survey Methodology 28 (2): 169-82. #' #' Gaure, Simon. 2013. "OLS with multiple high dimensional category variables." Computational Statistics & Data Analysis 66: 8-1. \doi{10.1016/j.csda.2013.03.024} #' #' Halperin, I. 1962. "The product of projection operators." Acta Scientiarum Mathematicarum (Szeged) 23(1-2): 96-99. #' #' MacKinnon, James, and Halbert White. 1985. "Some Heteroskedasticity-Consistent Covariance Matrix Estimators with Improved Finite Sample Properties." Journal of Econometrics 29 (3): 305-25. \doi{10.1016/0304-4076(85)90158-7}. #' #' Pustejovsky, James E, and Elizabeth Tipton. 2016. "Small Sample Methods for Cluster-Robust Variance Estimation and Hypothesis Testing in Fixed Effects Models." Journal of Business & Economic Statistics. Taylor & Francis. \doi{10.1080/07350015.2016.1247004}. #' #' Samii, Cyrus, and Peter M Aronow. 2012. "On Equivalencies Between Design-Based and Regression-Based Variance Estimators for Randomized Experiments." Statistics and Probability Letters 82 (2). \doi{10.1016/j.spl.2011.10.024}. #' #' @examples #' set.seed(15) #' library(fabricatr) #' dat <- fabricate( #' N = 40, #' y = rpois(N, lambda = 4), #' x = rnorm(N), #' z = rbinom(N, 1, prob = 0.4) #' ) #' #' # Default variance estimator is HC2 robust standard errors #' lmro <- lm_robust(y ~ x + z, data = dat) #' #' # Can tidy() the data in to a data.frame #' tidy(lmro) #' # Can use summary() to get more statistics #' summary(lmro) #' # Can also get coefficients three ways #' lmro$coefficients #' coef(lmro) #' tidy(lmro)$estimate #' # Can also get confidence intervals from object or with new 1 - `alpha` #' lmro$conf.low #' confint(lmro, level = 0.8) #' #' # Can recover classical standard errors #' lmclassic <- lm_robust(y ~ x + z, data = dat, se_type = "classical") #' tidy(lmclassic) #' #' # Can easily match Stata's robust standard errors #' lmstata <- lm_robust(y ~ x + z, data = dat, se_type = "stata") #' tidy(lmstata) #' #' # Easy to specify clusters for cluster-robust inference #' dat$clusterID <- sample(1:10, size = 40, replace = TRUE) #' #' lmclust <- lm_robust(y ~ x + z, data = dat, clusters = clusterID) #' tidy(lmclust) #' #' # Can also match Stata's clustered standard errors #' lm_robust( #' y ~ x + z, #' data = dat, #' clusters = clusterID, #' se_type = "stata" #' ) #' #' # Works just as LM does with functions in the formula #' dat$blockID <- rep(c("A", "B", "C", "D"), each = 10) #' #' lm_robust(y ~ x + z + factor(blockID), data = dat) #' #' # Weights are also easily specified #' dat$w <- runif(40) #' #' lm_robust( #' y ~ x + z, #' data = dat, #' weights = w, #' clusters = clusterID #' ) #' #' # Subsetting works just as in `lm()` #' lm_robust(y ~ x, data = dat, subset = z == 1) #' #' # One can also choose to set the significance level for different CIs #' lm_robust(y ~ x + z, data = dat, alpha = 0.1) #' #' # We can also specify fixed effects #' # Speed gains with fixed effects are greatest with "stata" or "HC1" std.errors #' tidy(lm_robust(y ~ z, data = dat, fixed_effects = ~ blockID, se_type = "HC1")) #' #' \dontrun{ #' # Can also use 'margins' or 'emmeans' package if you have them installed #' # to get marginal effects #' library(margins) #' lmrout <- lm_robust(y ~ x + z, data = dat) #' summary(margins(lmrout)) #' #' # Can output results using 'texreg' #' library(texreg) #' texreg(lmrout) #' #' # Using emmeans to obtain covariate-adjusted means #' library(emmeans) #' fiber.rlm <- lm_robust(strength ~ diameter + machine, data = fiber) #' emmeans(fiber.rlm, "machine") #' } #' #' @export lm_robust <- function(formula, data, weights, subset, clusters, fixed_effects, se_type = NULL, ci = TRUE, alpha = .05, return_vcov = TRUE, try_cholesky = FALSE) { datargs <- enquos( formula = formula, weights = weights, subset = subset, cluster = clusters, fixed_effects = fixed_effects ) data <- enquo(data) model_data <- clean_model_data(data = data, datargs) fes <- is.character(model_data[["fixed_effects"]]) if (fes) { yoriginal <- model_data[["outcome"]] Xoriginal <- model_data[["design_matrix"]] model_data <- demean_fes(model_data) attr(model_data$fixed_effects, "fe_rank") <- sum(model_data[["fe_levels"]]) + 1 } else { Xoriginal <- NULL yoriginal <- NULL } return_list <- lm_robust_fit( y = model_data$outcome, X = model_data$design_matrix, yoriginal = yoriginal, Xoriginal = Xoriginal, weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, ci = ci, se_type = se_type, alpha = alpha, return_vcov = return_vcov, try_cholesky = try_cholesky, has_int = attr(model_data$terms, "intercept"), iv_stage = list(0) ) return_list <- lm_return( return_list, model_data = model_data, formula = formula ) return_list[["call"]] <- match.call() return(return_list) } estimatr/R/S3_print.R0000644000176200001440000000761314747205231014143 0ustar liggesusers#' @export print.lm_robust <- function(x, ...) { print(summarize_tidy(x)) } #' @export print.iv_robust <- function(x, ...) { print(summarize_tidy(x)) } print_summary_lm_like <- function(x, digits, signif.stars = getOption("show.signif.stars"), ...) { cat( "\nCall:\n", paste(deparse(x$call, nlines = 5), sep = "\n", collapse = "\n"), "\n\n", sep = "" ) if (x$weighted) { cat("Weighted, ") } cat("Standard error type: ", x$se_type, "\n") if (x$rank < x$k) { singularities <- x$k - x$rank cat( "\nCoefficients: (", singularities, " not defined because the design matrix is rank deficient)\n", sep = "" ) } else { cat("\nCoefficients:\n") } print(coef(x), digits = digits) fstat <- if (is.numeric(x[["fstatistic"]])) { paste( "\nF-statistic:", formatC(x$fstatistic[1L], digits = digits), "on", x$fstatistic[2L], "and", x$fstatistic[3L], "DF, p-value:", format.pval(pf( x$fstatistic[1L], x$fstatistic[2L], x$fstatistic[3L], lower.tail = FALSE ), digits = digits) ) } else NULL cat( "\nMultiple R-squared: ", formatC(x$r.squared, digits = digits), ",\tAdjusted R-squared: ", formatC(x$adj.r.squared, digits = digits), fstat ) if (is.numeric(x[["proj_fstatistic"]])) { cat( "\nMultiple R-squared (proj. model): ", formatC(x$proj_r.squared, digits = digits), ",\tAdjusted R-squared (proj. model): ", formatC(x$proj_adj.r.squared, digits = digits), "\nF-statistic (proj. model):", formatC(x$proj_fstatistic[1L], digits = digits), "on", x$proj_fstatistic[2L], "and", x$proj_fstatistic[3L], "DF, p-value:", format.pval(pf( x$proj_fstatistic[1L], x$proj_fstatistic[2L], x$proj_fstatistic[3L], lower.tail = FALSE ), digits = digits) ) } cat("\n") if (is.numeric(x[["diagnostic_endogeneity_test"]])) { cat("\nDiagnostics:\n") printCoefmat( build_ivreg_diagnostics_mat(x), cs.ind = 1L:2L, tst.ind = 3L, has.Pvalue = TRUE, P.values = TRUE, digits = digits, signif.stars = signif.stars, na.print = "NA", ... ) } invisible(x) } #' @export print.summary.lm_robust <- function(x, digits = max(3L, getOption("digits") - 3L), signif.stars = getOption("show.signif.stars"), ...) { print_summary_lm_like(x, digits, ...) } #' @export print.summary.iv_robust <- function(x, digits = max(3L, getOption("digits") - 3L), signif.stars = getOption("show.signif.stars"), ...) { print_summary_lm_like(x, digits, signif.stars, ...) } #' @export print.difference_in_means <- function(x, ...) { cat("Design: ", x$design, "\n") print(summarize_tidy(x)) } #' @export print.horvitz_thompson <- function(x, ...) { print(summarize_tidy(x)) } #' @export print.lh <- function(x, ...) { print(summary(x)) } #' @export print.lh_robust <- function(x, ...) { lnames <- names(x) for (i in seq_along(x)) { cat("$", lnames[i], "\n", sep = "") print(x[[i]]) cat("\n") } invisible(x) } #' @export print.summary.lh_robust <- function(x, digits = max(3L, getOption("digits") - 3L), ...){ lnames <- names(x) for (i in seq_along(x)) { cat("$", lnames[i], "\n", sep = "") print(summary(x[[i]]), digits = digits) cat("\n") } } #' @export print.summary.lh <- function(x, digits = max(3L, getOption("digits") - 3L), ...){ class(x) <- NULL print(x, digits = digits) } estimatr/R/S3_emmeans.R0000644000176200001440000000304614747205231014430 0ustar liggesusers### Support for emmeans package # # Note: the recover_data and emm_basis methods are registered dynamically # (see zzz.R). So these functions are not exported #' @importFrom utils getS3method recover_data.lm_robust <- function(object, ...) { data <- getS3method("recover_data", "lm")(object, ...) if (object$rank < object$k) # rank-deficient. Need to pass dataset to emm_basis attr(data, "pass.it.on") <- TRUE data } emm_basis.lm_robust <- function(object, trms, xlev, grid, ...) { # coef() works right for lm but coef.aov tosses out NAs bhat <- coef(object) n.mult <- ifelse(is.matrix(bhat), ncol(bhat), 1) # columns in mult response m <- suppressWarnings(model.frame(trms, grid, na.action = na.pass, xlev = xlev)) X <- model.matrix(trms, m, contrasts.arg = object$contrasts) V <- emmeans::.my.vcov(object, ...) if (!anyNA(bhat)) nbasis <- estimability::all.estble else { desmat <- model.matrix(trms, data = attr(object, "data")) nbasis <- estimability::nonest.basis(desmat) } misc <- list() if (n.mult > 1) { # multivariate case. Need to expand some matrices eye <- diag(n.mult) X <- kronecker(eye, X) nbasis <- kronecker(eye, nbasis) if(is.null(colnames(bhat))) colnames(bhat) <- seq_len(n.mult) misc$ylevs <- list(rep.meas = colnames(bhat)) bhat <- as.numeric(bhat) # stretch coefs into a vector } dfargs <- list(df = object$df.residual) dffun <- function(k, dfargs) dfargs$df list(X = X, bhat = bhat, nbasis = nbasis, V = V, dffun = dffun, dfargs = dfargs, misc = misc) } estimatr/R/S3_summary.R0000644000176200001440000000653314747205231014504 0ustar liggesusers#' @export summary.lm_robust <- function(object, ...) { if (is.matrix(coef(object))) { ny <- ncol(coef(object)) ret <- setNames( vector("list", ny), paste("Response", object$outcome) ) mat_objs <- c( "coefficients", "std.error", "statistic", "df", "conf.low", "conf.high", "p.value" ) vec_objs <- c( "outcome", "r.squared", "adj.r.squared", "res_var" ) all_models <- object for (i in seq(ny)) { for (nm in names(object)) { if (nm %in% mat_objs) { object[[nm]] <- all_models[[nm]][, i, drop = TRUE] } else if (nm %in% vec_objs) { object[[nm]] <- all_models[[nm]][i] } else if (nm == "fstatistic") { object[[nm]] <- all_models[[nm]][c(i, ny + 1:2)] } } object$call$formula[[2L]] <- object$terms[[2L]] <- as.name(all_models$outcome[i]) ret[[i]] <- summary(object, ...) } class(ret) <- "listof" } else { ret <- summary_lm_model(object) } ret } #' @export summary.iv_robust <- function(object, ...) { summary_lm_model(object) } #' @export summary.lh_robust <- function(object,...){ class(object) <- "summary.lh_robust" object } #' @export summary.lh <- function(object,...){ summary_lh_object <- summarize_tidy(simplify_lh_outcome(object)) class(summary_lh_object) <- "summary.lh" summary_lh_object } summary_lm_model <- function(object) { out_values <- c( "call", "k", "rank", "df.residual", "res_var", "weighted", "se_type", "fes", "r.squared", "adj.r.squared", "fstatistic" ) # Different returns if fixed effects in the output if (object[["fes"]]) { out_values <- c( out_values, "proj_r.squared", "proj_adj.r.squared", "proj_fstatistic" ) } # Different returns if fixed effects in the output if (is.numeric(object[["diagnostic_endogeneity_test"]])) { out_values <- c( out_values, "diagnostic_first_stage_fstatistic", "diagnostic_endogeneity_test", "diagnostic_overid_test" ) } return_list <- object[out_values] # Split into two lists if multivariate linear model return_list[["coefficients"]] <- summarize_tidy(object) return_list[["nobs"]] <- nobs(object) class(return_list) <- "summary.lm_robust" return(return_list) } #' @export summary.difference_in_means <- function(object, ...) { return(list( coefficients = summarize_tidy(object), design = object$design )) } #' @export summary.horvitz_thompson <- function(object, ...) { return(list(coefficients = summarize_tidy(object, "z"))) } summarize_tidy <- function(object, test = "t", ...) { remove_cols <- c("term", "outcome") # Ugly so that summary(fit)$coefficients matches lm() tidy_out <- tidy(object, ...) colnames(tidy_out)[2:8] <- c( "Estimate", "Std. Error", paste0(test, " value"), paste0("Pr(>|", test, "|)"), "CI Lower", "CI Upper", "DF" ) tidy_mat <- as.matrix(tidy_out[, !(names(tidy_out) %in% remove_cols)]) ny <- length(object$outcome) p <- length(object$term) if (ny > 1) { rownames(tidy_mat) <- paste0( rep(object$outcome, each = p), ":", rep(object$term, times = ny) ) } else { rownames(tidy_mat) <- object$term } return(tidy_mat) } estimatr/R/estimatr_iv_robust.R0000644000176200001440000005263014747205231016365 0ustar liggesusers#' Two-Stage Least Squares Instrumental Variables Regression #' #' @description This formula estimates an instrumental variables regression #' using two-stage least squares with a variety of options for robust #' standard errors #' #' @param formula an object of class formula of the regression and the instruments. #' For example, the formula \code{y ~ x1 + x2 | z1 + z2} specifies \code{x1} and \code{x2} #' as endogenous regressors and \code{z1} and \code{z2} as their respective instruments. #' @param data A \code{data.frame} #' @param weights the bare (unquoted) names of the weights variable in the #' supplied data. #' @param subset An optional bare (unquoted) expression specifying a subset #' of observations to be used. #' @param clusters An optional bare (unquoted) name of the variable that #' corresponds to the clusters in the data. #' @param fixed_effects An optional right-sided formula containing the fixed #' effects that will be projected out of the data, such as \code{~ blockID}. Do not #' pass multiple-fixed effects with intersecting groups. Speed gains are greatest for #' variables with large numbers of groups and when using "HC1" or "stata" standard errors. #' See 'Details'. #' @param se_type The sort of standard error sought. If \code{clusters} is #' not specified the options are "HC0", "HC1" (or "stata", the equivalent), #' "HC2" (default), "HC3", or #' "classical". If \code{clusters} is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients. #' @param ci logical. Whether to compute and return p-values and confidence #' intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. #' @param diagnostics logical. Whether to compute and return instrumental variable diagnostic statistics and tests. #' @param return_vcov logical. Whether to return the variance-covariance #' matrix for later usage, TRUE by default. #' @param try_cholesky logical. Whether to try using a Cholesky #' decomposition to solve least squares instead of a QR decomposition, #' FALSE by default. Using a Cholesky decomposition may result in speed gains, but should only #' be used if users are sure their model is full-rank (i.e., there is no #' perfect multi-collinearity) #' #' @details #' #' This function performs two-stage least squares estimation to fit #' instrumental variables regression. The syntax is similar to that in #' \code{ivreg} from the \code{AER} package. Regressors and instruments #' should be specified in a two-part formula, such as #' \code{y ~ x1 + x2 | z1 + z2 + z3}, where \code{x1} and \code{x2} are #' regressors and \code{z1}, \code{z2}, and \code{z3} are instruments. Unlike #' \code{ivreg}, you must explicitly specify all exogenous regressors on #' both sides of the bar. #' #' The default variance estimators are the same as in \code{\link{lm_robust}}. #' Without clusters, we default to \code{HC2} standard errors, and with clusters #' we default to \code{CR2} standard errors. 2SLS variance estimates are #' computed using the same estimators as in \code{\link{lm_robust}}, however the #' design matrix used are the second-stage regressors, which includes the estimated #' endogenous regressors, and the residuals used are the difference #' between the outcome and a fit produced by the second-stage coefficients and the #' first-stage (endogenous) regressors. More notes on this can be found at #' \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{the mathematical appendix}. #' #' If \code{fixed_effects} are specified, both the outcome, regressors, and instruments #' are centered using the method of alternating projections (Halperin 1962; Gaure 2013). Specifying #' fixed effects in this way will result in large speed gains with standard error #' estimators that do not need to invert the matrix of fixed effects. This means using #' "classical", "HC0", "HC1", "CR0", or "stata" standard errors will be faster than other #' standard error estimators. Be wary when specifying fixed effects that may result #' in perfect fits for some observations or if there are intersecting groups across #' multiple fixed effect variables (e.g. if you specify both "year" and "country" fixed effects #' with an unbalanced panel where one year you only have data for one country). #' #' If \code{diagnostics} are requested, we compute and return three sets of diagnostics. #' First, we return tests for weak instruments using first-stage F-statistics (\code{diagnostic_first_stage_fstatistic}). Specifically, #' the F-statistics reported compare the model regressing each endogeneous variable on both the #' included exogenous variables and the instruments to a model where each endogenous variable is #' regressed only on the included exogenous variables (without the instruments). A significant F-test #' for weak instruments provides evidence against the null hypothesis that the instruments are weak. #' Second, we return tests for the endogeneity of the endogenous variables, often called the Wu-Hausman #' test (\code{diagnostic_endogeneity_test}). We implement the regression test from Hausman (1978), which allows for robust variance estimation. #' A significant endogeneity test provides evidence against the null that all the variables are exogenous. #' Third, we return a test for the correlation between the instruments and the error term (\code{diagnostic_overid_test}). We implement #' the Wooldridge (1995) robust score test, which is identical to Sargan's (1958) test with classical #' standard errors. This test is only reported if the model is overidentified (i.e. the number of #' instruments is greater than the number of endogenous regressors), and if no weights are specified. #' #' @return An object of class \code{"iv_robust"}. #' #' The post-estimation commands functions \code{summary} and \code{\link{tidy}} #' return results in a \code{data.frame}. To get useful data out of the return, #' you can use these data frames, you can use the resulting list directly, or #' you can use the generic accessor functions \code{coef}, \code{vcov}, #' \code{confint}, and \code{predict}. #' #' An object of class \code{"iv_robust"} is a list containing at least the #' following components: #' \item{coefficients}{the estimated coefficients} #' \item{std.error}{the estimated standard errors} #' \item{statistic}{the t-statistic} #' \item{df}{the estimated degrees of freedom} #' \item{p.value}{the p-values from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} #' \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} #' \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} #' \item{term}{a character vector of coefficient names} #' \item{alpha}{the significance level specified by the user} #' \item{se_type}{the standard error type specified by the user} #' \item{res_var}{the residual variance} #' \item{nobs}{the number of observations used} #' \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} #' \item{rank}{the rank of the fitted model} #' \item{vcov}{the fitted variance covariance matrix} #' \item{r.squared}{the \eqn{R^2} of the second stage regression} #' \item{adj.r.squared}{the \eqn{R^2} of the second stage regression, but penalized for having more parameters, \code{rank}} #' \item{fstatistic}{a vector with the value of the second stage F-statistic with the numerator and denominator degrees of freedom} #' \item{firststage_fstatistic}{a vector with the value of the first stage F-statistic with the numerator and denominator degrees of freedom, useful for a test for weak instruments} #' \item{weighted}{whether or not weights were applied} #' \item{call}{the original function call} #' \item{fitted.values}{the matrix of predicted means} #' We also return \code{terms} with the second stage terms and \code{terms_regressors} with the first stage terms, both of which used by \code{predict}. If \code{fixed_effects} are specified, then we return \code{proj_fstatistic}, \code{proj_r.squared}, and \code{proj_adj.r.squared}, which are model fit statistics that are computed on the projected model (after demeaning the fixed effects). #' #' We also return various diagnostics when \code{`diagnostics` == TRUE}. These are stored in \code{diagnostic_first_stage_fstatistic}, \code{diagnostic_endogeneity_test}, and \code{diagnostic_overid_test}. They have the test statistic, relevant degrees of freedom, and p.value in a named vector. See 'Details' for more. These are printed in a formatted table when the model object is passed to \code{summary()}. #' #' @references #' #' Gaure, Simon. 2013. "OLS with multiple high dimensional category variables." Computational Statistics & Data Analysis 66: 8-1. \doi{10.1016/j.csda.2013.03.024} #' #' Halperin, I. 1962. "The product of projection operators." Acta Scientiarum Mathematicarum (Szeged) 23(1-2): 96-99. #' #' @examples #' library(fabricatr) #' dat <- fabricate( #' N = 40, #' Y = rpois(N, lambda = 4), #' Z = rbinom(N, 1, prob = 0.4), #' D = Z * rbinom(N, 1, prob = 0.8), #' X = rnorm(N), #' G = sample(letters[1:4], N, replace = TRUE) #' ) #' #' # Instrument for treatment `D` with encouragement `Z` #' tidy(iv_robust(Y ~ D + X | Z + X, data = dat)) #' #' # Instrument with Stata's `ivregress 2sls , small rob` HC1 variance #' tidy(iv_robust(Y ~ D | Z, data = dat, se_type = "stata")) #' #' # With clusters, we use CR2 errors by default #' dat$cl <- rep(letters[1:5], length.out = nrow(dat)) #' tidy(iv_robust(Y ~ D | Z, data = dat, clusters = cl)) #' #' # Again, easy to replicate Stata (again with `small` correction in Stata) #' tidy(iv_robust(Y ~ D | Z, data = dat, clusters = cl, se_type = "stata")) #' #' # We can also specify fixed effects, that will be taken as exogenous regressors #' # Speed gains with fixed effects are greatests with "stata" or "HC1" std.errors #' tidy(iv_robust(Y ~ D | Z, data = dat, fixed_effects = ~ G, se_type = "HC1")) #' #' @export iv_robust <- function(formula, data, weights, subset, clusters, fixed_effects, se_type = NULL, ci = TRUE, alpha = .05, diagnostics = FALSE, return_vcov = TRUE, try_cholesky = FALSE) { datargs <- enquos( formula = formula, weights = weights, subset = subset, cluster = clusters, fixed_effects = fixed_effects ) data <- enquo(data) model_data <- clean_model_data(data = data, datargs, estimator = "iv") if (ncol(model_data$instrument_matrix) < ncol(model_data$design_matrix)) { warning("More regressors than instruments") } fes <- is.character(model_data[["fixed_effects"]]) if (fes) { if (diagnostics) { warning("Will not return `diagnostics` if `fixed_effects` are used.") diagnostics <- FALSE } yoriginal <- model_data[["outcome"]] Xoriginal <- model_data[["design_matrix"]] model_data <- demean_fes(model_data) } else { yoriginal <- NULL Xoriginal <- NULL } # ----------- # First stage # ----------- has_int <- attr(model_data$terms, "intercept") first_stage <- lm_robust_fit( y = model_data$design_matrix, X = model_data$instrument_matrix, weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, ci = FALSE, se_type = "none", has_int = has_int, alpha = alpha, return_fit = TRUE, return_vcov = FALSE, try_cholesky = try_cholesky, iv_stage = list(1) ) # ------ # Second stage # ------ colnames(first_stage$fitted.values) <- colnames(model_data$design_matrix) if (!is.null(model_data$fixed_effects)) { attr(model_data$fixed_effects, "fe_rank") <- sum(model_data[["fe_levels"]]) + 1 } second_stage <- lm_robust_fit( y = model_data$outcome, X = first_stage$fitted.values, yoriginal = yoriginal, Xoriginal = Xoriginal, weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, ci = ci, se_type = se_type, has_int = attr(model_data$terms, "intercept"), alpha = alpha, return_vcov = return_vcov, try_cholesky = try_cholesky, iv_stage = list(2, model_data$design_matrix) ) return_list <- lm_return( second_stage, model_data = model_data, formula = model_data$formula ) se_type <- return_list[["se_type"]] # ------ # diagnostics # ------ if (diagnostics) { # values for use by multiple diagnostic tests instruments <- setdiff( colnames(model_data$instrument_matrix), colnames(model_data$design_matrix) ) endog <- setdiff( colnames(model_data$design_matrix), colnames(model_data$instrument_matrix) ) first_stage_fits <- first_stage[["fitted.values"]][, endog, drop = FALSE] colnames(first_stage_fits) <- paste0("fit_", colnames(first_stage_fits)) first_stage_residuals <- model_data$design_matrix - first_stage[["fitted.values"]] colnames(first_stage_residuals) <- paste0("resid_", colnames(first_stage_residuals)) # Wu-Hausman f-test for endogeneity wu_hausman_ftest_val <- wu_hausman_reg_ftest(model_data, first_stage_residuals, se_type) # Overidentification test (only computed if n(instruments) > n(endog regressors) extra_instruments <- length(instruments) - length(endog) if (extra_instruments && is.null(model_data$weights)) { ss_residuals <- model_data$outcome - second_stage[["fitted.values"]] if (se_type == "classical") { overid_chisq_val <- sargan_chisq(model_data, ss_residuals) } else { overid_chisq_val <- wooldridge_score_chisq( model_data = model_data, endog = endog, instruments = instruments, ss_residuals = ss_residuals, first_stage_fits = first_stage_fits, m = extra_instruments ) } overid_chisqtest_val <- c( overid_chisq_val, extra_instruments, pchisq(overid_chisq_val, extra_instruments, lower.tail = FALSE) ) } else { overid_chisqtest_val <- c(NA_real_, 0, NA_real_) } names(overid_chisqtest_val) <- c("value", "df", "p.value") # Weak instrument test (first stage f-test) first_stage_ftest_val <- first_stage_ftest(model_data, endog, instruments, se_type) return_list[["diagnostic_first_stage_fstatistic"]] <- first_stage_ftest_val return_list[["diagnostic_endogeneity_test"]] <- wu_hausman_ftest_val return_list[["diagnostic_overid_test"]] <- overid_chisqtest_val } return_list[["call"]] <- match.call() return_list[["terms_regressors"]] <- model_data[["terms_regressors"]] return_list[["formula"]] <- formula(formula) class(return_list) <- "iv_robust" return(return_list) } # IV diagnostic test functions # helper to get denominator degress of freedom get_dendf <- function(lm_fit) { if (is.numeric(lm_fit[["nclusters"]])) { lm_fit[["nclusters"]] - 1 } else { lm_fit[["df.residual"]] } } # Weak first-stage ftest first_stage_ftest <- function(model_data, endog, instruments, se_type) { lm_instruments <- lm_robust_fit( y = model_data$design_matrix[, endog, drop = FALSE], X = model_data$instrument_matrix, weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, se_type = se_type, has_int = 0 %in% attr(model_data$instrument_matrix, "assign"), return_fit = TRUE, return_vcov = TRUE, ci = FALSE ) coef_inst <- as.matrix(lm_instruments[["coefficients"]]) if (all(colnames(model_data$instrument_matrix) %in% instruments)) { # if all instruments (including intercept!) are only instruments firststage_nomdf <- lm_instruments[["rank"]] firststage_fstat_value <- lm_instruments[["fstatistic"]][seq_len(length(endog))] } else { lm_noinstruments <- lm_robust_fit( y = model_data$design_matrix[, endog, drop = FALSE], X = model_data$instrument_matrix[ , !(colnames(model_data$instrument_matrix) %in% instruments), drop = FALSE ], weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, se_type = "none", has_int = FALSE, ci = FALSE, return_fit = TRUE, return_vcov = FALSE ) coef_noinst <- as.matrix(lm_noinstruments[["coefficients"]]) inst_indices <- which(!(rownames(coef_inst) %in% rownames(coef_noinst))) firststage_nomdf <- lm_instruments[["rank"]] - lm_noinstruments[["rank"]] firststage_fstat_value <- compute_fstat( coef_matrix = coef_inst, coef_indices = inst_indices, vcov_fit = lm_instruments[["vcov"]], rank = lm_instruments[["rank"]], nomdf = firststage_nomdf ) } fstat_names <- if (ncol(coef_inst) > 1) { paste0(colnames(coef_inst), ":value") } else { "value" } dendf <- get_dendf(lm_instruments) c( setNames(firststage_fstat_value, fstat_names), nomdf = firststage_nomdf, dendf = dendf, setNames( vapply( firststage_fstat_value, function(x) { pf(x, firststage_nomdf, dendf, lower.tail = FALSE) }, numeric(1) ), gsub("value", "p.value", fstat_names) ) ) } # Wu-Hausman f-test for endogeneity wu_hausman_reg_ftest <- function(model_data, first_stage_residuals, se_type) { lm_noresids <- lm_robust_fit( y = model_data$outcome, X = model_data$design_matrix, weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, se_type = "none", has_int = 0 %in% attr(model_data$design_matrix, "assign"), ci = FALSE, return_fit = TRUE, return_vcov = FALSE ) lm_resids <- lm_robust_fit( y = model_data$outcome, X = cbind(model_data$design_matrix, first_stage_residuals), weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, se_type = se_type, has_int = 0 %in% attr(model_data$design_matrix, "assign"), ci = FALSE, return_fit = TRUE, return_vcov = TRUE ) coef_noresids <- na.omit(lm_noresids[["coefficients"]]) coef_resids <- na.omit(lm_resids[["coefficients"]]) ovar <- which(!(names(coef_resids) %in% names(coef_noresids))) wu_hausman_nomdf <- lm_resids[["rank"]] - lm_noresids[["rank"]] wu_hausman_fstat <- compute_fstat( coef_matrix = as.matrix(coef_resids), coef_indices = ovar, vcov_fit = lm_resids[["vcov"]], rank = lm_resids[["rank"]], nomdf = wu_hausman_nomdf ) dendf <- get_dendf(lm_resids) c( "value" = wu_hausman_fstat, "numdf" = wu_hausman_nomdf, "dendf" = dendf, "p.value" = pf(wu_hausman_fstat, wu_hausman_nomdf, dendf, lower.tail = FALSE) ) } # Overidentification tests # Sargan (classical ses) sargan_chisq <- function(model_data, ss_residuals) { lmr <- lm_robust_fit( y = as.matrix(ss_residuals), X = model_data$instrument_matrix, weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, se_type = "classical", has_int = 0 %in% attr(model_data, "assign"), ci = FALSE, return_fit = FALSE, return_vcov = FALSE ) lmr[["r.squared"]] * lmr[["nobs"]] } # wooldridge score test (robust SEs) wooldridge_score_chisq <- function(model_data, endog, instruments, ss_residuals, first_stage_fits, m) { # Using notation following stata ivregress postestimation help qhat_fit <- lm_robust_fit( y = model_data$instrument_matrix[, instruments[seq_len(m)], drop = FALSE], X = cbind( model_data$design_matrix[, -which(colnames(model_data$design_matrix) %in% endog), drop = FALSE], first_stage_fits ), weights = model_data$weights, cluster = model_data$cluster, fixed_effects = model_data$fixed_effects, se_type = "none", has_int = TRUE, ci = FALSE, return_fit = TRUE, return_vcov = FALSE ) kmat <- as.matrix( model_data$instrument_matrix[, instruments[seq_len(m)], drop = FALSE] - qhat_fit[["fitted.values"]] ) * as.vector(ss_residuals) if (!is.null(model_data[["weights"]])) { kmat_fit <- lm.fit(kmat * model_data[["weights"]], as.matrix(rep(1, times = length(ss_residuals)))) } else { kmat_fit <- lm.fit(kmat, as.matrix(rep(1, times = length(ss_residuals)))) } return(length(ss_residuals) - sum(residuals(kmat_fit))) } build_ivreg_diagnostics_mat <- function(iv_robust_out, stata = FALSE) { weakinst <- iv_robust_out[["diagnostic_first_stage_fstatistic"]] wu_hausman <- iv_robust_out[["diagnostic_endogeneity_test"]] overid <- iv_robust_out[["diagnostic_overid_test"]] n_weak_inst_fstats <- (length(weakinst) - 2) / 2 diag_mat <- rbind( matrix( c( weakinst[seq_len(n_weak_inst_fstats)], rep(weakinst["nomdf"], n_weak_inst_fstats), rep(weakinst["dendf"], n_weak_inst_fstats), weakinst[n_weak_inst_fstats + 2 + seq_len(n_weak_inst_fstats)] ), nrow = n_weak_inst_fstats ), wu_hausman, c(overid[1], if (stata & overid[2] == 0) NA else overid[2], NA, overid[3]) )[, c(2, 3, 1, 4)] weak_names <- "Weak instruments" if (n_weak_inst_fstats > 1) { weak_names <- paste0( weak_names, " (", gsub("\\:*value", "", names(weakinst[seq_len(n_weak_inst_fstats)])), ")" ) } rownames(diag_mat) <- c( weak_names, "Wu-Hausman", "Overidentifying" ) diag_mat } estimatr/R/helper_na_omit_detailed.R0000644000176200001440000000065214747205231017256 0ustar liggesusers#' Extra logging on na.omit handler #' #' @param object a data.frame #' #' @return a normal \code{omit} object, with the extra attribute \code{why_omit}, #' which contains the leftmost column containing an NA for each row that was dropped, by #' column name, if any were dropped. #' #' @seealso \code{\link{na.omit}} na.omit_detailed.data.frame <- function(object){ naomitwhy(object, function(x, w) x[w, , drop=FALSE]) } estimatr/R/zzz.R0000644000176200001440000000203414747205231013267 0ustar liggesusers# Some of this is code modified from # https://github.com/atahk/bucky/blob/master/R/zzz.R (GPL 3.0) .onLoad <- function(libname, pkgname) { if (suppressWarnings(requireNamespace("texreg", quietly = TRUE))) { setGeneric("extract", function(model, ...) standardGeneric("extract"), package = "texreg" ) setMethod("extract", signature = className("lm_robust", pkgname), definition = extract.lm_robust ) setMethod("extract", signature = className("iv_robust", pkgname), definition = extract.iv_robust ) } if(requireNamespace("emmeans", quietly = TRUE)) { emmeans::.emm_register("lm_robust", pkgname) } invisible() } #' @importFrom utils packageVersion .onAttach <- function(libname, pkgname) { if (isNamespaceLoaded("broom") && packageVersion("broom") <= "0.5.0") { packageStartupMessage( "Warning: the `broom` package version 0.5.0 or earlier is loaded.\n", "Please upgrade `broom` or `tidy` methods may not work as expected." ) } invisible() } estimatr/R/helper_condition_pr_matrix.R0000644000176200001440000004224414747205231020053 0ustar liggesusersobtain <- function(ra_declaration, condition) { if (requireNamespace("randomizr", quietly = TRUE)) { randomizr::obtain_condition_probabilities(ra_declaration, condition) } else { ra_declaration$probability_matrix[, paste0("prob_", condition)] } } #' Builds condition probability matrices for Horvitz-Thompson estimation from #' \pkg{randomizr} declaration #' #' @param ra_declaration An object of class \code{"ra_declaration"}, generated #' by the \code{\link[randomizr]{declare_ra}} function in \pkg{randomizr}. This #' object contains the experimental design that will be represented in a #' condition probability matrix #' @param condition1 The name of the first condition, often the control group. If \code{NULL}, #' defaults to first condition in randomizr declaration. Either both \code{condition1} #' and \code{condition2} have to be specified or both left as \code{NULL}. #' @param condition2 The name of the second condition, often the treatment group. If \code{NULL}, #' defaults to second condition in randomizr declaration. Either both \code{condition1} #' and \code{condition2} have to be specified or both left as \code{NULL}. #' @param prob_matrix An optional probability matrix to override the one in #' \code{ra_declaration} #' #' @details This function takes a \code{"ra_declaration"}, generated #' by the \code{\link[randomizr]{declare_ra}} function in \pkg{randomizr} and #' returns a 2n*2n matrix that can be used to fully specify the design for #' \code{\link{horvitz_thompson}} estimation. This is done by passing this #' matrix to the \code{condition_pr_mat} argument of #' \code{\link{horvitz_thompson}}. #' #' Currently, this function can learn the condition probability matrix for a #' wide variety of randomizations: simple, complete, simple clustered, complete #' clustered, blocked, block-clustered. #' #' A condition probability matrix is made up of four submatrices, each of which #' corresponds to the #' joint and marginal probability that each observation is in one of the two #' treatment conditions. #' #' The upper-left quadrant is an n*n matrix. On the diagonal is the marginal #' probability of being in condition 1, often control, for every unit #' (Pr(Z_i = Condition1) where Z represents the vector of treatment conditions). #' The off-diagonal elements are the joint probabilities of each unit being in #' condition 1 with each other unit, Pr(Z_i = Condition1, Z_j = Condition1) #' where i indexes the rows and j indexes the columns. #' #' The upper-right quadrant is also an n*n matrix. On the diagonal is the joint #' probability of a unit being in condition 1 and condition 2, often the #' treatment, and thus is always 0. The off-diagonal elements are the joint #' probability of unit i being in condition 1 and unit j being in condition 2, #' Pr(Z_i = Condition1, Z_j = Condition2). #' #' The lower-left quadrant is also an n*n matrix. On the diagonal is the joint #' probability of a unit being in condition 1 and condition 2, and thus is #' always 0. The off-diagonal elements are the joint probability of unit i #' being in condition 2 and unit j being in condition 1, #' Pr(Z_i = Condition2, Z_j = Condition1). #' #' The lower-right quadrant is an n*n matrix. On the diagonal is the marginal #' probability of being in condition 2, often treatment, for every unit #' (Pr(Z_i = Condition2)). The off-diagonal elements are the joint probability #' of each unit being in condition 2 together, #' Pr(Z_i = Condition2, Z_j = Condition2). #' #' @return a numeric 2n*2n matrix of marginal and joint condition treatment #' probabilities to be passed to the \code{condition_pr_mat} argument of #' \code{\link{horvitz_thompson}}. See details. #' #' @seealso \code{\link{permutations_to_condition_pr_mat}} #' #' @examples #' #' # Learn condition probability matrix from complete blocked design #' library(randomizr) #' n <- 100 #' dat <- data.frame( #' blocks = sample(letters[1:10], size = n, replace = TRUE), #' y = rnorm(n) #' ) #' #' # Declare complete blocked randomization #' bl_declaration <- declare_ra(blocks = dat$blocks, prob = 0.4, simple = FALSE) #' # Get probabilities #' block_pr_mat <- declaration_to_condition_pr_mat(bl_declaration, 0, 1) #' # Do randomiztion #' dat$z <- conduct_ra(bl_declaration) #' #' horvitz_thompson(y ~ z, data = dat, condition_pr_mat = block_pr_mat) #' #' # When you pass a declaration to horvitz_thompson, this function is called #' #' # Equivalent to above call #' horvitz_thompson(y ~ z, data = dat, ra_declaration = bl_declaration) #' #' @export declaration_to_condition_pr_mat <- function(ra_declaration, condition1 = NULL, condition2 = NULL, prob_matrix = NULL) { if (!(inherits(ra_declaration, "ra_declaration"))) { stop("`ra_declaration` must be an object of class 'ra_declaration'") } if (!is.numeric(prob_matrix)) { prob_matrix <- ra_declaration$probabilities_matrix } if (ncol(prob_matrix) > 2) { stop( "`ra_declaration` must have only two arms when passed directly to ", "declaration_to_condition_pr_mat()" ) } if (is.null(condition1) && is.null(condition2)) { condition1 <- ra_declaration$conditions[1] condition2 <- ra_declaration$conditions[2] } else if (is.null(condition1) && !is.null(condition2)) { stop( "Cannot have `condition1 == NULL` and `condition2 != NULL`" ) } else if (!is.null(condition1) && is.null(condition2)) { stop( "Cannot have `condition2 == NULL` and `condition1 != NULL`" ) } p1 <- obtain( ra_declaration, condition1 ) p2 <- obtain( ra_declaration, condition2 ) n <- nrow(prob_matrix) if (inherits(ra_declaration, "ra_simple")) { v <- c(p1, p2) condition_pr_matrix <- tcrossprod(v) diag(condition_pr_matrix) <- v condition_pr_matrix[cbind(n + 1:n, 1:n)] <- 0 condition_pr_matrix[cbind(1:n, n + 1:n)] <- 0 } else if (inherits(ra_declaration, "ra_complete")) { if (length(unique(p2)) > 1) { stop( "Treatment probabilities must be fixed for complete randomized designs" ) } condition_pr_matrix <- gen_pr_matrix_complete( pr = p2[1], n_total = n ) } else if (inherits(ra_declaration, "ra_clustered")) { condition_pr_matrix <- gen_pr_matrix_cluster( clusters = ra_declaration$clusters, treat_probs = p2, simple = ra_declaration$simple ) } else if (inherits(ra_declaration, "ra_blocked")) { condition_pr_matrix <- gen_pr_matrix_block( blocks = ra_declaration$blocks, clusters = NULL, p1 = p1, p2 = p2 ) } else if (inherits(ra_declaration, "ra_blocked_and_clustered")) { condition_pr_matrix <- gen_pr_matrix_block( blocks = ra_declaration$blocks, clusters = ra_declaration$clusters, p1 = p1, p2 = p2 ) } else if (inherits(ra_declaration, "ra_custom")) { # Use permutation matrix return(permutations_to_condition_pr_mat(ra_declaration$permutation_matrix)) } # Add names colnames(condition_pr_matrix) <- rownames(condition_pr_matrix) <- c(paste0(condition1, "_", 1:n), paste0(condition2, "_", 1:n)) return(condition_pr_matrix) } #' Builds condition probability matrices for Horvitz-Thompson estimation from #' permutation matrix #' #' @param permutations A matrix where the rows are units and the columns are #' different treatment permutations; treated units must be represented with a #' 1 and control units with a 0 #' #' @details This function takes a matrix of permutations, for example from #' the \code{\link[randomizr]{obtain_permutation_matrix}} function in #' \pkg{randomizr} or through simulation and returns a 2n*2n matrix that can #' be used to fully specify the design for \code{\link{horvitz_thompson}} #' estimation. You can read more about these matrices in the documentation for #' the \code{\link{declaration_to_condition_pr_mat}} function. #' #' This is done by passing this matrix to the \code{condition_pr_mat} argument #' of #' #' @seealso \code{\link[randomizr]{declare_ra}}, #' \code{\link{declaration_to_condition_pr_mat}} #' #' @return a numeric 2n*2n matrix of marginal and joint condition treatment #' probabilities to be passed to the \code{condition_pr_mat} argument of #' \code{\link{horvitz_thompson}}. #' #' @examples #' #' # Complete randomization #' perms <- replicate(1000, sample(rep(0:1, each = 50))) #' comp_pr_mat <- permutations_to_condition_pr_mat(perms) #' #' # Arbitrary randomization #' possible_treats <- cbind( #' c(1, 1, 0, 1, 0, 0, 0, 1, 1, 0), #' c(0, 1, 1, 0, 1, 1, 0, 1, 0, 1), #' c(1, 0, 1, 1, 1, 1, 1, 0, 0, 0) #' ) #' arb_pr_mat <- permutations_to_condition_pr_mat(possible_treats) #' # Simulating a column to be realized treatment #' z <- possible_treats[, sample(ncol(possible_treats), size = 1)] #' y <- rnorm(nrow(possible_treats)) #' horvitz_thompson(y ~ z, condition_pr_mat = arb_pr_mat) #' #' @export permutations_to_condition_pr_mat <- function(permutations) { N <- nrow(permutations) if (!all(permutations %in% c(0, 1))) { stop("Matrix of `permutations` must be comprised of only 0s and 1s") } condition_pr_matrix <- tcrossprod(rbind(1 - permutations, permutations)) / ncol(permutations) colnames(condition_pr_matrix) <- rownames(condition_pr_matrix) <- c(paste0("0_", 1:N), paste0("1_", 1:N)) return(condition_pr_matrix) } #' Generate condition probability matrix given clusters and probabilities #' #' @param clusters A vector of clusters #' @param treat_probs A vector of treatment (condition 2) probabilities #' @param simple A boolean for whether the assignment is a random sample #' assignment (TRUE, default) or complete random assignment (FALSE) #' #' @return a numeric 2n*2n matrix of marginal and joint condition treatment #' probabilities to be passed to the \code{condition_pr_mat} argument of #' \code{\link{horvitz_thompson}}. #' #' @seealso \code{\link{declaration_to_condition_pr_mat}} #' #' @export gen_pr_matrix_cluster <- function(clusters, treat_probs, simple) { n <- length(clusters) cluster_lists <- split(1:n, clusters, drop = TRUE) n_clust <- length(cluster_lists) unique_first_in_cl <- !duplicated(clusters) cluster_marginal_probs <- treat_probs[unique_first_in_cl] # Container mats # Get cluster condition_pr_matrices # Complete random sampling if (is.null(simple) || !simple) { if (length(unique(cluster_marginal_probs)) > 1) { stop( "Treatment probabilities cannot vary within blocks for ", "block-clustered randomized designs and cannot vary within the whole ", "sample for complete cluster randomized designs" ) } prs <- gen_joint_pr_complete(cluster_marginal_probs[1], n_clust) # This definitely could be optimized mat_00 <- matrix(prs[["00"]], n, n) mat_10 <- matrix(prs[["10"]], n, n) mat_11 <- matrix(prs[["11"]], n, n) for (i in 1:n_clust) { mat_11[cluster_lists[[i]], cluster_lists[[i]]] <- cluster_marginal_probs[i] mat_00[cluster_lists[[i]], cluster_lists[[i]]] <- 1 - cluster_marginal_probs[i] mat_10[cluster_lists[[i]], cluster_lists[[i]]] <- 0 } condition_pr_matrix <- rbind( cbind(mat_00, mat_10), cbind(mat_10, mat_11) ) } else if (simple) { # cluster, simple randomized # container mats mat_00 <- mat_01 <- mat_10 <- mat_11 <- matrix(NA, nrow = n, ncol = n) for (i in seq_along(cluster_lists)) { for (j in seq_along(cluster_lists)) { if (i == j) { mat_11[cluster_lists[[i]], cluster_lists[[j]]] <- cluster_marginal_probs[i] mat_00[cluster_lists[[i]], cluster_lists[[j]]] <- 1 - cluster_marginal_probs[i] mat_01[cluster_lists[[i]], cluster_lists[[j]]] <- 0 mat_10[cluster_lists[[i]], cluster_lists[[j]]] <- 0 } else { mat_11[cluster_lists[[i]], cluster_lists[[j]]] <- cluster_marginal_probs[i] * cluster_marginal_probs[j] mat_00[cluster_lists[[i]], cluster_lists[[j]]] <- (1 - cluster_marginal_probs[i]) * (1 - cluster_marginal_probs[j]) mat_01[cluster_lists[[i]], cluster_lists[[j]]] <- (1 - cluster_marginal_probs[i]) * cluster_marginal_probs[j] mat_10[cluster_lists[[i]], cluster_lists[[j]]] <- cluster_marginal_probs[i] * (1 - cluster_marginal_probs[j]) } } } condition_pr_matrix <- rbind( cbind(mat_00, mat_01), cbind(mat_10, mat_11) ) } return(condition_pr_matrix) } gen_pr_matrix_block <- function(blocks, clusters, p2 = NULL, p1 = NULL, t = NULL, condition2 = NULL) { n <- length(blocks) # Assume complete randomization condition_pr_matrix <- matrix(NA, nrow = 2 * n, ncol = 2 * n) # Split by block and get complete randomized values within each block id_dat <- data.frame(ids = 1:n, stringsAsFactors = FALSE) if (!is.null(p2)) { id_dat$p2 <- p2 } if (!is.null(p1)) { id_dat$p1 <- p1 } if (!is.null(t)) { id_dat$t <- t } if (is.null(t) && is.null(p2) && is.null(p1)) { stop("Must specify one of `t`, `p2`, or `p1`") } clustered <- !is.null(clusters) if (clustered) { id_dat$clusters <- clusters } block_dat <- split( id_dat, blocks, drop = TRUE ) n_blocks <- length(block_dat) for (i in seq_along(block_dat)) { ids <- c(block_dat[[i]]$ids, n + block_dat[[i]]$ids) if (clustered) { if (is.null(block_dat[[i]]$p2)) { # learn prs cluster_treats <- get_cluster_treats(block_dat[[i]], condition2) block_dat[[i]]$p2 <- mean(cluster_treats$treat_clust) } if (is.null(block_dat[[i]]$p1)) { block_dat[[i]]$p1 <- 1 - block_dat[[i]]$p2 } # Has to be complete randomization of clusters condition_pr_matrix[ids, ids] <- gen_pr_matrix_cluster( clusters = block_dat[[i]]$clusters, treat_probs = block_dat[[i]]$p2, simple = FALSE ) } else { if (length(unique(block_dat[[i]]$p2)) > 1) { stop( "Treatment probabilities must be fixed within blocks for block ", "randomized designs" ) } if (is.null(block_dat[[i]]$p2)) { # learn prs block_dat[[i]]$p2 <- mean(block_dat[[i]]$t) } if (is.null(block_dat[[i]]$p1)) { block_dat[[i]]$p1 <- 1 - block_dat[[i]]$p2 } condition_pr_matrix[ids, ids] <- gen_pr_matrix_complete( pr = block_dat[[i]]$p2[1], n_total = length(block_dat[[i]]$p2) ) } } for (i in seq_along(block_dat)) { ids <- c(block_dat[[i]]$ids, n + block_dat[[i]]$ids) for (j in seq_along(block_dat)) { if (i != j) { condition_pr_matrix[ ids, c(block_dat[[j]]$ids, n + block_dat[[j]]$ids) ] <- tcrossprod( c(block_dat[[i]]$p1, block_dat[[i]]$p2), c(block_dat[[j]]$p1, block_dat[[j]]$p2) ) } } } return(condition_pr_matrix) } gen_pr_matrix_complete <- function(pr, n_total) { prs <- gen_joint_pr_complete(pr, n_total) pr00_mat <- matrix(prs[["00"]], nrow = n_total, ncol = n_total) diag(pr00_mat) <- 1 - pr pr10_mat <- matrix(prs[["10"]], nrow = n_total, ncol = n_total) diag(pr10_mat) <- 0 pr11_mat <- matrix(prs[["11"]], nrow = n_total, ncol = n_total) diag(pr11_mat) <- pr pr_mat <- cbind( rbind(pr00_mat, pr10_mat), rbind(pr10_mat, pr11_mat) ) return(pr_mat) } gen_joint_pr_complete <- function(pr, n_total) { n_treated <- pr * n_total remainder <- n_treated %% 1 n_treated_floor <- floor(n_treated) n_control <- n_total - n_treated_floor prs <- list() prs[["11"]] <- remainder * # pr(M) ((n_treated_floor + 1) / n_total) * # pr(j = 1 | M) (n_treated_floor / (n_total - 1)) + # pr(i = 1 | j = 1, M) (1 - remainder) * # pr(M') (n_treated_floor / n_total) * # pr(j = 1 | M') ((n_treated_floor - 1) / (n_total - 1)) # pr(i = 1 | j = 1, M') prs[["10"]] <- remainder * # pr(M) ((n_control - 1) / n_total) * # pr(j = 0 | M) ((n_treated_floor + 1) / (n_total - 1)) + # pr(i = 1 | j = 0, M) (1 - remainder) * # pr(M') (n_control / n_total) * # pr(j = 0 | M') (n_treated_floor / (n_total - 1)) # pr(i = 1 | j = 0, M') prs[["00"]] <- remainder * # pr(M) ((n_control - 1) / n_total) * # pr(j = 0 | M) ((n_control - 2) / (n_total - 1)) + # pr(i = 0 | j = 0, M) (1 - remainder) * # pr(M') (n_control / n_total) * # pr(j = 0 | M') ((n_control - 1) / (n_total - 1)) # pr(i = 0 | j = 0, M') return(prs) } get_cluster_treats <- function(data, condition2) { cluster_dat <- split( data$t, data$clusters, drop = TRUE ) n_clust <- length(cluster_dat) treat_clust <- numeric(n_clust) for (i in seq_along(cluster_dat)) { if (length(unique(cluster_dat[[i]])) > 1) { stop("Treatment condition must be constant within `clusters`") } treat_clust[i] <- as.numeric(cluster_dat[[i]][1] == condition2) } return(list(n_clust = n_clust, treat_clust = treat_clust)) } estimatr/R/helper_parse_arguments.R0000644000176200001440000000463514747205231017201 0ustar liggesusers# This function parses condition names for HT and DiM estimators parse_conditions <- function(treatment, condition1, condition2, estimator) { if (is.factor(treatment)) { condition_names <- levels(droplevels(treatment)) } else { condition_names <- sort(unique(treatment)) } if (any(!(c(condition1, condition2) %in% condition_names))) { stop("`condition1` and `condition2` must be values found in the treatment") } n_conditions <- length(condition_names) conditions <- list(NULL, NULL) if (n_conditions > 2) { if (is.null(condition1) || is.null(condition2)) { stop( "Treatment has > 2 values; must specify both `condition1` and ", "`condition2` or use a treatment with only 2 values" ) } else { conditions[1:2] <- c(condition1, condition2) } } else if (n_conditions == 2) { if (is.null(condition1) && is.null(condition2)) { conditions[1:2] <- condition_names } else if (!is.null(condition2) && is.null(condition1)) { conditions[1:2] <- c(setdiff(condition_names, condition2), condition2) } else if (!is.null(condition1) && is.null(condition2)) { conditions[1:2] <- c(condition1, setdiff(condition_names, condition1)) } else { conditions[1:2] <- c(condition1, condition2) } } else if (n_conditions == 1) { # Allowable for HT estimator if (estimator != "horvitz_thompson") { stop( "Must have more than one value in treatment unless using Horvitz-", "Thompson estimator" ) } if (is.null(condition1) && is.null(condition2)) { conditions[2] <- condition_names } else if (!is.null(condition2)) { conditions[2] <- condition2 } else if (!is.null(condition1)) { conditions[1] <- condition1 } } return(conditions) } # This function ensures that blocks and clusters have been specified correctly check_clusters_blocks <- function(data) { if (!is.null(data$cluster)) { one_block_per_clust <- tapply(data$block, data$cluster, function(x) all(x == x[1])) # Check that clusters nest within blocks if (any(!one_block_per_clust)) { stop("All `clusters` must be contained within `blocks`") } # get number of clusters per block clust_per_block <- tapply( data$cluster, data$block, function(x) length(unique(x)) ) } else { clust_per_block <- tabulate(as.factor(data$block)) } return(clust_per_block) } estimatr/R/S3_predict.R0000644000176200001440000002323414760370122014433 0ustar liggesusers#' Predict method for \code{lm_robust} object #' #' @param object an object of class 'lm_robust' #' @param newdata a data frame in which to look for variables with which to predict #' @param se.fit logical. Whether standard errors are required, default = FALSE #' @param interval type of interval calculation. Can be abbreviated, default = none #' @param alpha numeric denoting the test size for confidence intervals #' @param na.action function determining what should be done with missing #' values in newdata. The default is to predict NA. #' @param pred.var the variance(s) for future observations to be assumed for #' prediction intervals. #' @param weights variance weights for prediction. This can be a numeric #' vector or a bare (unquoted) name of the weights variable in the supplied #' newdata. #' @param ... other arguments, unused #' #' @details Produces predicted values, obtained by evaluating the regression #' function in the frame \code{newdata} for fits from \code{lm_robust} and #' \code{lm_lin}. If the logical se.fit is TRUE, standard errors of the #' predictions are calculated. Setting intervals specifies computation of #' confidence or prediction (tolerance) intervals at the specified level, #' sometimes referred to as narrow vs. wide intervals. #' #' The equation used for the standard error of a prediction given a row of #' data \eqn{x} is: #' #' \eqn{\sqrt(x \Sigma x')}, #' #' where \eqn{\Sigma} is the estimated variance-covariance matrix from #' \code{lm_robust}. #' #' The prediction intervals are for a single observation at each case in #' \code{newdata} with error variance(s) \code{pred.var}. The the default is to assume #' that future observations have the same error variance as those used for #' fitting, which is gotten from the fit \code{\link{lm_robust}} object. If #' weights is supplied, the inverse of this is used as a scale factor. If the #' fit was weighted, the default is to assume constant prediction variance, #' with a warning. #' #' @examples #' #' # Set seed #' set.seed(42) #' #' # Simulate data #' n <- 10 #' dat <- data.frame(y = rnorm(n), x = rnorm(n)) #' #' # Fit lm #' lm_out <- lm_robust(y ~ x, data = dat) #' # Get predicted fits #' fits <- predict(lm_out, newdata = dat) #' # With standard errors and confidence intervals #' fits <- predict(lm_out, newdata = dat, se.fit = TRUE, interval = "confidence") #' #' # Use new data as well #' new_dat <- data.frame(x = runif(n, 5, 8)) #' predict(lm_out, newdata = new_dat) #' #' # You can also supply custom variance weights for prediction intervals #' new_dat$w <- runif(n) #' predict(lm_out, newdata = new_dat, weights = w, interval = "prediction") #' #' # Works for 'lm_lin' models as well #' dat$z <- sample(1:3, size = nrow(dat), replace = TRUE) #' lmlin_out1 <- lm_lin(y ~ z, covariates = ~ x, data = dat) #' predict(lmlin_out1, newdata = dat, interval = "prediction") #' #' # Predictions from Lin models are equivalent with and without an intercept #' # and for multi-level treatments entered as numeric or factor variables #' lmlin_out2 <- lm_lin(y ~ z - 1, covariates = ~ x, data = dat) #' lmlin_out3 <- lm_lin(y ~ factor(z), covariates = ~ x, data = dat) #' lmlin_out4 <- lm_lin(y ~ factor(z) - 1, covariates = ~ x, data = dat) #' #' predict(lmlin_out2, newdata = dat, interval = "prediction") #' predict(lmlin_out3, newdata = dat, interval = "prediction") #' predict(lmlin_out4, newdata = dat, interval = "prediction") #' #' # In Lin models, predict will stop with an error message if new #' # treatment levels are supplied in the new data #' new_dat$z <- sample(0:3, size = nrow(new_dat), replace = TRUE) #' # predict(lmlin_out, newdata = new_dat) #' #' #' @export predict.lm_robust <- function(object, newdata, se.fit = FALSE, interval = c("none", "confidence", "prediction"), alpha = 0.05, na.action = na.pass, pred.var = NULL, weights, ...) { X <- get_X(object, newdata, na.action) # Get coefs coefs <- as.matrix(coef(object)) # Get prediction beta_na <- is.na(coefs[, 1]) # Get predicted values preds <- X[, !beta_na, drop = FALSE] %*% coefs[!beta_na, ] if (object[["fes"]]) { preds <- add_fes(preds, object, newdata) } predictor <- drop(preds) df_resid <- object$df.residual interval <- match.arg(interval) if (se.fit || interval != "none") { if (ncol(coefs) > 1) { stop("Can't set `se.fit` == TRUE with multivariate outcome") } if (object[["fes"]]) { stop("Can't set `se.fit` == TRUE with `fixed_effects`") } ret <- list() var_fit <- apply( X[, !beta_na, drop = FALSE], 1, function(x) tcrossprod(crossprod(x, object$vcov), x) ) if (interval != "none") { tval <- qt(alpha / 2, df_resid, lower.tail = FALSE) if (interval == "prediction") { # Get weights if (missing(weights)) { if (object$weighted && is.null(pred.var)) { warning("Assuming constant prediction variance even though model fit is weighted\\n") } weights <- 1 } else { weights <- eval(substitute(weights), newdata) } if (is.null(pred.var)) { pred.var <- object$res_var / weights } hwid <- tval * sqrt(var_fit + pred.var) } else if (interval == "confidence") { hwid <- tval * sqrt(var_fit) } predictor <- matrix( c( predictor, predictor - hwid, predictor + hwid ), ncol = 3, dimnames = list(NULL, c("fit", "lwr", "upr")) ) } ret[["fit"]] <- predictor if (se.fit) { ret[["se.fit"]] <- sqrt(var_fit) } return(ret) } else { return(predictor) } } #' @export predict.iv_robust <- function(object, newdata, na.action = na.pass, ...) { X <- get_X(object, newdata, na.action) coefs <- as.matrix(coef(object)) beta_na <- is.na(coefs[, 1]) # Get predicted values preds <- X[, !beta_na, drop = FALSE] %*% coefs[!beta_na, ] if (object[["fes"]]) { preds <- add_fes(preds, object, newdata) } return(drop(preds)) } get_X <- function(object, newdata, na.action) { # Get model matrix if (is.null(object[["terms_regressors"]])) { rhs_terms <- delete.response(object[["terms"]]) } else { rhs_terms <- delete.response(object[["terms_regressors"]]) } mf <- model.frame( rhs_terms, newdata, na.action = na.action, xlev = object[["xlevels"]] ) # Check class of columns in newdata match those in model fit if (!is.null(cl <- attr(rhs_terms, "dataClasses"))) .checkMFClasses(cl, mf) if (object[["fes"]]) { attr(rhs_terms, "intercept") <- 0 } X <- model.matrix(rhs_terms, mf, contrasts.arg = object$contrasts) # lm_lin scaling (moved down from predict.lm_robust) if (!is.null(object$scaled_center)) { # Covariates demeaned_covars <- scale( X[ , names(object$scaled_center), drop = FALSE ], center = object$scaled_center, scale = FALSE ) # Handle treatment variable reconstruction treat_name <- attr(object$terms, "term.labels")[1] treatment <- mf[, treat_name] vals <- sort(unique(treatment)) old_vals <- object$treatment_levels # Ensure treatment levels in newdata are subset of those for model fit if (!all(as.character(vals) %in% as.character(old_vals))) { stop( "Levels of treatment variable in `newdata` must be a subset of those ", "in the model fit." ) } treatment <- model.matrix(~ factor(treatment, levels = old_vals) - 1) colnames(treatment) <- paste0(treat_name, "_", old_vals) # Drop out first group if there is an intercept if (attr(rhs_terms, "intercept") == 1) treatment <- treatment[, -1, drop = FALSE] # Interactions matching original fitting logic n_treat_cols <- ncol(treatment) n_covars <- ncol(demeaned_covars) interaction_matrix <- matrix(0, nrow = nrow(X), ncol = n_covars * n_treat_cols) for (i in 1:n_covars) { cols <- (i - 1) * n_treat_cols + (1:n_treat_cols) interaction_matrix[, cols] <- treatment * demeaned_covars[, i] } X <- cbind( if (attr(rhs_terms, "intercept") == 1) { matrix(1, nrow = nrow(X), ncol = 1, dimnames = list(NULL, "(Intercept)")) }, treatment, if (attr(rhs_terms, "intercept") == 1 || ncol(treatment) == 1) demeaned_covars, interaction_matrix ) } return(X) } add_fes <- function(preds, object, newdata) { # Add factors! args <- as.list(object[["call"]]) if (length(all.vars(rlang::f_rhs(args[["fixed_effects"]]))) > 1) { stop( "Can't use `predict.lm_robust` with more than one set of ", "`fixed_effects`. Can recover fits in `fitted.values` in the model ", "object." ) } # cast as factor femat <- model.matrix( ~ 0 + ., data = as.data.frame( lapply( stats::model.frame.default( args[["fixed_effects"]], data = newdata, na.action = NULL ), FUN = as.factor ) ) ) keep_facs <- intersect(names(object[["fixed_effects"]]), colnames(femat)) extra_facs <- setdiff(colnames(femat), names(object[["fixed_effects"]])) if (length(extra_facs)) { stop( "Can't have new levels in `newdata` `fixed_effects` variable, such as: ", paste0(extra_facs, collapse = ", ") ) } preds <- preds + femat[, keep_facs] %*% object[["fixed_effects"]][keep_facs] return(preds) } estimatr/R/S3_nobs.R0000644000176200001440000000060614747205231013743 0ustar liggesusers#' @export nobs.lm_robust <- function(object, ...) object$nobs #' @export nobs.lh_robust <- function(object, ...) object$nobs #' @export nobs.iv_robust <- function(object, ...) object$nobs #' @export nobs.summary.lm_robust <- nobs.lm_robust #' @export nobs.horvitz_thompson <- function(object, ...) object$nobs #' @export nobs.difference_in_means <- function(object, ...) object$nobs estimatr/R/S3_update.R0000644000176200001440000000127014747205231014262 0ustar liggesusers#' @importFrom Formula Formula #' @importFrom stats getCall #' @export update.iv_robust <- function(object, formula., ..., evaluate = TRUE) { if (is.null(call <- getCall(object))) stop("need an object with call component") extras <- match.call(expand.dots = FALSE)$... if (!missing(formula.)) call$formula <- formula(update(Formula(formula(object)), formula.)) if (length(extras)) { existing <- !is.na(match(names(extras), names(call))) for (a in names(extras)[existing]) call[[a]] <- extras[[a]] if (any(!existing)) { call <- c(as.list(call), extras[!existing]) call <- as.call(call) } } if (evaluate) eval(call, parent.frame()) else call } estimatr/R/helper_cis_pvals.R0000644000176200001440000000267214747205231015764 0ustar liggesusers# Internal method takes the results and adds p-values and confidence intervals add_cis_pvals <- function(return_frame, alpha, ci, ttest = TRUE) { if (ci) { if (alpha <= 0 || alpha >= 1) { stop("`alpha` must be numeric between 0 and 1") } return_frame$statistic <- with(return_frame, coefficients / std.error) if (ttest) { if (any(return_frame$df <= 0, na.rm = TRUE)) { warning( "Some degrees of freedom have been estimated as negative or zero\n", "p-values and confidence intervals may not be calculated" ) return_frame$df <- ifelse(return_frame$df <= 0, NA, return_frame$df) } return_frame$p.value <- with( return_frame, 2 * pt(abs(statistic), df = df, lower.tail = FALSE) ) crit_se <- with(return_frame, qt(1 - alpha / 2, df = df) * std.error) } else { return_frame$p.value <- with( return_frame, 2 * pnorm(abs(statistic), lower.tail = FALSE) ) crit_se <- with(return_frame, qnorm(1 - alpha / 2) * std.error) return_frame$df <- NA } return_frame$conf.low <- with(return_frame, coefficients - crit_se) return_frame$conf.high <- with(return_frame, coefficients + crit_se) return(as.list(return_frame)) } else { return_frame$p.value <- NA return_frame$statistic <- NA return_frame$conf.low <- NA return_frame$conf.high <- NA return(as.list(return_frame)) } } estimatr/R/estimatr_lh_robust.R0000644000176200001440000001234214760370352016350 0ustar liggesusers#' Linear Hypothesis for Ordinary Least Squares with Robust Standard Errors #' #' @description This function fits a linear model with robust standard errors and performs linear hypothesis test. #' @param ... Other arguments to be passed to \code{\link{lm_robust}} #' @param data A \code{data.frame} #' @param linear_hypothesis A length 1 character string or a matrix specifying combination, to be passed to the hypothesis.matrix argument of car::linearHypothesis. Joint hypotheses are currently not handled by lh_robust. #' See \code{\link[car]{linearHypothesis}} for more details. #' @details #' #' This function is a wrapper for \code{\link{lm_robust}} and for #' \code{\link[car]{linearHypothesis}}. It first runs \code{lm_robust} and #' next passes \code{"lm_robust"} object as an argument to \code{linearHypothesis}. #' Currently CR2 standard errors are not handled by lh_robust. #' #' @return An object of class \code{"lh_robust"} containing the two following components: #' #' \item{lm_robust}{an object as returned by \code{lm_robust}.} #' \item{lh}{A data frame with most of its columns pulled from \code{linearHypothesis}' output.} #' #' The only analysis directly performed by \code{lh_robust} is a \code{t-test} for the null hypothesis of no effects of the linear combination of coefficients as specified by the user. #' All other output components are either extracted from \code{linearHypothesis} or \code{lm_robust}. #' Note that the estimate returned is the value of the LHS of an equation of the form f(X) = 0. Hyptheses "x - z = 1", "x +1= z + 2" and "x-z-1=0" will all return the value for "x-y-1" #' #' The original output returned by \code{linearHypothesis} is added as an attribute under the \code{"linear_hypothesis"} attribute. #' #' @examples #' #' library(fabricatr) #' dat <- fabricate( #' N = 40, #' y = rpois(N, lambda = 4), #' x = rnorm(N), #' z = rbinom(N, 1, prob = 0.4), #' clusterID = sample(1:4, 40, replace = TRUE) #' ) #' #' # Default variance estimator is HC2 robust standard errors #' lhro <- lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0") #' #' # The linear hypothesis argument can be specified equivalently as: #' lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z = 2x") #' lh_robust(y ~ x + z, data = dat, linear_hypothesis = "2*x +1*z") #' lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0") #' #' # Also recovers other sorts of standard erorrs just as specified in \code{\link{lm_robust}} #' lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0", se_type = "classical") #' lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0", se_type = "HC1") #' #' # Can tidy() main output and subcomponents in to a data.frame #' lhro <- lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0") #' tidy(lhro ) #' tidy(lhro$lm_robust) #' tidy(lhro$lh) #' #' # Can use summary() to get more statistics on the main output and subcomponents. #' summary(lhro) #' summary(lhro$lm_robust) #' summary(lhro$lh) #' #' @importFrom rlang quos eval_tidy #' #' @export #' lh_robust <- function(..., data, linear_hypothesis) { requireNamespace("car") # fit lm_robust model lm_robust_fit <- lm_robust(..., data = data) alpha <- lm_robust_fit$alpha # Checks # This stop could also be limited to hypotheses involving more than one coefficient if(!is.null(lm_robust_fit$se_type) && lm_robust_fit$se_type == "CR2") { stop("lh_robust not available for CR2 standard errors") } if(lm_robust_fit$clustered && is.null(lm_robust_fit$se_type)) { stop("lh_robust not available for CR2 standard errors; please specify CR0") } if(length(linear_hypothesis) > 1) { stop("lh_robust currently implements tests for hypotheses involving linear combinations of variables but not joint hypotheses (for instance X1 = X2, but not X1 = 0 and X2=0") } # calculate linear hypothesis car_lht <- car::linearHypothesis( lm_robust_fit, hypothesis.matrix = linear_hypothesis, level = 1 - alpha) estimate <- drop(attr(car_lht, "value")) std.error <- sqrt(diag(attr(car_lht, "vcov"))) if(length(estimate) > 1) { stop("lh_robust currently implements tests for hypotheses involving linear combinations of variables but not joint hypotheses (for instance X1 = X2, but not X1 = 0 and X2=0") } df <- lm_robust_fit$df[1] # appropriate when all elements of df are identical if(length(lm_robust_fit$df) >0 && var(lm_robust_fit$df > 0)) { warning("lh_robust inference may be inaccurate if degrees of freedom vary across coefficients") } statistic <- estimate / std.error p.value <- 2 * pt(abs(statistic), df, lower.tail = FALSE) ci <- estimate + std.error %o% qt(c(alpha / 2, 1 - alpha / 2), df) return_lh_robust <- data.frame( coefficients = estimate, std.error = std.error, statistic = statistic, p.value = p.value, alpha = alpha, conf.low = ci[, 1], conf.high = ci[, 2], df = df, term = linear_hypothesis, outcome = lm_robust_fit$outcome ) attr(return_lh_robust, "linear_hypothesis") <- car_lht class(return_lh_robust) <- c("lh", "data.frame") return_lm_robust <- lm_robust_fit return_lm_robust[["call"]] <- match.call() return(structure( list(lm_robust = return_lm_robust, lh = return_lh_robust), class = "lh_robust" )) } estimatr/R/S3_confint.R0000644000176200001440000000366014747205231014445 0ustar liggesusersconfint_lm_like <- function(object, parm = NULL, level = NULL, ...) { cis <- get_ci_mat(object, level) if (!is.null(parm)) { cis <- cis[parm, , drop = FALSE] } return(cis) } #' @export confint.lm_robust <- confint_lm_like #' @export confint.iv_robust <- confint_lm_like #' @importFrom stats confint #' @export confint.lh_robust <- function(object, parm = NULL, level = NULL, ...) { rbind(confint(object$lm_robust, parm = parm, level = level, ...), tidy(object$lh, ...)) } #' @export confint.lh <- confint_lm_like #' @export confint.difference_in_means <- function(object, parm = NULL, level = NULL, ...) { cis <- get_ci_mat(object, level) return(cis) } #' @export confint.horvitz_thompson <- function(object, parm = NULL, level = NULL, ...) { cis <- get_ci_mat(object, level, ttest = FALSE) return(cis) } ## internal method that builds confidence intervals and labels the matrix to be returned get_ci_mat <- function(object, level, ttest = TRUE) { if (!is.null(level)) { if (!is.null(object[["alpha"]])) { object[["alpha"]] <- NULL } object <- add_cis_pvals(object, alpha = 1 - level, ci = TRUE, ttest = ttest) } else { level <- 1 - object$alpha } cis <- cbind( as.vector(object$conf.low), as.vector(object$conf.high) ) if (is.matrix(object$conf.low)) { ny <- ncol(object$conf.low) p <- nrow(object$conf.low) rownames(cis) <- paste0( rep(object$outcome, each = p), ":", rep(object$term, times = ny) ) } else { rownames(cis) <- object$term } colnames(cis) <- paste((1 - level) / 2 * c(100, -100) + c(0, 100), "%") return(cis) } estimatr/R/S3_vcov.R0000644000176200001440000000157114747205231013761 0ustar liggesusers#' @export vcov.lm_robust <- function(object, complete = TRUE, ...) { vcov_simple(object, complete = complete) } #' @export vcov.iv_robust <- vcov.lm_robust #' @export vcov.difference_in_means <- function(object, ...) { return(object$vcov) } #' @export vcov.horvitz_thompson <- vcov.difference_in_means # Helper function for extracting vcov when it is just an element in the object list vcov_simple <- function(object, complete) { if (is.null(object$vcov)) { stop( "Object must have vcov matrix. Try setting `return_vcov = TRUE` in ", "the estimator function." ) } if (complete && (object$rank < object$k)) { vc <- matrix(NA_real_, object$k, object$k, dimnames = list(object$term, object$term)) j <- which(!is.na(coef(object, complete = TRUE))) vc[j, j] <- object$vcov return(vc) } else { return(object$vcov) } } estimatr/R/helper_starprep.R0000644000176200001440000002024114747205231015631 0ustar liggesusers#' Build lm_robust object from lm fit #' #' @param model an lm model object #' @param se_type The sort of standard error sought. If \code{clusters} is #' not specified the options are "HC0", "HC1" (or "stata", the equivalent), #' "HC2" (default), "HC3", or "classical". If \code{clusters} is specified the #' options are "CR0", "CR2" (default), or "stata". Can also specify "none", #' which may speed up estimation of the coefficients. #' @param clusters A vector corresponding to the clusters in the data. #' @param ci logical. Whether to compute and return p-values and confidence #' intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. #' #' @return an \code{\link{lm_robust}} object. #' #' @examples #' lmo <- lm(mpg ~ hp, data = mtcars) #' #' # Default HC2 #' commarobust(lmo) #' #' commarobust(lmo, se_type = "HC3") #' #' commarobust(lmo, se_type = "stata", clusters = mtcars$carb) #' #' @export commarobust <- function(model, se_type = NULL, clusters = NULL, ci = TRUE, alpha = 0.05) { if (class(model)[1] != "lm") { stop("`model` must be an lm object") } coefs <- as.matrix(coef(model)) est_exists <- !is.na(coefs) covs_used <- which(est_exists) coefs <- coefs[covs_used, , drop = FALSE] Qr <- qr(model) p1 <- seq_len(model$rank) XtX_inv <- chol2inv(Qr$qr[p1, p1, drop = FALSE]) clustered <- !is.null(clusters) se_type <- check_se_type(se_type = se_type, clustered = clustered) X <- model.matrix.default(model) contrasts <- attr(X, "contrasts") N <- nrow(X) data <- list( y = as.matrix(model.response(model$model)), X = X, weights = weights(model) ) weighted <- is.numeric(data[["weights"]]) ## check clusters and add to data if (clustered) { if (is.matrix(clusters) && ncol(clusters) > 1) { stop("`clusters` must be a single vector or column denoting the clusters.") } if (length(clusters) != N) { stop("`clusters` must be the same length as the model data.") } data[["cluster"]] <- as.factor(clusters) } data <- prep_data( data = data, se_type = se_type, clustered = clustered, weighted = weighted, fes = FALSE, iv_stage = list(0) ) ei <- as.matrix(resid(model)) if (clustered) { ei <- ei[data[["cl_ord"]], , drop = FALSE] } if (any(!est_exists)) { data <- drop_collinear(data, covs_used, weighted, FALSE) } if (weighted) { ei <- data[["weights"]] * ei XtX_inv <- data[["weight_mean"]] * XtX_inv # Need unweighted resid and need to reweight X if (se_type == "CR2") { eiunweighted <- as.matrix(data[["yunweighted"]] - data[["Xunweighted"]] %*% coefs) data[["X"]] <- data[["weights"]] * data[["X"]] } } vcov_fit <- lm_variance( X = data[["X"]], Xunweighted = data[["Xunweighted"]], XtX_inv = XtX_inv, ei = if (se_type == "CR2" && weighted) eiunweighted else ei, weight_mean = data[["weight_mean"]], cluster = data[["cluster"]], J = data[["J"]], ci = ci, se_type = se_type, which_covs = rep(TRUE, model$rank), fe_rank = 0 ) ## Build return_list return_list <- list( coefficients = as.matrix(coef(model)), std.error = NA, df = NA, term = names(coef(model)), outcome = as.character(rlang::f_lhs(formula(model))), alpha = alpha, se_type = se_type, df.residual = df.residual(model), weighted = weighted, fes = FALSE, clustered = clustered, nobs = nobs(model), rank = model$rank, k = ncol(X), fitted.values = fitted.values(model), contrasts = contrasts, terms = model$terms, xlevels = model$xlevels, weights = weights(model) ) return_list[["std.error"]][est_exists] <- sqrt(diag(vcov_fit$Vcov_hat)) return_list[["df"]][est_exists] <- ifelse(vcov_fit$dof == -99, NA, vcov_fit$dof) if (clustered) { return_list[["nclusters"]] <- data[["J"]] } return_list[["res_var"]] <- get_resvar( data = data, ei = ei, df.residual = return_list[["df.residual"]], vcov_fit = vcov_fit, weighted = weighted ) return_list <- add_cis_pvals(return_list, alpha, ci && se_type != "none") ## Add F stat tss_r2s <- get_r2s( y = data[["y"]], return_list = return_list, yunweighted = data[["yunweighted"]], has_int = attr(model$terms, "intercept"), weights = data[["weights"]], weight_mean = data[["weight_mean"]] ) if (clustered) { dendf <- data[["J"]] - 1 } else { dendf <- return_list[["df.residual"]] } f <- get_fstat( tss_r2s = tss_r2s, return_list = return_list, iv_ei = NULL, nomdf = model$rank - attr(model$terms, "intercept"), dendf = dendf, vcov_fit = vcov_fit, has_int = attr(model$terms, "intercept"), iv_stage = list(0) ) return_list <- c(return_list, tss_r2s) return_list[["fstatistic"]] <- f return_list[["vcov"]] <- vcov_fit$Vcov_hat dimnames(return_list[["vcov"]]) <- list( return_list$term[est_exists], return_list$term[est_exists] ) return_list <- lm_return(return_list, model_data = NULL, formula = NULL) attr(return_list, "class") <- "lm_robust" return(return_list) } #' Prepare model fits for stargazer #' #' @param ... a list of lm_robust or lm objects #' @param stat either "std.error" (the default), "statistic" (the t-statistic), "p.value", "ci", or "df" #' @param se_type (optional) if any of the objects are lm objects, what standard #' errors should be used. Must only be one type and will be used for all lm #' objects passed to starprep. See \code{commarobust} for more. #' @param clusters (optional) if any of the objects are lm objects, what clusters #' should be used, if clusters should be used. Must only be one vector and will #' be used for all lm objects passed to starprep. See \code{commarobust} for more. #' @param alpha (optional) if any of the objects are lm objects, what significance level #' should be used for the p-values or confidence intervals #' #' @details Used to help extract statistics from lists of model fits for stargazer. #' Prefers lm_robust objects, but because \code{stargazer} does not work with \code{lm_robust} #' objects, \code{starprep} can also take \code{lm} objects and calls \code{commarobust} to get #' the preferred, robust statistics. #' #' @return a list of vectors of extracted statistics for stargazers #' #' @examples #' #' library(stargazer) #' #' lm1 <- lm(mpg ~ hp, data = mtcars) #' lm2 <- lm(mpg ~ hp + wt, data = mtcars) #' #' # Use default "HC2" standard errors #' stargazer(lm1, lm2, #' se = starprep(lm1, lm2), #' p = starprep(lm1, lm2, stat = "p.value"), #' omit.stat = "f") #' # NB: We remove the F-stat because stargazer only can use original F-stat #' # which uses classical SEs #' #' # Use default "CR2" standard errors with clusters #' stargazer(lm1, lm2, #' se = starprep(lm1, lm2, clusters = mtcars$carb), #' p = starprep(lm1, lm2, clusters = mtcars$carb, stat = "p.value"), #' omit.stat = "f") #' #' # Can also specify significance levels and different standard errors #' stargazer(lm1, lm2, #' ci.custom = starprep(lm1, lm2, se_type = "HC3", alpha = 0.1, stat = "ci"), #' omit.stat = "f") #' #' @export starprep <- function(..., stat = c("std.error", "statistic", "p.value", "ci", "df"), se_type = NULL, clusters = NULL, alpha = 0.05) { if (inherits(..1, "list")) { if (...length() > 1) { stop("`...` must be one list of model fits or several comma separated model fits") } fits <- ..1 } else { fits <- list(...) } is_list_of_lm <- vapply(fits, inherits, what = c("lm","lm_robust"), TRUE) if (any(!is_list_of_lm)) { stop("`...` must contain only `lm` or `lm_robust` objects.") } fitlist <- lapply( fits, function(x) { if (inherits(x, "lm")) commarobust(x, se_type = se_type, clusters = clusters, alpha = alpha) else x } ) stat <- match.arg(stat) if (stat == "ci") { out <- lapply(fitlist, function(x) cbind(x[["conf.low"]], x[["conf.high"]])) } else { out <- lapply(fitlist, `[[`, stat) } return(out) } estimatr/R/estimatr_horvitz_thompson.R0000644000176200001440000006707314747205231020014 0ustar liggesusers#' Horvitz-Thompson estimator for two-armed trials #' #' @description Horvitz-Thompson estimators that are unbiased for designs in #' which the randomization scheme is known #' #' @param formula an object of class formula, as in \code{\link{lm}}, such as #' \code{Y ~ Z} with only one variable on the right-hand side, the treatment. #' @param data A data.frame. #' @param condition_prs An optional bare (unquoted) name of the variable with #' the condition 2 (treatment) probabilities. See details. May also use a single #' number for the condition 2 probability if it is constant. #' @param blocks An optional bare (unquoted) name of the block variable. Use #' for blocked designs only. See details. #' @param clusters An optional bare (unquoted) name of the variable that #' corresponds to the clusters in the data; used for cluster randomized #' designs. For blocked designs, clusters must be within blocks. #' @param simple logical, optional. Whether the randomization is simple #' (TRUE) or complete (FALSE). This is ignored if \code{blocks} are specified, #' as all blocked designs use complete randomization, or either #' \code{ra_declaration} or \code{condition_pr_mat} are passed. Otherwise, it #' defaults to \code{TRUE}. #' @param condition_pr_mat An optional 2n * 2n matrix of marginal and joint #' probabilities of all units in condition1 and condition2. See details. #' @param ra_declaration An object of class \code{"ra_declaration"}, from #' the \code{\link[randomizr]{declare_ra}} function in the \pkg{randomizr} #' package. This is the third way that one can specify a design for this #' estimator. Cannot be used along with any of \code{condition_prs}, #' \code{blocks}, \code{clusters}, or \code{condition_pr_mat}. See details. #' @param subset An optional bare (unquoted) expression specifying a subset of #' observations to be used. #' @param se_type can be one of \code{c("youngs", "constant", "none")} and corresponds #' the estimator of the standard errors. Default estimator uses Young's #' inequality (and is conservative) while the other uses a constant treatment #' effects assumption and only works for simple randomized designs at the #' moment. If "none" then standard errors will not be computed which may speed up run time if only the point estimate is required. #' @param condition1 value in the treatment vector of the condition #' to be the control. Effects are #' estimated with \code{condition1} as the control and \code{condition2} as the #' treatment. If unspecified, \code{condition1} is the "first" condition and #' \code{condition2} is the "second" according to levels if the treatment is a #' factor or according to a sortif it is a numeric or character variable (i.e #' if unspecified and the treatment is 0s and 1s, \code{condition1} will by #' default be 0 and \code{condition2} will be 1). See the examples for more. #' @param condition2 value in the treatment vector of the condition to be the #' treatment. See \code{condition1}. #' @param ci logical. Whether to compute and return p-values and #' confidence intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. #' @param return_condition_pr_mat logical. Whether to return the condition #' probability matrix. Returns NULL if the design is simple randomization, #' FALSE by default. #' #' @details This function implements the Horvitz-Thompson estimator for #' treatment effects for two-armed trials. This estimator is useful for estimating unbiased #' treatment effects given any randomization scheme as long as the #' randomization scheme is known. #' #' In short, the Horvitz-Thompson estimator essentially reweights each unit #' by the probability of it being in its observed condition. Pivotal to the #' estimation of treatment effects using this estimator are the marginal #' condition probabilities (i.e., the probability that any one unit is in #' a particular treatment condition). Pivotal to estimating the variance #' whenever the design is more complicated than simple randomization are the #' joint condition probabilities (i.e., the probabilities that any two units #' have a particular set of treatment conditions, either the same or #' different). The estimator we provide here considers the case with two #' treatment conditions. #' #' Users interested in more details can see the #' \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{mathematical notes} #' for more information and references, or see the references below. #' #' There are three distinct ways that users can specify the design to the #' function. The preferred way is to use #' the \code{\link[randomizr]{declare_ra}} function in the \pkg{randomizr} #' package. This function takes several arguments, including blocks, clusters, #' treatment probabilities, whether randomization is simple or not, and more. #' Passing the outcome of that function, an object of class #' \code{"ra_declaration"} to the \code{ra_declaration} argument in this function, #' will lead to a call of the \code{\link{declaration_to_condition_pr_mat}} #' function which generates the condition probability matrix needed to #' estimate treatment effects and standard errors. We provide many examples #' below of how this could be done. #' #' The second way is to pass the names of vectors in your \code{data} to #' \code{condition_prs}, \code{blocks}, and \code{clusters}. You can further #' specify whether the randomization was simple or complete using the \code{simple} #' argument. Note that if \code{blocks} are specified the randomization is #' always treated as complete. From these vectors, the function learns how to #' build the condition probability matrix that is used in estimation. #' #' In the case #' where \code{condition_prs} is specified, this function assumes those #' probabilities are the marginal probability that each unit is in condition2 #' and then uses the other arguments (\code{blocks}, \code{clusters}, #' \code{simple}) to learn the rest of the design. If users do not pass #' \code{condition_prs}, this function learns the probability of being in #' condition2 from the data. That is, none of these arguments are specified, #' we assume that there was a simple randomization where the probability #' of each unit being in condition2 was the average of all units in condition2. #' Similarly, we learn the block-level probability of treatment within #' \code{blocks} by looking at the mean number of units in condition2 if #' \code{condition_prs} is not specified. #' #' The third way is to pass a \code{condition_pr_mat} directly. One can #' see more about this object in the documentation for #' \code{\link{declaration_to_condition_pr_mat}} and #' \code{\link{permutations_to_condition_pr_mat}}. Essentially, this 2n * 2n #' matrix allows users to specify marginal and joint marginal probabilities #' of units being in conditions 1 and 2 of arbitrary complexity. Users should #' only use this option if they are certain they know what they are doing. #' #' @return Returns an object of class \code{"horvitz_thompson"}. #' #' The post-estimation commands functions \code{summary} and \code{\link{tidy}} #' return results in a \code{data.frame}. To get useful data out of the return, #' you can use these data frames, you can use the resulting list directly, or #' you can use the generic accessor functions \code{coef} and #' \code{confint}. #' #' An object of class \code{"horvitz_thompson"} is a list containing at #' least the following components: #' #' \item{coefficients}{the estimated difference in totals} #' \item{std.error}{the estimated standard error} #' \item{statistic}{the z-statistic} #' \item{df}{the estimated degrees of freedom} #' \item{p.value}{the p-value from a two-sided z-test using \code{coefficients} and \code{std.error}} #' \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} #' \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} #' \item{term}{a character vector of coefficient names} #' \item{alpha}{the significance level specified by the user} #' \item{nobs}{the number of observations used} #' \item{outcome}{the name of the outcome variable} #' \item{condition_pr_mat}{the condition probability matrix if \code{return_condition_pr_mat} is TRUE} #' #' #' @seealso \code{\link[randomizr]{declare_ra}} #' #' @references #' Aronow, Peter M, and Joel A Middleton. 2013. "A Class of Unbiased Estimators of the Average Treatment Effect in Randomized Experiments." Journal of Causal Inference 1 (1): 135-54. \doi{10.1515/jci-2012-0009}. #' #' Aronow, Peter M, and Cyrus Samii. 2017. "Estimating Average Causal Effects Under Interference Between Units." Annals of Applied Statistics, forthcoming. \url{https://arxiv.org/abs/1305.6156v3}. #' #' Middleton, Joel A, and Peter M Aronow. 2015. "Unbiased Estimation of the Average Treatment Effect in Cluster-Randomized Experiments." Statistics, Politics and Policy 6 (1-2): 39-75. \doi{10.1515/spp-2013-0002}. #' #' @examples #' #' # Set seed #' set.seed(42) #' #' # Simulate data #' n <- 10 #' dat <- data.frame(y = rnorm(n)) #' #' library(randomizr) #' #' #---------- #' # 1. Simple random assignment #' #---------- #' dat$p <- 0.5 #' dat$z <- rbinom(n, size = 1, prob = dat$p) #' #' # If you only pass condition_prs, we assume simple random sampling #' horvitz_thompson(y ~ z, data = dat, condition_prs = p) #' # Assume constant effects instead #' horvitz_thompson(y ~ z, data = dat, condition_prs = p, se_type = "constant") #' #' # Also can use randomizr to pass a declaration #' srs_declaration <- declare_ra(N = nrow(dat), prob = 0.5, simple = TRUE) #' horvitz_thompson(y ~ z, data = dat, ra_declaration = srs_declaration) #' #' #---------- #' # 2. Complete random assignment #' #---------- #' #' dat$z <- sample(rep(0:1, each = n/2)) #' # Can use a declaration #' crs_declaration <- declare_ra(N = nrow(dat), prob = 0.5, simple = FALSE) #' horvitz_thompson(y ~ z, data = dat, ra_declaration = crs_declaration) #' # Can precompute condition_pr_mat and pass it #' # (faster for multiple runs with same condition probability matrix) #' crs_pr_mat <- declaration_to_condition_pr_mat(crs_declaration) #' horvitz_thompson(y ~ z, data = dat, condition_pr_mat = crs_pr_mat) #' #' #---------- #' # 3. Clustered treatment, complete random assigment #' #----------- #' # Simulating data #' dat$cl <- rep(1:4, times = c(2, 2, 3, 3)) #' dat$prob <- 0.5 #' clust_crs_decl <- declare_ra(N = nrow(dat), clusters = dat$cl, prob = 0.5) #' dat$z <- conduct_ra(clust_crs_decl) #' # Easiest to specify using declaration #' ht_cl <- horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_crs_decl) #' # Also can pass the condition probability and the clusters #' ht_cl_manual <- horvitz_thompson( #' y ~ z, #' data = dat, #' clusters = cl, #' condition_prs = prob, #' simple = FALSE #' ) #' ht_cl #' ht_cl_manual #' #' # Blocked estimators specified similarly #' #' #---------- #' # More complicated assignment #' #---------- #' #' # arbitrary permutation matrix #' possible_treats <- cbind( #' c(1, 1, 0, 1, 0, 0, 0, 1, 1, 0), #' c(0, 1, 1, 0, 1, 1, 0, 1, 0, 1), #' c(1, 0, 1, 1, 1, 1, 1, 0, 0, 0) #' ) #' arb_pr_mat <- permutations_to_condition_pr_mat(possible_treats) #' # Simulating a column to be realized treatment #' dat$z <- possible_treats[, sample(ncol(possible_treats), size = 1)] #' horvitz_thompson(y ~ z, data = dat, condition_pr_mat = arb_pr_mat) #' #' @export horvitz_thompson <- function(formula, data, blocks, clusters, simple = NULL, condition_prs, condition_pr_mat = NULL, ra_declaration = NULL, subset, condition1 = NULL, condition2 = NULL, se_type = c("youngs", "constant", "none"), ci = TRUE, alpha = .05, return_condition_pr_mat = FALSE) { # ----- # Check arguments # ----- if (length(all.vars(f_rhs(formula))) > 1) { stop( "'formula' must have only one variable on the right-hand side: the ", "treatment variable" ) } simple_specified <- is.logical(simple) se_type <- match.arg(se_type) # ----- # Parse arguments, clean data # ----- # User can either use declaration or the arguments, not both! if (!is.null(ra_declaration)) { if (ncol(ra_declaration$probabilities_matrix) > 2) { stop( "Cannot use horvitz_thompson() with a `ra_declaration` with more than ", "two treatment arms for now" ) } if (!missing(clusters) | !missing(condition_prs) | !missing(blocks) | !is.null(condition_pr_mat)) { stop( "Cannot use `ra_declaration` with any of `clusters`, `condition_prs`, ", "`blocks`, `condition_pr_mat`" ) } # Add clusters, blocks, and treatment probabilities to data so they # can be cleaned with clean_model_data if (!is.null(ra_declaration$clusters)) { .clusters_ddinternal <- ra_declaration$clusters clusters <- quo(.clusters_ddinternal) } if (!is.null(ra_declaration$blocks)) { .blocks_ddinternal <- ra_declaration$blocks blocks <- quo(.blocks_ddinternal) } if (!is.null(condition2)) { treatnum <- match(condition2, ra_declaration$conditions) if (is.na(treatnum)) { stop( "If `condition2` and `ra_declaration` are both specified, ", "`condition2` must match the condition_names in `ra_declaration`.", "\n`condition2`: ", condition2, "\n`condition_names`: ", paste0( ra_declaration$conditions, collapse = ", " ) ) } treatment_prob <- obtain(ra_declaration,condition2) } else { # assuming treatment is second column treatment_prob <- ra_declaration$probabilities_matrix[, 2] } .treatment_prob_ddinternal <- treatment_prob condition_prs <- quo(.treatment_prob_ddinternal) } ## Clean data datargs <- enquos( formula = formula, subset = subset, block = blocks, cluster = clusters, condition_pr = condition_prs ) data <- enquo(data) model_data <- clean_model_data(data = data, datargs, estimator = "ht") ## condition_pr_mat, if supplied, must be same length if (!is.null(condition_pr_mat) && (2 * length(model_data$outcome) != nrow(condition_pr_mat))) { stop( "After cleaning the data, it has ", length(model_data$outcome), " ", "while `condition_pr_mat` has ", nrow(condition_pr_mat), ". ", "`condition_pr_mat` should have twice the rows" ) } data <- data.frame( y = model_data$outcome, t = model_data$original_treatment, stringsAsFactors = FALSE ) # Parse conditions condition_names <- parse_conditions( treatment = data$t, condition1 = condition1, condition2 = condition2, estimator = "horvitz_thompson" ) condition2 <- condition_names[[2]] condition1 <- condition_names[[1]] data$clusters <- model_data$cluster data$blocks <- model_data$block if (!is.null(model_data$condition_pr)) { data$condition_probabilities <- model_data$condition_pr } # ---------- # Learn design # ---------- # Declaration is passed if (!is.null(ra_declaration)) { prob_matrix <- NULL # Use output from clean_model_data to rebuild declaration if (nrow(ra_declaration$probabilities_matrix) != length(data$y)) { prob_matrix <- cbind( 1 - data$condition_probabilities, data$condition_probabilities ) } # If simple, just use condition probabilities shortcut # Same if se not needed if (inherits(ra_declaration, "ra_simple") || se_type == "none") { condition_pr_mat <- NULL } else { # TODO to allow for declaration with multiple arms, get probability matrix # and build it like decl$pr_mat <- cbind(decl$pr_mat[, c(cond1, cond2)]) condition_pr_mat <- declaration_to_condition_pr_mat( ra_declaration, condition1, condition2, prob_matrix ) } } else if (is.null(condition_pr_mat)) { # need to learn it if no declaration and not passed # check simple arg simple <- ifelse(is.null(simple), TRUE, simple) if (is.null(data$blocks) && is.null(data$clusters)) { # no blocks or clusters if (simple) { # don't need condition_pr_mat, just the condition_prs # if the user passed it, we're fine and can use just the # marginal probabilities # if user didn't pass, we have to guess if (is.null(data$condition_probabilities)) { data$condition_probabilities <- mean(data$t == condition2) message( "Assuming simple random assignment with probability of treatment ", "equal to the mean number of obs in `condition2`, which is roughly ", round(data$condition_probabilities[1], 3) ) } } else { # If we don't know the prob, learn it if (is.null(data$condition_probabilities)) { data$condition_probabilities <- mean(data$t == condition2) message( "Learning probability of complete random assignment from data ", "with prob = ", round(data$condition_probabilities[1], 3) ) if (se_type != "none") { condition_pr_mat <- gen_pr_matrix_complete( pr = data$condition_probabilities[1], n_total = length(data$y) ) } } else { if (length(unique(data$condition_probabilities)) > 1) { stop( "Treatment probabilities must be fixed for complete randomized designs" ) } if (se_type != "none") { condition_pr_mat <- gen_pr_matrix_complete( pr = data$condition_probabilities[1], n_total = length(data$y) ) } } } } else if (is.null(data$blocks)) { # clustered case if (!simple_specified) { message( "Assuming ", ifelse(simple, "simple", "complete"), " cluster randomization" ) } if (is.null(data$condition_probabilities)) { # Split by cluster and get complete randomized values # within each cluster cluster_treats <- get_cluster_treats(data, condition2) data$condition_probabilities <- mean(cluster_treats$treat_clust) message( "`condition_prs` not found, estimating probability of treatment ", "to be constant at mean of clusters in `condition2` at prob =", data$condition_probabilities[1] ) if (se_type != "none") { # Some redundancy in following fn condition_pr_mat <- gen_pr_matrix_cluster( clusters = data$clusters, treat_probs = data$condition_probabilities, simple = simple ) } } else if (se_type != "none") { # Just to check if cluster has same treatment within get_cluster_treats(data, condition2) condition_pr_mat <- gen_pr_matrix_cluster( clusters = data$clusters, treat_probs = data$condition_probabilities, simple = simple ) } } else { # blocked case if (simple) { message( "Assuming complete random assignment of clusters within blocks. ", "User can use `ra_declaration` or `condition_pr_mat` to have full ", "control over the design." ) } if (is.null(data$condition_probabilities)) { message( "`condition_prs` not found, estimating probability of treatment ", "to be proportion of units or clusters in condition2 in each block" ) condition_pr_mat <- gen_pr_matrix_block( blocks = data$blocks, clusters = data$clusters, t = data$t, condition2 = condition2 ) } else { condition_pr_mat <- gen_pr_matrix_block( blocks = data$blocks, clusters = data$clusters, p2 = data$condition_probabilities ) } } } # Need the marginal condition prs for later in rare cases where not yet set if (is.null(data$condition_probabilities)) { data$condition_probabilities <- diag(condition_pr_mat)[(length(data$y) + 1):(2 * length(data$y))] } # Check some things that must be true, could do this earlier # but don't have condition_probabilities then, and unfortunatey # this loops over data clusters a second time if (!is.null(data$clusters)) { if (any(!tapply( data$condition_probabilities, data$clusters, function(x) all(x == x[1]) ))) { stop("`condition_prs` must be constant within `cluster`") } } rm(model_data) # ----- # Estimation # ----- if (is.null(data$blocks)) { return_frame <- horvitz_thompson_internal( condition_pr_mat = condition_pr_mat, condition1 = condition1, condition2 = condition2, data = data, se_type = se_type, alpha = alpha ) } else { clust_per_block <- check_clusters_blocks(data) N <- nrow(data) data$index <- 1:N block_dfs <- split(data, data$blocks) block_estimates <- lapply(block_dfs, function(x) { horvitz_thompson_internal( data = x, condition1 = condition1, condition2 = condition2, condition_pr_mat = condition_pr_mat[c(x$index, N + x$index), c(x$index, N + x$index)], se_type = se_type, alpha = alpha ) }) block_estimates <- do.call(rbind, block_estimates) N_overall <- with(block_estimates, sum(nobs)) n_blocks <- nrow(block_estimates) diff <- with(block_estimates, sum(coefficients * nobs / N_overall)) if (se_type != "none") { std.error <- with( block_estimates, sqrt(sum(std.error^2 * (nobs / N_overall)^2)) ) } else { std.error <- NA } return_frame <- data.frame( coefficients = diff, std.error = std.error, nobs = N_overall, stringsAsFactors = FALSE ) } return_frame$df <- NA return_list <- add_cis_pvals(return_frame, alpha, ci, ttest = FALSE) # ----- # Build and return output # ----- return_list <- dim_like_return( return_list, alpha = alpha, formula = formula, conditions = list(condition1, condition2) ) if (return_condition_pr_mat) { return_list[["condition_pr_mat"]] <- condition_pr_mat } return_list[["se_type"]] <- se_type attr(return_list, "class") <- "horvitz_thompson" return(return_list) } var_ht_total_no_cov <- function(y, ps) { sum((1 - ps) * ps * y^2) } horvitz_thompson_internal <- function(condition_pr_mat = NULL, condition1 = NULL, condition2 = NULL, data, pair_matched = FALSE, se_type, alpha = .05) { # TODO, add estimator from Middleton & Aronow 2015 page 51 t2 <- which(data$t == condition2) t1 <- which(data$t == condition1) N <- length(t2) + length(t1) std.error <- NA collapsed <- !is.null(data$clusters) if (collapsed) { if (se_type == "constant") { stop( "`se_type` = 'constant' only supported for simple random designs ", "at the moment" ) } if (is.factor(data$clusters)) { data$clusters <- as.numeric(data$clusters) } # used for cluster randomized designs k <- length(unique(data$clusters)) y2_totals <- tapply(data$y[t2], data$clusters[t2], sum) y1_totals <- tapply(data$y[t1], data$clusters[t1], sum) to_drop <- which(duplicated(data$clusters)) t2 <- which(data$t[-to_drop] == condition2) t1 <- which(data$t[-to_drop] == condition1) # reorder totals because tapply above sorts on cluster y2_totals <- y2_totals[as.character(data$clusters[-to_drop][t2])] y1_totals <- y1_totals[as.character(data$clusters[-to_drop][t1])] prs <- data$condition_probabilities[-to_drop] ps2 <- prs[t2] ps1 <- 1 - prs[t1] # for now rescale, with joint pr need squared top alone Y2 <- y2_totals / ps2 Y1 <- y1_totals / ps1 diff <- (sum(Y2) - sum(Y1)) / N if (se_type != "none") { condition_pr_mat <- condition_pr_mat[-c(to_drop, N + to_drop), -c(to_drop, N + to_drop)] std.error <- sqrt( sum(Y2^2) + sum(Y1^2) + ht_var_partial( Y2, condition_pr_mat[(k + t2), (k + t2), drop = FALSE] ) + ht_var_partial( Y1, condition_pr_mat[t1, t1, drop = FALSE] ) - 2 * ht_covar_partial( Y2, Y1, condition_pr_mat[(k + t2), t1, drop = FALSE], ps2, ps1 ) ) / N } } else { # All non-clustered designs ps2 <- data$condition_probabilities[t2] ps1 <- 1 - data$condition_probabilities[t1] Y2 <- data$y[t2] / ps2 Y1 <- data$y[t1] / ps1 diff <- (sum(Y2) - sum(Y1)) / N if (is.null(condition_pr_mat)) { # Simple random assignment # joint inclusion probabilities = product of marginals if (se_type == "constant") { # Scale again y0 <- ifelse( data$t == condition1, data$y / (1 - data$condition_probabilities), (data$y - diff) / (1 - data$condition_probabilities) ) y1 <- ifelse( data$t == condition2, data$y / data$condition_probabilities, (data$y + diff) / data$condition_probabilities ) std.error <- sqrt( var_ht_total_no_cov(y1, data$condition_probabilities) + var_ht_total_no_cov(y0, 1 - data$condition_probabilities) + # TODO why is it +2 instead of - (looking at old samii/aronow) 2 * sum(c(data$y[t2], data$y[t1] + diff) * c(data$y[t2] - diff, data$y[t1])) ) / N } else if (se_type == "youngs") { # Young's inequality std.error <- sqrt(sum(Y2^2) + sum(Y1^2)) / N } } else { # Complete random assignment if (se_type == "constant") { stop( "`se_type` = 'constant' only supported for simple random designs ", "at the moment" ) } else if (se_type == "youngs") { # Young's inequality # this is the "clustered" estimator where each unit is a cluster # shouldn't apply to clustered designs but may if user passes a # condition_pr_mat varN2 <- sum(Y2^2) + sum(Y1^2) + ht_var_partial( Y2, condition_pr_mat[(N + t2), (N + t2), drop = FALSE] ) + ht_var_partial( Y1, condition_pr_mat[t1, t1, drop = FALSE] ) - 2 * ht_covar_partial( Y2, Y1, condition_pr_mat[(N + t2), t1, drop = FALSE], ps2, ps1 ) if (!is.nan(varN2)) { if (varN2 < 0) { warning("Variance below 0") std.error <- NA } else { std.error <- sqrt(varN2) / N } } else { warning( "Variance is NaN. This is likely the result ", "of a complex condition probability matrix" ) std.error <- NA } } } } return_frame <- data.frame( coefficients = diff, std.error = std.error, nobs = N, stringsAsFactors = FALSE ) return(return_frame) } estimatr/R/data.R0000644000176200001440000000230414747205231013343 0ustar liggesusers#' Replication data for Lin 2013 #' #' A dataset containing the data to replicate: #' Lin, Winston. 2013. "Agnostic notes on regression adjustments to experimental #' data: Reexamining Freedman's critique." The Annals of Applied Statistics. #' Stat. 7(1): 295-318. doi:10.1214/12-AOAS583. #' https://projecteuclid.org/euclid.aoas/1365527200. #' #' This data was originally taken from the following paper, subset to men who #' showed up to college, were in one of the arms with the support condition, #' and had GPA data for their first year in college. #' #' Angrist, Joshua, Daniel Lang, and Philip Oreopoulos. 2009. "Incentives and #' Services for College Achievement: Evidence from a Randomized Trial." American #' Economic Journal: Applied Economics 1(1): 136-63. #' https://www.aeaweb.org/articles?id=10.1257/app.1.1.136 #' #' @format A data frame with educational treatments and outcomes: #' \describe{ #' \item{gpa0}{high school GPA} #' \item{sfsp}{financial incentives and support treatment} #' \item{ssp}{support only treatment} #' \item{GPA_year1}{college GPA year 1} #' \item{GPA_year2}{college GPA year 2} #' } #' @source \url{https://www.aeaweb.org/articles?id=10.1257/app.1.1.136} "alo_star_men" estimatr/R/helper_lm_robust_fit.R0000644000176200001440000004547514747205231016661 0ustar liggesusers#' Internal method that creates linear fits #' #' @param y numeric outcome vector #' @param X numeric design matrix #' @param yoriginal numeric outcome vector, unprojected if there are fixed effects #' @param Xoriginal numeric design matrix, unprojected if there are fixed effects. Any column named \code{"(Intercept)" will be dropped} #' @param weights numeric weights vector #' @param cluster numeric cluster vector #' @param fixed_effects character matrix of fixed effect groups #' @param ci boolean that when T returns confidence intervals and p-values #' @param se_type character denoting which kind of SEs to return #' @param has_int logical, whether the model has an intercept, used for \eqn{R^2} #' @param alpha numeric denoting the test size for confidence intervals #' @param return_vcov logical, whether to return the vcov matrix for later usage #' @param return_fit logical, whether to return fitted values #' @param try_cholesky logical, whether to try using a cholesky decomposition to solve LS instead of a QR decomposition #' @param iv_stage list of length two, the first element denotes the stage of 2SLS IV estimation, where 0 is used for OLS. The second element is only used for the second stage of 2SLS and has the first stage design matrix. For OLS, the default, \code{list(0)}, for the first stage of 2SLS \code{list(1)}, for second stage of 2SLS \code{list(2, first_stage_design_mat)}. #' #' @export #' lm_robust_fit <- function(y, X, yoriginal = NULL, Xoriginal = NULL, weights, cluster, fixed_effects = NULL, ci = TRUE, se_type, has_int, # TODO get this out of here alpha = 0.05, return_vcov = TRUE, return_fit = TRUE, try_cholesky = FALSE, iv_stage = list(0)) { # ---------- # Check se type # ---------- clustered <- !is.null(cluster) fes <- !is.null(fixed_effects) weighted <- !is.null(weights) se_type <- check_se_type(se_type, clustered) if (weighted && se_type == "CR2" && fes) { stop( "Cannot use `fixed_effects` with weighted CR2 estimation at the moment. ", "Try setting `se_type` = \"stata\"" ) } # ----------- # Prep data for fitting # ----------- data <- list( y = as.matrix(y), X = X ) ny <- ncol(data[["y"]]) ynames <- colnames(data[["y"]]) multivariate <- ny > 1 if (weighted) { data[["weights"]] <- weights } if (iv_stage[[1]] == 2) { data[["X_first_stage"]] <- iv_stage[[2]] } if (clustered) { data[["cluster"]] <- cluster } k <- ncol(data[["X"]]) if (is.null(colnames(data[["X"]]))) { colnames(data[["X"]]) <- paste0("X", 1:k) } variable_names <- colnames(data[["X"]]) if (fes) { data[["fixed_effects"]] <- fixed_effects if (is.numeric(yoriginal)) { data[["yoriginal"]] <- as.matrix(yoriginal) } if (is.numeric(Xoriginal)) { # Drop (Intercept) if Xoriginal created by clean_model_data data[["Xoriginal"]] <- as.matrix(Xoriginal) data[["Xoriginal"]] <- data[["Xoriginal"]][ , colnames(data[["Xoriginal"]]) != "(Intercept)", drop = FALSE ] } fe_rank <- attr(data[["fixed_effects"]], "fe_rank") } else { fe_rank <- 0 } # Legacy, in case we want to only get some covs in the future which_covs <- setNames(rep(TRUE, k), variable_names) data <- prep_data( data = data, se_type = se_type, clustered = clustered, weighted = weighted, fes = fes, iv_stage = iv_stage ) # ----------- # Estimate coefficients # ----------- fit <- lm_solver( X = data[["X"]], y = data[["y"]], try_cholesky = try_cholesky ) fit$beta_hat <- as.matrix(fit$beta_hat) dimnames(fit$beta_hat) <- list(variable_names, ynames) # Use first model to get linear dependencies est_exists <- !is.na(fit$beta_hat) covs_used <- which(est_exists[, 1]) N <- nrow(data[["X"]]) x_rank <- length(covs_used) tot_rank <- x_rank + fe_rank if (multivariate) { return_list <- list( coefficients = fit$beta_hat, std.error = matrix(NA, k, ny), df = matrix(NA, k, ny) ) } else { return_list <- list( coefficients = setNames(as.vector(fit$beta_hat), variable_names), std.error = NA, df = NA ) } # ---------- # Estimate variance # ---------- if (se_type != "none" || return_fit) { # Drop NA columns from data and from beta_hat if (x_rank < ncol(data[["X"]])) { data <- drop_collinear(data, covs_used, weighted, iv_stage) fit$beta_hat <- fit$beta_hat[covs_used, , drop = FALSE] } # compute fitted.values and residuals fit_vals <- list() if (iv_stage[[1]] == 2) { X_name <- "X_first_stage" X_name_unweighted <- "X_first_stage_unweighted" } else { X_name <- "X" X_name_unweighted <- "Xunweighted" } fit_vals[["fitted.values"]] <- as.matrix( data[[X_name]][, seq_len(x_rank), drop = FALSE] %*% fit$beta_hat ) fit_vals[["ei"]] <- as.matrix(data[["y"]] - fit_vals[["fitted.values"]]) if (weighted) { fit_vals[["fitted.values.unweighted"]] <- as.matrix( data[[X_name_unweighted]] %*% fit$beta_hat ) fit_vals[["ei.unweighted"]] <- as.matrix( data[["yunweighted"]] - fit_vals[["fitted.values.unweighted"]] ) # For CR2 need X weighted by weights again # so that instead of having X * sqrt(W) we have X * W if (se_type == "CR2") { data[["X"]] <- data[["weights"]] * data[["X"]] if (fes) { data[["femat"]] <- data[["weights"]] * data[["femat"]] } } } # Also need second stage residuals for fstat if (iv_stage[[1]] == 2) { fit_vals[["fitted.values.iv"]] <- as.matrix(data[["X"]] %*% fit$beta_hat) fit_vals[["ei.iv"]] <- as.matrix(data[["y"]] - fit_vals[["fitted.values.iv"]]) if (weighted) { fit_vals[["ei.iv"]] <- data[["weights"]] * fit_vals[["ei.iv"]] } return_list[["ei.iv"]] <- fit_vals[["ei.iv"]] } if (se_type != "none") { vcov_fit <- lm_variance( X = if (se_type %in% c("HC2", "HC3", "CR2") && fes) cbind(data[["X"]], data[["femat"]]) else data[["X"]], Xunweighted = if (se_type %in% c("HC2", "HC3", "CR2") && fes && weighted) cbind(data[["Xunweighted"]], data[["fematunweighted"]]) else data[["Xunweighted"]], XtX_inv = fit$XtX_inv, ei = if (se_type == "CR2" && weighted) fit_vals[["ei.unweighted"]] else fit_vals[["ei"]], weight_mean = data[["weight_mean"]], cluster = data[["cluster"]], J = data[["J"]], ci = ci, se_type = se_type, which_covs = which_covs[covs_used], fe_rank = fe_rank ) return_list$std.error[est_exists] <- sqrt(diag(vcov_fit$Vcov_hat)) if (ci) { # If any not computed in variance fn, replace with NA return_list$df[est_exists] <- ifelse(vcov_fit$dof == -99, NA, vcov_fit$dof) } } } # ---------- # Augment return object # ---------- return_list <- add_cis_pvals(return_list, alpha, ci && se_type != "none") if (return_fit) { if (fes && iv_stage[[1]] != 1) { # Override previous fitted values with those that take into consideration # the fixed effects (unless IV first stage, where we stay w/ projected model) return_list[["fitted.values"]] <- as.matrix(data[["yoriginal"]] - fit_vals[["ei"]]) if (weighted) { return_list[["fitted.values"]] <- return_list[["fitted.values"]] / data[["weights"]] } } else { fitted.vals_name <- if (weighted) "fitted.values.unweighted" else "fitted.values" return_list[["fitted.values"]] <- as.matrix(fit_vals[[fitted.vals_name]]) } if (fes && (ncol(data[["fixed_effects"]]) == 1) && is.numeric(data[["Xoriginal"]])) { return_list[["fixed_effects"]] <- setNames( tapply( return_list[["fitted.values"]] - data[["Xoriginal"]] %*% fit$beta_hat, data[["fixed_effects"]], `[`, 1 ), colnames(data[["femat"]]) ) } # If we reordered to get SEs earlier, have to fix order if (clustered && se_type != "none") { return_list[["fitted.values"]] <- return_list[["fitted.values"]][order(data[["cl_ord"]]), , drop = FALSE] } colnames(return_list[["fitted.values"]]) <- ynames } return_list[["term"]] <- variable_names return_list[["outcome"]] <- ynames return_list[["alpha"]] <- alpha return_list[["se_type"]] <- se_type return_list[["weighted"]] <- weighted return_list[["fes"]] <- fes return_list[["clustered"]] <- clustered return_list[["df.residual"]] <- N - tot_rank return_list[["nobs"]] <- N if (clustered) { return_list[["nclusters"]] <- data[["J"]] } return_list[["k"]] <- k return_list[["rank"]] <- x_rank if (se_type != "none") { return_list[["res_var"]] <- get_resvar( data = data, ei = fit_vals[["ei"]], df.residual = return_list[["df.residual"]], vcov_fit = vcov_fit, weighted = weighted ) tss_r2s <- get_r2s( y = data[["y"]], return_list = return_list, has_int = has_int, yunweighted = data[["yunweighted"]], weights = data[["weights"]], weight_mean = data[["weight_mean"]] ) nomdf <- x_rank - as.numeric(!fes) * has_int if (clustered) { dendf <- data[["J"]] - 1 } else { dendf <- return_list[["df.residual"]] } if (nomdf > 0) { f <- get_fstat( tss_r2s = tss_r2s, return_list = return_list, iv_ei = fit_vals[["ei.iv"]], nomdf = nomdf, dendf = dendf, vcov_fit = vcov_fit, has_int = has_int, iv_stage = iv_stage ) } else { f <- NULL } if (!fes) { return_list <- c(return_list, tss_r2s) return_list[["fstatistic"]] <- f } else { return_list <- c(return_list, setNames(tss_r2s, paste0("proj_", names(tss_r2s)))) return_list[["proj_fstatistic"]] <- f tss_r2s <- get_r2s( y = data[["yoriginal"]], return_list = return_list, has_int = has_int, yunweighted = data[["yoriginalunweighted"]], weights = data[["weights"]], weight_mean = data[["weight_mean"]] ) return_list <- c(return_list, tss_r2s) # TODO (possibly) compute full fstatistic for fe models # return_list[["fstatistic"]] <- f } if (return_vcov) { # return_list$residuals <- fit$residuals return_list[["vcov"]] <- vcov_fit$Vcov_hat if (multivariate) { coef_names <- lapply( seq_len(ncol(est_exists)), function(j) return_list$term[est_exists[, j]] ) outcome_coef_names <- paste0( rep(paste0(return_list[["outcome"]], ":"), times = vapply(coef_names, length, integer(1))), unlist(coef_names, FALSE, FALSE) ) dimnames(return_list[["vcov"]]) <- list( outcome_coef_names, outcome_coef_names ) } else { dimnames(return_list[["vcov"]]) <- list( return_list$term[est_exists], return_list$term[est_exists] ) } } } attr(return_list, "class") <- "lm_robust" return(return_list) } check_se_type <- function(se_type, clustered) { # Allowable se_types with clustering cl_se_types <- c("CR0", "CR2", "stata") rob_se_types <- c("HC0", "HC1", "HC2", "HC3", "classical", "stata") # Parse cluster variable if (clustered) { # set/check se_type if (is.null(se_type)) { se_type <- "CR2" } else if (!(se_type %in% c(cl_se_types, "none"))) { stop( "`se_type` must be either 'CR0', 'stata', 'CR2', or 'none' when ", "`clusters` are specified.\nYou passed: ", se_type ) } } else { # set/check se_type if (is.null(se_type)) { se_type <- "HC2" } else if (se_type %in% setdiff(cl_se_types, "stata")) { stop( "`se_type` must be either 'HC0', 'HC1', 'stata', 'HC2', 'HC3', ", "'classical' or 'none' with no `clusters`.\nYou passed: ", se_type, " which is reserved for a case with clusters." ) } else if (!(se_type %in% c(rob_se_types, "none"))) { stop( "`se_type` must be either 'HC0', 'HC1', 'stata', 'HC2', 'HC3', ", "'classical' or 'none' with no `clusters`.\nYou passed: ", se_type ) } else if (se_type == "stata") { se_type <- "HC1" # In IV this is true with small option } } return(se_type) } get_resvar <- function(data, ei, df.residual, vcov_fit, weighted) { res_var <- if (weighted) colSums(ei^2 * data[["weight_mean"]]) / df.residual else as.vector(ifelse(vcov_fit[["res_var"]] < 0, NA, vcov_fit[["res_var"]])) return(res_var) } get_r2s <- function(y, return_list, has_int, yunweighted, weights, weight_mean) { N <- nrow(y) if (return_list[["weighted"]]) { if (has_int) { tss <- colSums(apply(yunweighted, 2, function(x) { weights^2 * (x - weighted.mean(x, weights^2))^2 })) * weight_mean } else { tss <- colSums(y^2 * weight_mean) } } else { if (has_int) { tss <- .rowSums(apply(y, 1, `-`, colMeans(y))^2, ncol(y), N) } else { tss <- colSums(y^2) } } tss <- as.vector(tss) r.squared <- 1 - ( return_list[["df.residual"]] * return_list[["res_var"]] / tss ) adj.r.squared <- 1 - ( (1 - r.squared) * ((N - has_int) / return_list[["df.residual"]]) ) return(list( tss = tss, r.squared = r.squared, adj.r.squared = adj.r.squared )) } get_fstat <- function(tss_r2s, return_list, iv_ei, nomdf, dendf, vcov_fit, has_int, iv_stage) { coefs <- as.matrix(return_list$coefficients) if (length(return_list[["outcome"]]) > 1) { fstat_names <- paste0(return_list[["outcome"]], ":value") } else { fstat_names <- "value" } if (iv_stage[[1]] != 2 && return_list[["se_type"]] == "classical") { fstat <- tss_r2s$r.squared * return_list[["df.residual"]] / ((1 - tss_r2s$r.squared) * (nomdf)) } else if (return_list[["se_type"]] == "classical" && iv_stage[[1]] == 2 && !return_list[["weighted"]]) { ivrss <- colSums(iv_ei^2) fstat <- ((tss_r2s$tss - ivrss) / nomdf) / return_list[["res_var"]] } else { indices <- seq.int(has_int + (!return_list[["fes"]]), return_list[["rank"]], by = 1) fstat <- compute_fstat( coef_matrix = coefs, coef_indices = indices, vcov_fit = vcov_fit$Vcov_hat, rank = return_list[["rank"]], nomdf = nomdf ) } f <- c( setNames(fstat, fstat_names), numdf = nomdf, dendf = dendf ) return(f) } compute_fstat <- function(coef_matrix, coef_indices, vcov_fit, rank, nomdf) { fstat <- numeric(ncol(coef_matrix)) for (i in seq_along(fstat)) { vcov_indices <- coef_indices + (i - 1) * rank fstat[i] <- tryCatch( { crossprod( coef_matrix[coef_indices, i], chol2inv(chol(vcov_fit[vcov_indices, vcov_indices])) %*% coef_matrix[coef_indices, i] ) / nomdf }, error = function(e) { NA_real_ } ) } fstat } prep_data <- function(data, se_type, clustered, weighted, fes, iv_stage) { # The se_type check also prevents first stage IV with clusters # from incorrectly reordering if (clustered && se_type != "none") { data[["cl_ord"]] <- order(data[["cluster"]]) data[["cluster"]] <- data[["cluster"]][data[["cl_ord"]]] data[["y"]] <- data[["y"]][data[["cl_ord"]], , drop = FALSE] data[["X"]] <- data[["X"]][data[["cl_ord"]], , drop = FALSE] if (fes) { data[["fixed_effects"]] <- data[["fixed_effects"]][data[["cl_ord"]], , drop = FALSE] data[["yoriginal"]] <- data[["yoriginal"]][data[["cl_ord"]], , drop = FALSE] data[["Xoriginal"]] <- data[["Xoriginal"]][data[["cl_ord"]], , drop = FALSE] } if (weighted) { data[["weights"]] <- data[["weights"]][data[["cl_ord"]]] } if (iv_stage[[1]] == 2) { data[["X_first_stage"]] <- data[["X_first_stage"]][data[["cl_ord"]], , drop = FALSE] } data[["J"]] <- length(unique(data[["cluster"]])) } else { data[["J"]] <- 1 } if (fes) { fe_dat <- as.data.frame(data[["fixed_effects"]], stringsAsFactors=TRUE) fe_levels <- vapply(fe_dat, nlevels, 0L) if (any(fe_levels == 1)) { if (ncol(fe_dat) != 1) { stop( "Can't have a fixed effect with only one group AND multiple fixed ", "effect variables" ) } data[["femat"]] <- matrix( 1, nrow(data[["fixed_effects"]]), dimnames = list( names(data[["fixed_effects"]]), paste0(colnames(data[["fixed_effects"]]), data[["fixed_effects"]][1]) ) ) } else { data[["femat"]] <- model.matrix( ~ 0 + ., data = fe_dat) } } if (weighted) { data[["Xunweighted"]] <- data[["X"]] data[["yunweighted"]] <- data[["y"]] data[["weight_mean"]] <- mean(data[["weights"]]) data[["weights"]] <- sqrt(data[["weights"]] / data[["weight_mean"]]) data[["X"]] <- data[["weights"]] * data[["X"]] data[["y"]] <- data[["weights"]] * data[["y"]] if (fes) { if (is.numeric(data[["yoriginal"]])) { data[["yoriginalunweighted"]] <- data[["yoriginal"]] data[["yoriginal"]] <- data[["weights"]] * data[["yoriginal"]] } data[["fematunweighted"]] <- data[["femat"]] data[["femat"]] <- data[["weights"]] * data[["femat"]] } if (iv_stage[[1]] == 2) { data[["X_first_stage_unweighted"]] <- data[["X_first_stage"]] data[["X_first_stage"]] <- data[["weights"]] * data[["X_first_stage"]] } } else { data[["weight_mean"]] <- 1 } return(data) } drop_collinear <- function(data, covs_used, weighted, iv_stage) { data[["X"]] <- data[["X"]][, covs_used, drop = FALSE] if (weighted) { data[["Xunweighted"]] <- data[["Xunweighted"]][, covs_used, drop = FALSE] } if (iv_stage[[1]] == 2) { data[["X_first_stage"]] <- data[["X_first_stage"]][, covs_used, drop = FALSE] if (weighted) { data[["X_first_stage_unweighted"]] <- data[["X_first_stage_unweighted"]][, covs_used, drop = FALSE] } } if (is.numeric(data[["Xoriginal"]])) { data[["Xoriginal"]] <- data[["Xoriginal"]][, covs_used, drop = FALSE] } return(data) } estimatr/R/estimatr_lm_lin.R0000644000176200001440000003266514760370122015626 0ustar liggesusers#' Linear regression with the Lin (2013) covariate adjustment #' #' @description This function is a wrapper for \code{\link{lm_robust}} that #' is useful for estimating treatment effects with pre-treatment covariate #' data. This implements the method described by Lin (2013). #' #' @param formula an object of class formula, as in \code{\link{lm}}, such as #' \code{Y ~ Z} with only one variable on the right-hand side, the treatment #' @param covariates a right-sided formula with pre-treatment covariates on #' the right hand side, such as \code{ ~ x1 + x2 + x3}. #' @param data A \code{data.frame} #' @param weights the bare (unquoted) names of the weights variable in the #' supplied data. #' @param subset An optional bare (unquoted) expression specifying a subset #' of observations to be used. #' @param clusters An optional bare (unquoted) name of the variable that #' corresponds to the clusters in the data. #' @param se_type The sort of standard error sought. If \code{clusters} is #' not specified the options are "HC0", "HC1" (or "stata", the equivalent), #' "HC2" (default), "HC3", or "classical". If \code{clusters} is specified the #' options are "CR0", "CR2" (default), or "stata" are #' permissible. #' @param ci logical. Whether to compute and return p-values and confidence #' intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. #' @param return_vcov logical. Whether to return the variance-covariance #' matrix for later usage, TRUE by default. #' @param try_cholesky logical. Whether to try using a Cholesky #' decomposition to solve least squares instead of a QR decomposition, #' FALSE by default. Using a Cholesky decomposition may result in speed gains, but should only #' be used if users are sure their model is full-rank (i.e., there is no #' perfect multi-collinearity) #' #' @details #' #' This function is simply a wrapper for \code{\link{lm_robust}} and implements #' the Lin estimator (see the reference below). This method #' pre-processes the data by taking the covariates specified in the #' \code{`covariates`} argument, centering them by subtracting from each covariate #' its mean, and interacting them with the treatment. If the treatment has #' multiple values, a series of dummies for each value is created and each of #' those is interacted with the demeaned covariates. More details can be found #' in the #' \href{https://declaredesign.org/r/estimatr/articles/getting-started.html}{Getting Started vignette} #' and the #' \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{mathematical notes}. #' #' @return An object of class \code{"lm_robust"}. #' #' The post-estimation commands functions \code{summary} and \code{\link{tidy}} #' return results in a \code{data.frame}. To get useful data out of the return, #' you can use these data frames, you can use the resulting list directly, or #' you can use the generic accessor functions \code{coef}, \code{vcov}, #' \code{confint}, and \code{predict}. Marginal effects and uncertainty about #' them can be gotten by passing this object to #' \code{\link[margins]{margins}} from the \pkg{margins}. #' #' Users who want to print the results in TeX of HTML can use the #' \code{\link[texreg]{extract}} function and the \pkg{texreg} package. #' #' An object of class \code{"lm_robust"} is a list containing at least the #' following components: #' \item{coefficients}{the estimated coefficients} #' \item{std.error}{the estimated standard errors} #' \item{statistic}{the t-statistic} #' \item{df}{the estimated degrees of freedom} #' \item{p.value}{the p-values from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} #' \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} #' \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} #' \item{term}{a character vector of coefficient names} #' \item{alpha}{the significance level specified by the user} #' \item{se_type}{the standard error type specified by the user} #' \item{res_var}{the residual variance} #' \item{N}{the number of observations used} #' \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} #' \item{rank}{the rank of the fitted model} #' \item{vcov}{the fitted variance covariance matrix} #' \item{r.squared}{The \eqn{R^2}, #' \deqn{R^2 = 1 - Sum(e[i]^2) / Sum((y[i] - y^*)^2),} where \eqn{y^*} #' is the mean of \eqn{y[i]} if there is an intercept and zero otherwise, #' and \eqn{e[i]} is the ith residual.} #' \item{adj.r.squared}{The \eqn{R^2} but penalized for having more parameters, \code{rank}} #' \item{weighted}{whether or not weights were applied} #' \item{call}{the original function call} #' \item{fitted.values}{the matrix of predicted means} #' We also return \code{terms}, \code{contrasts}, and \code{treatment_levels}, used by \code{predict}, #' and \code{scaled_center} (the means of each of the covariates used for centering them). #' #' @seealso \code{\link{lm_robust}} #' #' @references #' #' Freedman, David A. 2008. "On Regression Adjustments in Experiments with Several Treatments." The Annals of Applied Statistics. JSTOR, 176-96. \doi{10.1214/07-AOAS143}. #' #' Lin, Winston. 2013. "Agnostic Notes on Regression Adjustments to Experimental Data: Reexamining Freedman's Critique." The Annals of Applied Statistics 7 (1). Institute of Mathematical Statistics: 295-318. \doi{10.1214/12-AOAS583}. #' #' @examples #' library(fabricatr) #' library(randomizr) #' dat <- fabricate( #' N = 40, #' x = rnorm(N, mean = 2.3), #' x2 = rpois(N, lambda = 2), #' x3 = runif(N), #' y0 = rnorm(N) + x, #' y1 = rnorm(N) + x + 0.35 #' ) #' #' dat$z <- complete_ra(N = nrow(dat)) #' dat$y <- ifelse(dat$z == 1, dat$y1, dat$y0) #' #' # Same specification as lm_robust() with one additional argument #' lmlin_out <- lm_lin(y ~ z, covariates = ~ x, data = dat) #' tidy(lmlin_out) #' #' # Works with multiple pre-treatment covariates #' lm_lin(y ~ z, covariates = ~ x + x2, data = dat) #' #' # Also centers data AFTER evaluating any functions in formula #' lmlin_out2 <- lm_lin(y ~ z, covariates = ~ x + log(x3), data = dat) #' lmlin_out2$scaled_center["log(x3)"] #' mean(log(dat$x3)) #' #' # Works easily with clusters #' dat$clusterID <- rep(1:20, each = 2) #' dat$z_clust <- cluster_ra(clusters = dat$clusterID) #' #' lm_lin(y ~ z_clust, covariates = ~ x, data = dat, clusters = clusterID) #' #' # Works with multi-valued treatments, whether treatment is specified as a #' # factor or not #' dat$z_multi <- sample(1:3, size = nrow(dat), replace = TRUE) #' #' lm_lin(y ~ z_multi, covariates = ~ x, data = dat) #' lm_lin(y ~ factor(z_multi), covariates = ~ x, data = dat) #' #' # Stratified estimator with blocks #' dat$blockID <- rep(1:5, each = 8) #' dat$z_block <- block_ra(blocks = dat$blockID) #' #' lm_lin(y ~ z_block, ~ factor(blockID), data = dat) #' #' # Fitting the model without an intercept provides estimates of mean outcomes #' # under each respective treatment condition #' lm_lin(y ~ z_multi - 1, covariates = ~ x, data = dat) #' #' # Predictions are the same in equivalent models with and without an intercept #' lmlin_out3 <- lm_lin(y ~ z_multi - 1, covariates = ~ x, data = dat) #' lmlin_out4 <- lm_lin(y ~ z_multi, covariates = ~ x, data = dat) #' #' predict(lmlin_out3, newdata = dat, se.fit = TRUE, interval = "confidence") #' predict(lmlin_out4, newdata = dat, se.fit = TRUE, interval = "confidence") #' #' \dontrun{ #' # Can also use 'margins' package if you have it installed to get #' # marginal effects #' library(margins) #' # Instruct 'margins' to treat z as a factor #' lmlout <- lm_lin(y ~ factor(z_block), ~ x, data = dat) #' summary(margins(lmlout)) #' #' # Can output results using 'texreg' #' library(texreg) #' texregobj <- extract(lmlout) #' } #' #' @export lm_lin <- function(formula, covariates, data, weights, subset, clusters, se_type = NULL, ci = TRUE, alpha = .05, return_vcov = TRUE, try_cholesky = FALSE) { # ---------- # Check arguments # ---------- if (length(all.vars(f_rhs(formula))) > 1) { stop( "The `formula` argument, `", format(formula), "`, must only have the ", "treatment variable on the right-hand side of the formula. Covariates ", "should be specified in the `covariates` argument like so:\n`covariates = ", paste0("~ ", paste(all.vars(f_rhs(formula))[-1], sep = " + ")), "`.", "\n\n See ?lm_lin." ) } if (!inherits(covariates, "formula")) { stop( "The `covariates` argument must be specified as a formula:\n", "You passed an object of class ", class(covariates) ) } cov_terms <- terms(covariates) # Check covariates is right hand sided fn if (attr(cov_terms, "response") != 0) { stop( "Must not specify a response variable in `covariates`` formula.\n", "`covariates` must be a right-sided formula, such as '~ x1 + x2 + x3'" ) } if (length(attr(cov_terms, "order")) == 0) { stop( "`covariates` must have a variable on the right-hand side, not 0 or 1" ) } # ---------- # Get design matrix including `covariates` for centering # ---------- full_formula <- update( formula, reformulate(c(".", labels(cov_terms))) ) datargs <- enquos( formula = full_formula, weights = weights, subset = subset, cluster = clusters ) data <- enquo(data) model_data <- clean_model_data(data = data, datargs) outcome <- as.matrix(model_data$outcome) n <- nrow(outcome) design_matrix <- model_data$design_matrix weights <- model_data$weights cluster <- model_data$cluster # Get treatment columns has_intercept <- attr(terms(formula), "intercept") treat_col <- which(attr(design_matrix, "assign") == 1) treatment <- design_matrix[, treat_col, drop = FALSE] design_mat_treatment <- colnames(design_matrix)[treat_col] # Check case where treatment is not factor and is not binary if (any(!(treatment %in% c(0, 1))) | (!has_intercept&ncol(treatment) ==1) ) { # create dummies for non-factor treatment variable # Drop out first group if there is an intercept vals <- sort(unique(treatment)) if (has_intercept) vals <- vals[-1] n_treats <- length(vals) # Could warn if there are too many values # (i.e., if there are as many treatments as observations) names(vals) <- paste0(colnames(design_matrix)[treat_col], vals) # Create matrix of dummies treatment <- outer( drop(treatment), vals, function(x, y) as.numeric(x == y) ) } # ---------- # Center and interact variables # ---------- # Initialize as non-demeaned demeaned_covars <- design_matrix[ , setdiff(colnames(design_matrix), c(design_mat_treatment, "(Intercept)")), drop = FALSE ] # Choose what to center on! if (is.numeric(weights)) { center <- apply(demeaned_covars, 2, weighted.mean, weights) } else { center <- colMeans(demeaned_covars) } demeaned_covars <- sweep(demeaned_covars, 2, center) original_covar_names <- colnames(demeaned_covars) # Change name of centered covariates to end in "_c" # If covar name has `:` or a `(` not in the first position, # wrap the whole var name in parentheses first colnames(demeaned_covars) <- paste0( ifelse(grepl("\\:|(^.+\\()", colnames(demeaned_covars)), paste0("(", colnames(demeaned_covars), ")"), colnames(demeaned_covars)), "_c" ) n_treat_cols <- ncol(treatment) n_covars <- ncol(demeaned_covars) # Interacted # n_int_covar_cols <- n_covars * (n_treat_cols + has_intercept) n_int_covar_cols <- n_covars * (n_treat_cols) interacted_covars <- matrix(0, nrow = n, ncol = n_int_covar_cols) interacted_covars_names <- character(n_int_covar_cols) for (i in 1:n_covars) { covar_name <- colnames(demeaned_covars)[i] cols <- (i - 1) * n_treat_cols + (1:n_treat_cols) interacted_covars[, cols] <- treatment * demeaned_covars[, i] interacted_covars_names[cols] <- paste0(colnames(treatment), ":", covar_name) } colnames(interacted_covars) <- interacted_covars_names if (has_intercept) { # Have to manually create intercept if treatment wasn't a factor X <- cbind( matrix(1, nrow = n, ncol = 1, dimnames = list(NULL, "(Intercept)")), treatment, demeaned_covars, interacted_covars ) } else { X <- cbind( treatment, interacted_covars ) } # ---------- # Estimation # ---------- return_list <- lm_robust_fit( y = outcome, X = X, weights = weights, cluster = cluster, ci = ci, se_type = se_type, alpha = alpha, return_vcov = return_vcov, try_cholesky = try_cholesky, has_int = has_intercept, iv_stage = list(0) ) # ---------- # Build return object # ---------- return_list <- lm_return( return_list, model_data = model_data, formula = formula ) return_list[["scaled_center"]] <- center setNames(return_list[["scaled_center"]], original_covar_names) # Store unique treatment values if(attr(terms(model_data), "dataClasses")[attr(terms(model_data),"term.labels")[1]] == "factor"){ return_list[["treatment_levels"]] <- model_data$xlevels[[1]] } else { return_list[["treatment_levels"]] <- sort(unique(design_matrix[, design_mat_treatment])) } return_list[["call"]] <- match.call() return(return_list) } estimatr/R/helper_return.R0000644000176200001440000000603114747205231015311 0ustar liggesusers# This file has helper functions for returning the lists from various estimators lm_return <- function(return_list, model_data, formula) { if (!is.null(model_data)) { return_list[["contrasts"]] <- attr(model_data$design_matrix, "contrasts") return_list[["terms"]] <- model_data$terms return_list[["xlevels"]] <- model_data$xlevels return_list[["felevels"]] <- model_data$felevels return_list[["weights"]] <- model_data$weights if (is.matrix(model_data$outcome) && is.character(colnames(model_data$outcome))) { return_list[["outcome"]] <- colnames(model_data$outcome) } else { return_list[["outcome"]] <- deparse(formula[[2]], nlines = 5) } } # Name and flatten objects if (is.matrix(return_list[["std.error"]]) && ncol(return_list[["std.error"]]) > 1) { dimnames(return_list[["std.error"]]) <- dimnames(return_list[["coefficients"]]) } else { return_list[["coefficients"]] <- drop(return_list[["coefficients"]]) nms <- c("std.error", "statistic", "p.value", "df", "conf.low", "conf.high") for (nm in nms) { if (length(return_list[[nm]]) > 1 || !is.na(return_list[[nm]])) { return_list[[nm]] <- setNames( drop(return_list[[nm]]), names(return_list[["coefficients"]]) ) } } } if (return_list[["weighted"]]) { names(return_list[["weights"]]) <- if (is.matrix(return_list[["fitted.values"]])) rownames(return_list[["fitted.values"]]) else names(return_list[["fitted.values"]]) } return_list[["fitted.values"]] <- drop(return_list[["fitted.values"]]) return_list[["ei.iv"]] <- drop(return_list[["ei.iv"]]) return_list[["residuals"]] <- drop(return_list[["residuals"]]) return(return_list) } dim_like_return <- function(return_list, alpha, formula, conditions) { return_list[["alpha"]] <- alpha # get "max" condition to account for case with only 1 condition treat_condition <- conditions[[2]] # now we add the condition 2 value to coefficient name like it were a factor # Only add label if conditions aren't 0/1 add_label <- !(conditions[[2]] == 1 && conditions[[1]] == 0) # If horvitz_thompson and there is only one treatment, add_label will be NA # In this case, we add the non-null value if it's condition 2 if (is.na(add_label)) { add_label <- !is.null(conditions[[2]]) } fterms <- terms(formula) coef_name <- labels(fterms) if (add_label) { return_list[["term"]] <- paste0( coef_name, conditions[[2]] ) } else { return_list[["term"]] <- coef_name } return_list[["outcome"]] <- deparse(formula[[2]], nlines = 5) names(return_list[["coefficients"]]) <- names(return_list[["std.error"]]) <- names(return_list[["p.value"]]) <- names(return_list[["df"]]) <- return_list[["term"]] return_list[["condition2"]] <- conditions[[2]] return_list[["condition1"]] <- conditions[[1]] return_list[["vcov"]] <- matrix( data = return_list[["std.error"]] ^ 2, dimnames = list(return_list[["term"]], return_list[["term"]]) ) return(return_list) } estimatr/data/0000755000176200001440000000000014747205231013020 5ustar liggesusersestimatr/data/alo_star_men.rda0000644000176200001440000000333314747205231016155 0ustar liggesusersBZh91AY&SY\ Qu~Dܰ"XpFn}|ŋ $$i=&dlTS'h 41'҃SSC@hh  0M0LF42`#0$҉%=FM @44hL&a41`ɀLRzQzP4z|wPED%oMFզF4Sf`^avQ>g|,a11+(^*Dn"Z#3+^Jcx816xC|wbx iQJUMDB NdKȻD!;T("&y 4,e using namespace Rcpp; // [[Rcpp::export]] Eigen::ArrayXXd demeanMat2(const Eigen::MatrixXd& what, const Rcpp::IntegerMatrix& fes, const Rcpp::NumericVector& weights, const int& start_col, const double& eps) { int n = what.rows(); int p = what.cols() - start_col; Eigen::ArrayXXd out(n,p); Eigen::ArrayXd oldcol(n); double norm2 = 0.0; double neweps = 0.0; out = what.block(0, start_col, n, p); int nlevels = max(fes) + 1; // add one because factor codes are 1:N Eigen::ArrayXd group_sums(nlevels); Eigen::ArrayXXd group_weights(nlevels, fes.cols()); group_weights.setZero(); for (int j = 0; j < fes.cols(); ++j) { for (int i=0; i= neweps); } return out; } // Much of what follows is modified from RcppEigen Vignette by Douglas Bates and Dirk Eddelbuettel // https://cran.r-project.org/web/packages/RcppEigen/vignettes/RcppEigen-Introduction.pdf // [[Rcpp::export]] Eigen::MatrixXd AtA(const Eigen::MatrixXd& A) { int n(A.cols()); return Eigen::MatrixXd(n,n).setZero().selfadjointView() .rankUpdate(A.adjoint()); } // [[Rcpp::export]] Eigen::MatrixXd Kr(const Eigen::MatrixXd& A, const Eigen::MatrixXd& B) { Eigen::MatrixXd AB(A.rows() * B.rows(), A.cols() * B.cols()); for (int i = 0; i < A.rows(); i++) { for (int j = 0; j < A.cols(); j++) { AB.block(i*B.rows(), j*B.cols(), B.rows(), B.cols()) = A(i, j) * B; } } return AB; } // Gets padded UtU matrix (where U = cbind(X, FE_dummies)) Eigen::MatrixXd getMeatXtX(Eigen::Map& X, const Eigen::MatrixXd& XtX_inv) { Eigen::ColPivHouseholderQR PQR(X); const Eigen::ColPivHouseholderQR::PermutationType Pmat(PQR.colsPermutation()); int r = PQR.rank(); int p = X.cols(); Eigen::MatrixXd R_inv = PQR.matrixQR().topLeftCorner(r, r).triangularView().solve(Eigen::MatrixXd::Identity(r, r)); // Get all column indices Eigen::ArrayXi Pmat_indices = Pmat.indices(); // Get the order for the columns you are keeping Eigen::ArrayXi Pmat_keep = Pmat_indices.head(r); // Get the indices for columns you are discarding Eigen::ArrayXi Pmat_toss = Pmat_indices.tail(p - r); for(Eigen::Index i=0; i P = Eigen::PermutationWrapper(Pmat_keep); R_inv = P * R_inv * P; Eigen::MatrixXd meatXtX_inv = R_inv * R_inv.transpose(); for (Eigen::Index i=0; i& X, const Eigen::Map& y, const bool& try_cholesky) { const int p(X.cols()), ny(y.cols()); int r = p; Eigen::MatrixXd XtX_inv, R_inv; Eigen::MatrixXd beta_out(Eigen::MatrixXd::Constant(p, ny, ::NA_REAL)); //Rcpp::Rcout << y << std::endl; bool do_qr = !try_cholesky; if (try_cholesky) { const Eigen::LLT llt(X.transpose() * X); // Catch case where X is rank-deficient if (llt.info() == Eigen::NumericalIssue) { do_qr = true; } else{ beta_out = llt.solve(X.adjoint() * y); R_inv = llt.matrixL().solve(Eigen::MatrixXd::Identity(p, p)); XtX_inv = R_inv.transpose() * R_inv; } } if (do_qr) { Eigen::ColPivHouseholderQR PQR(X); const Eigen::ColPivHouseholderQR::PermutationType Pmat(PQR.colsPermutation()); r = PQR.rank(); Eigen::MatrixXd R_inv = PQR.matrixQR().topLeftCorner(r, r).triangularView().solve(Eigen::MatrixXd::Identity(r, r)); // Get all column indices Eigen::ArrayXi Pmat_indices = Pmat.indices(); // Get the order for the columns you are keeping Eigen::ArrayXi Pmat_keep = Pmat_indices.head(r); // Get the indices for columns you are discarding Eigen::ArrayXi Pmat_toss = Pmat_indices.tail(p - r); for(Eigen::Index i=0; i P = Eigen::PermutationWrapper(Pmat_keep); Eigen::MatrixXd effects(PQR.householderQ().adjoint() * y); // Rcout << "effects:" << std::endl; // Rcout << effects << std::endl; beta_out.topRows(r) = R_inv * effects.topRows(r); // Rcout << "beta_out:" << std::endl; // Rcout << beta_out << std::endl; beta_out = PQR.colsPermutation() * beta_out; R_inv = P * R_inv * P; XtX_inv = R_inv * R_inv.transpose(); } return List::create( _["beta_hat"]= beta_out, _["XtX_inv"]= XtX_inv ); } // [[Rcpp::export]] List lm_variance(Eigen::Map& X, const Rcpp::Nullable & Xunweighted, const Eigen::Map& XtX_inv, const Eigen::Map& ei, const double weight_mean, const Rcpp::Nullable & cluster, const int& J, const bool& ci, const String se_type, const std::vector & which_covs, const int& fe_rank) { const int n(X.rows()), r(XtX_inv.cols()), ny(ei.cols()); // Rcout << "X:" << std::endl << X << std::endl; int r_fe = r + fe_rank; const bool clustered = ((se_type == "stata") || (se_type == "CR0") || (se_type == "CR2")); const int npars = r * ny; int sandwich_size = n; if (clustered) { sandwich_size = J; } // For CR2 Eigen::MatrixXd Xoriginal; Eigen::MatrixXd H1s; Eigen::MatrixXd H2s; Eigen::MatrixXd H3s; Eigen::MatrixXd P_diags; Eigen::MatrixXd M_U_ct; Eigen::MatrixXd MUWTWUM; Eigen::MatrixXd Omega_ct; Eigen::MatrixXd At_WX_inv; Eigen::MatrixXd Vcov_hat; Eigen::VectorXd dof = Eigen::VectorXd::Constant(npars, -99.0); Eigen::VectorXd res_var = Eigen::VectorXd::Constant(ny, -99.0); // Standard error calculations if (se_type == "classical") { // Classical Eigen::MatrixXd s2 = AtA(ei)/((double)n - (double)r_fe); Vcov_hat = Kr(s2, XtX_inv); res_var = s2.diagonal(); } else { // Robust Eigen::MatrixXd temp_omega = ei.array().pow(2); res_var = temp_omega.colwise().sum()/((double)n - (double)r_fe); Eigen::MatrixXd bread(npars, npars); Eigen::MatrixXd half_meat(sandwich_size, npars); if (ny == 1) { bread = XtX_inv; } else { bread = Kr(Eigen::MatrixXd::Identity(ny, ny), XtX_inv); } Eigen::MatrixXd meatXtX_inv; if ((se_type == "HC2") || (se_type == "HC3") || (se_type == "CR2")) { if (X.cols() > r) { meatXtX_inv = getMeatXtX(X, XtX_inv); r_fe = meatXtX_inv.cols(); } else { meatXtX_inv = XtX_inv; } } // Rcout << "meatXtX_inv:" << std::endl << meatXtX_inv << std::endl; if ( !clustered ) { // Rcout << "temp_omega:" << std::endl << temp_omega << std::endl; if ((se_type == "HC2") || (se_type == "HC3")) { Eigen::ArrayXd new_omega(ny); for (int i = 0; i < n; i++) { Eigen::VectorXd Xi = X.leftCols(r_fe).row(i); // Rcout << i << ":" << Xi << std::endl; if (se_type == "HC2") { new_omega = temp_omega.row(i) / (1.0 - (Xi.transpose() * meatXtX_inv * Xi)); } else if (se_type == "HC3") { new_omega = temp_omega.row(i) / (std::pow(1.0 - Xi.transpose() * meatXtX_inv * Xi, 2)); } // Perfect fits cause instability, but we can place 0s for those // observations and the rest of the estimation works new_omega = new_omega.unaryExpr([](double v) {return std::isfinite(v)? v : 0.0;}); temp_omega.row(i) = new_omega; } } // Rcout << "temp_omega:" << std::endl << temp_omega << std::endl; for (int m = 0; m < ny; m++) { if (ny > 1) { // Preserve signs for off-diagonal vcov blocks in mlm half_meat.block(0, r*m, n, r) = X.leftCols(r).array().colwise() * (ei.col(m).array().sign() * temp_omega.col(m).array().sqrt()); } else { half_meat.block(0, r*m, n, r) = X.leftCols(r).array().colwise() * temp_omega.col(m).array().sqrt(); } } } else { // clustered if (se_type == "CR2") { Xoriginal.resize(n, r); if (Xunweighted.isNotNull()) { Xoriginal = Rcpp::as >(Xunweighted); } else { Xoriginal = X; } H1s.resize(r_fe, r_fe*J); H2s.resize(r_fe, r_fe*J); H3s.resize(r_fe, r_fe*J); P_diags.resize(r_fe, J); M_U_ct = meatXtX_inv.llt().matrixL(); MUWTWUM = meatXtX_inv * X.leftCols(r_fe).transpose() * X.leftCols(r_fe) * meatXtX_inv; Omega_ct = MUWTWUM.llt().matrixL(); } Eigen::Map clusters = Rcpp::as >(cluster); double current_cluster = clusters(0); int clust_num = 0; int start_pos = 0; int len = 1; // iterate over unique cluster values for (int i = 1; i <= n; ++i){ if ((i == n) || (clusters(i) != current_cluster)) { if (se_type == "CR2") { // H is not symmetric if weighted CR2 Eigen::MatrixXd H = Xoriginal.block(start_pos, 0, len, r_fe) * meatXtX_inv * X.block(start_pos, 0, len, r_fe).transpose(); // Code from clubSandwich // uwTwu <- Map(function(uw, th) uw %*% th %*% t(uw), // uw = UW_list, th = Theta_list) // MUWTWUM <- M_U %*% Reduce("+", uwTwu) %*% M_U //(thet - h %*% thet - thet %*% t(h) + u %*% MUWTWUM %*% t(u)) // A' W R in clubSand notation // If no FEs Eigen::SelfAdjointEigenSolver At_WX( (Eigen::MatrixXd::Identity(len, len) - H) - H.transpose() + Xoriginal.block(start_pos, 0, len, r_fe) * MUWTWUM * Xoriginal.block(start_pos, 0, len, r_fe).transpose() ); Eigen::VectorXd eigvals = At_WX.eigenvalues(); for (int m = 0; m < eigvals.size(); ++m) { if (eigvals(m) > std::pow(10.0, -12.0)) { eigvals(m) = 1.0 / std::sqrt(eigvals(m)); } else { eigvals(m) = 0; } } At_WX_inv = At_WX.eigenvectors() * eigvals.asDiagonal() * At_WX.eigenvectors().transpose() * X.block(start_pos, 0, len, r_fe); if (ci) { Eigen::MatrixXd ME(r_fe, len); if (weight_mean != 1) { ME = (meatXtX_inv / weight_mean) * At_WX_inv.transpose(); } else { ME = meatXtX_inv * At_WX_inv.transpose(); } P_diags.col(clust_num) = ME.array().pow(2).rowwise().sum(); Eigen::MatrixXd MEU = ME * Xoriginal.block(start_pos, 0, len, r_fe); int p_pos = clust_num*r_fe; // Rcout << "p_pos: " << p_pos << std::endl; H1s.block(0, p_pos, r_fe, r_fe) = MEU * M_U_ct; H2s.block(0, p_pos, r_fe, r_fe) = ME * X.block(start_pos, 0, len, r_fe) * M_U_ct; H3s.block(0, p_pos, r_fe, r_fe) = MEU * Omega_ct; } } if (ny > 1) { // Stack residuals for this cluster from each model // Rcout << "len: " << len << std::endl; Eigen::MatrixXd ei_block = ei.block(start_pos, 0, len, ny); Eigen::Map ei_long(ei_block.data(), 1, len*ny); if (se_type == "CR2") { half_meat.block(clust_num, 0, 1, npars) = ei_long * Kr(Eigen::MatrixXd::Identity(ny, ny), At_WX_inv.leftCols(r)); } else { half_meat.block(clust_num, 0, 1, npars) = ei_long * Kr(Eigen::MatrixXd::Identity(ny, ny), X.block(start_pos, 0, len, r)); } } else { if (se_type == "CR2") { half_meat.row(clust_num) = ei.block(start_pos, 0, len, 1).transpose() * At_WX_inv.leftCols(r); } else { half_meat.row(clust_num) = ei.block(start_pos, 0, len, 1).transpose() * X.block(start_pos, 0, len, r); } } if (i < n) { current_cluster = clusters(i); len = 1; start_pos = i; } clust_num++; } else { len++; continue; } } } // Rcout << "bread:" << std::endl << bread << std::endl << std::endl; // Rcout << "half_meat:" << std::endl << half_meat << std::endl << std::endl; // Rcout << "meat:" << std::endl << (half_meat.transpose() * half_meat) << std::endl << std::endl; Vcov_hat = bread * (half_meat.transpose() * half_meat) * bread; } if (se_type == "HC1") { Vcov_hat = Vcov_hat * (double)n / ((double)n - (double)r_fe); } else if (se_type == "stata") { // Rcout << "correction: " << (((double)J * (n - 1)) / (((double)J - 1) * (n - r))) << std::endl; Vcov_hat = Vcov_hat * (((double)J * (n - 1)) / (((double)J - 1) * (n - r_fe))); } // Degrees of freedom if (ci) { if ( !clustered ) { dof.fill(n - r_fe); // regular } else if (se_type != "CR2") { dof.fill(J - 1); // clustered } else { for (int j = 0; j < r; j++) { if (which_covs[j]) { Eigen::MatrixXd H1t = H1s.row(j); Eigen::MatrixXd H2t = H2s.row(j); Eigen::MatrixXd H3t = H3s.row(j); H1t.resize(r_fe, J); H2t.resize(r_fe, J); H3t.resize(r_fe, J); Eigen::MatrixXd uf = H1t.transpose() * H2t; Eigen::MatrixXd P_row = P_diags.row(j).asDiagonal(); Eigen::MatrixXd P_array = (H3t.transpose()*H3t - uf - uf.transpose()) + P_row; double dof_j = std::pow(P_array.trace(), 2) / P_array.array().pow(2).sum(); for (int outcome_ix = 0; outcome_ix < ny; outcome_ix++) { dof(j + outcome_ix * r) = dof_j; } } } } } return List::create(_["Vcov_hat"]= Vcov_hat, _["dof"]= dof, _["res_var"]= res_var); } estimatr/src/RcppExports.cpp0000644000176200001440000001600414747205231015674 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include #include using namespace Rcpp; #ifdef RCPP_USE_GLOBAL_ROSTREAM Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // ht_covar_partial double ht_covar_partial(const Eigen::VectorXd& y1, const Eigen::VectorXd& y0, const Eigen::MatrixXd& p10, const Eigen::VectorXd& p1, const Eigen::VectorXd& p0); RcppExport SEXP _estimatr_ht_covar_partial(SEXP y1SEXP, SEXP y0SEXP, SEXP p10SEXP, SEXP p1SEXP, SEXP p0SEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type y1(y1SEXP); Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type y0(y0SEXP); Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type p10(p10SEXP); Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type p1(p1SEXP); Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type p0(p0SEXP); rcpp_result_gen = Rcpp::wrap(ht_covar_partial(y1, y0, p10, p1, p0)); return rcpp_result_gen; END_RCPP } // ht_var_partial double ht_var_partial(const Eigen::VectorXd& y, const Eigen::MatrixXd& p); RcppExport SEXP _estimatr_ht_var_partial(SEXP ySEXP, SEXP pSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type y(ySEXP); Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type p(pSEXP); rcpp_result_gen = Rcpp::wrap(ht_var_partial(y, p)); return rcpp_result_gen; END_RCPP } // demeanMat2 Eigen::ArrayXXd demeanMat2(const Eigen::MatrixXd& what, const Rcpp::IntegerMatrix& fes, const Rcpp::NumericVector& weights, const int& start_col, const double& eps); RcppExport SEXP _estimatr_demeanMat2(SEXP whatSEXP, SEXP fesSEXP, SEXP weightsSEXP, SEXP start_colSEXP, SEXP epsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type what(whatSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerMatrix& >::type fes(fesSEXP); Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type weights(weightsSEXP); Rcpp::traits::input_parameter< const int& >::type start_col(start_colSEXP); Rcpp::traits::input_parameter< const double& >::type eps(epsSEXP); rcpp_result_gen = Rcpp::wrap(demeanMat2(what, fes, weights, start_col, eps)); return rcpp_result_gen; END_RCPP } // AtA Eigen::MatrixXd AtA(const Eigen::MatrixXd& A); RcppExport SEXP _estimatr_AtA(SEXP ASEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type A(ASEXP); rcpp_result_gen = Rcpp::wrap(AtA(A)); return rcpp_result_gen; END_RCPP } // Kr Eigen::MatrixXd Kr(const Eigen::MatrixXd& A, const Eigen::MatrixXd& B); RcppExport SEXP _estimatr_Kr(SEXP ASEXP, SEXP BSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type A(ASEXP); Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type B(BSEXP); rcpp_result_gen = Rcpp::wrap(Kr(A, B)); return rcpp_result_gen; END_RCPP } // lm_solver List lm_solver(const Eigen::Map& X, const Eigen::Map& y, const bool& try_cholesky); RcppExport SEXP _estimatr_lm_solver(SEXP XSEXP, SEXP ySEXP, SEXP try_choleskySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::Map& >::type X(XSEXP); Rcpp::traits::input_parameter< const Eigen::Map& >::type y(ySEXP); Rcpp::traits::input_parameter< const bool& >::type try_cholesky(try_choleskySEXP); rcpp_result_gen = Rcpp::wrap(lm_solver(X, y, try_cholesky)); return rcpp_result_gen; END_RCPP } // lm_variance List lm_variance(Eigen::Map& X, const Rcpp::Nullable& Xunweighted, const Eigen::Map& XtX_inv, const Eigen::Map& ei, const double weight_mean, const Rcpp::Nullable& cluster, const int& J, const bool& ci, const String se_type, const std::vector& which_covs, const int& fe_rank); RcppExport SEXP _estimatr_lm_variance(SEXP XSEXP, SEXP XunweightedSEXP, SEXP XtX_invSEXP, SEXP eiSEXP, SEXP weight_meanSEXP, SEXP clusterSEXP, SEXP JSEXP, SEXP ciSEXP, SEXP se_typeSEXP, SEXP which_covsSEXP, SEXP fe_rankSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Eigen::Map& >::type X(XSEXP); Rcpp::traits::input_parameter< const Rcpp::Nullable& >::type Xunweighted(XunweightedSEXP); Rcpp::traits::input_parameter< const Eigen::Map& >::type XtX_inv(XtX_invSEXP); Rcpp::traits::input_parameter< const Eigen::Map& >::type ei(eiSEXP); Rcpp::traits::input_parameter< const double >::type weight_mean(weight_meanSEXP); Rcpp::traits::input_parameter< const Rcpp::Nullable& >::type cluster(clusterSEXP); Rcpp::traits::input_parameter< const int& >::type J(JSEXP); Rcpp::traits::input_parameter< const bool& >::type ci(ciSEXP); Rcpp::traits::input_parameter< const String >::type se_type(se_typeSEXP); Rcpp::traits::input_parameter< const std::vector& >::type which_covs(which_covsSEXP); Rcpp::traits::input_parameter< const int& >::type fe_rank(fe_rankSEXP); rcpp_result_gen = Rcpp::wrap(lm_variance(X, Xunweighted, XtX_inv, ei, weight_mean, cluster, J, ci, se_type, which_covs, fe_rank)); return rcpp_result_gen; END_RCPP } // naomitwhy DataFrame naomitwhy(DataFrame df, Function recursive_subset); RcppExport SEXP _estimatr_naomitwhy(SEXP dfSEXP, SEXP recursive_subsetSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP); Rcpp::traits::input_parameter< Function >::type recursive_subset(recursive_subsetSEXP); rcpp_result_gen = Rcpp::wrap(naomitwhy(df, recursive_subset)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_estimatr_ht_covar_partial", (DL_FUNC) &_estimatr_ht_covar_partial, 5}, {"_estimatr_ht_var_partial", (DL_FUNC) &_estimatr_ht_var_partial, 2}, {"_estimatr_demeanMat2", (DL_FUNC) &_estimatr_demeanMat2, 5}, {"_estimatr_AtA", (DL_FUNC) &_estimatr_AtA, 1}, {"_estimatr_Kr", (DL_FUNC) &_estimatr_Kr, 2}, {"_estimatr_lm_solver", (DL_FUNC) &_estimatr_lm_solver, 3}, {"_estimatr_lm_variance", (DL_FUNC) &_estimatr_lm_variance, 11}, {"_estimatr_naomitwhy", (DL_FUNC) &_estimatr_naomitwhy, 2}, {NULL, NULL, 0} }; RcppExport void R_init_estimatr(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } estimatr/src/Makevars.win0000644000176200001440000000006114747205231015163 0ustar liggesusers PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) estimatr/src/Makevars0000644000176200001440000000006014747205231014366 0ustar liggesusersPKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) estimatr/src/horvitz_thompson_variance.cpp0000644000176200001440000000262314747205231020711 0ustar liggesusers// [[Rcpp::depends(RcppEigen)]] // [[Rcpp::plugins(cpp11)]] #include using namespace Rcpp; // These functions help compute the variance for the Horvitz-Thompson estimators // TODO use symmetry and matrices to improve speed // [[Rcpp::export]] double ht_covar_partial(const Eigen::VectorXd & y1, const Eigen::VectorXd & y0, const Eigen::MatrixXd & p10, const Eigen::VectorXd & p1, const Eigen::VectorXd & p0) { double cov_total = 0.0; for (int i = 0; i < y1.size(); ++i) { for(int j = 0; j < y0.size(); ++j) { if(p10(i, j) == 0) { cov_total += y1(i) * y0(j) * (p10(i, j) - p1(i) * p0(j)); } else { cov_total += y1(i) * y0(j) * (p10(i, j) - p1(i) * p0(j)) / p10(i, j); } } } return cov_total; } // [[Rcpp::export]] double ht_var_partial(const Eigen::VectorXd & y, const Eigen::MatrixXd & p) { double var_total = 0.0; for (int i = 0; i < y.size(); ++i) { for(int j = 0; j < y.size(); ++j) { if(i != j) { if (p(i, j) == 0) { var_total += y(i) * y(j) * (p(i, j) - p(i,i) * p(j,j)) + std::pow(y(i), 2) * p(i, i) / 2.0 + std::pow(y(j), 2) * p(j, j) / 2.0; } else { var_total += y(i) * y(j) * (p(i, j) - p(i,i) * p(j,j)) / p(i, j); } } } } return var_total; } estimatr/src/naomit.cpp0000644000176200001440000000611114747205231014670 0ustar liggesusers// [[Rcpp::depends(RcppEigen)]] // [[Rcpp::plugins(cpp11)]] #include using namespace Rcpp; template Vector generic_logical_subset_impl( Vector xin, LogicalVector w){ return xin[w] ; } SEXP generic_logical_subset( SEXP xin , LogicalVector w){ RCPP_RETURN_VECTOR(generic_logical_subset_impl, xin, w) ; } // [[Rcpp::export]] DataFrame naomitwhy(DataFrame df, Function recursive_subset) { int m = df.nrow(); int n = df.ncol(); Function isna("is.na"); CharacterVector df_names = df.names(); LogicalVector omit = LogicalVector(m); int omit_count = 0; List why_omit(n); why_omit.names() = df_names; LogicalVector why_omit_idx(n); for (int j = 0; j < n; j++) { std::vector why_omit_j; LogicalVector v_isna = isna(df[j]); for(int ii = m; ii < LENGTH(v_isna); ){ for(int i = 0; i < m; i++, ii++) v_isna[i] |= v_isna[ii]; } for(int i = 0; i < m; i++){ if(v_isna[i]){ if(!omit[i]){ why_omit_j.push_back(i + 1); } omit[i] = true; }; } if(why_omit_j.size() > 0){ why_omit[j] = wrap(why_omit_j); why_omit_idx[j] = true; omit_count += why_omit_j.size(); } } if(omit_count == 0){ return(df); } IntegerVector omit_idx = IntegerVector(omit_count); for(int i = 0, ii=0; ii < omit_count; i++){ if(omit[i]) omit_idx[ii++] = i+1; } CharacterVector rownames = df.attr("row.names"); omit_idx.attr("names") = rownames[omit]; omit_idx.attr("why_omit") = why_omit[why_omit_idx]; omit_idx.attr("class") = CharacterVector::create("omit", "detailed"); omit = !omit; List out(n); for(int i = 0; i < n; i++){ SEXP dfi = df(i); if(LENGTH(dfi) == m){ out[i] = generic_logical_subset(dfi, omit); } else { out[i] = recursive_subset(dfi, omit); } } out.names() = df_names; out.attr("row.names") = rownames[omit]; out.attr("na.action") = omit_idx; out.attr("class") = df.attr("class"); return(out); } // require(microbenchmark) // df <- expand.grid(x=c(1:100, NA), y=c(1:5, NA), z=c(1:8, NA), q=c(NA,2:5)) // df2 <- na.omit(df) // microbenchmark(stock=na.omit(df), ours=estimatr:::na.omit_detailed.data.frame(df), unit="ms") // microbenchmark(stock=na.omit(df2), ours=estimatr:::na.omit_detailed.data.frame(df2), unit="ms") // df <- rbind(df, df2, df) // df2 <- rbind(df2, df2, df2) // microbenchmark(stock=na.omit(df), ours=estimatr:::na.omit_detailed.data.frame(df), unit="ms") // microbenchmark(stock=na.omit(df2), ours=estimatr:::na.omit_detailed.data.frame(df2), unit="ms") // df <- cbind(df, df,df) // df2 <- cbind(df2, df2, df2) // microbenchmark(stock=na.omit(df), ours=estimatr:::na.omit_detailed.data.frame(df), unit="ms") // microbenchmark(stock=na.omit(df2), ours=estimatr:::na.omit_detailed.data.frame(df2), unit="ms") // sleep[c("sleep", "foo")] = list(sleep, matrix(1:40, 20)) // sleep[cbind(c(1,5,9), c(2,1,3))] <- NA // sleep$sleep[cbind(1+c(1,5,9), c(2,1,3))] <- NA // sleep$foo[12,1] <- NA // attributes(estimatr:::na.omit_detailed.data.frame(sleep)) estimatr/NAMESPACE0000644000176200001440000000660514747205231013335 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(confint,difference_in_means) S3method(confint,horvitz_thompson) S3method(confint,iv_robust) S3method(confint,lh) S3method(confint,lh_robust) S3method(confint,lm_robust) S3method(glance,difference_in_means) S3method(glance,horvitz_thompson) S3method(glance,iv_robust) S3method(glance,lh_robust) S3method(glance,lm_robust) S3method(nobs,difference_in_means) S3method(nobs,horvitz_thompson) S3method(nobs,iv_robust) S3method(nobs,lh_robust) S3method(nobs,lm_robust) S3method(nobs,summary.lm_robust) S3method(predict,iv_robust) S3method(predict,lm_robust) S3method(print,difference_in_means) S3method(print,horvitz_thompson) S3method(print,iv_robust) S3method(print,lh) S3method(print,lh_robust) S3method(print,lm_robust) S3method(print,summary.iv_robust) S3method(print,summary.lh) S3method(print,summary.lh_robust) S3method(print,summary.lm_robust) S3method(summary,difference_in_means) S3method(summary,horvitz_thompson) S3method(summary,iv_robust) S3method(summary,lh) S3method(summary,lh_robust) S3method(summary,lm_robust) S3method(tidy,difference_in_means) S3method(tidy,horvitz_thompson) S3method(tidy,iv_robust) S3method(tidy,lh) S3method(tidy,lh_robust) S3method(tidy,lm_robust) S3method(update,iv_robust) S3method(vcov,difference_in_means) S3method(vcov,horvitz_thompson) S3method(vcov,iv_robust) S3method(vcov,lm_robust) export(commarobust) export(declaration_to_condition_pr_mat) export(difference_in_means) export(extract.iv_robust) export(extract.lm_robust) export(gen_pr_matrix_cluster) export(glance) export(horvitz_thompson) export(iv_robust) export(lh_robust) export(lm_lin) export(lm_robust) export(lm_robust_fit) export(permutations_to_condition_pr_mat) export(starprep) export(tidy) importFrom(Formula,Formula) importFrom(Formula,as.Formula) importFrom(Rcpp,evalCpp) importFrom(generics,glance) importFrom(generics,tidy) importFrom(methods,className) importFrom(methods,isGeneric) importFrom(methods,setGeneric) importFrom(methods,setMethod) importFrom(rlang,"%||%") importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,eval_tidy) importFrom(rlang,f_rhs) importFrom(rlang,quo) importFrom(rlang,quo_get_expr) importFrom(rlang,quo_is_missing) importFrom(rlang,quo_set_expr) importFrom(rlang,quos) importFrom(rlang,sym) importFrom(stats,.checkMFClasses) importFrom(stats,.getXlevels) importFrom(stats,coef) importFrom(stats,complete.cases) importFrom(stats,confint) importFrom(stats,delete.response) importFrom(stats,df.residual) importFrom(stats,fitted.values) importFrom(stats,formula) importFrom(stats,getCall) importFrom(stats,lm) importFrom(stats,lm.fit) importFrom(stats,model.extract) importFrom(stats,model.frame) importFrom(stats,model.frame.default) importFrom(stats,model.matrix) importFrom(stats,model.matrix.default) importFrom(stats,model.matrix.lm) importFrom(stats,model.response) importFrom(stats,na.omit) importFrom(stats,na.pass) importFrom(stats,nobs) importFrom(stats,pchisq) importFrom(stats,pf) importFrom(stats,printCoefmat) importFrom(stats,pt) importFrom(stats,qt) importFrom(stats,reformulate) importFrom(stats,resid) importFrom(stats,residuals) importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,terms) importFrom(stats,update) importFrom(stats,var) importFrom(stats,vcov) importFrom(stats,weighted.mean) importFrom(stats,weights) importFrom(utils,getS3method) importFrom(utils,packageVersion) useDynLib(estimatr, .registration = TRUE) estimatr/LICENSE0000644000176200001440000000016614747205231013117 0ustar liggesusersYEAR: 2015-2018 COPYRIGHT HOLDER: Graeme Blair, Jasper Cooper, Alexander Coppock, Macartan Humphreys, and Luke Sonnet estimatr/NEWS.md0000644000176200001440000001247314760371723013221 0ustar liggesusers# estimatr 1.0.6 * Allows for prediction with lm_lin() when treatment is a factor and/or multi-valued. * Adds saved treatment_levels to the returned lm_lin model object. * Stops prediction for lm_lin if the treatment values in new data are not a subset of treatment_levels. * Standardizes model fit for lm_lin() models with no intercept. * Adds tests to ensure identical predictions from lm_lin() models where treatment is either numeric or factorial, and fit with/without an intercept. * Adds relevant examples to predict and lm_robust and lm_lin documentation. * Adds Molly Offer-Westort as a contributor. * Remove functionality for using `lh_robust` with multiple hypotheses. * Restricted functionality for using `lh_robust` with clustered standard errors to CR0 standard errors. # estimatr 1.0.4 * Test suite changes for M1 mac stay current on CRAN. # estimatr 1.0.2 * Minor documentation changes to stay current on CRAN. # estimatr 1.0.0 * Version bump to coincide with DeclareDesign package version 1.0.0 * Tests edited # estimatr 0.30.6 * Fix tests to address CRAN failures # estimatr 0.30.4 * Bug fix of tidy handling of conf.level * Bug fix of lh_robust tidy # estimatr 0.30.2 * Remove lfe from tests # estimatr 0.30.0 * Test suite changes (skip if not installed for checking against other packages) # estimatr 0.28.0 * Test suite changes # estimatr 0.26.0 * Test suite changes # estimatr 0.24.0 * tidy: rename nobs, nclusters, nblocks * tidy: new arguments conf.int, conf.level * Added `update.iv_robust()` * Bug fix regarding fixed effects with large numbers # estimatr 0.22.0 * Bug fixes # estimatr 0.20.0 * Added support for `emmeans` (thanks @rvlenth)! * Fixed bug when estimating `diagnostics` in `iv_robust()` without explicitly specifying `se_type` (issue #310) * Support for `rlang` 0.4.0 # estimatr 0.18.0 * Fixed bug where collinear covariates caused fixed effects estimator to crash (issue #294) * Added `glance.lh_robust()` and fixed some issues with printing and summarizing `lh_robust()` objects (issues #295 and #296) * Fixes CRAN errors in testing with new `clubSandwich` package # estimatr 0.16.0 * Add `diagnostics` to `iv_robust()` * Add `glance()` methods for all estimators * Add `lh_robust()` for easy interface to `car::linearHypothesis()` * Fixed minor bug with a formula such as `is.na(var)` in the `covariates` formula in `lm_lin()` (issue #283) # estimatr 0.14.0 * Removes `broom` hack for `tidy` method and instead relies on importing `generics` # estimatr 0.12.0 * Fixed ambiguity about how interacted covariates were centered in `lm_lin` * A series of fixes for bugs that occurred with multiple outcomes (multivariate regression): * Fixed bug pointed out by James Pustejovsky via the `sandwich` version 2.5-0 and off-diagonal blocks of multivariate regression vcov matrices * Fixed bugs in `lm_lin` preventing multivariate regression * Fixed bug that truncated degrees of freedom with "CR2" standard errors * Fixed bug that returned incorrect R-squared for the second or later outcomes * Fixed bug preventing integration with latest version of `margins` * Fixed bug with `difference_in_means` when using `condition1` and `condition2` to subset a treatment vector with more than two treatment conditions. Previous estimates and standard errors were incorrect. # estimatr 0.10.0 * Changed names of confidence interval columns in tidied data from `ci.lower` and `ci.upper` to `conf.low` and `conf.high` to be in line with other tidy methods * Added support for `fixed_effects` that are just one block * Added support for specifing `condition_prs` in `horvitz_thompson()` as a single number * Added t- and z-statistics to output * Limit unnecessary messaging in `horvitz_thompson()` # estimatr 0.8.0 * Added support for absorbing fixed effects in `lm_robust` and `iv_robust` * Added `commarobust` and `starprep` for stargazer integration * Added `texreg` support for 2SLS IV models * Fixed bugs for incorrect F-statistics with robust standard errors * Refactor of main fitting engine for linear models # estimatr 0.6.0 * Added support for multivariate linear models * Added support for instrumental variables regression * Major change to name of object output elements to mostly match with `broom::tidy` * old -> new * `coefficient_names` -> `term` * `se` -> `std.error` * `p` -> `p.values` * `ci_lower` -> `ci.lower` * `ci_upper` -> `ci.upper` * All of the above changes are also made to the column names on the output of `tidy`; furthermore for `tidy` objects one further name change from `coefficients` -> `estimate` has been made * Fixed bug that caused variances, standard errors, and p-values to be wrong for weighted "CR2" variance estimation * Fixed incorrect estimates when both weights and blocks were passed to `difference_in_means` * Rewrite NSE handling to be done by `rlang` * Rewrite `na.omit` handler in R * Major refactor of C++ underlying regression estimators # estimatr 0.4.0 * Changed suffix added to centered variables in `lm_lin()` from `_bar` to `_c` * Added all vignettes to `.Rbuildignore`, only available on website now * Fixed `lm_robust_helper.cpp` algorithm to not catch own exception and to deal with `valgrind` memory errors * Bugfix where passing a formula as an object within a function would fail * Simplified some tests for various CRAN test platforms # estimatr 0.2.0 * First **CRAN** upload estimatr/build/0000755000176200001440000000000014760401121013175 5ustar liggesusersestimatr/build/partial.rdb0000644000176200001440000000007514760401121015324 0ustar liggesusersb```b`aab`b1g``d`aҬy@D?M7estimatr/man/0000755000176200001440000000000014760370352012664 5ustar liggesusersestimatr/man/lh_robust.Rd0000644000176200001440000000600614760370352015156 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/estimatr_lh_robust.R \name{lh_robust} \alias{lh_robust} \title{Linear Hypothesis for Ordinary Least Squares with Robust Standard Errors} \usage{ lh_robust(..., data, linear_hypothesis) } \arguments{ \item{...}{Other arguments to be passed to \code{\link{lm_robust}}} \item{data}{A \code{data.frame}} \item{linear_hypothesis}{A length 1 character string or a matrix specifying combination, to be passed to the hypothesis.matrix argument of car::linearHypothesis. Joint hypotheses are currently not handled by lh_robust. See \code{\link[car]{linearHypothesis}} for more details.} } \value{ An object of class \code{"lh_robust"} containing the two following components: \item{lm_robust}{an object as returned by \code{lm_robust}.} \item{lh}{A data frame with most of its columns pulled from \code{linearHypothesis}' output.} The only analysis directly performed by \code{lh_robust} is a \code{t-test} for the null hypothesis of no effects of the linear combination of coefficients as specified by the user. All other output components are either extracted from \code{linearHypothesis} or \code{lm_robust}. Note that the estimate returned is the value of the LHS of an equation of the form f(X) = 0. Hyptheses "x - z = 1", "x +1= z + 2" and "x-z-1=0" will all return the value for "x-y-1" The original output returned by \code{linearHypothesis} is added as an attribute under the \code{"linear_hypothesis"} attribute. } \description{ This function fits a linear model with robust standard errors and performs linear hypothesis test. } \details{ This function is a wrapper for \code{\link{lm_robust}} and for \code{\link[car]{linearHypothesis}}. It first runs \code{lm_robust} and next passes \code{"lm_robust"} object as an argument to \code{linearHypothesis}. Currently CR2 standard errors are not handled by lh_robust. } \examples{ library(fabricatr) dat <- fabricate( N = 40, y = rpois(N, lambda = 4), x = rnorm(N), z = rbinom(N, 1, prob = 0.4), clusterID = sample(1:4, 40, replace = TRUE) ) # Default variance estimator is HC2 robust standard errors lhro <- lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0") # The linear hypothesis argument can be specified equivalently as: lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z = 2x") lh_robust(y ~ x + z, data = dat, linear_hypothesis = "2*x +1*z") lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0") # Also recovers other sorts of standard erorrs just as specified in \code{\link{lm_robust}} lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0", se_type = "classical") lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0", se_type = "HC1") # Can tidy() main output and subcomponents in to a data.frame lhro <- lh_robust(y ~ x + z, data = dat, linear_hypothesis = "z + 2x = 0") tidy(lhro ) tidy(lhro$lm_robust) tidy(lhro$lh) # Can use summary() to get more statistics on the main output and subcomponents. summary(lhro) summary(lhro$lm_robust) summary(lhro$lh) } estimatr/man/starprep.Rd0000644000176200001440000000440414747205231015013 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_starprep.R \name{starprep} \alias{starprep} \title{Prepare model fits for stargazer} \usage{ starprep( ..., stat = c("std.error", "statistic", "p.value", "ci", "df"), se_type = NULL, clusters = NULL, alpha = 0.05 ) } \arguments{ \item{...}{a list of lm_robust or lm objects} \item{stat}{either "std.error" (the default), "statistic" (the t-statistic), "p.value", "ci", or "df"} \item{se_type}{(optional) if any of the objects are lm objects, what standard errors should be used. Must only be one type and will be used for all lm objects passed to starprep. See \code{commarobust} for more.} \item{clusters}{(optional) if any of the objects are lm objects, what clusters should be used, if clusters should be used. Must only be one vector and will be used for all lm objects passed to starprep. See \code{commarobust} for more.} \item{alpha}{(optional) if any of the objects are lm objects, what significance level should be used for the p-values or confidence intervals} } \value{ a list of vectors of extracted statistics for stargazers } \description{ Prepare model fits for stargazer } \details{ Used to help extract statistics from lists of model fits for stargazer. Prefers lm_robust objects, but because \code{stargazer} does not work with \code{lm_robust} objects, \code{starprep} can also take \code{lm} objects and calls \code{commarobust} to get the preferred, robust statistics. } \examples{ library(stargazer) lm1 <- lm(mpg ~ hp, data = mtcars) lm2 <- lm(mpg ~ hp + wt, data = mtcars) # Use default "HC2" standard errors stargazer(lm1, lm2, se = starprep(lm1, lm2), p = starprep(lm1, lm2, stat = "p.value"), omit.stat = "f") # NB: We remove the F-stat because stargazer only can use original F-stat # which uses classical SEs # Use default "CR2" standard errors with clusters stargazer(lm1, lm2, se = starprep(lm1, lm2, clusters = mtcars$carb), p = starprep(lm1, lm2, clusters = mtcars$carb, stat = "p.value"), omit.stat = "f") # Can also specify significance levels and different standard errors stargazer(lm1, lm2, ci.custom = starprep(lm1, lm2, se_type = "HC3", alpha = 0.1, stat = "ci"), omit.stat = "f") } estimatr/man/iv_robust.Rd0000644000176200001440000002352414747205231015173 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/estimatr_iv_robust.R \name{iv_robust} \alias{iv_robust} \title{Two-Stage Least Squares Instrumental Variables Regression} \usage{ iv_robust( formula, data, weights, subset, clusters, fixed_effects, se_type = NULL, ci = TRUE, alpha = 0.05, diagnostics = FALSE, return_vcov = TRUE, try_cholesky = FALSE ) } \arguments{ \item{formula}{an object of class formula of the regression and the instruments. For example, the formula \code{y ~ x1 + x2 | z1 + z2} specifies \code{x1} and \code{x2} as endogenous regressors and \code{z1} and \code{z2} as their respective instruments.} \item{data}{A \code{data.frame}} \item{weights}{the bare (unquoted) names of the weights variable in the supplied data.} \item{subset}{An optional bare (unquoted) expression specifying a subset of observations to be used.} \item{clusters}{An optional bare (unquoted) name of the variable that corresponds to the clusters in the data.} \item{fixed_effects}{An optional right-sided formula containing the fixed effects that will be projected out of the data, such as \code{~ blockID}. Do not pass multiple-fixed effects with intersecting groups. Speed gains are greatest for variables with large numbers of groups and when using "HC1" or "stata" standard errors. See 'Details'.} \item{se_type}{The sort of standard error sought. If \code{clusters} is not specified the options are "HC0", "HC1" (or "stata", the equivalent), "HC2" (default), "HC3", or "classical". If \code{clusters} is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} \item{alpha}{The significance level, 0.05 by default.} \item{diagnostics}{logical. Whether to compute and return instrumental variable diagnostic statistics and tests.} \item{return_vcov}{logical. Whether to return the variance-covariance matrix for later usage, TRUE by default.} \item{try_cholesky}{logical. Whether to try using a Cholesky decomposition to solve least squares instead of a QR decomposition, FALSE by default. Using a Cholesky decomposition may result in speed gains, but should only be used if users are sure their model is full-rank (i.e., there is no perfect multi-collinearity)} } \value{ An object of class \code{"iv_robust"}. The post-estimation commands functions \code{summary} and \code{\link{tidy}} return results in a \code{data.frame}. To get useful data out of the return, you can use these data frames, you can use the resulting list directly, or you can use the generic accessor functions \code{coef}, \code{vcov}, \code{confint}, and \code{predict}. An object of class \code{"iv_robust"} is a list containing at least the following components: \item{coefficients}{the estimated coefficients} \item{std.error}{the estimated standard errors} \item{statistic}{the t-statistic} \item{df}{the estimated degrees of freedom} \item{p.value}{the p-values from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} \item{term}{a character vector of coefficient names} \item{alpha}{the significance level specified by the user} \item{se_type}{the standard error type specified by the user} \item{res_var}{the residual variance} \item{nobs}{the number of observations used} \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} \item{rank}{the rank of the fitted model} \item{vcov}{the fitted variance covariance matrix} \item{r.squared}{the \eqn{R^2} of the second stage regression} \item{adj.r.squared}{the \eqn{R^2} of the second stage regression, but penalized for having more parameters, \code{rank}} \item{fstatistic}{a vector with the value of the second stage F-statistic with the numerator and denominator degrees of freedom} \item{firststage_fstatistic}{a vector with the value of the first stage F-statistic with the numerator and denominator degrees of freedom, useful for a test for weak instruments} \item{weighted}{whether or not weights were applied} \item{call}{the original function call} \item{fitted.values}{the matrix of predicted means} We also return \code{terms} with the second stage terms and \code{terms_regressors} with the first stage terms, both of which used by \code{predict}. If \code{fixed_effects} are specified, then we return \code{proj_fstatistic}, \code{proj_r.squared}, and \code{proj_adj.r.squared}, which are model fit statistics that are computed on the projected model (after demeaning the fixed effects). We also return various diagnostics when \code{`diagnostics` == TRUE}. These are stored in \code{diagnostic_first_stage_fstatistic}, \code{diagnostic_endogeneity_test}, and \code{diagnostic_overid_test}. They have the test statistic, relevant degrees of freedom, and p.value in a named vector. See 'Details' for more. These are printed in a formatted table when the model object is passed to \code{summary()}. } \description{ This formula estimates an instrumental variables regression using two-stage least squares with a variety of options for robust standard errors } \details{ This function performs two-stage least squares estimation to fit instrumental variables regression. The syntax is similar to that in \code{ivreg} from the \code{AER} package. Regressors and instruments should be specified in a two-part formula, such as \code{y ~ x1 + x2 | z1 + z2 + z3}, where \code{x1} and \code{x2} are regressors and \code{z1}, \code{z2}, and \code{z3} are instruments. Unlike \code{ivreg}, you must explicitly specify all exogenous regressors on both sides of the bar. The default variance estimators are the same as in \code{\link{lm_robust}}. Without clusters, we default to \code{HC2} standard errors, and with clusters we default to \code{CR2} standard errors. 2SLS variance estimates are computed using the same estimators as in \code{\link{lm_robust}}, however the design matrix used are the second-stage regressors, which includes the estimated endogenous regressors, and the residuals used are the difference between the outcome and a fit produced by the second-stage coefficients and the first-stage (endogenous) regressors. More notes on this can be found at \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{the mathematical appendix}. If \code{fixed_effects} are specified, both the outcome, regressors, and instruments are centered using the method of alternating projections (Halperin 1962; Gaure 2013). Specifying fixed effects in this way will result in large speed gains with standard error estimators that do not need to invert the matrix of fixed effects. This means using "classical", "HC0", "HC1", "CR0", or "stata" standard errors will be faster than other standard error estimators. Be wary when specifying fixed effects that may result in perfect fits for some observations or if there are intersecting groups across multiple fixed effect variables (e.g. if you specify both "year" and "country" fixed effects with an unbalanced panel where one year you only have data for one country). If \code{diagnostics} are requested, we compute and return three sets of diagnostics. First, we return tests for weak instruments using first-stage F-statistics (\code{diagnostic_first_stage_fstatistic}). Specifically, the F-statistics reported compare the model regressing each endogeneous variable on both the included exogenous variables and the instruments to a model where each endogenous variable is regressed only on the included exogenous variables (without the instruments). A significant F-test for weak instruments provides evidence against the null hypothesis that the instruments are weak. Second, we return tests for the endogeneity of the endogenous variables, often called the Wu-Hausman test (\code{diagnostic_endogeneity_test}). We implement the regression test from Hausman (1978), which allows for robust variance estimation. A significant endogeneity test provides evidence against the null that all the variables are exogenous. Third, we return a test for the correlation between the instruments and the error term (\code{diagnostic_overid_test}). We implement the Wooldridge (1995) robust score test, which is identical to Sargan's (1958) test with classical standard errors. This test is only reported if the model is overidentified (i.e. the number of instruments is greater than the number of endogenous regressors), and if no weights are specified. } \examples{ library(fabricatr) dat <- fabricate( N = 40, Y = rpois(N, lambda = 4), Z = rbinom(N, 1, prob = 0.4), D = Z * rbinom(N, 1, prob = 0.8), X = rnorm(N), G = sample(letters[1:4], N, replace = TRUE) ) # Instrument for treatment `D` with encouragement `Z` tidy(iv_robust(Y ~ D + X | Z + X, data = dat)) # Instrument with Stata's `ivregress 2sls , small rob` HC1 variance tidy(iv_robust(Y ~ D | Z, data = dat, se_type = "stata")) # With clusters, we use CR2 errors by default dat$cl <- rep(letters[1:5], length.out = nrow(dat)) tidy(iv_robust(Y ~ D | Z, data = dat, clusters = cl)) # Again, easy to replicate Stata (again with `small` correction in Stata) tidy(iv_robust(Y ~ D | Z, data = dat, clusters = cl, se_type = "stata")) # We can also specify fixed effects, that will be taken as exogenous regressors # Speed gains with fixed effects are greatests with "stata" or "HC1" std.errors tidy(iv_robust(Y ~ D | Z, data = dat, fixed_effects = ~ G, se_type = "HC1")) } \references{ Gaure, Simon. 2013. "OLS with multiple high dimensional category variables." Computational Statistics & Data Analysis 66: 8-1. \doi{10.1016/j.csda.2013.03.024} Halperin, I. 1962. "The product of projection operators." Acta Scientiarum Mathematicarum (Szeged) 23(1-2): 96-99. } estimatr/man/na.omit_detailed.data.frame.Rd0000644000176200001440000000111414747205231020347 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_na_omit_detailed.R \name{na.omit_detailed.data.frame} \alias{na.omit_detailed.data.frame} \title{Extra logging on na.omit handler} \usage{ na.omit_detailed.data.frame(object) } \arguments{ \item{object}{a data.frame} } \value{ a normal \code{omit} object, with the extra attribute \code{why_omit}, which contains the leftmost column containing an NA for each row that was dropped, by column name, if any were dropped. } \description{ Extra logging on na.omit handler } \seealso{ \code{\link{na.omit}} } estimatr/man/gen_pr_matrix_cluster.Rd0000644000176200001440000000157214747205231017555 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_condition_pr_matrix.R \name{gen_pr_matrix_cluster} \alias{gen_pr_matrix_cluster} \title{Generate condition probability matrix given clusters and probabilities} \usage{ gen_pr_matrix_cluster(clusters, treat_probs, simple) } \arguments{ \item{clusters}{A vector of clusters} \item{treat_probs}{A vector of treatment (condition 2) probabilities} \item{simple}{A boolean for whether the assignment is a random sample assignment (TRUE, default) or complete random assignment (FALSE)} } \value{ a numeric 2n*2n matrix of marginal and joint condition treatment probabilities to be passed to the \code{condition_pr_mat} argument of \code{\link{horvitz_thompson}}. } \description{ Generate condition probability matrix given clusters and probabilities } \seealso{ \code{\link{declaration_to_condition_pr_mat}} } estimatr/man/alo_star_men.Rd0000644000176200001440000000251114747205231015613 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{alo_star_men} \alias{alo_star_men} \title{Replication data for Lin 2013} \format{ A data frame with educational treatments and outcomes: \describe{ \item{gpa0}{high school GPA} \item{sfsp}{financial incentives and support treatment} \item{ssp}{support only treatment} \item{GPA_year1}{college GPA year 1} \item{GPA_year2}{college GPA year 2} } } \source{ \url{https://www.aeaweb.org/articles?id=10.1257/app.1.1.136} } \usage{ alo_star_men } \description{ A dataset containing the data to replicate: Lin, Winston. 2013. "Agnostic notes on regression adjustments to experimental data: Reexamining Freedman's critique." The Annals of Applied Statistics. Stat. 7(1): 295-318. doi:10.1214/12-AOAS583. https://projecteuclid.org/euclid.aoas/1365527200. } \details{ This data was originally taken from the following paper, subset to men who showed up to college, were in one of the arms with the support condition, and had GPA data for their first year in college. Angrist, Joshua, Daniel Lang, and Philip Oreopoulos. 2009. "Incentives and Services for College Achievement: Evidence from a Randomized Trial." American Economic Journal: Applied Economics 1(1): 136-63. https://www.aeaweb.org/articles?id=10.1257/app.1.1.136 } \keyword{datasets} estimatr/man/lm_robust.Rd0000644000176200001440000002572714747205231015174 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/estimatr_lm_robust.R \name{lm_robust} \alias{lm_robust} \title{Ordinary Least Squares with Robust Standard Errors} \usage{ lm_robust( formula, data, weights, subset, clusters, fixed_effects, se_type = NULL, ci = TRUE, alpha = 0.05, return_vcov = TRUE, try_cholesky = FALSE ) } \arguments{ \item{formula}{an object of class formula, as in \code{\link{lm}}} \item{data}{A \code{data.frame}} \item{weights}{the bare (unquoted) names of the weights variable in the supplied data.} \item{subset}{An optional bare (unquoted) expression specifying a subset of observations to be used.} \item{clusters}{An optional bare (unquoted) name of the variable that corresponds to the clusters in the data.} \item{fixed_effects}{An optional right-sided formula containing the fixed effects that will be projected out of the data, such as \code{~ blockID}. Do not pass multiple-fixed effects with intersecting groups. Speed gains are greatest for variables with large numbers of groups and when using "HC1" or "stata" standard errors. See 'Details'.} \item{se_type}{The sort of standard error sought. If \code{clusters} is not specified the options are "HC0", "HC1" (or "stata", the equivalent), "HC2" (default), "HC3", or "classical". If \code{clusters} is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} \item{alpha}{The significance level, 0.05 by default.} \item{return_vcov}{logical. Whether to return the variance-covariance matrix for later usage, TRUE by default.} \item{try_cholesky}{logical. Whether to try using a Cholesky decomposition to solve least squares instead of a QR decomposition, FALSE by default. Using a Cholesky decomposition may result in speed gains, but should only be used if users are sure their model is full-rank (i.e., there is no perfect multi-collinearity)} } \value{ An object of class \code{"lm_robust"}. The post-estimation commands functions \code{summary} and \code{\link{tidy}} return results in a \code{data.frame}. To get useful data out of the return, you can use these data frames, you can use the resulting list directly, or you can use the generic accessor functions \code{coef}, \code{vcov}, \code{confint}, and \code{predict}. Marginal effects and uncertainty about them can be gotten by passing this object to \code{\link[margins]{margins}} from the \pkg{margins}, or to \code{emmeans} in the \pkg{emmeans} package. Users who want to print the results in TeX of HTML can use the \code{\link[texreg]{extract}} function and the \pkg{texreg} package. If users specify a multivariate linear regression model (multiple outcomes), then some of the below components will be of higher dimension to accommodate the additional models. An object of class \code{"lm_robust"} is a list containing at least the following components: \item{coefficients}{the estimated coefficients} \item{std.error}{the estimated standard errors} \item{statistic}{the t-statistic} \item{df}{the estimated degrees of freedom} \item{p.value}{the p-values from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} \item{term}{a character vector of coefficient names} \item{alpha}{the significance level specified by the user} \item{se_type}{the standard error type specified by the user} \item{res_var}{the residual variance} \item{N}{the number of observations used} \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} \item{rank}{the rank of the fitted model} \item{vcov}{the fitted variance covariance matrix} \item{r.squared}{The \eqn{R^2}, \deqn{R^2 = 1 - Sum(e[i]^2) / Sum((y[i] - y^*)^2),} where \eqn{y^*} is the mean of \eqn{y[i]} if there is an intercept and zero otherwise, and \eqn{e[i]} is the ith residual.} \item{adj.r.squared}{The \eqn{R^2} but penalized for having more parameters, \code{rank}} \item{fstatistic}{a vector with the value of the F-statistic with the numerator and denominator degrees of freedom} \item{weighted}{whether or not weights were applied} \item{call}{the original function call} \item{fitted.values}{the matrix of predicted means} We also return \code{terms} and \code{contrasts}, used by \code{predict}. If \code{fixed_effects} are specified, then we return \code{proj_fstatistic}, \code{proj_r.squared}, and \code{proj_adj.r.squared}, which are model fit statistics that are computed on the projected model (after demeaning the fixed effects). } \description{ This formula fits a linear model, provides a variety of options for robust standard errors, and conducts coefficient tests } \details{ This function performs linear regression and provides a variety of standard errors. It takes a formula and data much in the same was as \code{\link{lm}} does, and all auxiliary variables, such as clusters and weights, can be passed either as quoted names of columns, as bare column names, or as a self-contained vector. Examples of usage can be seen below and in the \href{https://declaredesign.org/r/estimatr/articles/getting-started.html}{Getting Started vignette}. The mathematical notes in \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{this vignette} specify the exact estimators used by this function. The default variance estimators have been chosen largely in accordance with the procedures in \href{https://github.com/acoppock/Green-Lab-SOP/blob/master/Green_Lab_SOP.pdf}{this manual}. The default for the case without clusters is the HC2 estimator and the default with clusters is the analogous CR2 estimator. Users can easily replicate Stata standard errors in the clustered or non-clustered case by setting \code{`se_type` = "stata"}. The function estimates the coefficients and standard errors in C++, using the \code{RcppEigen} package. By default, we estimate the coefficients using Column-Pivoting QR decomposition from the Eigen C++ library, although users could get faster solutions by setting \code{`try_cholesky` = TRUE} to use a Cholesky decomposition instead. This will likely result in quicker solutions, but the algorithm does not reliably detect when there are linear dependencies in the model and may fail silently if they exist. If \code{`fixed_effects`} are specified, both the outcome and design matrix are centered using the method of alternating projections (Halperin 1962; Gaure 2013). Specifying fixed effects in this way will result in large speed gains with standard error estimators that do not need to invert the matrix of fixed effects. This means using "classical", "HC0", "HC1", "CR0", or "stata" standard errors will be faster than other standard error estimators. Be wary when specifying fixed effects that may result in perfect fits for some observations or if there are intersecting groups across multiple fixed effect variables (e.g. if you specify both "year" and "country" fixed effects with an unbalanced panel where one year you only have data for one country). As with \code{`lm()`}, multivariate regression (multiple outcomes) will only admit observations into the estimation that have no missingness on any outcome. } \examples{ set.seed(15) library(fabricatr) dat <- fabricate( N = 40, y = rpois(N, lambda = 4), x = rnorm(N), z = rbinom(N, 1, prob = 0.4) ) # Default variance estimator is HC2 robust standard errors lmro <- lm_robust(y ~ x + z, data = dat) # Can tidy() the data in to a data.frame tidy(lmro) # Can use summary() to get more statistics summary(lmro) # Can also get coefficients three ways lmro$coefficients coef(lmro) tidy(lmro)$estimate # Can also get confidence intervals from object or with new 1 - `alpha` lmro$conf.low confint(lmro, level = 0.8) # Can recover classical standard errors lmclassic <- lm_robust(y ~ x + z, data = dat, se_type = "classical") tidy(lmclassic) # Can easily match Stata's robust standard errors lmstata <- lm_robust(y ~ x + z, data = dat, se_type = "stata") tidy(lmstata) # Easy to specify clusters for cluster-robust inference dat$clusterID <- sample(1:10, size = 40, replace = TRUE) lmclust <- lm_robust(y ~ x + z, data = dat, clusters = clusterID) tidy(lmclust) # Can also match Stata's clustered standard errors lm_robust( y ~ x + z, data = dat, clusters = clusterID, se_type = "stata" ) # Works just as LM does with functions in the formula dat$blockID <- rep(c("A", "B", "C", "D"), each = 10) lm_robust(y ~ x + z + factor(blockID), data = dat) # Weights are also easily specified dat$w <- runif(40) lm_robust( y ~ x + z, data = dat, weights = w, clusters = clusterID ) # Subsetting works just as in `lm()` lm_robust(y ~ x, data = dat, subset = z == 1) # One can also choose to set the significance level for different CIs lm_robust(y ~ x + z, data = dat, alpha = 0.1) # We can also specify fixed effects # Speed gains with fixed effects are greatest with "stata" or "HC1" std.errors tidy(lm_robust(y ~ z, data = dat, fixed_effects = ~ blockID, se_type = "HC1")) \dontrun{ # Can also use 'margins' or 'emmeans' package if you have them installed # to get marginal effects library(margins) lmrout <- lm_robust(y ~ x + z, data = dat) summary(margins(lmrout)) # Can output results using 'texreg' library(texreg) texreg(lmrout) # Using emmeans to obtain covariate-adjusted means library(emmeans) fiber.rlm <- lm_robust(strength ~ diameter + machine, data = fiber) emmeans(fiber.rlm, "machine") } } \references{ Abadie, Alberto, Susan Athey, Guido W Imbens, and Jeffrey Wooldridge. 2017. "A Class of Unbiased Estimators of the Average Treatment Effect in Randomized Experiments." arXiv Pre-Print. \url{https://arxiv.org/abs/1710.02926v2}. Bell, Robert M, and Daniel F McCaffrey. 2002. "Bias Reduction in Standard Errors for Linear Regression with Multi-Stage Samples." Survey Methodology 28 (2): 169-82. Gaure, Simon. 2013. "OLS with multiple high dimensional category variables." Computational Statistics & Data Analysis 66: 8-1. \doi{10.1016/j.csda.2013.03.024} Halperin, I. 1962. "The product of projection operators." Acta Scientiarum Mathematicarum (Szeged) 23(1-2): 96-99. MacKinnon, James, and Halbert White. 1985. "Some Heteroskedasticity-Consistent Covariance Matrix Estimators with Improved Finite Sample Properties." Journal of Econometrics 29 (3): 305-25. \doi{10.1016/0304-4076(85)90158-7}. Pustejovsky, James E, and Elizabeth Tipton. 2016. "Small Sample Methods for Cluster-Robust Variance Estimation and Hypothesis Testing in Fixed Effects Models." Journal of Business & Economic Statistics. Taylor & Francis. \doi{10.1080/07350015.2016.1247004}. Samii, Cyrus, and Peter M Aronow. 2012. "On Equivalencies Between Design-Based and Regression-Based Variance Estimators for Randomized Experiments." Statistics and Probability Letters 82 (2). \doi{10.1016/j.spl.2011.10.024}. } estimatr/man/estimatr.Rd0000644000176200001440000000266414760370122015006 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/estimatr.R \docType{package} \name{estimatr} \alias{estimatr-package} \alias{estimatr} \title{estimatr} \description{ Fast procedures for small set of commonly-used, design-appropriate estimators with robust standard errors and confidence intervals. Includes estimators for linear regression, instrumental variables regression, difference-in-means, Horvitz-Thompson estimation, and regression improving precision of experimental estimates by interacting treatment with centered pre-treatment covariates introduced by Lin (2013) . } \seealso{ Useful links: \itemize{ \item \url{https://declaredesign.org/r/estimatr/} \item \url{https://github.com/DeclareDesign/estimatr} \item Report bugs at \url{https://github.com/DeclareDesign/estimatr/issues} } } \author{ \strong{Maintainer}: Graeme Blair \email{graeme.blair@gmail.com} Authors: \itemize{ \item Jasper Cooper \email{jjc2247@columbia.edu} \item Alexander Coppock \email{alex.coppock@yale.edu} \item Macartan Humphreys \email{macartan@gmail.com} \item Luke Sonnet \email{luke.sonnet@gmail.com} } Other contributors: \itemize{ \item Neal Fultz \email{nfultz@gmail.com} [contributor] \item Lily Medina \email{lilymiru@gmail.com} [contributor] \item Russell Lenth \email{russell-lenth@uiowa.edu} [contributor] \item Molly Offer-Westort \email{mollyow@uchicago.edu} [contributor] } } estimatr/man/lm_lin.Rd0000644000176200001440000001716114760370122014426 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/estimatr_lm_lin.R \name{lm_lin} \alias{lm_lin} \title{Linear regression with the Lin (2013) covariate adjustment} \usage{ lm_lin( formula, covariates, data, weights, subset, clusters, se_type = NULL, ci = TRUE, alpha = 0.05, return_vcov = TRUE, try_cholesky = FALSE ) } \arguments{ \item{formula}{an object of class formula, as in \code{\link{lm}}, such as \code{Y ~ Z} with only one variable on the right-hand side, the treatment} \item{covariates}{a right-sided formula with pre-treatment covariates on the right hand side, such as \code{ ~ x1 + x2 + x3}.} \item{data}{A \code{data.frame}} \item{weights}{the bare (unquoted) names of the weights variable in the supplied data.} \item{subset}{An optional bare (unquoted) expression specifying a subset of observations to be used.} \item{clusters}{An optional bare (unquoted) name of the variable that corresponds to the clusters in the data.} \item{se_type}{The sort of standard error sought. If \code{clusters} is not specified the options are "HC0", "HC1" (or "stata", the equivalent), "HC2" (default), "HC3", or "classical". If \code{clusters} is specified the options are "CR0", "CR2" (default), or "stata" are permissible.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} \item{alpha}{The significance level, 0.05 by default.} \item{return_vcov}{logical. Whether to return the variance-covariance matrix for later usage, TRUE by default.} \item{try_cholesky}{logical. Whether to try using a Cholesky decomposition to solve least squares instead of a QR decomposition, FALSE by default. Using a Cholesky decomposition may result in speed gains, but should only be used if users are sure their model is full-rank (i.e., there is no perfect multi-collinearity)} } \value{ An object of class \code{"lm_robust"}. The post-estimation commands functions \code{summary} and \code{\link{tidy}} return results in a \code{data.frame}. To get useful data out of the return, you can use these data frames, you can use the resulting list directly, or you can use the generic accessor functions \code{coef}, \code{vcov}, \code{confint}, and \code{predict}. Marginal effects and uncertainty about them can be gotten by passing this object to \code{\link[margins]{margins}} from the \pkg{margins}. Users who want to print the results in TeX of HTML can use the \code{\link[texreg]{extract}} function and the \pkg{texreg} package. An object of class \code{"lm_robust"} is a list containing at least the following components: \item{coefficients}{the estimated coefficients} \item{std.error}{the estimated standard errors} \item{statistic}{the t-statistic} \item{df}{the estimated degrees of freedom} \item{p.value}{the p-values from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} \item{term}{a character vector of coefficient names} \item{alpha}{the significance level specified by the user} \item{se_type}{the standard error type specified by the user} \item{res_var}{the residual variance} \item{N}{the number of observations used} \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} \item{rank}{the rank of the fitted model} \item{vcov}{the fitted variance covariance matrix} \item{r.squared}{The \eqn{R^2}, \deqn{R^2 = 1 - Sum(e[i]^2) / Sum((y[i] - y^*)^2),} where \eqn{y^*} is the mean of \eqn{y[i]} if there is an intercept and zero otherwise, and \eqn{e[i]} is the ith residual.} \item{adj.r.squared}{The \eqn{R^2} but penalized for having more parameters, \code{rank}} \item{weighted}{whether or not weights were applied} \item{call}{the original function call} \item{fitted.values}{the matrix of predicted means} We also return \code{terms}, \code{contrasts}, and \code{treatment_levels}, used by \code{predict}, and \code{scaled_center} (the means of each of the covariates used for centering them). } \description{ This function is a wrapper for \code{\link{lm_robust}} that is useful for estimating treatment effects with pre-treatment covariate data. This implements the method described by Lin (2013). } \details{ This function is simply a wrapper for \code{\link{lm_robust}} and implements the Lin estimator (see the reference below). This method pre-processes the data by taking the covariates specified in the \code{`covariates`} argument, centering them by subtracting from each covariate its mean, and interacting them with the treatment. If the treatment has multiple values, a series of dummies for each value is created and each of those is interacted with the demeaned covariates. More details can be found in the \href{https://declaredesign.org/r/estimatr/articles/getting-started.html}{Getting Started vignette} and the \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{mathematical notes}. } \examples{ library(fabricatr) library(randomizr) dat <- fabricate( N = 40, x = rnorm(N, mean = 2.3), x2 = rpois(N, lambda = 2), x3 = runif(N), y0 = rnorm(N) + x, y1 = rnorm(N) + x + 0.35 ) dat$z <- complete_ra(N = nrow(dat)) dat$y <- ifelse(dat$z == 1, dat$y1, dat$y0) # Same specification as lm_robust() with one additional argument lmlin_out <- lm_lin(y ~ z, covariates = ~ x, data = dat) tidy(lmlin_out) # Works with multiple pre-treatment covariates lm_lin(y ~ z, covariates = ~ x + x2, data = dat) # Also centers data AFTER evaluating any functions in formula lmlin_out2 <- lm_lin(y ~ z, covariates = ~ x + log(x3), data = dat) lmlin_out2$scaled_center["log(x3)"] mean(log(dat$x3)) # Works easily with clusters dat$clusterID <- rep(1:20, each = 2) dat$z_clust <- cluster_ra(clusters = dat$clusterID) lm_lin(y ~ z_clust, covariates = ~ x, data = dat, clusters = clusterID) # Works with multi-valued treatments, whether treatment is specified as a # factor or not dat$z_multi <- sample(1:3, size = nrow(dat), replace = TRUE) lm_lin(y ~ z_multi, covariates = ~ x, data = dat) lm_lin(y ~ factor(z_multi), covariates = ~ x, data = dat) # Stratified estimator with blocks dat$blockID <- rep(1:5, each = 8) dat$z_block <- block_ra(blocks = dat$blockID) lm_lin(y ~ z_block, ~ factor(blockID), data = dat) # Fitting the model without an intercept provides estimates of mean outcomes # under each respective treatment condition lm_lin(y ~ z_multi - 1, covariates = ~ x, data = dat) # Predictions are the same in equivalent models with and without an intercept lmlin_out3 <- lm_lin(y ~ z_multi - 1, covariates = ~ x, data = dat) lmlin_out4 <- lm_lin(y ~ z_multi, covariates = ~ x, data = dat) predict(lmlin_out3, newdata = dat, se.fit = TRUE, interval = "confidence") predict(lmlin_out4, newdata = dat, se.fit = TRUE, interval = "confidence") \dontrun{ # Can also use 'margins' package if you have it installed to get # marginal effects library(margins) # Instruct 'margins' to treat z as a factor lmlout <- lm_lin(y ~ factor(z_block), ~ x, data = dat) summary(margins(lmlout)) # Can output results using 'texreg' library(texreg) texregobj <- extract(lmlout) } } \references{ Freedman, David A. 2008. "On Regression Adjustments in Experiments with Several Treatments." The Annals of Applied Statistics. JSTOR, 176-96. \doi{10.1214/07-AOAS143}. Lin, Winston. 2013. "Agnostic Notes on Regression Adjustments to Experimental Data: Reexamining Freedman's Critique." The Annals of Applied Statistics 7 (1). Institute of Mathematical Statistics: 295-318. \doi{10.1214/12-AOAS583}. } \seealso{ \code{\link{lm_robust}} } estimatr/man/permutations_to_condition_pr_mat.Rd0000644000176200001440000000371514747205231022023 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_condition_pr_matrix.R \name{permutations_to_condition_pr_mat} \alias{permutations_to_condition_pr_mat} \title{Builds condition probability matrices for Horvitz-Thompson estimation from permutation matrix} \usage{ permutations_to_condition_pr_mat(permutations) } \arguments{ \item{permutations}{A matrix where the rows are units and the columns are different treatment permutations; treated units must be represented with a 1 and control units with a 0} } \value{ a numeric 2n*2n matrix of marginal and joint condition treatment probabilities to be passed to the \code{condition_pr_mat} argument of \code{\link{horvitz_thompson}}. } \description{ Builds condition probability matrices for Horvitz-Thompson estimation from permutation matrix } \details{ This function takes a matrix of permutations, for example from the \code{\link[randomizr]{obtain_permutation_matrix}} function in \pkg{randomizr} or through simulation and returns a 2n*2n matrix that can be used to fully specify the design for \code{\link{horvitz_thompson}} estimation. You can read more about these matrices in the documentation for the \code{\link{declaration_to_condition_pr_mat}} function. This is done by passing this matrix to the \code{condition_pr_mat} argument of } \examples{ # Complete randomization perms <- replicate(1000, sample(rep(0:1, each = 50))) comp_pr_mat <- permutations_to_condition_pr_mat(perms) # Arbitrary randomization possible_treats <- cbind( c(1, 1, 0, 1, 0, 0, 0, 1, 1, 0), c(0, 1, 1, 0, 1, 1, 0, 1, 0, 1), c(1, 0, 1, 1, 1, 1, 1, 0, 0, 0) ) arb_pr_mat <- permutations_to_condition_pr_mat(possible_treats) # Simulating a column to be realized treatment z <- possible_treats[, sample(ncol(possible_treats), size = 1)] y <- rnorm(nrow(possible_treats)) horvitz_thompson(y ~ z, condition_pr_mat = arb_pr_mat) } \seealso{ \code{\link[randomizr]{declare_ra}}, \code{\link{declaration_to_condition_pr_mat}} } estimatr/man/estimatr_tidiers.Rd0000644000176200001440000000325714747205231016533 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/S3_tidy.R \name{estimatr_tidiers} \alias{estimatr_tidiers} \alias{tidy.lm_robust} \alias{tidy.iv_robust} \alias{tidy.difference_in_means} \alias{tidy.horvitz_thompson} \alias{tidy.lh_robust} \alias{tidy.lh} \title{Tidy an estimatr object} \usage{ \method{tidy}{lm_robust}(x, conf.int = TRUE, conf.level = NULL, ...) \method{tidy}{iv_robust}(x, conf.int = TRUE, conf.level = NULL, ...) \method{tidy}{difference_in_means}(x, conf.int = TRUE, conf.level = NULL, ...) \method{tidy}{horvitz_thompson}(x, conf.int = TRUE, conf.level = NULL, ...) \method{tidy}{lh_robust}(x, conf.int = TRUE, conf.level = NULL, ...) \method{tidy}{lh}(x, conf.int = TRUE, conf.level = NULL, ...) } \arguments{ \item{x}{An object returned by one of the estimators} \item{conf.int}{Logical indicating whether or not to include a confidence interval in the tidied output. Defaults to ‘TRUE’.} \item{conf.level}{The confidence level to use for the confidence interval if ‘conf.int = TRUE’. Must be strictly greater than 0 and less than 1. Defaults to 0.95, which corresponds to a 95 percent confidence interval.} \item{...}{extra arguments (not used)} } \value{ A data.frame with columns for coefficient names, estimates, standard errors, confidence intervals, p-values, degrees of freedom, and the name of the outcome variable } \description{ Tidy an estimatr object } \seealso{ \code{\link[generics:tidy]{generics::tidy()}}, \code{\link[=lm_robust]{lm_robust()}}, \code{\link[=iv_robust]{iv_robust()}}, \code{\link[=difference_in_means]{difference_in_means()}}, \code{\link[=horvitz_thompson]{horvitz_thompson()}} } \concept{estimatr tidiers} estimatr/man/lm_robust_fit.Rd0000644000176200001440000000352014747205231016021 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_lm_robust_fit.R \name{lm_robust_fit} \alias{lm_robust_fit} \title{Internal method that creates linear fits} \usage{ lm_robust_fit( y, X, yoriginal = NULL, Xoriginal = NULL, weights, cluster, fixed_effects = NULL, ci = TRUE, se_type, has_int, alpha = 0.05, return_vcov = TRUE, return_fit = TRUE, try_cholesky = FALSE, iv_stage = list(0) ) } \arguments{ \item{y}{numeric outcome vector} \item{X}{numeric design matrix} \item{yoriginal}{numeric outcome vector, unprojected if there are fixed effects} \item{Xoriginal}{numeric design matrix, unprojected if there are fixed effects. Any column named \code{"(Intercept)" will be dropped}} \item{weights}{numeric weights vector} \item{cluster}{numeric cluster vector} \item{fixed_effects}{character matrix of fixed effect groups} \item{ci}{boolean that when T returns confidence intervals and p-values} \item{se_type}{character denoting which kind of SEs to return} \item{has_int}{logical, whether the model has an intercept, used for \eqn{R^2}} \item{alpha}{numeric denoting the test size for confidence intervals} \item{return_vcov}{logical, whether to return the vcov matrix for later usage} \item{return_fit}{logical, whether to return fitted values} \item{try_cholesky}{logical, whether to try using a cholesky decomposition to solve LS instead of a QR decomposition} \item{iv_stage}{list of length two, the first element denotes the stage of 2SLS IV estimation, where 0 is used for OLS. The second element is only used for the second stage of 2SLS and has the first stage design matrix. For OLS, the default, \code{list(0)}, for the first stage of 2SLS \code{list(1)}, for second stage of 2SLS \code{list(2, first_stage_design_mat)}.} } \description{ Internal method that creates linear fits } estimatr/man/reexports.Rd0000644000176200001440000000071014747205231015202 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/S3_glance.R, R/S3_tidy.R \docType{import} \name{reexports} \alias{reexports} \alias{glance} \alias{tidy} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{generics}{\code{\link[generics]{glance}}, \code{\link[generics]{tidy}}} }} estimatr/man/estimatr_glancers.Rd0000644000176200001440000000733214747205231016664 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/S3_glance.R \name{estimatr_glancers} \alias{estimatr_glancers} \alias{glance.lm_robust} \alias{glance.lh_robust} \alias{glance.iv_robust} \alias{glance.difference_in_means} \alias{glance.horvitz_thompson} \title{Glance at an estimatr object} \usage{ \method{glance}{lm_robust}(x, ...) \method{glance}{lh_robust}(x, ...) \method{glance}{iv_robust}(x, ...) \method{glance}{difference_in_means}(x, ...) \method{glance}{horvitz_thompson}(x, ...) } \arguments{ \item{x}{An object returned by one of the estimators} \item{...}{extra arguments (not used)} } \value{ For \code{glance.lm_robust}, a data.frame with columns: \item{r.squared}{the \eqn{R^2}, \deqn{R^2 = 1 - Sum(e[i]^2) / Sum((y[i] - y^*)^2),} where \eqn{y^*} is the mean of \eqn{y[i]} if there is an intercept and zero otherwise, and \eqn{e[i]} is the ith residual.} \item{adj.r.squared}{the \eqn{R^2} but penalized for having more parameters, \code{rank}} \item{se_type}{the standard error type specified by the user} \item{statistic}{the value of the F-statistic} \item{p.value}{p-value from the F test} \item{df.residual}{residual degrees of freedom} \item{nobs}{the number of observations used} For \code{glance.lh_robust}, we glance the \code{lm_robust} component only. You can access the linear hypotheses as a data.frame directy from the \code{lh} component of the \code{lh_robust} object For \code{glance.iv_robust}, a data.frame with columns: \item{r.squared}{The \eqn{R^2} of the second stage regression} \item{adj.r.squared}{The \eqn{R^2} but penalized for having more parameters, \code{rank}} \item{df.residual}{residual degrees of freedom} \item{N}{the number of observations used} \item{se_type}{the standard error type specified by the user} \item{statistic}{the value of the F-statistic} \item{p.value}{p-value from the F test} \item{statistic.weakinst}{the value of the first stage F-statistic, useful for the weak instruments test; only reported if there is only one endogenous variable} \item{p.value.weakinst}{p-value from the first-stage F test, a test of weak instruments; only reported if there is only one endogenous variable} \item{statistic.endogeneity}{the value of the F-statistic for the test of endogeneity; often called the Wu-Hausman statistic, with robust standard errors, we employ the regression based test} \item{p.value.endogeneity}{p-value from the F-test for endogeneity} \item{statistic.overid}{the value of the chi-squared statistic for the test of instrument correlation with the error term; only reported with overidentification} \item{p.value.overid}{p-value from the chi-squared test; only reported with overidentification} For \code{glance.difference_in_means}, a data.frame with columns: \item{design}{the design used, and therefore the estimator used} \item{df}{the degrees of freedom} \item{nobs}{the number of observations used} \item{nblocks}{the number of blocks, if used} \item{nclusters}{the number of clusters, if used} \item{condition2}{the second, "treatment", condition} \item{condition1}{the first, "control", condition} For \code{glance.horvitz_thompson}, a data.frame with columns: \item{nobs}{the number of observations used} \item{se_type}{the type of standard error estimator used} \item{condition2}{the second, "treatment", condition} \item{condition1}{the first, "control", condition} } \description{ Glance at an estimatr object } \seealso{ \code{\link[generics:glance]{generics::glance()}}, \code{\link[=lm_robust]{lm_robust()}}, \code{\link[=lm_lin]{lm_lin()}}, \code{\link[=iv_robust]{iv_robust()}}, \code{\link[=difference_in_means]{difference_in_means()}}, \code{\link[=horvitz_thompson]{horvitz_thompson()}} } \concept{estimatr glancers} estimatr/man/difference_in_means.Rd0000644000176200001440000002026114747205231017115 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/estimatr_difference_in_means.R \name{difference_in_means} \alias{difference_in_means} \title{Design-based difference-in-means estimator} \usage{ difference_in_means( formula, data, blocks, clusters, weights, subset, se_type = c("default", "none"), condition1 = NULL, condition2 = NULL, ci = TRUE, alpha = 0.05 ) } \arguments{ \item{formula}{an object of class formula, as in \code{\link{lm}}, such as \code{Y ~ Z} with only one variable on the right-hand side, the treatment.} \item{data}{A \code{data.frame}.} \item{blocks}{An optional bare (unquoted) name of the block variable. Use for blocked designs only.} \item{clusters}{An optional bare (unquoted) name of the variable that corresponds to the clusters in the data; used for cluster randomized designs. For blocked designs, clusters must nest within blocks.} \item{weights}{the bare (unquoted) names of the weights variable in the supplied data.} \item{subset}{An optional bare (unquoted) expression specifying a subset of observations to be used.} \item{se_type}{An optional string that can be one of \code{c("default", "none")}. If "default" (the default), it will use the default standard error estimator for the design, and if "none" then standard errors will not be computed which may speed up run time if only the point estimate is required.} \item{condition1}{value in the treatment vector of the condition to be the control. Effects are estimated with \code{condition1} as the control and \code{condition2} as the treatment. If unspecified, \code{condition1} is the "first" condition and \code{condition2} is the "second" according to levels if the treatment is a factor or according to a sortif it is a numeric or character variable (i.e if unspecified and the treatment is 0s and 1s, \code{condition1} will by default be 0 and \code{condition2} will be 1). See the examples for more.} \item{condition2}{value in the treatment vector of the condition to be the treatment. See \code{condition1}.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} \item{alpha}{The significance level, 0.05 by default.} } \value{ Returns an object of class \code{"difference_in_means"}. The post-estimation commands functions \code{summary} and \code{\link{tidy}} return results in a \code{data.frame}. To get useful data out of the return, you can use these data frames, you can use the resulting list directly, or you can use the generic accessor functions \code{coef} and \code{confint}. An object of class \code{"difference_in_means"} is a list containing at least the following components: \item{coefficients}{the estimated difference in means} \item{std.error}{the estimated standard error} \item{statistic}{the t-statistic} \item{df}{the estimated degrees of freedom} \item{p.value}{the p-value from a two-sided t-test using \code{coefficients}, \code{std.error}, and \code{df}} \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} \item{term}{a character vector of coefficient names} \item{alpha}{the significance level specified by the user} \item{N}{the number of observations used} \item{outcome}{the name of the outcome variable} \item{design}{the name of the design learned from the arguments passed} } \description{ Difference-in-means estimators that selects the appropriate point estimate, standard errors, and degrees of freedom for a variety of designs: unit randomized, cluster randomized, block randomized, block-cluster randomized, matched-pairs, and matched-pair cluster randomized designs } \details{ This function implements a difference-in-means estimator, with support for blocked, clustered, matched-pairs, block-clustered, and matched-pair clustered designs. One specifies their design by passing the blocks and clusters in their data and this function chooses which estimator is most appropriate. If you pass only \code{blocks}, if all blocks are of size two, we will infer that the design is a matched-pairs design. If they are all size four or larger, we will infer that it is a regular blocked design. If you pass both \code{blocks} and \code{clusters}, we will similarly infer whether it is a matched-pairs clustered design or a block-clustered design the number of clusters per block. If the user passes only \code{clusters}, we will infer that the design was cluster-randomized. If the user specifies neither the \code{blocks} nor the \code{clusters}, a regular Welch's t-test will be performed. Importantly, if the user specifies weights, the estimation is handed off to \code{\link{lm_robust}} with the appropriate robust standard errors as weighted difference-in-means estimators are not implemented here. More details of the about each of the estimators can be found in the \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{mathematical notes}. } \examples{ library(fabricatr) library(randomizr) # Get appropriate standard errors for unit-randomized designs # ---------- # 1. Unit randomized # ---------- dat <- fabricate( N = 100, Y = rnorm(100), Z_comp = complete_ra(N, prob = 0.4), ) table(dat$Z_comp) difference_in_means(Y ~ Z_comp, data = dat) # ---------- # 2. Cluster randomized # ---------- # Accurates estimates and standard errors for clustered designs dat$clust <- sample(20, size = nrow(dat), replace = TRUE) dat$Z_clust <- cluster_ra(dat$clust, prob = 0.6) table(dat$Z_clust, dat$clust) summary(difference_in_means(Y ~ Z_clust, clusters = clust, data = dat)) # ---------- # 3. Block randomized # ---------- dat$block <- rep(1:10, each = 10) dat$Z_block <- block_ra(dat$block, prob = 0.5) table(dat$Z_block, dat$block) difference_in_means(Y ~ Z_block, blocks = block, data = dat) # ---------- # 4. Block cluster randomized # ---------- # Learns this design if there are two clusters per block dat$small_clust <- rep(1:50, each = 2) dat$big_blocks <- rep(1:5, each = 10) dat$Z_blcl <- block_and_cluster_ra( blocks = dat$big_blocks, clusters = dat$small_clust ) difference_in_means( Y ~ Z_blcl, blocks = big_blocks, clusters = small_clust, data = dat ) # ---------- # 5. Matched-pairs # ---------- # Matched-pair estimates and standard errors are also accurate # Specified same as blocked design, function learns that # it is matched pair from size of blocks! dat$pairs <- rep(1:50, each = 2) dat$Z_pairs <- block_ra(dat$pairs, prob = 0.5) table(dat$pairs, dat$Z_pairs) difference_in_means(Y ~ Z_pairs, blocks = pairs, data = dat) # ---------- # 6. Matched-pair cluster randomized # ---------- # Learns this design if there are two clusters per block dat$small_clust <- rep(1:50, each = 2) dat$cluster_pairs <- rep(1:25, each = 4) table(dat$cluster_pairs, dat$small_clust) dat$Z_mpcl <- block_and_cluster_ra( blocks = dat$cluster_pairs, clusters = dat$small_clust ) difference_in_means( Y ~ Z_mpcl, blocks = cluster_pairs, clusters = small_clust, data = dat ) # ---------- # Other examples # ---------- # Also works with multi-valued treatments if users specify # comparison of interest dat$Z_multi <- simple_ra( nrow(dat), conditions = c("Treatment 2", "Treatment 1", "Control"), prob_each = c(0.4, 0.4, 0.2) ) # Only need to specify which condition is treated `condition2` and # which is control `condition1` difference_in_means( Y ~ Z_multi, condition1 = "Treatment 2", condition2 = "Control", data = dat ) difference_in_means( Y ~ Z_multi, condition1 = "Treatment 1", condition2 = "Control", data = dat ) # Specifying weights will result in estimation via lm_robust() dat$w <- runif(nrow(dat)) difference_in_means(Y ~ Z_comp, weights = w, data = dat) lm_robust(Y ~ Z_comp, weights = w, data = dat) } \references{ Gerber, Alan S, and Donald P Green. 2012. Field Experiments: Design, Analysis, and Interpretation. New York: W.W. Norton. Imai, Kosuke, Gary King, Clayton Nall. 2009. "The Essential Role of Pair Matching in Cluster-Randomized Experiments, with Application to the Mexican Universal Health Insurance Evaluation." Statistical Science 24 (1). Institute of Mathematical Statistics: 29-53. \doi{10.1214/08-STS274}. } \seealso{ \code{\link{lm_lin}} } estimatr/man/predict.lm_robust.Rd0000644000176200001440000000746614760370122016622 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/S3_predict.R \name{predict.lm_robust} \alias{predict.lm_robust} \title{Predict method for \code{lm_robust} object} \usage{ \method{predict}{lm_robust}( object, newdata, se.fit = FALSE, interval = c("none", "confidence", "prediction"), alpha = 0.05, na.action = na.pass, pred.var = NULL, weights, ... ) } \arguments{ \item{object}{an object of class 'lm_robust'} \item{newdata}{a data frame in which to look for variables with which to predict} \item{se.fit}{logical. Whether standard errors are required, default = FALSE} \item{interval}{type of interval calculation. Can be abbreviated, default = none} \item{alpha}{numeric denoting the test size for confidence intervals} \item{na.action}{function determining what should be done with missing values in newdata. The default is to predict NA.} \item{pred.var}{the variance(s) for future observations to be assumed for prediction intervals.} \item{weights}{variance weights for prediction. This can be a numeric vector or a bare (unquoted) name of the weights variable in the supplied newdata.} \item{...}{other arguments, unused} } \description{ Predict method for \code{lm_robust} object } \details{ Produces predicted values, obtained by evaluating the regression function in the frame \code{newdata} for fits from \code{lm_robust} and \code{lm_lin}. If the logical se.fit is TRUE, standard errors of the predictions are calculated. Setting intervals specifies computation of confidence or prediction (tolerance) intervals at the specified level, sometimes referred to as narrow vs. wide intervals. The equation used for the standard error of a prediction given a row of data \eqn{x} is: \eqn{\sqrt(x \Sigma x')}, where \eqn{\Sigma} is the estimated variance-covariance matrix from \code{lm_robust}. The prediction intervals are for a single observation at each case in \code{newdata} with error variance(s) \code{pred.var}. The the default is to assume that future observations have the same error variance as those used for fitting, which is gotten from the fit \code{\link{lm_robust}} object. If weights is supplied, the inverse of this is used as a scale factor. If the fit was weighted, the default is to assume constant prediction variance, with a warning. } \examples{ # Set seed set.seed(42) # Simulate data n <- 10 dat <- data.frame(y = rnorm(n), x = rnorm(n)) # Fit lm lm_out <- lm_robust(y ~ x, data = dat) # Get predicted fits fits <- predict(lm_out, newdata = dat) # With standard errors and confidence intervals fits <- predict(lm_out, newdata = dat, se.fit = TRUE, interval = "confidence") # Use new data as well new_dat <- data.frame(x = runif(n, 5, 8)) predict(lm_out, newdata = new_dat) # You can also supply custom variance weights for prediction intervals new_dat$w <- runif(n) predict(lm_out, newdata = new_dat, weights = w, interval = "prediction") # Works for 'lm_lin' models as well dat$z <- sample(1:3, size = nrow(dat), replace = TRUE) lmlin_out1 <- lm_lin(y ~ z, covariates = ~ x, data = dat) predict(lmlin_out1, newdata = dat, interval = "prediction") # Predictions from Lin models are equivalent with and without an intercept # and for multi-level treatments entered as numeric or factor variables lmlin_out2 <- lm_lin(y ~ z - 1, covariates = ~ x, data = dat) lmlin_out3 <- lm_lin(y ~ factor(z), covariates = ~ x, data = dat) lmlin_out4 <- lm_lin(y ~ factor(z) - 1, covariates = ~ x, data = dat) predict(lmlin_out2, newdata = dat, interval = "prediction") predict(lmlin_out3, newdata = dat, interval = "prediction") predict(lmlin_out4, newdata = dat, interval = "prediction") # In Lin models, predict will stop with an error message if new # treatment levels are supplied in the new data new_dat$z <- sample(0:3, size = nrow(new_dat), replace = TRUE) # predict(lmlin_out, newdata = new_dat) } estimatr/man/declaration_to_condition_pr_mat.Rd0000644000176200001440000001062514747205231021554 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_condition_pr_matrix.R \name{declaration_to_condition_pr_mat} \alias{declaration_to_condition_pr_mat} \title{Builds condition probability matrices for Horvitz-Thompson estimation from \pkg{randomizr} declaration} \usage{ declaration_to_condition_pr_mat( ra_declaration, condition1 = NULL, condition2 = NULL, prob_matrix = NULL ) } \arguments{ \item{ra_declaration}{An object of class \code{"ra_declaration"}, generated by the \code{\link[randomizr]{declare_ra}} function in \pkg{randomizr}. This object contains the experimental design that will be represented in a condition probability matrix} \item{condition1}{The name of the first condition, often the control group. If \code{NULL}, defaults to first condition in randomizr declaration. Either both \code{condition1} and \code{condition2} have to be specified or both left as \code{NULL}.} \item{condition2}{The name of the second condition, often the treatment group. If \code{NULL}, defaults to second condition in randomizr declaration. Either both \code{condition1} and \code{condition2} have to be specified or both left as \code{NULL}.} \item{prob_matrix}{An optional probability matrix to override the one in \code{ra_declaration}} } \value{ a numeric 2n*2n matrix of marginal and joint condition treatment probabilities to be passed to the \code{condition_pr_mat} argument of \code{\link{horvitz_thompson}}. See details. } \description{ Builds condition probability matrices for Horvitz-Thompson estimation from \pkg{randomizr} declaration } \details{ This function takes a \code{"ra_declaration"}, generated by the \code{\link[randomizr]{declare_ra}} function in \pkg{randomizr} and returns a 2n*2n matrix that can be used to fully specify the design for \code{\link{horvitz_thompson}} estimation. This is done by passing this matrix to the \code{condition_pr_mat} argument of \code{\link{horvitz_thompson}}. Currently, this function can learn the condition probability matrix for a wide variety of randomizations: simple, complete, simple clustered, complete clustered, blocked, block-clustered. A condition probability matrix is made up of four submatrices, each of which corresponds to the joint and marginal probability that each observation is in one of the two treatment conditions. The upper-left quadrant is an n*n matrix. On the diagonal is the marginal probability of being in condition 1, often control, for every unit (Pr(Z_i = Condition1) where Z represents the vector of treatment conditions). The off-diagonal elements are the joint probabilities of each unit being in condition 1 with each other unit, Pr(Z_i = Condition1, Z_j = Condition1) where i indexes the rows and j indexes the columns. The upper-right quadrant is also an n*n matrix. On the diagonal is the joint probability of a unit being in condition 1 and condition 2, often the treatment, and thus is always 0. The off-diagonal elements are the joint probability of unit i being in condition 1 and unit j being in condition 2, Pr(Z_i = Condition1, Z_j = Condition2). The lower-left quadrant is also an n*n matrix. On the diagonal is the joint probability of a unit being in condition 1 and condition 2, and thus is always 0. The off-diagonal elements are the joint probability of unit i being in condition 2 and unit j being in condition 1, Pr(Z_i = Condition2, Z_j = Condition1). The lower-right quadrant is an n*n matrix. On the diagonal is the marginal probability of being in condition 2, often treatment, for every unit (Pr(Z_i = Condition2)). The off-diagonal elements are the joint probability of each unit being in condition 2 together, Pr(Z_i = Condition2, Z_j = Condition2). } \examples{ # Learn condition probability matrix from complete blocked design library(randomizr) n <- 100 dat <- data.frame( blocks = sample(letters[1:10], size = n, replace = TRUE), y = rnorm(n) ) # Declare complete blocked randomization bl_declaration <- declare_ra(blocks = dat$blocks, prob = 0.4, simple = FALSE) # Get probabilities block_pr_mat <- declaration_to_condition_pr_mat(bl_declaration, 0, 1) # Do randomiztion dat$z <- conduct_ra(bl_declaration) horvitz_thompson(y ~ z, data = dat, condition_pr_mat = block_pr_mat) # When you pass a declaration to horvitz_thompson, this function is called # Equivalent to above call horvitz_thompson(y ~ z, data = dat, ra_declaration = bl_declaration) } \seealso{ \code{\link{permutations_to_condition_pr_mat}} } estimatr/man/extract.lm_robust.Rd0000644000176200001440000000302614747205231016631 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_extract.R \name{extract.robust_default} \alias{extract.robust_default} \alias{extract.lm_robust} \alias{extract.iv_robust} \title{Extract model data for \pkg{texreg} package} \usage{ extract.robust_default( model, include.ci = TRUE, include.rsquared = TRUE, include.adjrs = TRUE, include.nobs = TRUE, include.fstatistic = FALSE, include.rmse = TRUE, include.nclusts = TRUE, ... ) extract.lm_robust( model, include.ci = TRUE, include.rsquared = TRUE, include.adjrs = TRUE, include.nobs = TRUE, include.fstatistic = FALSE, include.rmse = TRUE, include.nclusts = TRUE, ... ) extract.iv_robust( model, include.ci = TRUE, include.rsquared = TRUE, include.adjrs = TRUE, include.nobs = TRUE, include.fstatistic = FALSE, include.rmse = TRUE, include.nclusts = TRUE, ... ) } \arguments{ \item{model}{an object of class \code{\link{lm_robust}} or \code{"iv_robust"}} \item{include.ci}{logical. Defaults to TRUE} \item{include.rsquared}{logical. Defaults to TRUE} \item{include.adjrs}{logical. Defaults to TRUE} \item{include.nobs}{logical. Defaults to TRUE} \item{include.fstatistic}{logical. Defaults to TRUE} \item{include.rmse}{logical. Defaults to TRUE} \item{include.nclusts}{logical. Defaults to TRUE if clusters in \code{model}} \item{...}{unused} } \description{ Prepares a \code{"lm_robust"} or \code{"iv_robust"} object for the \pkg{texreg} package. This is largely a clone of the \code{extract.lm} method. } estimatr/man/commarobust.Rd0000644000176200001440000000221114747205231015500 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/helper_starprep.R \name{commarobust} \alias{commarobust} \title{Build lm_robust object from lm fit} \usage{ commarobust(model, se_type = NULL, clusters = NULL, ci = TRUE, alpha = 0.05) } \arguments{ \item{model}{an lm model object} \item{se_type}{The sort of standard error sought. If \code{clusters} is not specified the options are "HC0", "HC1" (or "stata", the equivalent), "HC2" (default), "HC3", or "classical". If \code{clusters} is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients.} \item{clusters}{A vector corresponding to the clusters in the data.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} \item{alpha}{The significance level, 0.05 by default.} } \value{ an \code{\link{lm_robust}} object. } \description{ Build lm_robust object from lm fit } \examples{ lmo <- lm(mpg ~ hp, data = mtcars) # Default HC2 commarobust(lmo) commarobust(lmo, se_type = "HC3") commarobust(lmo, se_type = "stata", clusters = mtcars$carb) } estimatr/man/horvitz_thompson.Rd0000644000176200001440000002616114750715463016622 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/estimatr_horvitz_thompson.R \name{horvitz_thompson} \alias{horvitz_thompson} \title{Horvitz-Thompson estimator for two-armed trials} \usage{ horvitz_thompson( formula, data, blocks, clusters, simple = NULL, condition_prs, condition_pr_mat = NULL, ra_declaration = NULL, subset, condition1 = NULL, condition2 = NULL, se_type = c("youngs", "constant", "none"), ci = TRUE, alpha = 0.05, return_condition_pr_mat = FALSE ) } \arguments{ \item{formula}{an object of class formula, as in \code{\link{lm}}, such as \code{Y ~ Z} with only one variable on the right-hand side, the treatment.} \item{data}{A data.frame.} \item{blocks}{An optional bare (unquoted) name of the block variable. Use for blocked designs only. See details.} \item{clusters}{An optional bare (unquoted) name of the variable that corresponds to the clusters in the data; used for cluster randomized designs. For blocked designs, clusters must be within blocks.} \item{simple}{logical, optional. Whether the randomization is simple (TRUE) or complete (FALSE). This is ignored if \code{blocks} are specified, as all blocked designs use complete randomization, or either \code{ra_declaration} or \code{condition_pr_mat} are passed. Otherwise, it defaults to \code{TRUE}.} \item{condition_prs}{An optional bare (unquoted) name of the variable with the condition 2 (treatment) probabilities. See details. May also use a single number for the condition 2 probability if it is constant.} \item{condition_pr_mat}{An optional 2n * 2n matrix of marginal and joint probabilities of all units in condition1 and condition2. See details.} \item{ra_declaration}{An object of class \code{"ra_declaration"}, from the \code{\link[randomizr]{declare_ra}} function in the \pkg{randomizr} package. This is the third way that one can specify a design for this estimator. Cannot be used along with any of \code{condition_prs}, \code{blocks}, \code{clusters}, or \code{condition_pr_mat}. See details.} \item{subset}{An optional bare (unquoted) expression specifying a subset of observations to be used.} \item{condition1}{value in the treatment vector of the condition to be the control. Effects are estimated with \code{condition1} as the control and \code{condition2} as the treatment. If unspecified, \code{condition1} is the "first" condition and \code{condition2} is the "second" according to levels if the treatment is a factor or according to a sortif it is a numeric or character variable (i.e if unspecified and the treatment is 0s and 1s, \code{condition1} will by default be 0 and \code{condition2} will be 1). See the examples for more.} \item{condition2}{value in the treatment vector of the condition to be the treatment. See \code{condition1}.} \item{se_type}{can be one of \code{c("youngs", "constant", "none")} and corresponds the estimator of the standard errors. Default estimator uses Young's inequality (and is conservative) while the other uses a constant treatment effects assumption and only works for simple randomized designs at the moment. If "none" then standard errors will not be computed which may speed up run time if only the point estimate is required.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} \item{alpha}{The significance level, 0.05 by default.} \item{return_condition_pr_mat}{logical. Whether to return the condition probability matrix. Returns NULL if the design is simple randomization, FALSE by default.} } \value{ Returns an object of class \code{"horvitz_thompson"}. The post-estimation commands functions \code{summary} and \code{\link{tidy}} return results in a \code{data.frame}. To get useful data out of the return, you can use these data frames, you can use the resulting list directly, or you can use the generic accessor functions \code{coef} and \code{confint}. An object of class \code{"horvitz_thompson"} is a list containing at least the following components: \item{coefficients}{the estimated difference in totals} \item{std.error}{the estimated standard error} \item{statistic}{the z-statistic} \item{df}{the estimated degrees of freedom} \item{p.value}{the p-value from a two-sided z-test using \code{coefficients} and \code{std.error}} \item{conf.low}{the lower bound of the \code{1 - alpha} percent confidence interval} \item{conf.high}{the upper bound of the \code{1 - alpha} percent confidence interval} \item{term}{a character vector of coefficient names} \item{alpha}{the significance level specified by the user} \item{nobs}{the number of observations used} \item{outcome}{the name of the outcome variable} \item{condition_pr_mat}{the condition probability matrix if \code{return_condition_pr_mat} is TRUE} } \description{ Horvitz-Thompson estimators that are unbiased for designs in which the randomization scheme is known } \details{ This function implements the Horvitz-Thompson estimator for treatment effects for two-armed trials. This estimator is useful for estimating unbiased treatment effects given any randomization scheme as long as the randomization scheme is known. In short, the Horvitz-Thompson estimator essentially reweights each unit by the probability of it being in its observed condition. Pivotal to the estimation of treatment effects using this estimator are the marginal condition probabilities (i.e., the probability that any one unit is in a particular treatment condition). Pivotal to estimating the variance whenever the design is more complicated than simple randomization are the joint condition probabilities (i.e., the probabilities that any two units have a particular set of treatment conditions, either the same or different). The estimator we provide here considers the case with two treatment conditions. Users interested in more details can see the \href{https://declaredesign.org/r/estimatr/articles/mathematical-notes.html}{mathematical notes} for more information and references, or see the references below. There are three distinct ways that users can specify the design to the function. The preferred way is to use the \code{\link[randomizr]{declare_ra}} function in the \pkg{randomizr} package. This function takes several arguments, including blocks, clusters, treatment probabilities, whether randomization is simple or not, and more. Passing the outcome of that function, an object of class \code{"ra_declaration"} to the \code{ra_declaration} argument in this function, will lead to a call of the \code{\link{declaration_to_condition_pr_mat}} function which generates the condition probability matrix needed to estimate treatment effects and standard errors. We provide many examples below of how this could be done. The second way is to pass the names of vectors in your \code{data} to \code{condition_prs}, \code{blocks}, and \code{clusters}. You can further specify whether the randomization was simple or complete using the \code{simple} argument. Note that if \code{blocks} are specified the randomization is always treated as complete. From these vectors, the function learns how to build the condition probability matrix that is used in estimation. In the case where \code{condition_prs} is specified, this function assumes those probabilities are the marginal probability that each unit is in condition2 and then uses the other arguments (\code{blocks}, \code{clusters}, \code{simple}) to learn the rest of the design. If users do not pass \code{condition_prs}, this function learns the probability of being in condition2 from the data. That is, none of these arguments are specified, we assume that there was a simple randomization where the probability of each unit being in condition2 was the average of all units in condition2. Similarly, we learn the block-level probability of treatment within \code{blocks} by looking at the mean number of units in condition2 if \code{condition_prs} is not specified. The third way is to pass a \code{condition_pr_mat} directly. One can see more about this object in the documentation for \code{\link{declaration_to_condition_pr_mat}} and \code{\link{permutations_to_condition_pr_mat}}. Essentially, this 2n * 2n matrix allows users to specify marginal and joint marginal probabilities of units being in conditions 1 and 2 of arbitrary complexity. Users should only use this option if they are certain they know what they are doing. } \examples{ # Set seed set.seed(42) # Simulate data n <- 10 dat <- data.frame(y = rnorm(n)) library(randomizr) #---------- # 1. Simple random assignment #---------- dat$p <- 0.5 dat$z <- rbinom(n, size = 1, prob = dat$p) # If you only pass condition_prs, we assume simple random sampling horvitz_thompson(y ~ z, data = dat, condition_prs = p) # Assume constant effects instead horvitz_thompson(y ~ z, data = dat, condition_prs = p, se_type = "constant") # Also can use randomizr to pass a declaration srs_declaration <- declare_ra(N = nrow(dat), prob = 0.5, simple = TRUE) horvitz_thompson(y ~ z, data = dat, ra_declaration = srs_declaration) #---------- # 2. Complete random assignment #---------- dat$z <- sample(rep(0:1, each = n/2)) # Can use a declaration crs_declaration <- declare_ra(N = nrow(dat), prob = 0.5, simple = FALSE) horvitz_thompson(y ~ z, data = dat, ra_declaration = crs_declaration) # Can precompute condition_pr_mat and pass it # (faster for multiple runs with same condition probability matrix) crs_pr_mat <- declaration_to_condition_pr_mat(crs_declaration) horvitz_thompson(y ~ z, data = dat, condition_pr_mat = crs_pr_mat) #---------- # 3. Clustered treatment, complete random assigment #----------- # Simulating data dat$cl <- rep(1:4, times = c(2, 2, 3, 3)) dat$prob <- 0.5 clust_crs_decl <- declare_ra(N = nrow(dat), clusters = dat$cl, prob = 0.5) dat$z <- conduct_ra(clust_crs_decl) # Easiest to specify using declaration ht_cl <- horvitz_thompson(y ~ z, data = dat, ra_declaration = clust_crs_decl) # Also can pass the condition probability and the clusters ht_cl_manual <- horvitz_thompson( y ~ z, data = dat, clusters = cl, condition_prs = prob, simple = FALSE ) ht_cl ht_cl_manual # Blocked estimators specified similarly #---------- # More complicated assignment #---------- # arbitrary permutation matrix possible_treats <- cbind( c(1, 1, 0, 1, 0, 0, 0, 1, 1, 0), c(0, 1, 1, 0, 1, 1, 0, 1, 0, 1), c(1, 0, 1, 1, 1, 1, 1, 0, 0, 0) ) arb_pr_mat <- permutations_to_condition_pr_mat(possible_treats) # Simulating a column to be realized treatment dat$z <- possible_treats[, sample(ncol(possible_treats), size = 1)] horvitz_thompson(y ~ z, data = dat, condition_pr_mat = arb_pr_mat) } \references{ Aronow, Peter M, and Joel A Middleton. 2013. "A Class of Unbiased Estimators of the Average Treatment Effect in Randomized Experiments." Journal of Causal Inference 1 (1): 135-54. \doi{10.1515/jci-2012-0009}. Aronow, Peter M, and Cyrus Samii. 2017. "Estimating Average Causal Effects Under Interference Between Units." Annals of Applied Statistics, forthcoming. \url{https://arxiv.org/abs/1305.6156v3}. Middleton, Joel A, and Peter M Aronow. 2015. "Unbiased Estimation of the Average Treatment Effect in Cluster-Randomized Experiments." Statistics, Politics and Policy 6 (1-2): 39-75. \doi{10.1515/spp-2013-0002}. } \seealso{ \code{\link[randomizr]{declare_ra}} } estimatr/DESCRIPTION0000644000176200001440000000446614760407272013633 0ustar liggesusersPackage: estimatr Type: Package Title: Fast Estimators for Design-Based Inference Version: 1.0.6 Authors@R: c(person("Graeme", "Blair", email = "graeme.blair@gmail.com", role = c("aut", "cre")), person("Jasper", "Cooper", email = "jjc2247@columbia.edu", role = c("aut")), person("Alexander", "Coppock", email = "alex.coppock@yale.edu", role = c("aut")), person("Macartan", "Humphreys", email = "macartan@gmail.com", role = c("aut")), person("Luke", "Sonnet", email = "luke.sonnet@gmail.com", role = c("aut")), person("Neal", "Fultz", email = "nfultz@gmail.com", role = c("ctb")), person("Lily", "Medina", email = "lilymiru@gmail.com", role = c("ctb")), person("Russell", "Lenth", email = "russell-lenth@uiowa.edu", role = c("ctb")), person("Molly", "Offer-Westort", email = "mollyow@uchicago.edu", role = c("ctb"))) Description: Fast procedures for small set of commonly-used, design-appropriate estimators with robust standard errors and confidence intervals. Includes estimators for linear regression, instrumental variables regression, difference-in-means, Horvitz-Thompson estimation, and regression improving precision of experimental estimates by interacting treatment with centered pre-treatment covariates introduced by Lin (2013) . URL: https://declaredesign.org/r/estimatr/, https://github.com/DeclareDesign/estimatr BugReports: https://github.com/DeclareDesign/estimatr/issues License: MIT + file LICENSE Depends: R (>= 3.6.0) Imports: Formula, generics, methods, Rcpp (>= 0.12.16), rlang (>= 0.2.0) LinkingTo: Rcpp, RcppEigen Encoding: UTF-8 RoxygenNote: 7.3.2 LazyData: true Suggests: fabricatr (>= 0.10.0), randomizr (>= 0.20.0), AER, clubSandwich, emmeans (>= 1.4), estimability, margins, modelsummary, prediction, RcppEigen, sandwich, stargazer, testthat, car Enhances: texreg NeedsCompilation: yes Packaged: 2025-02-28 18:37:05 UTC; gblair Author: Graeme Blair [aut, cre], Jasper Cooper [aut], Alexander Coppock [aut], Macartan Humphreys [aut], Luke Sonnet [aut], Neal Fultz [ctb], Lily Medina [ctb], Russell Lenth [ctb], Molly Offer-Westort [ctb] Maintainer: Graeme Blair Repository: CRAN Date/Publication: 2025-02-28 19:30:02 UTC