sjstats/ 0000755 0001762 0000144 00000000000 15022765352 011754 5 ustar ligges users sjstats/tests/ 0000755 0001762 0000144 00000000000 14620351262 013110 5 ustar ligges users sjstats/tests/testthat/ 0000755 0001762 0000144 00000000000 15022765352 014756 5 ustar ligges users sjstats/tests/testthat/test-t_test.R 0000644 0001762 0000144 00000002733 14623373000 017354 0 ustar ligges users skip_if_not_installed("datawizard")
skip_if_not_installed("effectsize")
test_that("t_test", {
data(efc)
set.seed(123)
efc$weight <- abs(rnorm(nrow(efc), 1, 0.3))
expect_snapshot(t_test(efc, "e17age"))
expect_snapshot(t_test(efc, "e17age", "e16sex"))
expect_snapshot(t_test(efc, c("e17age", "c160age")))
expect_snapshot(t_test(efc, c("e17age", "c160age"), paired = TRUE))
expect_snapshot(t_test(efc, "e17age", weights = "weight"))
expect_snapshot(t_test(efc, "e17age", "e16sex", weights = "weight"))
expect_snapshot(t_test(efc, c("e17age", "c160age"), weights = "weight"))
expect_snapshot(t_test(efc, c("e17age", "c160age"), weights = "weight", paired = TRUE))
out1 <- t_test(efc, "e17age")
out2 <- t.test(efc$e17age ~ 1)
expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$effect_size, 9.774916, tolerance = 1e-4, ignore_attr = TRUE)
out1 <- t_test(efc, "e17age", "e16sex")
out2 <- t.test(efc$e17age ~ efc$e16sex)
expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$effect_size, -0.5641989, tolerance = 1e-4, ignore_attr = TRUE)
efc$e17age <- as.factor(efc$e17age)
expect_error(t_test(efc, "e17age", "c161sex"), regex = "Variable provided in `select` must be numeric")
})
sjstats/tests/testthat/test-wilcoxon_test.R 0000644 0001762 0000144 00000001574 14620351262 020760 0 ustar ligges users skip_if_not_installed("survey")
skip_if_not_installed("datawizard")
skip_if_not_installed("coin")
test_that("wilcoxon_test", {
data(mtcars)
out1 <- wilcoxon_test(mtcars, "mpg")
out2 <- wilcox.test(mtcars$mpg ~ 1, exact = FALSE)
expect_equal(out1$v, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
out1 <- wilcoxon_test(mtcars, c("mpg", "hp"))
out2 <- wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE)
expect_equal(out1$v, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
data(iris)
d <- iris[iris$Species != "setosa", ]
out <- wilcoxon_test(d, "Sepal.Width", by = "Species")
expect_snapshot(print(out))
})
sjstats/tests/testthat/test-kruskal_wallis_test.R 0000644 0001762 0000144 00000002656 14620351262 022147 0 ustar ligges users skip_if_not_installed("survey")
skip_if_not_installed("datawizard")
test_that("kruskal_wallis_test", {
data(efc)
set.seed(123)
efc$weight <- abs(rnorm(nrow(efc), 1, 0.3))
out1 <- kruskal_wallis_test(efc, "e17age", by = "c172code")
out2 <- kruskal.test(e17age ~ c172code, data = efc)
expect_equal(out1$Chi2, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
set.seed(123)
wide_data <- data.frame(
scale1 = runif(20),
scale2 = runif(20),
scale3 = runif(20)
)
long_data <- data.frame(
scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3),
groups = as.factor(rep(c("A", "B", "C"), each = 20)),
stringsAsFactors = FALSE
)
out1 <- kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3"))
out2 <- kruskal_wallis_test(long_data, select = "scales", by = "groups")
out3 <- kruskal.test(scales ~ groups, data = long_data)
expect_equal(out1$Chi2, out2$Chi2, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$Chi2, out3$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out3$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
out1 <- kruskal_wallis_test(efc, "e17age", by = "c172code", weights = "weight")
})
sjstats/tests/testthat/test-autoprior.R 0000644 0001762 0000144 00000001771 14620351262 020102 0 ustar ligges users .runThisTest <- Sys.getenv("RunAllsjstatsTests") == "yes"
if (.runThisTest) {
if (suppressWarnings(
require("testthat") &&
require("sjstats") &&
require("brms")
)) {
context("sjstats, autoprior")
data(efc)
efc$c172code <- as.factor(efc$c172code)
efc$c161sex <- to_label(efc$c161sex)
efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1)
test_that("auto_prior", {
mf <- formula(neg_c_7 ~ c161sex + c160age + c172code)
expect_s3_class(auto_prior(mf, efc, TRUE), "brmsprior")
mf <- formula(neg_c_7 ~ c161sex + c160age + c172code + c12hour + e17age)
expect_s3_class(auto_prior(mf, efc, TRUE), "brmsprior")
expect_error(auto_prior(mf, efc))
mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age)
expect_s3_class(auto_prior(mf, efc, FALSE), "brmsprior")
expect_s3_class(auto_prior(mf, efc), "brmsprior")
expect_warning(auto_prior(mf, efc, TRUE))
})
}
}
sjstats/tests/testthat/test-mann_whitney_test.R 0000644 0001762 0000144 00000003644 14620351262 021616 0 ustar ligges users skip_if_not_installed("coin")
skip_if_not_installed("survey")
skip_if_not_installed("datawizard")
test_that("mann_whitney_test", {
data(efc)
set.seed(123)
efc$weight <- abs(rnorm(nrow(efc), 1, 0.3))
out1 <- mann_whitney_test(efc, "e17age", by = "e16sex")
out2 <- wilcox.test(e17age ~ e16sex, data = efc)
expect_equal(out1$w, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$estimate, -151, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$r, 0.2571254, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
set.seed(123)
wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20))
out1 <- mann_whitney_test(wide_data, select = c("scale1", "scale2"))
out2 <- wilcox.test(wide_data$scale1, wide_data$scale2)
expect_equal(out1$w, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$r, 0.05132394, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
out <- mann_whitney_test(efc, "e17age", by = "e16sex", weights = "weight")
expect_equal(out$p, 1.976729e-14, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out$estimate, 0.1594972, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out$r, 0.2599877, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out))
})
test_that("mann_whitney_test, sanity checks", {
data(efc)
expect_error(mann_whitney_test(efc, "e17age", by = "c172code"), regex = "Only two groups are")
expect_error(mann_whitney_test(efc, c("e17age", "c172code", "e16sex")), regex = "You may only specify")
expect_error(mann_whitney_test(efc, c("e17age", "c172code"), by = "e17age"), regex = "If `select` specifies more")
expect_error(mann_whitney_test(efc, "e17age"), regex = "Only one variable provided")
})
sjstats/tests/testthat/_snaps/ 0000755 0001762 0000144 00000000000 14623373000 016230 5 ustar ligges users sjstats/tests/testthat/_snaps/t_test.md 0000644 0001762 0000144 00000005743 14620351262 020070 0 ustar ligges users # t_test
Code
t_test(efc, "e17age")
Output
# One Sample t-test
Data: e17age
Group 1: e17age (mean = 79.12)
Alternative hypothesis: true mean is not equal to 0
t = 291.78, Cohen's d = 9.77 (large effect), df = 890, p < .001
---
Code
t_test(efc, "e17age", "e16sex")
Output
# Welch Two Sample t-test
Data: e17age by e16sex
Group 1: 1 (n = 294, mean = 76.16)
Group 2: 2 (n = 596, mean = 80.57)
Alternative hypothesis: true difference in means is not equal to 0
t = -8.05, Cohen's d = -0.56 (medium effect), df = 610.8, p < .001
---
Code
t_test(efc, c("e17age", "c160age"))
Output
# Welch Two Sample t-test
Data: e17age by c160age
Group 1: c160age (n = 890, mean = 53.42)
Group 2: e17age (n = 890, mean = 79.12)
Alternative hypothesis: true difference in means is not equal to 0
t = -49.22, Cohen's d = -2.33 (large effect), df = 1468.1, p < .001
---
Code
t_test(efc, c("e17age", "c160age"), paired = TRUE)
Output
# Paired t-test
Data: e17age and c160age (mean difference = 25.70)
Alternative hypothesis: true mean is not equal to 0
t = 54.11, Cohen's d = 1.81 (large effect), df = 889, p < .001
---
Code
t_test(efc, "e17age", weights = "weight")
Output
# One Sample t-test (weighted)
Data: e17age
Group 1: e17age (n = 897, mean = 79.17)
Alternative hypothesis: true mean is not equal to 0
t = 291.31, Cohen's d = 3.17 (large effect), df = 890, p < .001
---
Code
t_test(efc, "e17age", "e16sex", weights = "weight")
Output
# Two-Sample t-test (weighted)
Data: e17age by e16sex
Group 1: 1 (n = 600, mean = 80.63)
Group 2: 2 (n = 296, mean = 76.19)
Alternative hypothesis: true difference in means is not equal to 0
t = 8.03, Cohen's d = -0.17 (very small effect), df = 604.5, p < .001
---
Code
t_test(efc, c("e17age", "c160age"), weights = "weight")
Output
# Two-Sample t-test (weighted)
Data: e17age by c160age
Group 1: c160age (n = 896, mean = 79.17)
Group 2: e17age (n = 896, mean = 53.40)
Alternative hypothesis: true difference in means is not equal to 0
t = 49.31, Cohen's d = -1.12 (large effect), df = 1470.0, p < .001
---
Code
t_test(efc, c("e17age", "c160age"), weights = "weight", paired = TRUE)
Output
# Paired t-test (weighted)
Data: e17age and c160age (mean difference = 25.77)
Alternative hypothesis: true mean difference is not equal to 0
t = 54.37, Cohen's d = 1.54 (large effect), df = 889, p < .001
sjstats/tests/testthat/_snaps/wilcoxon_test.md 0000644 0001762 0000144 00000001272 14620351262 021460 0 ustar ligges users # wilcoxon_test
Code
print(out1)
Output
# One Sample Wilcoxon signed rank test
Alternative hypothesis: true location shift is not equal to 0
V = 528, p < .001
---
Code
print(out1)
Output
# Paired Wilcoxon signed rank test
Alternative hypothesis: true location shift is not equal to 0
V = 0, r = 0.87, Z = -4.94, p < .001
---
Code
print(out)
Output
# Paired Wilcoxon signed rank test
Alternative hypothesis: true location shift is not equal to 0
V = 247, r = 0.39, Z = -2.76, p = 0.006
sjstats/tests/testthat/_snaps/chi_squared_test.md 0000644 0001762 0000144 00000002464 14623373000 022106 0 ustar ligges users # chi_squared_test
Code
print(out1)
Output
# Chi-squared test for contingency tables
Data: c161sex by e16sex (n = 900)
χ² = 2.233, ϕ = 0.053 (very small effect), df = 1, p = 0.135
---
Code
print(out)
Output
# Chi-squared test for contingency tables (weighted)
Data: c161sex by e16sex (n = 904)
χ² = 2.416, ϕ = 0.054 (very small effect), df = 1, p = 0.120
---
Code
print(out1)
Output
# Chi-squared test for given probabilities
Data: c161sex against probabilities 30% and 70% (n = 901)
χ² = 16.162, פ = 0.088 (very small effect), df = 1, p < .001
---
Code
print(out)
Output
# Chi-squared test for given probabilities (weighted)
Data: c161sex against probabilities 30% and 70% (n = 906)
χ² = 20.074, פ = 0.097 (very small effect), df = 1, p < .001
---
Code
print(out1)
Output
# Chi-squared test for contingency tables
(using McNemar's test for paired data)
Data: survey_1 by survey_2 (n = 1000)
χ² = 10.868, ϕ = 0.032 (tiny effect), df = 1, p < .001
sjstats/tests/testthat/_snaps/mann_whitney_test.md 0000644 0001762 0000144 00000001650 14620351262 022316 0 ustar ligges users # mann_whitney_test
Code
print(out1)
Output
# Mann-Whitney test
Group 1: male (n = 294, rank mean = 147.50)
Group 2: female (n = 596, rank mean = 298.50)
Alternative hypothesis: true location shift is not equal to 0
W = 59684 , r = 0.26, Z = -7.75, p < .001
---
Code
print(out1)
Output
# Mann-Whitney test
Group 1: scale1 (n = 20, rank mean = 10.50)
Group 2: scale2 (n = 20, rank mean = 10.50)
Alternative hypothesis: true location shift is not equal to 0
W = 188 , r = 0.05, Z = -0.32, p = 0.758
---
Code
print(out)
Output
# Mann-Whitney test (weighted)
Group 1: male (n = 296, rank mean = 147.58)
Group 2: female (n = 600, rank mean = 299.42)
r = 0.26, Z = 7.78, p < .001
sjstats/tests/testthat/_snaps/kruskal_wallis_test.md 0000644 0001762 0000144 00000000661 14620351262 022646 0 ustar ligges users # kruskal_wallis_test
Code
print(out1)
Output
# Kruskal-Wallis test
Data: e17age by c172code (3 groups, n = 506, 180 and 156)
χ² = 4.05, df = 2, p = 0.132
---
Code
print(out1)
Output
# Kruskal-Wallis test
Data: scale1 by scale2 (3 groups, n = 20, 20 and 20)
χ² = 4.86, df = 2, p = 0.088
sjstats/tests/testthat/test-wtd.R 0000644 0001762 0000144 00000000427 14620351262 016651 0 ustar ligges users test_that("wtd", {
data(efc)
set.seed(123)
efc$weight <- abs(rnorm(nrow(efc), 1, 0.3))
expect_equal(weighted_se(efc$c12hour, weights = efc$weight), 1.704182, tolerance = 1e-5)
expect_equal(weighted_se(efc$c12hour, weights = NULL), 1.691623, tolerance = 1e-5)
})
sjstats/tests/testthat/test-chi_squared_test.R 0000644 0001762 0000144 00000004100 14623373035 021376 0 ustar ligges users skip_if_not_installed("effectsize")
skip_if_not_installed("datawizard")
skip_if_not_installed("MASS")
test_that("chi_squared_test", {
data(efc)
set.seed(123)
efc$weight <- abs(rnorm(nrow(efc), 1, 0.3))
out1 <- chi_squared_test(efc, "c161sex", by = "e16sex")
out2 <- chisq.test(efc$c161sex, efc$e16sex)
expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
out <- chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight")
expect_equal(out$statistic, 2.415755, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out$effect_size, 0.05448519, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out$p, 0.1201201, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out))
out1 <- chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7))
out2 <- chisq.test(table(efc$c161sex), p = c(0.3, 0.7))
expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
out <- chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7), weights = "weight")
expect_equal(out$statistic, 20.07379, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out$effect_size, 0.0974456, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out))
set.seed(1234)
d <- data.frame(
survey_1 = sample(c("Approve", "Disapprove"), size = 1000, replace = TRUE, prob = c(0.45, 0.55)),
survey_2 = sample(c("Approve", "Disapprove"), size = 1000, replace = TRUE, prob = c(0.42, 0.58))
)
out1 <- chi_squared_test(d, "survey_1", "survey_2", paired = TRUE)
out2 <- mcnemar.test(table(d))
expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE)
expect_equal(out1$effect_size, 0.03170437, tolerance = 1e-4, ignore_attr = TRUE)
expect_snapshot(print(out1))
})
sjstats/tests/testthat.R 0000644 0001762 0000144 00000000074 14620351262 015074 0 ustar ligges users library(testthat)
library(sjstats)
test_check("sjstats")
sjstats/MD5 0000644 0001762 0000144 00000011352 15022765352 012266 0 ustar ligges users c1aef7ed54f2637f188181a52473501b *DESCRIPTION
74455df2ddc77d6e8704d5268f9c6204 *NAMESPACE
2842f48f167c6406c8424f945c136b3b *NEWS.md
e10dd983c750c5e52dcbd672c628fbf2 *R/Deprecated.R
d9dba7573d21e93660ec5a360b45ce43 *R/S3-methods.R
292fb71d8ced0f748dd0f4e66f29925b *R/anova_stats.R
94f104911f33da856c41be988c125168 *R/auto_prior.R
afa0b3edfad717886d11f70b5a3b8dc2 *R/boot_ci.R
9e96fc4c88cf6dde542dad506fbd024c *R/bootstrap.R
dae151ca1921e12cdc50d34ec4aaba98 *R/chi_squared_test.R
bea37c3b504f502316ea0c0ea13ace6d *R/confint_ncg.R
fdc049d40c1128eda89e331c5c90d9ba *R/cramer.R
a4d0e395069f0b478afde3129cc335f1 *R/cv.R
d4bd61851d9cbf5c43ec4eb85307155c *R/cv_error.R
b826d04f20826fbf7e12669244ca6e1f *R/design_effect.R
1ad1908f342e4326610442ec1f87a733 *R/find_beta.R
9e41625c78d4f0fc616fda45af0ddb8c *R/gmd.R
999e7912c1aefa456570911f5d78fe6c *R/gof.R
0cc0afccd5e8618261a77a39fcaf48ab *R/helpfunctions.R
6a846d9f6d8e46234264fafb97ccb650 *R/inequ_trends.R
1fa8b40681d24aee9e3f61fe2f5c3280 *R/is_prime.R
67314769d13f100aa42347989682118c *R/kruskal_wallis_test.R
1d5d666a8a158249ded68970dbc2b1a7 *R/mann_whitney_test.R
95ac928d84740007386966a5acd85477 *R/nhanes_sample.R
d6b62025eb44e0cec4b5b2b3ac35d95c *R/phi.R
0862472d8822eb19a02df43afed8252f *R/prop.R
3b73410ea4df483db4d088b26cd67fcd *R/re-exports.R
e05b4f907d36a356776c91a2efd4dd64 *R/samplesize_mixed.R
361bc5f678c9c2319dbdb8eeac7eb45a *R/se_ybar.R
086354712673aab0cc83fd08f1989359 *R/select_helpers.R
7d974672824a933c8324bcd8d01469a2 *R/sjStatistics.R
045c06ea2a7397394b2953a9b583298c *R/svy_median.R
6493371b66b336aa577c64dcb106c184 *R/svyglmnb.R
93a3a551f4b1f0aee327253aa5c8900e *R/svyglmzip.R
3e75c25d915766e171526bdf56c751be *R/t_test.R
b31743de39c751d4d377fa619511b60f *R/var_pop.R
974429e0f1399cf4bf3fd978cd48ea88 *R/weight.R
f116a04b43b7ccd58e83a073d3127d92 *R/wilcoxon_test.R
87f10c7fc8851aadc42785eb054f1513 *R/wtd_cor.R
3d485f8685d0eab81d5ef06afc2a6ebe *R/wtd_se.R
0ff693defb2e755fcf9335563a05af75 *R/wtd_variance.R
5aabab241149572088cd6eee7608af2d *R/xtab_statistics.R
ae778cf204793e0dfedee9997d09129b *README.md
cb3b20077bebc5da31fc1b46c657c0ff *build/partial.rdb
3172b22b3d87d0f86d78326bc26891fc *data/efc.RData
ad3f18b79c24699a3c122d8b377b5028 *data/nhanes_sample.RData
87f8a6a5cb2f59e0b2110feef3926498 *inst/CITATION
e7b45979874646edfe06751f551ecac0 *man/anova_stats.Rd
59646cdd3bf69f527ab406e00ab87fa9 *man/auto_prior.Rd
70644108d79f2803c7942ac793429ff5 *man/boot_ci.Rd
1145032561d5db541e2d6507fb7a0767 *man/bootstrap.Rd
aa75706bfe65fddfef67c466659bb300 *man/chi_squared_test.Rd
d442b8e1e4f45c610e7f36156b45f337 *man/chisq_gof.Rd
7c79e41fdc70e1658f589abeb6b461e8 *man/crosstable_statistics.Rd
ffdb50b8e82e22adbf2a02c0109ad4df *man/cv.Rd
cedc450189ecfdec77f4b1c854d684cc *man/cv_error.Rd
4ca804b5ac14c5bd99da78f47255d2ba *man/design_effect.Rd
07b7d0ca4d598d4623ef85ce91255f2c *man/efc.Rd
00a3f8db1d678464d37705968d42286f *man/figures/logo.png
5972d9fb77c826cf2dfc1bedf013ae6d *man/find_beta.Rd
1d3f4f4138dfbce4505b8d4a115b50e3 *man/gmd.Rd
aba99aa993b4a557f0263a74ac9d03e3 *man/inequ_trend.Rd
4ce0bf4951990d5e54c63e58c6655b9a *man/is_prime.Rd
7d0da9e0dddae98ae29ef61067d92858 *man/kruskal_wallis_test.Rd
a260eeaa63dfd35efd31bef099bc52ca *man/mann_whitney_test.Rd
2caeab2b1a18d198a10c8f2922a3f569 *man/nhanes_sample.Rd
29b6feb7a0b67ad073cde5e2bba6adb2 *man/prop.Rd
489cce3aeab9138295c8f9b96bc8b411 *man/r2.Rd
df2053d8e790c20ba0628b8e14c6661f *man/reexports.Rd
8b6bca7c1d0ca430e195d7f85957d2b3 *man/samplesize_mixed.Rd
6376e9c3db371cfc7941e2867d848567 *man/se_ybar.Rd
d805390222777ca589b1ae881b21b86d *man/svyglm.nb.Rd
08b6aac1d7829ce80aa4a86ec3e32e17 *man/svyglm.zip.Rd
bf3b5ec64335984fc48f271d39f15f9a *man/t_test.Rd
1fc30c16002e33aedd18a861df8bb685 *man/table_values.Rd
12f085a686effc5f7215f57dbe9a337a *man/var_pop.Rd
ce9b25cf83dceadb0d8312b9ec0ef64b *man/weight.Rd
2e7b20401a5f7788b47893d95ee60a29 *man/weighted_se.Rd
fe079ad00575d5017383eadab1fb321c *man/wilcoxon_test.Rd
bcdd319f419f3a3c026bd5c6cf6c6606 *tests/testthat.R
a7fdaa4db0db6dcb721eda69951ac277 *tests/testthat/_snaps/chi_squared_test.md
cdc4c6c33056ce23e1f8ea10c81c6153 *tests/testthat/_snaps/kruskal_wallis_test.md
44f5aa1953dacf2508276030089c1b7d *tests/testthat/_snaps/mann_whitney_test.md
8abd67404361072251463e4ec47310c4 *tests/testthat/_snaps/t_test.md
4dc8c45020c146986def33cf0994b64f *tests/testthat/_snaps/wilcoxon_test.md
79df90b4a1ef0dfa51a14229e13354ec *tests/testthat/test-autoprior.R
289430580685f1f24b5e8b0ae055708a *tests/testthat/test-chi_squared_test.R
26a2f03b97851963289a3b6ea589cf36 *tests/testthat/test-kruskal_wallis_test.R
0fe1150cbf1b8f688c88f6dc6ebfea8f *tests/testthat/test-mann_whitney_test.R
113db343ce6d3454faa6f62cc812a317 *tests/testthat/test-t_test.R
b70593e19a84da47a7fd5631e0817458 *tests/testthat/test-wilcoxon_test.R
3ddf1a5493907c2646b615b2aaeb56e4 *tests/testthat/test-wtd.R
sjstats/R/ 0000755 0001762 0000144 00000000000 15022763073 012153 5 ustar ligges users sjstats/R/mann_whitney_test.R 0000644 0001762 0000144 00000042322 14623373000 016031 0 ustar ligges users #' @title Mann-Whitney test
#' @name mann_whitney_test
#' @description This function performs a Mann-Whitney test (or Wilcoxon rank
#' sum test for _unpaired_ samples). Unlike the underlying base R function
#' `wilcox.test()`, this function allows for weighted tests and automatically
#' calculates effect sizes. For _paired_ (dependent) samples, or for one-sample
#' tests, please use the `wilcoxon_test()` function.
#'
#' A Mann-Whitney test is a non-parametric test for the null hypothesis that two
#' _independent_ samples have identical continuous distributions. It can be used
#' for ordinal scales or when the two continuous variables are not normally
#' distributed. For large samples, or approximately normally distributed variables,
#' the `t_test()` function can be used.
#'
#' @param data A data frame.
#' @param select Name(s) of the continuous variable(s) (as character vector)
#' to be used as samples for the test. `select` can be one of the following:
#'
#' - `select` can be used in combination with `by`, in which case `select` is
#' the name of the continous variable (and `by` indicates a grouping factor).
#' - `select` can also be a character vector of length two or more (more than
#' two names only apply to `kruskal_wallis_test()`), in which case the two
#' continuous variables are treated as samples to be compared. `by` must be
#' `NULL` in this case.
#' - If `select` select is of length **two** and `paired = TRUE`, the two samples
#' are considered as *dependent* and a paired test is carried out.
#' - If `select` specifies **one** variable and `by = NULL`, a one-sample test
#' is carried out (only applicable for `t_test()` and `wilcoxon_test()`)
#' - For `chi_squared_test()`, if `select` specifies **one** variable and
#' both `by` and `probabilities` are `NULL`, a one-sample test against given
#' probabilities is automatically conducted, with equal probabilities for
#' each level of `select`.
#' @param by Name of the variable indicating the groups. Required if `select`
#' specifies only one variable that contains all samples to be compared in the
#' test. If `by` is not a factor, it will be coerced to a factor. For
#' `chi_squared_test()`, if `probabilities` is provided, `by` must be `NULL`.
#' @param weights Name of an (optional) weighting variable to be used for the test.
#' @param alternative A character string specifying the alternative hypothesis,
#' must be one of `"two.sided"` (default), `"greater"` or `"less"`. See `?t.test`
#' and `?wilcox.test`.
#' @param mu The hypothesized difference in means (for `t_test()`) or location
#' shift (for `wilcoxon_test()` and `mann_whitney_test()`). The default is 0.
#' @param ... Additional arguments passed to `wilcox.test()` (for unweighted
#' tests, i.e. when `weights = NULL`).
#'
#' @section Which test to use:
#' The following table provides an overview of which test to use for different
#' types of data. The choice of test depends on the scale of the outcome
#' variable and the number of samples to compare.
#'
#' | **Samples** | **Scale of Outcome** | **Significance Test** |
#' |-----------------|------------------------|---------------------------------|
#' | 1 | binary / nominal | `chi_squared_test()` |
#' | 1 | continuous, not normal | `wilcoxon_test()` |
#' | 1 | continuous, normal | `t_test()` |
#' | 2, independent | binary / nominal | `chi_squared_test()` |
#' | 2, independent | continuous, not normal | `mann_whitney_test()` |
#' | 2, independent | continuous, normal | `t_test()` |
#' | 2, dependent | binary (only 2x2) | `chi_squared_test(paired=TRUE)` |
#' | 2, dependent | continuous, not normal | `wilcoxon_test()` |
#' | 2, dependent | continuous, normal | `t_test(paired=TRUE)` |
#' | >2, independent | continuous, not normal | `kruskal_wallis_test()` |
#' | >2, independent | continuous, normal | `datawizard::means_by_group()` |
#' | >2, dependent | continuous, not normal | _not yet implemented_ (1) |
#' | >2, dependent | continuous, normal | _not yet implemented_ (2) |
#'
#' (1) More than two dependent samples are considered as _repeated measurements_.
#' For ordinal or not-normally distributed outcomes, these samples are
#' usually tested using a [`friedman.test()`], which requires the samples
#' in one variable, the groups to compare in another variable, and a third
#' variable indicating the repeated measurements (subject IDs).
#'
#' (2) More than two dependent samples are considered as _repeated measurements_.
#' For normally distributed outcomes, these samples are usually tested using
#' a ANOVA for repeated measurements. A more sophisticated approach would
#' be using a linear mixed model.
#'
#' @seealso
#' - [`t_test()`] for parametric t-tests of dependent and independent samples.
#' - [`mann_whitney_test()`] for non-parametric tests of unpaired (independent)
#' samples.
#' - [`wilcoxon_test()`] for Wilcoxon rank sum tests for non-parametric tests
#' of paired (dependent) samples.
#' - [`kruskal_wallis_test()`] for non-parametric tests with more than two
#' independent samples.
#' - [`chi_squared_test()`] for chi-squared tests (two categorical variables,
#' dependent and independent).
#'
#' @return A data frame with test results. The function returns p and Z-values
#' as well as effect size r and group-rank-means.
#'
#' @references
#' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
#' \doi{10.3390/math11091982}
#'
#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
#' Dtsch Med Wochenschr 2007; 132: e24–e25
#'
#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
#'
#' @details This function is based on [`wilcox.test()`] and [`coin::wilcox_test()`]
#' (the latter to extract effect sizes). The weighted version of the test is
#' based on [`survey::svyranktest()`].
#'
#' Interpretation of the effect size **r**, as a rule-of-thumb:
#'
#' - small effect >= 0.1
#' - medium effect >= 0.3
#' - large effect >= 0.5
#'
#' **r** is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}.
#'
#' @examplesIf requireNamespace("coin") && requireNamespace("survey")
#' data(efc)
#' # Mann-Whitney-U tests for elder's age by elder's sex.
#' mann_whitney_test(efc, "e17age", by = "e16sex")
#' # base R equivalent
#' wilcox.test(e17age ~ e16sex, data = efc)
#'
#' # when data is in wide-format, specify all relevant continuous
#' # variables in `select` and omit `by`
#' set.seed(123)
#' wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20))
#' mann_whitney_test(wide_data, select = c("scale1", "scale2"))
#' # base R equivalent
#' wilcox.test(wide_data$scale1, wide_data$scale2)
#' # same as if we had data in long format, with grouping variable
#' long_data <- data.frame(
#' scales = c(wide_data$scale1, wide_data$scale2),
#' groups = as.factor(rep(c("A", "B"), each = 20))
#' )
#' mann_whitney_test(long_data, select = "scales", by = "groups")
#' # base R equivalent
#' wilcox.test(scales ~ groups, long_data)
#' @export
mann_whitney_test <- function(data,
select = NULL,
by = NULL,
weights = NULL,
mu = 0,
alternative = "two.sided",
...) {
insight::check_if_installed("datawizard")
alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater"))
# sanity checks
.sanitize_htest_input(data, select, by, weights, test = "mann_whitney_test")
# alternative only if weights are NULL
if (!is.null(weights) && alternative != "two.sided") {
insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.")
}
# does select indicate more than one variable?
if (length(select) > 1) {
# we convert the data into long format, and create a grouping variable
data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale")
by <- select[2]
select <- select[1]
# after converting to long, we have the "grouping" variable first in the data
colnames(data) <- c(by, select)
}
# get data
dv <- data[[select]]
grp <- data[[by]]
# coerce to factor
grp <- datawizard::to_factor(grp)
# only two groups allowed
if (insight::n_unique(grp) > 2) {
insight::format_error("Only two groups are allowed for Mann-Whitney test. Please use `kruskal_wallis_test()` for more than two groups.") # nolint
}
# value labels
group_labels <- names(attr(data[[by]], "labels", exact = TRUE))
if (is.null(group_labels)) {
group_labels <- levels(droplevels(grp))
}
if (is.null(weights)) {
.calculate_mwu(dv, grp, alternative, mu, group_labels, ...)
} else {
.calculate_weighted_mwu(dv, grp, data[[weights]], group_labels)
}
}
# Mann-Whitney-Test for two groups --------------------------------------------
.calculate_mwu <- function(dv, grp, alternative, mu, group_labels, ...) {
insight::check_if_installed("coin")
# prepare data
wcdat <- data.frame(dv, grp)
# perfom wilcox test
wt <- coin::wilcox_test(dv ~ grp, data = wcdat)
# for rank mean
group_levels <- levels(grp)
# compute statistics
u <- as.numeric(coin::statistic(wt, type = "linear"))
z <- as.numeric(coin::statistic(wt, type = "standardized"))
r <- abs(z / sqrt(length(dv)))
htest <- suppressWarnings(stats::wilcox.test(
dv ~ grp,
data = wcdat,
alternative = alternative,
mu = mu,
...
))
w <- htest$statistic
p <- htest$p.value
# group means
dat_gr1 <- stats::na.omit(dv[grp == group_levels[1]])
dat_gr2 <- stats::na.omit(dv[grp == group_levels[2]])
rank_mean_1 <- mean(rank(dat_gr1))
rank_mean_2 <- mean(rank(dat_gr2))
# compute n for each group
n_grp1 <- length(dat_gr1)
n_grp2 <- length(dat_gr2)
out <- data.frame(
group1 = group_levels[1],
group2 = group_levels[2],
estimate = rank_mean_1 - rank_mean_2,
u = u,
w = w,
z = z,
r = r,
p = as.numeric(p),
mu = mu,
alternative = alternative
)
attr(out, "rank_means") <- stats::setNames(
c(rank_mean_1, rank_mean_2),
c("Mean Group 1", "Mean Group 2")
)
attr(out, "n_groups") <- stats::setNames(
c(n_grp1, n_grp2),
c("N Group 1", "N Group 2")
)
attr(out, "group_labels") <- group_labels
attr(out, "method") <- "wilcoxon"
attr(out, "weighted") <- FALSE
class(out) <- c("sj_htest_mwu", "data.frame")
out
}
# Weighted Mann-Whitney-Test for two groups ----------------------------------
.calculate_weighted_mwu <- function(dv, grp, weights, group_labels) {
# check if pkg survey is available
insight::check_if_installed("survey")
dat <- stats::na.omit(data.frame(dv, grp, weights))
colnames(dat) <- c("x", "g", "w")
design <- survey::svydesign(ids = ~0, data = dat, weights = ~w)
result <- survey::svyranktest(formula = x ~ g, design, test = "wilcoxon")
# for rank mean
group_levels <- levels(droplevels(grp))
# subgroups
dat_gr1 <- dat[dat$g == group_levels[1], ]
dat_gr2 <- dat[dat$g == group_levels[2], ]
dat_gr1$rank_x <- rank(dat_gr1$x)
dat_gr2$rank_x <- rank(dat_gr2$x)
# rank means
design_mean1 <- survey::svydesign(
ids = ~0,
data = dat_gr1,
weights = ~w
)
rank_mean_1 <- survey::svymean(~rank_x, design_mean1)
design_mean2 <- survey::svydesign(
ids = ~0,
data = dat_gr2,
weights = ~w
)
rank_mean_2 <- survey::svymean(~rank_x, design_mean2)
# group Ns
n_grp1 <- round(sum(dat_gr1$w))
n_grp2 <- round(sum(dat_gr2$w))
# statistics and effect sizes
z <- result$statistic
r <- abs(z / sqrt(sum(n_grp1, n_grp2)))
out <- data_frame(
group1 = group_levels[1],
group2 = group_levels[2],
estimate = result$estimate,
z = z,
r = r,
p = as.numeric(result$p.value),
alternative = "two.sided"
)
attr(out, "rank_means") <- stats::setNames(
c(rank_mean_1, rank_mean_2),
c("Mean Group 1", "Mean Group 2")
)
attr(out, "n_groups") <- stats::setNames(
c(n_grp1, n_grp2),
c("N Group 1", "N Group 2")
)
attr(out, "group_labels") <- group_labels
attr(out, "weighted") <- TRUE
class(out) <- c("sj_htest_mwu", "data.frame")
out
}
# helper ----------------------------------------------------------------------
.sanitize_htest_input <- function(data, select, by, weights, test = NULL) {
# check if arguments are NULL
if (is.null(select)) {
insight::format_error("Argument `select` is missing.")
}
# sanity check - may only specify two variable names
if (identical(test, "mann_whitney_test") && length(select) > 2) {
insight::format_error("You may only specify two variables for Mann-Whitney test.")
}
if (identical(test, "mann_whitney_test") && length(select) == 1 && is.null(by)) {
insight::format_error("Only one variable provided in `select`, but none in `by`. You need to specify a second continuous variable in `select`, or a grouping variable in `by` for Mann-Whitney test.") # nolint
}
# sanity check - may only specify two variable names
if (identical(test, "t_test") && length(select) > 2) {
insight::format_error("You may only specify two variables for Student's t test.")
}
if ((!is.null(test) && test %in% c("t_test", "kruskal_wallis_test", "mann_whitney_test")) && length(select) > 1 && !is.null(by)) { # nolint
insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.")
}
# check if arguments have correct length or are of correct type
if (!is.character(select)) {
insight::format_error("Argument `select` must be a character string with the name(s) of the variable(s).")
}
if (!is.null(by) && (length(by) != 1 || !is.character(by))) {
insight::format_error("Argument `by` must be a character string with the name of a single variable.")
}
if (!is.null(weights) && (length(weights) != 1 || !is.character(weights))) {
insight::format_error("Argument `weights` must be a character string with the name of a single variable.")
}
# check if "select" is in data
if (!all(select %in% colnames(data))) {
not_found <- setdiff(select, colnames(data))[1]
insight::format_error(
sprintf("Variable '%s' not found in data frame.", not_found),
.misspelled_string(colnames(data), not_found, "Maybe misspelled?")
)
}
# check if "by" is in data
if (!is.null(by) && !by %in% colnames(data)) {
insight::format_error(
sprintf("Variable '%s' not found in data frame.", by),
.misspelled_string(colnames(data), by, "Maybe misspelled?")
)
}
# check if "weights" is in data
if (!is.null(weights) && !weights %in% colnames(data)) {
insight::format_error(
sprintf("Weighting variable '%s' not found in data frame.", weights),
.misspelled_string(colnames(data), weights, "Maybe misspelled?")
)
}
# select variable type for certain tests
if (identical(test, "t_test") && !all(vapply(data[select], is.numeric, logical(1)))) {
insight::format_error("Variable provided in `select` must be numeric for Student's t test.")
}
}
# methods ---------------------------------------------------------------------
#' @export
print.sj_htest_mwu <- function(x, ...) {
# fetch attributes
group_labels <- attributes(x)$group_labels
rank_means <- attributes(x)$rank_means
n_groups <- attributes(x)$n_groups
weighted <- attributes(x)$weighted
if (weighted) {
weight_string <- " (weighted)"
} else {
weight_string <- ""
}
# same width
group_labels <- format(group_labels)
# header
insight::print_color(sprintf("# Mann-Whitney test%s\n\n", weight_string), "blue")
# group-1-info
insight::print_color(
sprintf(
" Group 1: %s (n = %i, rank mean = %s)\n",
group_labels[1], n_groups[1], insight::format_value(rank_means[1], protect_integers = TRUE)
), "cyan"
)
# group-2-info
insight::print_color(
sprintf(
" Group 2: %s (n = %i, rank mean = %s)\n",
group_labels[2], n_groups[2], insight::format_value(rank_means[2], protect_integers = TRUE)
), "cyan"
)
# alternative hypothesis
if (!is.null(x$alternative) && !is.null(x$mu)) {
alt_string <- switch(x$alternative,
two.sided = "not equal to",
less = "less than",
greater = "greater than"
)
alt_string <- paste("true location shift is", alt_string, x$mu)
insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan")
}
if (!is.null(x$w)) {
w_stat <- paste("W =", insight::format_value(x$w, protect_integers = TRUE), ", ")
} else {
w_stat <- ""
}
cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", w_stat, x$r, x$z, insight::format_p(x$p)))
}
sjstats/R/wtd_cor.R 0000644 0001762 0000144 00000004216 14620351262 013736 0 ustar ligges users #' @rdname weighted_se
#' @export
weighted_correlation <- function(data, ...) {
UseMethod("weighted_correlation")
}
#' @rdname weighted_se
#' @export
weighted_correlation.default <- function(data, x, y, weights, ci.lvl = 0.95, ...) {
if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1))
insight::format_error("'ci.lvl' must be a single number between 0 and 1.")
x.name <- deparse(substitute(x))
y.name <- deparse(substitute(y))
w.name <- deparse(substitute(weights))
if (w.name == "NULL") {
w.name <- "weights"
data$weights <- 1
}
# create string with variable names
vars <- c(x.name, y.name, w.name)
# get data
dat <- suppressMessages(data[vars])
dat <- stats::na.omit(dat)
xv <- dat[[x.name]]
yv <- dat[[y.name]]
wv <- dat[[w.name]]
weighted_correlation_helper(xv, yv, wv, ci.lvl)
}
#' @rdname weighted_se
#' @export
weighted_correlation.formula <- function(formula, data, ci.lvl = 0.95, ...) {
if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1))
insight::format_error("'ci.lvl' must be a single number between 0 and 1.")
vars <- all.vars(formula)
if (length(vars) < 3) {
vars <- c(vars, "weights")
data$weights <- 1
}
# get data
dat <- suppressMessages(data[vars])
dat <- stats::na.omit(dat)
xv <- dat[[vars[1]]]
yv <- dat[[vars[2]]]
wv <- dat[[vars[3]]]
weighted_correlation_helper(xv, yv, wv, ci.lvl)
}
weighted_correlation_helper <- function(xv, yv, wv, ci.lvl) {
x <- xv - weighted_mean(xv, weights = wv)
y <- yv - weighted_mean(yv, weights = wv)
x <- x / weighted_sd(x, weights = wv)
y <- y / weighted_sd(y, weights = wv)
results <- stats::coef(summary(stats::lm(y ~ x, weights = wv)))[2, ]
ci <- ci.lvl - ((1 - ci.lvl) / 2)
ci <- results[1] + (stats::qnorm(ci) * c(-1, 1) * results[2])
structure(
class = "sj_wcor",
list(
estimate = results[1],
method = "Pearson's Correlation Coefficient",
p.value = results[4],
ci = ci,
ci.lvl = ci.lvl
)
)
}
sjstats/R/kruskal_wallis_test.R 0000644 0001762 0000144 00000014511 15022763073 016366 0 ustar ligges users #' @title Kruskal-Wallis test
#' @name kruskal_wallis_test
#' @description This function performs a Kruskal-Wallis rank sum test, which is
#' a non-parametric method to test the null hypothesis that the population median
#' of all of the groups are equal. The alternative is that they differ in at
#' least one. Unlike the underlying base R function `kruskal.test()`, this
#' function allows for weighted tests.
#'
#' @inheritParams mann_whitney_test
#' @inherit mann_whitney_test seealso
#'
#' @return A data frame with test results.
#'
#' @inheritSection mann_whitney_test Which test to use
#'
#' @references
#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
#' Dtsch Med Wochenschr 2007; 132: e24–e25
#'
#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
#'
#' @details The function simply is a wrapper around [`kruskal.test()`]. The
#' weighted version of the Kruskal-Wallis test is based on the **survey** package,
#' using [`survey::svyranktest()`].
#'
#' @examples
#' data(efc)
#' # Kruskal-Wallis test for elder's age by education
#' kruskal_wallis_test(efc, "e17age", by = "c172code")
#'
#' # when data is in wide-format, specify all relevant continuous
#' # variables in `select` and omit `by`
#' set.seed(123)
#' wide_data <- data.frame(
#' scale1 = runif(20),
#' scale2 = runif(20),
#' scale3 = runif(20)
#' )
#' kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3"))
#'
#' # same as if we had data in long format, with grouping variable
#' long_data <- data.frame(
#' scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3),
#' groups = rep(c("A", "B", "C"), each = 20)
#' )
#' kruskal_wallis_test(long_data, select = "scales", by = "groups")
#' # base R equivalent
#' kruskal.test(scales ~ groups, data = long_data)
#' @export
kruskal_wallis_test <- function(data,
select = NULL,
by = NULL,
weights = NULL) {
insight::check_if_installed("datawizard")
# sanity checks
.sanitize_htest_input(data, select, by, weights, test = "kruskal_wallis_test")
# does select indicate more than one variable?
if (length(select) > 1) {
# we convert the data into long format, and create a grouping variable
data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale")
by <- select[2]
select <- select[1]
# after converting to long, we have the "grouping" variable first in the data
colnames(data) <- c(by, select)
}
# get data
dv <- data[[select]]
grp <- data[[by]]
# coerce to factor
grp <- datawizard::to_factor(grp)
# only two groups allowed
if (insight::n_unique(grp) < 2) {
insight::format_error("At least two groups are required, i.e. data must have at least two unique levels in `by` for `kruskal_wallis_test()`.") # nolint
}
if (is.null(weights)) {
.calculate_kw(dv, grp, group_labels = c(select, by))
} else {
.calculate_weighted_kw(dv, grp, data[[weights]], group_labels = c(select, by))
}
}
# Kruskal-Wallis-Test --------------------------------------------
.calculate_kw <- function(dv, grp, paired = FALSE, group_labels = NULL) {
# prepare data
wcdat <- data.frame(dv, grp)
if (paired) {
# perfom friedman test for paired data
wt <- stats::friedman.test(table(wcdat))
} else {
# perfom kruskal wallis test
wt <- stats::kruskal.test(dv ~ grp, data = wcdat)
}
# number of groups
n_groups <- vapply(
stats::na.omit(unique(grp)),
function(g) sum(grp == g, na.rm = TRUE),
numeric(1)
)
out <- data.frame(
data = paste(group_labels[1], "by", group_labels[2]),
Chi2 = wt$statistic,
df = wt$parameter,
p = as.numeric(wt$p.value),
stringsAsFactors = FALSE
)
attr(out, "n_groups") <- n_groups
attr(out, "method") <- ifelse(paired, "friedman", "kruskal")
attr(out, "weighted") <- FALSE
class(out) <- c("sj_htest_kw", "data.frame")
out
}
# Weighted Mann-Whitney-Test for two groups ----------------------------------
.calculate_weighted_kw <- function(dv, grp, weights, paired = FALSE, group_labels = NULL) {
# check if pkg survey is available
insight::check_if_installed("survey")
dat <- stats::na.omit(data.frame(dv, grp, weights))
colnames(dat) <- c("x", "g", "w")
# number of groups
n_groups <- vapply(stats::na.omit(unique(grp)), function(i) {
round(sum(dat$w[dat$g == i], na.rm = TRUE))
}, numeric(1))
if (paired) {
## TODO: paired no working. should call `friedman.test()`
insight::format_error("Paired Kruskal-Wallis test is using weights is not implemented yet.")
} else {
design <- survey::svydesign(ids = ~0, data = dat, weights = ~w)
result <- survey::svyranktest(formula = x ~ g, design, test = "KruskalWallis")
}
out <- data.frame(
data = paste(group_labels[1], "by", group_labels[2]),
Chi2 = result$statistic,
df = result$parameter,
p = as.numeric(result$p.value),
stringsAsFactors = FALSE
)
attr(out, "n_groups") <- n_groups
attr(out, "method") <- ifelse(paired, "friedman", "kruskal")
attr(out, "weighted") <- TRUE
class(out) <- c("sj_htest_kw", "data.frame")
out
}
# methods ---------------------------------------------------------------------
#' @export
print.sj_htest_kw <- function(x, ...) {
insight::check_if_installed("datawizard")
# fetch attributes
n_groups <- attributes(x)$n_groups
weighted <- attributes(x)$weighted
method <- attributes(x)$method
if (weighted) {
weight_string <- " (weighted)"
} else {
weight_string <- ""
}
# header
if (identical(method, "kruskal")) {
insight::print_color(sprintf("# Kruskal-Wallis test%s\n\n", weight_string), "blue")
} else {
insight::print_color(sprintf("# Friedman test%s\n\n", weight_string), "blue")
}
# data info
insight::print_color(
sprintf(
" Data: %s (%i groups, n = %s)\n",
x$data, length(n_groups), datawizard::text_concatenate(n_groups)
), "cyan"
)
stat_symbol <- .format_symbols("Chi2")
cat(sprintf(
"\n %s = %.2f, df = %i, %s\n\n",
stat_symbol, x$Chi2, round(x$df), insight::format_p(x$p)
))
}
sjstats/R/auto_prior.R 0000644 0001762 0000144 00000011576 14623373000 014464 0 ustar ligges users #' @title Create default priors for brms-models
#' @name auto_prior
#'
#' @description This function creates default priors for brms-regression
#' models, based on the same automatic prior-scale adjustment as in
#' \pkg{rstanarm}.
#'
#' @param formula A formula describing the model, which just needs to contain
#' the model terms, but no notation of interaction, splines etc. Usually,
#' you want only those predictors in the formula, for which automatic
#' priors should be generated. Add informative priors afterwards to the
#' returned \code{brmsprior}-object.
#' @param data The data that will be used to fit the model.
#' @param gaussian Logical, if the outcome is gaussian or not.
#' @param locations A numeric vector with location values for the priors. If
#' \code{locations = NULL}, \code{0} is used as location parameter.
#'
#' @return A \code{brmsprior}-object.
#'
#' @details \code{auto_prior()} is a small, convenient function to create
#' some default priors for brms-models with automatically adjusted prior
#' scales, in a similar way like \pkg{rstanarm} does. The default scale for
#' the intercept is 10, for coefficients 2.5. If the outcome is gaussian,
#' both scales are multiplied with \code{sd(y)}. Then, for categorical
#' variables, nothing more is changed. For numeric variables, the scales
#' are divided by the standard deviation of the related variable.
#' \cr \cr
#' All prior distributions are \emph{normal} distributions. \code{auto_prior()}
#' is intended to quickly create default priors with feasible scales. If
#' more precise definitions of priors is necessary, this needs to be done
#' directly with brms-functions like \code{set_prior()}.
#'
#' @note As \code{auto_prior()} also sets priors on the intercept, the model
#' formula used in \code{brms::brm()} must be rewritten to something like
#' \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}.
#'
#' @examplesIf requireNamespace("brms")
#' data(efc)
#' efc$c172code <- as.factor(efc$c172code)
#' efc$c161sex <- as.factor(efc$c161sex)
#'
#' mf <- formula(neg_c_7 ~ c161sex + c160age + c172code)
#' auto_prior(mf, efc, TRUE)
#'
#' ## compare to
#' # m <- rstanarm::stan_glm(mf, data = efc, chains = 2, iter = 200)
#' # ps <- prior_summary(m)
#' # ps$prior_intercept$adjusted_scale
#' # ps$prior$adjusted_scale
#'
#' ## usage
#' # ap <- auto_prior(mf, efc, TRUE)
#' # brm(mf, data = efc, prior = ap)
#'
#' # add informative priors
#' mf <- formula(neg_c_7 ~ c161sex + c172code)
#'
#' auto_prior(mf, efc, TRUE) +
#' brms::prior(normal(.1554, 40), class = "b", coef = "c160age")
#'
#' # example with binary response
#' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1)
#' mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age)
#' auto_prior(mf, efc, FALSE)
#' @export
auto_prior <- function(formula, data, gaussian, locations = NULL) {
insight::check_if_installed("brms")
scale.b <- 2.5
scale.y <- 10
pred <- insight::find_predictors(formula, effects = "all", flatten = TRUE)
y.name <- insight::find_response(formula, combine = TRUE)
data <- stats::na.omit(data[c(y.name, pred)])
y <- data[[y.name]]
# check if response is binary
if (missing(gaussian) && insight::n_unique(y) == 2) gaussian <- FALSE
if (isTRUE(gaussian) && insight::n_unique(y) == 2)
insight::format_alert("Priors were calculated based on assumption that the response is Gaussian, however it seems to be binary.") # nolint
if (gaussian) {
scale.factor <- stats::sd(y, na.rm = TRUE)
scale.b <- scale.b * scale.factor
scale.y <- scale.y * scale.factor
}
if (!is.null(locations))
location.y <- locations[1]
else
location.y <- 0
priors <- brms::set_prior(
sprintf("normal(%s, %s)", round(location.y, 2), round(scale.y, 2)),
class = "Intercept"
)
is.fac <- NULL
term.names <- NULL
scale.pred <- NULL
# we need to check which predictors are categorical and then "mimic"
# their coefficient name as it is represented in the model (i.e. variable
# name + category name)
for (i in pred) {
f <- data[[i]]
if (is.factor(f)) {
i <- sprintf("%s%s", i, levels(f)[2:nlevels(f)])
is.fac <- c(is.fac, rep(TRUE, nlevels(f) - 1))
scale.pred <- c(scale.pred, rep(scale.b, nlevels(f) - 1))
} else {
is.fac <- c(is.fac, FALSE)
scale.pred <- c(scale.pred, scale.b / stats::sd(f, na.rm = TRUE))
}
term.names <- c(term.names, i)
}
for (i in seq_along(term.names)) {
if (!is.null(locations) && length(locations) >= (i + 1))
location.b <- locations[i + 1]
else
location.b <- 0
priors <- priors + brms::set_prior(
sprintf("normal(%s, %s)", round(location.b, 2), round(scale.pred[i], 2)),
class = "b",
coef = term.names[i]
)
}
priors
}
sjstats/R/select_helpers.R 0000644 0001762 0000144 00000001545 14620351262 015300 0 ustar ligges users string_starts_with <- function(pattern, x) {
pattern <- paste0("^\\Q", pattern, "\\E")
grep(pattern, x, perl = TRUE)
}
string_contains <- function(pattern, x) {
pattern <- paste0("\\Q", pattern, "\\E")
grep(pattern, x, perl = TRUE)
}
string_ends_with <- function(pattern, x) {
pattern <- paste0("\\Q", pattern, "\\E$")
grep(pattern, x, perl = TRUE)
}
string_one_of <- function(pattern, x) {
m <- unlist(lapply(pattern, grep, x = x, fixed = TRUE, useBytes = TRUE))
x[m]
}
rownames_as_column <- function(x, var = "rowname") {
rn <- data.frame(rn = rownames(x), stringsAsFactors = FALSE)
x <- cbind(rn, x)
colnames(x)[1] <- var
rownames(x) <- NULL
x
}
obj_has_name <- function(x, name) {
name %in% names(x)
}
obj_has_rownames <- function(x) {
!identical(as.character(seq_len(nrow(x))), rownames(x))
}
sjstats/R/prop.R 0000644 0001762 0000144 00000015306 15022763073 013263 0 ustar ligges users #' @title Proportions of values in a vector
#' @name prop
#'
#' @description `prop()` calculates the proportion of a value or category
#' in a variable. `props()` does the same, but allows for
#' multiple logical conditions in one statement. It is similar
#' to `mean()` with logical predicates, however, both
#' `prop()` and `props()` work with grouped data frames.
#'
#' @param data A data frame. May also be a grouped data frame (see 'Examples').
#' @param ... One or more value pairs of comparisons (logical predicates). Put
#' variable names the left-hand-side and values to match on the
#' right hand side. Expressions may be quoted or unquoted. See
#' 'Examples'.
#' @param weights Vector of weights that will be applied to weight all observations.
#' Must be a vector of same length as the input vector. Default is
#' `NULL`, so no weights are used.
#' @param na.rm Logical, whether to remove NA values from the vector when the
#' proportion is calculated. `na.rm = FALSE` gives you the raw
#' percentage of a value in a vector, `na.rm = TRUE` the valid
#' percentage.
#' @param digits Amount of digits for returned values.
#'
#' @details `prop()` only allows one logical statement per comparison,
#' while `props()` allows multiple logical statements per comparison.
#' However, `prop()` supports weighting of variables before calculating
#' proportions, and comparisons may also be quoted. Hence, `prop()`
#' also processes comparisons, which are passed as character vector
#' (see 'Examples').
#'
#'
#' @return For one condition, a numeric value with the proportion of the values
#' inside a vector. For more than one condition, a data frame with one column
#' of conditions and one column with proportions. For grouped data frames,
#' returns a data frame with one column per group with grouping categories,
#' followed by one column with proportions per condition.
#'
#' @examplesIf getRversion() >= "4.2.0" && requireNamespace("datawizard", quietly = TRUE)
#' data(efc)
#'
#' # proportion of value 1 in e42dep
#' prop(efc, e42dep == 1)
#'
#' # expression may also be completely quoted
#' prop(efc, "e42dep == 1")
#'
#' # use "props()" for multiple logical statements
#' props(efc, e17age > 70 & e17age < 80)
#'
#' # proportion of value 1 in e42dep, and all values greater
#' # than 2 in e42dep, including missing values. will return a data frame
#' prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE)
#'
#' # for factors or character vectors, use quoted or unquoted values
#' library(datawizard)
#' # convert numeric to factor, using labels as factor levels
#' efc$e16sex <- to_factor(efc$e16sex)
#' efc$n4pstu <- to_factor(efc$n4pstu)
#'
#' # get proportion of female older persons
#' prop(efc, e16sex == female)
#'
#' # get proportion of male older persons
#' prop(efc, e16sex == "male")
#'
#' # "props()" needs quotes around non-numeric factor levels
#' props(efc,
#' e17age > 70 & e17age < 80,
#' n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3'
#' )
#' @export
prop <- function(data, ..., weights = NULL, na.rm = TRUE, digits = 4) {
# check argument
if (!is.data.frame(data)) {
insight::format_error("`data` needs to be a data frame.")
}
dots <- match.call(expand.dots = FALSE)[["..."]]
.proportions(data, dots = dots, weight.by = weights, na.rm, digits, multi_logical = FALSE)
}
#' @rdname prop
#' @export
props <- function(data, ..., na.rm = TRUE, digits = 4) {
# check argument
if (!is.data.frame(data)) {
insight::format_error("`data` needs to be a data frame.")
}
dots <- match.call(expand.dots = FALSE)[["..."]]
.proportions(data, dots = dots, NULL, na.rm, digits, multi_logical = TRUE)
}
.proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) {
# remember comparisons
comparisons <- lapply(dots, function(x) {
# to character, and remove spaces and quotes
x <- gsub(" ", "", deparse(x), fixed = TRUE)
x <- gsub("\"", "", x, fixed = TRUE)
x
})
if (inherits(data, "grouped_df")) {
grps <- attributes(data)$groups
result <- lapply(grps[[".rows"]], function(x) {
.process_prop(data[x, , drop = FALSE], comparisons, dots, multi_logical, na.rm, digits, weight.by)
})
} else {
result <- .process_prop(data, comparisons, dots, multi_logical, na.rm, digits, weight.by)
}
result
}
.process_prop <- function(data, comparisons, dots, multi_logical, na.rm, digits, weight.by) {
# iterate dots (comparing conditions)
if (multi_logical)
result <- lapply(dots, get_multiple_proportion, data, na.rm, digits)
else
result <- lapply(dots, get_proportion, data, weight.by, na.rm, digits)
# if we have more than one proportion, return a data frame. this allows us
# to save more information, the condition and the proportion value
if (length(comparisons) > 1) {
return(data_frame(
condition = as.character(unlist(comparisons)),
prop = unlist(result)
))
}
unlist(result)
}
get_proportion <- function(x, data, weight.by, na.rm, digits) {
# to character, and remove spaces and quotes
x <- gsub(" ", "", deparse(x), fixed = TRUE)
x <- gsub("\"", "", x, fixed = TRUE)
# split expression at ==, < or >
x.parts <- unlist(regmatches(x, gregexpr("[!=]=|[<>]|(?:(?![=!]=)[^<>])+", x, perl = TRUE)))
# shorter version, however, does not split variable names with dots
# x.parts <- unlist(regmatches(x, regexec("(\\w+)(\\W+)(\\w+)", x)))[-1]
# correct == assignment?
if (length(x.parts) < 3) {
message("?Syntax error in argument. You possibly used `=` instead of `==`.")
return(NULL)
}
# get variable from data and value from equation
f <- data[[x.parts[1]]]
v <- suppressWarnings(as.numeric(x.parts[3]))
# if we have factor, values maybe non-numeric
if (is.na(v)) v <- x.parts[3]
# weight vector?
if (!is.null(weight.by)) f <- weight(f, weights = weight.by)
# get proportions
dummy <- switch(x.parts[2],
"==" = f == v,
"!=" = f != v,
"<" = f < v,
">" = f > v,
f == v
)
# remove missings?
if (na.rm) dummy <- stats::na.omit(dummy)
# get proportion
round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits)
}
get_multiple_proportion <- function(x, data, na.rm, digits) {
# evaluate argument
dummy <- with(data, eval(parse(text = deparse(x))))
# remove missings?
if (na.rm) dummy <- stats::na.omit(dummy)
# get proportion
round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits)
}
sjstats/R/svy_median.R 0000644 0001762 0000144 00000000607 14620351262 014433 0 ustar ligges users #' @rdname weighted_se
#' @export
survey_median <- function(x, design) {
# check if pkg survey is available
insight::check_if_installed("survey")
# deparse
v <- stats::as.formula(paste("~", as.character(substitute(x))))
as.vector(
survey::svyquantile(
v,
design = design,
quantiles = 0.5,
ci = FALSE,
na.rm = TRUE
)
)
}
sjstats/R/design_effect.R 0000644 0001762 0000144 00000003642 13563265750 015077 0 ustar ligges users #' @title Design effects for two-level mixed models
#' @name design_effect
#'
#' @description Compute the design effect (also called \emph{Variance Inflation Factor})
#' for mixed models with two-level design.
#'
#' @param n Average number of observations per grouping cluster (i.e. level-2 unit).
#' @param icc Assumed intraclass correlation coefficient for multilevel-model.
#'
#' @return The design effect (Variance Inflation Factor) for the two-level model.
#'
#' @references Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20.
#' \cr \cr
#' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230}
#' \cr \cr
#' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492}
#' \cr \cr
#' Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. \doi{10.1370/afm.1347}
#'
#' @details The formula for the design effect is simply \code{(1 + (n - 1) * icc)}.
#'
#' @examples
#' # Design effect for two-level model with 30 observations per
#' # cluster group (level-2 unit) and an assumed intraclass
#' # correlation coefficient of 0.05.
#' design_effect(n = 30)
#'
#' # Design effect for two-level model with 24 observation per cluster
#' # group and an assumed intraclass correlation coefficient of 0.2.
#' design_effect(n = 24, icc = 0.2)
#'
#' @export
design_effect <- function(n, icc = 0.05) {
1 + (n - 1) * icc
}
sjstats/R/anova_stats.R 0000644 0001762 0000144 00000015777 14620363055 014640 0 ustar ligges users #' @title Effect size statistics for anova
#' @name anova_stats
#' @description Returns the (partial) eta-squared, (partial) omega-squared,
#' epsilon-squared statistic or Cohen's F for all terms in an anovas.
#' \code{anova_stats()} returns a tidy summary, including all these statistics
#' and power for each term.
#'
#' @param model A fitted anova-model of class \code{aov} or \code{anova}. Other
#' models are coerced to \code{\link[stats]{anova}}.
#' @param digits Amount of digits for returned values.
#'
#' @return A data frame with all statistics is returned (excluding confidence intervals).
#'
#' @references Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research.
#' \cr \cr
#' Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model.
#'
#' @examplesIf requireNamespace("car")
#' # load sample data
#' data(efc)
#'
#' # fit linear model
#' fit <- aov(
#' c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age,
#' data = efc
#' )
#' anova_stats(car::Anova(fit, type = 2))
#' @export
anova_stats <- function(model, digits = 3) {
# .Deprecated("effectsize::effectsize()", package = "effectsize")
# get tidy summary table
aov.sum <- aov_stat_summary(model)
# compute all model statistics
etasq <- aov_stat_core(aov.sum, type = "eta")
partial.etasq <- aov_stat_core(aov.sum, type = "peta")
omegasq <- aov_stat_core(aov.sum, type = "omega")
partial.omegasq <- aov_stat_core(aov.sum, type = "pomega")
epsilonsq <- aov_stat_core(aov.sum, type = "epsilon")
# compute power for each estimate
cohens.f <- sqrt(partial.etasq / (1 - partial.etasq))
# bind as data frame
anov_stat <- rbind(
data.frame(etasq, partial.etasq, omegasq, partial.omegasq, epsilonsq, cohens.f),
data.frame(etasq = NA, partial.etasq = NA, omegasq = NA, partial.omegasq = NA, epsilonsq = NA, cohens.f = NA)
)
anov_stat <- cbind(anov_stat, data.frame(aov.sum))
# get nr of terms
nt <- nrow(anov_stat) - 1
# finally, compute power
as_power <- tryCatch(
c(.calculate_power(
df1 = anov_stat$df[1:nt],
df2 = anov_stat$df[nrow(anov_stat)],
effect_size = anov_stat$cohens.f[1:nt]^2
),
NA
),
error = function(x) {
NA
}
)
out <- cbind(anov_stat, data.frame(power = as_power))
out[] <- lapply(out, function(i) {
if (is.numeric(i)) {
round(i, digits)
} else {
i
}
})
class(out) <- c("sj_anova_stat", class(out))
out
}
aov_stat <- function(model, type) {
aov.sum <- aov_stat_summary(model)
aov.res <- aov_stat_core(aov.sum, type)
if (obj_has_name(aov.sum, "stratum"))
attr(aov.res, "stratum") <- aov.sum[["stratum"]]
aov.res
}
aov_stat_summary <- function(model) {
insight::check_if_installed("parameters")
# check if we have a mixed model
mm <- is_merMod(model)
ori.model <- model
# check that model inherits from correct class
# else, try to coerce to anova table
if (!inherits(model, c("Gam", "aov", "anova", "anova.rms", "aovlist")))
model <- stats::anova(model)
# get summary table
aov.sum <- insight::standardize_names(as.data.frame(parameters::model_parameters(model)), style = "broom")
# for mixed models, add information on residuals
if (mm) {
res <- stats::residuals(ori.model)
aov.sum <- rbind(
aov.sum,
data_frame(
term = "Residuals",
df = length(res) - sum(aov.sum[["df"]]),
sumsq = sum(res^2, na.rm = TRUE),
meansq = mse(ori.model),
statistic = NA
)
)
}
# check if object has sums of square
if (!obj_has_name(aov.sum, "sumsq")) {
stop("Model object has no sums of squares. Cannot compute effect size statistic.", call. = FALSE)
}
# need special handling for rms-anova
if (inherits(model, "anova.rms"))
colnames(aov.sum) <- c("term", "df", "sumsq", "meansq", "statistic", "p.value")
# for car::Anova, the meansq-column might be missing, so add it manually
if (!obj_has_name(aov.sum, "meansq")) {
pos_sumsq <- which(colnames(aov.sum) == "sumsq")
aov.sum <- cbind(
aov.sum[1:pos_sumsq],
data.frame(meansq = aov.sum$sumsq / aov.sum$df),
aov.sum[(pos_sumsq + 1):ncol(aov.sum)]
)
}
intercept <- .which_intercept(aov.sum$term)
if (length(intercept) > 0) {
aov.sum <- aov.sum[-intercept, ]
}
aov.sum
}
aov_stat_core <- function(aov.sum, type) {
intercept <- .which_intercept(aov.sum$term)
if (length(intercept) > 0) {
aov.sum <- aov.sum[-intercept, ]
}
# get mean squared of residuals
meansq.resid <- aov.sum[["meansq"]][nrow(aov.sum)]
# get total sum of squares
ss.total <- sum(aov.sum[["sumsq"]])
# get sum of squares of residuals
ss.resid <- aov.sum[["sumsq"]][nrow(aov.sum)]
# number of terms in model
n_terms <- nrow(aov.sum) - 1
# number of observations
N <- sum(aov.sum[["df"]]) + 1
aovstat <- switch(type,
# compute omega squared for each model term
omega = unlist(lapply(1:n_terms, function(x) {
ss.term <- aov.sum[["sumsq"]][x]
df.term <- aov.sum[["df"]][x]
(ss.term - df.term * meansq.resid) / (ss.total + meansq.resid)
})),
# compute partial omega squared for each model term
pomega = unlist(lapply(1:n_terms, function(x) {
df.term <- aov.sum[["df"]][x]
meansq.term <- aov.sum[["meansq"]][x]
(df.term * (meansq.term - meansq.resid)) / (df.term * meansq.term + (N - df.term) * meansq.resid)
})),
# compute epsilon squared for each model term
epsilon = unlist(lapply(1:n_terms, function(x) {
ss.term <- aov.sum[["sumsq"]][x]
df.term <- aov.sum[["df"]][x]
(ss.term - df.term * meansq.resid) / ss.total
})),
# compute eta squared for each model term
eta = unlist(lapply(1:n_terms, function(x) {
aov.sum[["sumsq"]][x] / sum(aov.sum[["sumsq"]])
})),
# compute partial eta squared for each model term
cohens.f = ,
peta = unlist(lapply(1:n_terms, function(x) {
aov.sum[["sumsq"]][x] / (aov.sum[["sumsq"]][x] + ss.resid)
}))
)
# compute Cohen's F
if (type == "cohens.f") aovstat <- sqrt(aovstat / (1 - aovstat))
# give values names of terms
names(aovstat) <- aov.sum[["term"]][1:n_terms]
aovstat
}
.which_intercept <- function(x) {
which(tolower(x) %in% c("(intercept)_zi", "intercept (zero-inflated)", "intercept", "zi_intercept", "(intercept)", "b_intercept", "b_zi_intercept"))
}
.calculate_power <- function(df1, df2, effect_size) {
if (any(effect_size < 0)) {
return(NA)
}
if (!is.null(df1) && any(df1 < 1)) {
return(NA)
}
if (!is.null(df2) && any(df2 < 1)) {
return(NA)
}
lambda <- effect_size * (df1 + df2 + 1)
stats::pf(
stats::qf(0.05, df1 = df1, df2 = df2, lower.tail = FALSE),
df1 = df1,
df2 = df2,
ncp = lambda,
lower.tail = FALSE
)
}
sjstats/R/gof.R 0000644 0001762 0000144 00000007272 14620351262 013055 0 ustar ligges users #' @title Compute model quality
#' @name chisq_gof
#'
#' @description For logistic regression models, performs a Chi-squared
#' goodness-of-fit-test.
#'
#' @param x A numeric vector or a \code{glm}-object.
#' @param prob Vector of probabilities (indicating the population probabilities)
#' of the same length as \code{x}'s amount of categories / factor levels.
#' Use \code{nrow(table(x))} to determine the amount of necessary values
#' for \code{prob}. Only used, when \code{x} is a vector, and not a
#' \code{glm}-object.
#' @param weights Vector with weights, used to weight \code{x}.
#'
#' @references
#' Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc.
#'
#' @details For vectors, this function is a convenient function for the
#' \code{chisq.test()}, performing goodness-of-fit test. For
#' \code{glm}-objects, this function performs a goodness-of-fit test.
#' A well-fitting model shows \emph{no} significant difference between the
#' model and the observed data, i.e. the reported p-values should be
#' greater than 0.05.
#'
#' @return For vectors, returns the object of the computed \code{\link[stats]{chisq.test}}.
#' For \code{glm}-objects, an object of class \code{chisq_gof} with
#' following values: \code{p.value}, the p-value for the goodness-of-fit test;
#' \code{z.score}, the standardized z-score for the goodness-of-fit test;
#' \code{rss}, the residual sums of squares term and \code{chisq}, the pearson
#' chi-squared statistic.
#'
#' @examples
#' data(efc)
#' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1)
#' m <- glm(
#' neg_c_7d ~ c161sex + barthtot + c172code,
#' data = efc,
#' family = binomial(link = "logit")
#' )
#'
#' # goodness-of-fit test for logistic regression
#' chisq_gof(m)
#'
#' # goodness-of-fit test for vectors against probabilities
#' # differing from population
#' chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28))
#'
#' # equal to population
#' chisq_gof(efc$e42dep, prop.table(table(efc$e42dep)))
#'
#' @export
chisq_gof <- function(x, prob = NULL, weights = NULL) {
if (inherits(x, "glm")) {
# This is an adapted version from the
# "binomTools" package. The "X2GOFtest()"
# function did not work when model data frame
# had missing values.
y_hat <- stats::fitted(x)
wt <- x$prior.weight
vJ <- wt * y_hat * (1 - y_hat)
cJ <- (1 - 2 * y_hat) / vJ
X2 <- sum(stats::resid(x, type = "pearson")^2)
form <- stats::as.formula(x$formula)
form[[2]] <- as.name("cJ")
# use model matrix instead of data values,
# because data may contain more variables
# than needed, and due to missing may have
# different row length
dat <- stats::na.omit(x$model)
dat$cJ <- cJ
dat$vJ <- vJ
RSS <- sum(vJ * stats::resid(stats::lm(form, data = dat, weights = vJ))^2)
A <- 2 * (length(y_hat) - sum(1 / wt))
z <- (X2 - x$df.residual) / sqrt(A + RSS)
p.value <- 2 * stats::pnorm(abs(z), lower.tail = FALSE)
chi2gof <- list(
p.value = p.value,
z.score = z,
rss = RSS,
chisq = X2
)
class(chi2gof) <- c("sj_chi2gof", "list")
} else {
# check if we have probs
if (is.null(prob)) {
warning("`prob` needs to be specified.", call. = F)
return(invisible(NULL))
}
# performs a Chi-square goodnes-of-fit-test
if (!is.null(weights)) x <- weight(x, weights)
dummy <- as.vector(table(x))
# goodness of fit-test. x is one-dimensional and
# y not given
chi2gof <- stats::chisq.test(dummy, p = prob)
}
chi2gof
}
sjstats/R/phi.R 0000644 0001762 0000144 00000002257 14620351262 013060 0 ustar ligges users #' @rdname crosstable_statistics
#' @export
phi <- function(tab, ...) {
UseMethod("phi")
}
#' @export
phi.table <- function(tab, ...) {
.phi(tab)
}
#' @export
phi.ftable <- function(tab, ...) {
.phi(tab)
}
#' @export
phi.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) {
formula_terms <- all.vars(formula)
tab <- table(data[[formula_terms[1]]], data[[formula_terms[2]]])
method <- match.arg(method)
if (is.null(ci.lvl) || is.na(ci.lvl)) {
.cramers_v(tab)
} else {
straps <- sjstats::bootstrap(data[formula_terms], n)
tables <- lapply(straps$strap, function(x) {
dat <- as.data.frame(x)
table(dat[[1]], dat[[2]])
})
phis <- sapply(tables, .phi)
ci <- boot_ci(phis, ci.lvl = ci.lvl, method = method)
data_frame(
phi = .phi(tab),
conf.low = ci$conf.low,
conf.high = ci$conf.high
)
}
}
.phi <- function(tab) {
insight::check_if_installed("MASS")
# convert to flat table
if (!inherits(tab, "ftable")) tab <- stats::ftable(tab)
tb <- summary(MASS::loglm(~1 + 2, tab))$tests
sqrt(tb[2, 1] / sum(tab))
}
sjstats/R/cv_error.R 0000644 0001762 0000144 00000005212 14620351262 014113 0 ustar ligges users #' @title Test and training error from model cross-validation
#' @name cv_error
#'
#' @description \code{cv_error()} computes the root mean squared error from a model fitted
#' to kfold cross-validated test-training-data. \code{cv_compare()}
#' does the same, for multiple formulas at once (by calling \code{cv_error()}
#' for each formula).
#'
#' @param data A data frame.
#' @param formula The formula to fit the linear model for the test and training data.
#' @param formulas A list of formulas, to fit linear models for the test and training data.
#' @param k The number of folds for the kfold-crossvalidation.
#'
#' @return A data frame with the root mean squared errors for the training and test data.
#'
#' @details \code{cv_error()} first generates cross-validated test-training pairs, using
#' \code{\link[modelr]{crossv_kfold}} and then fits a linear model, which
#' is described in \code{formula}, to the training data. Then, predictions
#' for the test data are computed, based on the trained models.
#' The \emph{training error} is the mean value of the \code{\link{rmse}} for
#' all \emph{trained} models; the \emph{test error} is the rmse based on all
#' residuals from the test data.
#'
#' @examples
#' data(efc)
#' cv_error(efc, neg_c_7 ~ barthtot + c161sex)
#'
#' cv_compare(efc, formulas = list(
#' neg_c_7 ~ barthtot + c161sex,
#' neg_c_7 ~ barthtot + c161sex + e42dep,
#' neg_c_7 ~ barthtot + c12hour
#' ))
#'
#' @export
cv_error <- function(data, formula, k = 5) {
insight::check_if_installed("datawizard")
# response
resp <- insight::find_response(formula)
# compute cross validation data
cv_data <- lapply(k, function(i) {
datawizard::data_partition(data, proportion = 0.8)
})
# get train and test datasets
train_data <- lapply(cv_data, function(cvdat) cvdat[[1]])
test_data <- lapply(cv_data, function(cvdat) cvdat[[2]])
# fit models on datasets
trained_models <- lapply(train_data, function(x) stats::lm(formula, data = x))
test_models <- lapply(test_data, function(x) stats::lm(formula, data = x))
# RMSE
train_error <- mean(vapply(trained_models, performance::rmse, numeric(1)), na.rm = TRUE)
test_error <- mean(vapply(test_models, performance::rmse, numeric(1)), na.rm = TRUE)
data_frame(
model = deparse(formula),
train.error = round(train_error, 4),
test.error = round(test_error, 4)
)
}
#' @rdname cv_error
#' @export
cv_compare <- function(data, formulas, k = 5) {
do.call(rbind, lapply(formulas, function(f) cv_error(data, formula = f, k = k)))
}
sjstats/R/cv.R 0000644 0001762 0000144 00000003113 14620351262 012700 0 ustar ligges users #' @title Compute model quality
#' @name cv
#'
#' @description Compute the coefficient of variation.
#'
#' @param x Fitted linear model of class \code{lm}, \code{merMod} (\pkg{lme4})
#' or \code{lme} (\pkg{nlme}).
#' @param ... More fitted model objects, to compute multiple coefficients of
#' variation at once.
#'
#' @details The advantage of the cv is that it is unitless. This allows
#' coefficient of variation to be compared to each other in ways
#' that other measures, like standard deviations or root mean
#' squared residuals, cannot be.
#'
#' @return Numeric, the coefficient of variation.
#'
#' @examples
#' data(efc)
#' fit <- lm(barthtot ~ c160age + c12hour, data = efc)
#' cv(fit)
#'
#' @export
cv <- function(x, ...) {
# return value
cv_ <- cv_helper(x)
# check if we have multiple parameters
if (nargs() > 1) {
# get input list
params_ <- list(...)
cv_ <- c(cv_, sapply(params_, cv_helper))
}
cv_
}
cv_helper <- function(x) {
insight::check_if_installed("performance")
# check if we have a fitted linear model
if (inherits(x, c("lm", "lmerMod", "lme", "merModLmerTest")) && !inherits(x, "glm")) {
# get response
dv <- insight::get_response(x)
mw <- mean(dv, na.rm = TRUE)
stddev <- performance::rmse(x)
} else {
mw <- mean(x, na.rm = TRUE)
stddev <- stats::sd(x, na.rm = TRUE)
}
# check if mean is zero?
if (mw == 0)
stop("Mean of dependent variable is zero. Cannot compute model's coefficient of variation.", call. = F)
stddev / mw
}
sjstats/R/is_prime.R 0000644 0001762 0000144 00000001214 14620351262 014077 0 ustar ligges users #' @title Find prime numbers
#' @name is_prime
#'
#' @description This functions checks whether a number is, or numbers in a
#' vector are prime numbers.
#'
#' @param x An integer, or a vector of integers.
#'
#' @return `TRUE` for each prime number in `x`, `FALSE` otherwise.
#'
#' @examples
#' is_prime(89)
#' is_prime(15)
#' is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
#'
#' @export
is_prime <- function(x) {
if (is.numeric(x) && !all(x %% 1 == 0, na.rm = TRUE)) {
insight::format_error("`x` needs to be an integer value.")
}
vapply(x, function(.x) .x == 2L || all(.x %% 2L:max(2, floor(sqrt(.x))) != 0), logical(1))
}
sjstats/R/confint_ncg.R 0000644 0001762 0000144 00000007437 14620351262 014574 0 ustar ligges users # This function is a modified version from package MBESS
# copied from https://github.com/cran/MBESS/blob/master/R/conf.limits.ncf.R
# Author: Ken Kelley
# License: GPL-3
confint_ncg <- function(F.value = NULL, conf.level = 0.95, df.1 = NULL, df.2 = NULL) {
alpha.lower <- alpha.upper <- (1 - conf.level) / 2
tol <- 1e-09
Jumping.Prop <- 0.1
FAILED <- NULL
LL.0 <- stats::qf(p = alpha.lower * 5e-04, df1 = df.1, df2 = df.2)
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) - (1 - alpha.lower)
if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < (1 - alpha.lower)) {
FAILED <- if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = 0) < 1 - alpha.lower)
LL.0 <- 1e-08
if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < 1 - alpha.lower)
FAILED <- TRUE
}
if (is.null(FAILED)) {
LL.1 <- LL.2 <- LL.0
while (Diff > tol) {
LL.2 <- LL.1 * (1 + Jumping.Prop)
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.2) - (1 - alpha.lower)
LL.1 <- LL.2
}
LL.1 <- LL.2 / (1 + Jumping.Prop)
LL.Bounds <- c(LL.1, (LL.1 + LL.2) / 2, LL.2)
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower)
while (abs(Diff) > tol) {
Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[1]) - (1 - alpha.lower) > tol
Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) > tol
Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[3]) - (1 - alpha.lower) > tol
if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) {
LL.Bounds <-
c(LL.Bounds[2], (LL.Bounds[2] + LL.Bounds[3]) / 2, LL.Bounds[3])
}
if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) {
LL.Bounds <-
c(LL.Bounds[1], (LL.Bounds[1] + LL.Bounds[2]) / 2, LL.Bounds[2])
}
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower)
}
LL <- LL.Bounds[2]
}
if (!is.null(FAILED)) LL <- NA
FAILED.Up <- NULL
UL.0 <- stats::qf(p = 1 - alpha.upper * 5e-04, df1 = df.1, df2 = df.2)
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper
if (Diff < 0) UL.0 <- 1e-08
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper
if (Diff < 0) FAILED.Up <- TRUE
if (is.null(FAILED.Up)) {
UL.1 <- UL.2 <- UL.0
while (Diff > tol) {
UL.2 <- UL.1 * (1 + Jumping.Prop)
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.2) - alpha.upper
UL.1 <- UL.2
}
UL.1 <- UL.2 / (1 + Jumping.Prop)
UL.Bounds <- c(UL.1, (UL.1 + UL.2) / 2, UL.2)
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper
while (abs(Diff) > tol) {
Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[1]) - alpha.upper > tol
Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper > tol
Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[3]) - alpha.upper > tol
if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) {
UL.Bounds <-
c(UL.Bounds[2], (UL.Bounds[2] + UL.Bounds[3]) / 2, UL.Bounds[3])
}
if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) {
UL.Bounds <- c(UL.Bounds[1], (UL.Bounds[1] + UL.Bounds[2]) / 2, UL.Bounds[2])
}
Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper
}
UL <- UL.Bounds[2]
}
if (!is.null(FAILED.Up)) UL <- NA
list(Lower.Limit = LL, Upper.Limit = UL)
}
sjstats/R/chi_squared_test.R 0000644 0001762 0000144 00000024747 14623373000 015633 0 ustar ligges users #' @title Chi-Squared test
#' @name chi_squared_test
#' @description This function performs a \eqn{\chi^2} test for contingency
#' tables or tests for given probabilities. The returned effects sizes are
#' Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi})
#' for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against
#' given probabilities (see _Ben-Shachar et al. 2023_).
#'
#' @param probabilities A numeric vector of probabilities for each cell in the
#' contingency table. The length of the vector must match the number of cells
#' in the table, i.e. the number of unique levels of the variable specified
#' in `select`. If `probabilities` is provided, a chi-squared test for given
#' probabilities is conducted. Furthermore, if `probabilities` is given, `by`
#' must be `NULL`. The probabilities must sum to 1.
#' @param paired Logical, if `TRUE`, a McNemar test is conducted for 2x2 tables.
#' Note that `paired` only works for 2x2 tables.
#' @param ... Additional arguments passed down to [`chisq.test()`].
#' @inheritParams mann_whitney_test
#'
#' @inheritSection mann_whitney_test Which test to use
#'
#' @inherit mann_whitney_test seealso
#'
#' @return A data frame with test results. The returned effects sizes are
#' Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi})
#' for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against
#' given probabilities.
#'
#' @details The function is a wrapper around [`chisq.test()`] and
#' [`fisher.test()`] (for small expected values) for contingency tables, and
#' `chisq.test()` for given probabilities. When `probabilities` are provided,
#' these are rescaled to sum to 1 (i.e. `rescale.p = TRUE`). When `fisher.test()`
#' is called, simulated p-values are returned (i.e. `simulate.p.value = TRUE`,
#' see `?fisher.test`). If `paired = TRUE` and a 2x2 table is provided,
#' a McNemar test (see [`mcnemar.test()`]) is conducted.
#'
#' The weighted version of the chi-squared test is based on the a weighted
#' table, using [`xtabs()`] as input for `chisq.test()`.
#'
#' Interpretation of effect sizes are based on rules described in
#' [`effectsize::interpret_phi()`], [`effectsize::interpret_cramers_v()`],
#' and [`effectsize::interpret_fei()`]. Use these function directly to get other
#' interpretations, by providing the returned effect size as argument, e.g.
#' `interpret_phi(0.35, rules = "gignac2016")`.
#'
#' @references
#' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
#' \doi{10.3390/math11091982}
#'
#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
#' Dtsch Med Wochenschr 2007; 132: e24–e25
#'
#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
#'
#' @examplesIf requireNamespace("effectsize") && requireNamespace("MASS")
#' data(efc)
#' efc$weight <- abs(rnorm(nrow(efc), 1, 0.3))
#'
#' # Chi-squared test
#' chi_squared_test(efc, "c161sex", by = "e16sex")
#'
#' # weighted Chi-squared test
#' chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight")
#'
#' # Chi-squared test for given probabilities
#' chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7))
#' @export
chi_squared_test <- function(data,
select = NULL,
by = NULL,
probabilities = NULL,
weights = NULL,
paired = FALSE,
...) {
# sanity check - if we only have one variable in "select" and "by" and
# "probabilities" are NULL, set probalities
if (is.null(probabilities) && !is.null(select) && is.null(by) && length(select) == 1) {
probabilities <- rep(1 / length(data[[select]]), length(data[[select]]))
}
if (is.null(probabilities)) {
.calculate_chisq(data, select, by, weights, paired, ...)
} else {
# sanity check - `paired = TRUE` is not available for given probabilities
if (paired) {
insight::format_error("When `probabilities` are provided, `paired = TRUE` is not available.") # nolint
}
.calculate_chisq_gof(data, select, probabilities, weights, ...)
}
}
# Mann-Whitney-Test for two groups --------------------------------------------
.calculate_chisq <- function(data, select, by, weights, paired = FALSE, ...) {
insight::check_if_installed(c("datawizard", "MASS"))
# sanity checks
.sanitize_htest_input(data, select, by, weights)
# get data
grp1 <- data[[select]]
grp2 <- data[[by]]
# if paired = TRUE, we only allow a 2x2 table
if (paired && (length(stats::na.omit(unique(grp1))) != 2 || length(stats::na.omit(unique(grp2))) != 2)) {
insight::format_error("When `paired = TRUE`, only 2x2 tables are allowed (i.e. both variables must have exactly two levels).") # nolint
}
# create data frame for table
x <- data.frame(
grp1 = datawizard::to_factor(grp1),
grp2 = datawizard::to_factor(grp2)
)
# add weights
if (!is.null(weights)) {
x$weights <- data[[weights]]
}
# remove missings
x <- stats::na.omit(x)
# contingency table
if (is.null(weights)) {
tab <- table(x)
} else {
tab <- as.table(round(stats::xtabs(x[[3]] ~ x[[1]] + x[[2]])))
class(tab) <- "table"
}
# expected values, to identify whether Fisher's test is needed
expected_values <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab))) # nolint
# paired? mc-nemar test
if (paired) {
htest <- suppressWarnings(stats::mcnemar.test(tab, ...))
test_statistic <- htest$statistic
} else {
# chi-squared test
htest <- suppressWarnings(stats::chisq.test(tab, ...))
test_statistic <- htest$statistic
# need fisher?
if (min(expected_values) < 5 || (min(expected_values) < 10 && htest$parameter == 1)) {
htest <- stats::fisher.test(tab, simulate.p.value = TRUE, ...)
}
}
p_value <- htest$p.value
# effect size
if (nrow(tab) > 2 || ncol(tab) > 2) {
effect_size <- stats::setNames(cramer(tab), "Cramer's V")
} else {
effect_size <- stats::setNames(phi(tab), "Phi")
}
# return result
out <- data.frame(
data = paste(select, "by", by),
statistic_name = "Chi-squared",
statistic = test_statistic,
effect_size_name = names(effect_size),
effect_size = as.numeric(effect_size),
p = p_value,
df = (nrow(tab) - 1) * (ncol(tab) - 1),
n_obs = sum(tab, na.rm = TRUE),
stringsAsFactors = FALSE
)
class(out) <- c("sj_htest_chi", "data.frame")
attr(out, "weighted") <- !is.null(weights)
attr(out, "fisher") <- isTRUE(startsWith(htest$method, "Fisher"))
attr(out, "mcnemar") <- isTRUE(paired)
attr(out, "caption") <- "contingency tables"
out
}
.calculate_chisq_gof <- function(data, select, probabilities, weights, ...) {
insight::check_if_installed("effectsize")
# get data
x <- data.frame(grp = data[[select]])
# add weights
if (!is.null(weights)) {
x$weights <- data[[weights]]
}
# remove missings
x <- stats::na.omit(x)
# contingency table
if (is.null(weights)) {
tab <- table(x)
} else {
tab <- as.table(round(stats::xtabs(x[[2]] ~ x[[1]])))
class(tab) <- "table"
}
# table dimensions
n_rows <- nlevels(droplevels(as.factor(x$grp)))
# sanity check
if (length(probabilities) != n_rows) {
insight::format_error("Length of probabilities must match number of cells in table (i.e. number of levels of input factor).") # nolint
}
if (!isTRUE(all.equal(sum(probabilities), 1))) {
insight::format_error("Probabilities must sum to 1.")
}
# chi-squared test
htest <- suppressWarnings(stats::chisq.test(tab, p = probabilities, rescale.p = TRUE, ...))
test_statistic <- htest$statistic
p_value <- htest$p.value
effect_size <- effectsize::chisq_to_fei(
test_statistic,
n = sum(tab),
nrow = n_rows,
ncol = 1,
p = probabilities,
alternative = "two.sided"
)$Fei
# return result
out <- data.frame(
data = paste(
select,
"against probabilities",
datawizard::text_concatenate(sprintf("%i%%", round(100 * probabilities)))
),
statistic_name = "Chi-squared",
statistic = test_statistic,
effect_size_name = "Fei",
effect_size = as.numeric(effect_size),
p = p_value,
df = n_rows - 1,
n_obs = sum(tab, na.rm = TRUE),
stringsAsFactors = FALSE
)
class(out) <- c("sj_htest_chi", "data.frame")
attr(out, "caption") <- "given probabilities"
attr(out, "weighted") <- !is.null(weights)
out
}
# methods ---------------------------------------------------------------------
#' @export
print.sj_htest_chi <- function(x, ...) {
weighted <- attributes(x)$weighted
if (weighted) {
weight_string <- " (weighted)"
} else {
weight_string <- ""
}
fisher <- attributes(x)$fisher
mcnemar <- attributes(x)$mcnemar
# headline
insight::print_color(sprintf(
"# Chi-squared test for %s%s\n",
attributes(x)$caption,
weight_string
), "blue")
# Fisher's exact test?
if (isTRUE(fisher)) {
insight::print_color(" (using Fisher's exact test due to small expected values)\n", "blue") # nolint
} else if (isTRUE(mcnemar)) {
insight::print_color(" (using McNemar's test for paired data)\n", "blue") # nolint
}
cat("\n")
# data info
insight::print_color(
sprintf(" Data: %s (n = %i)\n", x$data, round(x$n_obs)),
"cyan"
)
# prepare and align strings
eff_symbol <- .format_symbols(x$effect_size_name)
stat_symbol <- .format_symbols(x$statistic_name)
# string for effectsizes
eff_string <- switch(x$effect_size_name,
Fei = sprintf(
"%s = %.3f (%s effect)",
eff_symbol,
x$effect_size,
effectsize::interpret_fei(x$effect_size)
),
Phi = sprintf(
"%s = %.3f (%s effect)",
eff_symbol,
x$effect_size,
effectsize::interpret_phi(x$effect_size)
),
sprintf(
"Cramer's V = %.3f (%s effect)",
x$effect_size,
effectsize::interpret_cramers_v(x$effect_size)
)
)
cat(sprintf(
"\n %s = %.3f, %s, df = %i, %s\n\n",
stat_symbol, x$statistic, eff_string, round(x$df), insight::format_p(x$p)
))
}
sjstats/R/svyglmzip.R 0000644 0001762 0000144 00000007675 14620351262 014355 0 ustar ligges users utils::globalVariables("scaled.weights")
#' @title Survey-weighted zero-inflated Poisson model
#' @name svyglm.zip
#' @description \code{svyglm.zip()} is an extension to the \CRANpkg{survey}-package
#' to fit survey-weighted zero-inflated Poisson models. It uses
#' \code{\link[survey]{svymle}} to fit sampling-weighted
#' maximum likelihood estimates, based on starting values provided
#' by \code{\link[pscl]{zeroinfl}}.
#'
#'
#' @param formula An object of class \code{formula}, i.e. a symbolic description
#' of the model to be fitted. See 'Details' in \code{\link[pscl]{zeroinfl}}.
#' @param design An object of class \code{\link[survey]{svydesign}}, providing
#' a specification of the survey design.
#' @param ... Other arguments passed down to \code{\link[pscl]{zeroinfl}}.
#'
#' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.zip},
#' with some additional information about the model.
#'
#' @details Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/.
#'
#' @examples
#' if (require("survey")) {
#' data(nhanes_sample)
#' set.seed(123)
#' nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2)
#' nhanes_sample$malepartners[sample(1:2992, 400)] <- 0
#'
#' # create survey design
#' des <- svydesign(
#' id = ~SDMVPSU,
#' strat = ~SDMVSTRA,
#' weights = ~WTINT2YR,
#' nest = TRUE,
#' data = nhanes_sample
#' )
#'
#' # fit negative binomial regression
#' fit <- svyglm.zip(
#' malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1),
#' des
#' )
#'
#' # print coefficients and standard errors
#' fit
#' }
#' @export
svyglm.zip <- function(formula, design, ...) {
insight::check_if_installed(c("survey", "pscl"))
# get design weights. we need to scale these weights for the glm.nb() function
dw <- stats::weights(design)
# update design with scaled weights
design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE))
# fit ZIP model, with scaled design weights
mod <- suppressWarnings(pscl::zeroinfl(formula, data = stats::model.frame(design), weights = scaled.weights, ...))
ff <- insight::find_formula(mod)
# fit survey model, using maximum likelihood estimation
svyfit <-
survey::svymle(
loglike = sjstats_loglik_zip,
grad = sjstats_score_zip,
design = design,
formulas = list(eta = ff$conditional, logitp = ff$zero_inflated),
start = stats::coef(mod),
na.action = "na.omit"
)
# add additoinal information
class(svyfit) <- c("svyglm.zip", class(svyfit))
attr(svyfit, "zip.terms") <- all.vars(formula)
attr(svyfit, "zip.formula") <- formula
svyfit$deviance <- mod$deviance
svyfit$df.residuals <- mod$df.residuals
svyfit$df <- length(stats::coef(mod)) + 1
svyfit$aic <- mod$aic
svyfit
}
# log-likelihood function used in "svymle()"
sjstats_loglik_zip <- function(y, eta, logitp) {
mu <- exp(eta)
p <- exp(logitp) / (1 + exp(logitp))
log(p * (y == 0) + (1 - p) * stats::dpois(y, mu))
}
sjstats_dlogitp = function(y, eta, logitp) {
mu <- exp(eta)
p <- exp(logitp) / (1 + exp(logitp))
dexpit <- p / (1 + p) ^ 2
num <- dexpit * (y == 0) - dexpit * stats::dpois(y, mu)
denom <- p * (y == 0) + (1 - p) * stats::dpois(y, mu)
num / denom
}
# derivative
sjstats_deta_zip <- function(y, eta, logitp) {
mu <- exp(eta)
p <- exp(logitp) / (1 + exp(logitp))
dmutoy <- 0 * y
dmutoy[y > 0] = exp(-mu[y > 0]) * mu[y > 0] ^ (y[y > 0] - 1) / factorial(y[y > 0] - 1)
num = (1 - p) * (-stats::dpois(y, mu) + dmutoy)
denom = p * (y == 0) + (1 - p) * stats::dpois(y, mu)
num / denom
}
# score function, combines derivatives
sjstats_score_zip <- function(y, eta, logitp) {
cbind(sjstats_deta_zip(y, eta, logitp), sjstats_dlogitp(y, eta, logitp))
}
sjstats/R/S3-methods.R 0000644 0001762 0000144 00000026121 14620351262 014222 0 ustar ligges users #' @export
print.svyglm.nb <- function(x, se = c("robust", "model"), digits = 4, ...) {
se <- match.arg(se)
sm <- tidy_svyglm.nb(x, digits, v_se = se)[-1, -2]
pan <- ifelse(sm$p.value < 0.001, "<0.001 ***",
ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint
ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint
ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint
sprintf("%.*f ", digits, sm$p.value)
)
)
)
)
sm$p.value <- pan
print(sm, ...)
# add dispersion parameter
cat(sprintf("\nDispersion parameter Theta: %.*f", digits, attr(x, "nb.theta", exact = TRUE)))
cat(sprintf("\n Standard Error of Theta: %.*f", digits, attr(x, "nb.theta.se", exact = TRUE)))
message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se))
}
#' @export
print.svyglm.zip <- function(x, se = c("robust", "model"), digits = 4, ...) {
se <- match.arg(se)
sm <- tidy_svyglm.zip(x, digits, v_se = se)[-1, ]
pan <- ifelse(sm$p.value < 0.001, "<0.001 ***",
ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint
ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint
ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint
sprintf("%.*f ", digits, sm$p.value)
)
)
)
)
sm$p.value <- pan
print(sm, ...)
message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se))
}
tidy_svyglm.nb <- function(x, digits = 4, v_se = c("robust", "model")) {
v_se <- match.arg(v_se)
if (!isNamespaceLoaded("survey"))
requireNamespace("survey", quietly = TRUE)
# keep original value, not rounded
est <- stats::coef(x)
se <- sqrt(diag(stats::vcov(x, stderr = v_se)))
data_frame(
term = substring(names(stats::coef(x)), 5),
estimate = round(est, digits),
irr = round(exp(est), digits),
std.error = round(se, digits),
conf.low = round(exp(est - stats::qnorm(0.975) * se), digits),
conf.high = round(exp(est + stats::qnorm(0.975) * se), digits),
p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits)
)
}
tidy_svyglm.zip <- function(x, digits = 4, v_se = c("robust", "model")) {
v_se <- match.arg(v_se)
if (!isNamespaceLoaded("survey"))
requireNamespace("survey", quietly = TRUE)
# keep original value, not rounded
est <- stats::coef(x)
se <- sqrt(diag(stats::vcov(x, stderr = v_se)))
data_frame(
term = substring(names(stats::coef(x)), 5),
estimate = round(est, digits),
std.error = round(se, digits),
conf.low = round(exp(est - stats::qnorm(0.975) * se), digits),
conf.high = round(exp(est + stats::qnorm(0.975) * se), digits),
p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits)
)
}
#' @export
model.frame.svyglm.nb <- function(formula, ...) {
pred <- attr(formula, "nb.terms", exact = TRUE)
formula$design$variables[intersect(pred, colnames(formula$design$variables))]
}
#' @export
model.frame.svyglm.zip <- function(formula, ...) {
pred <- attr(formula, "zip.terms", exact = TRUE)
formula$design$variables[intersect(pred, colnames(formula$design$variables))]
}
#' @importFrom stats family
#' @export
family.svyglm.nb <- function(object, ...) {
attr(object, "family", exact = TRUE)
}
#' @export
formula.svyglm.nb <- function(x, ...) {
attr(x, "nb.formula", exact = TRUE)
}
#' @export
formula.svyglm.zip <- function(x, ...) {
attr(x, "zip.formula", exact = TRUE)
}
#' @export
predict.svyglm.nb <- function(object, newdata = NULL,
type = c("link", "response", "terms"),
se.fit = FALSE, dispersion = NULL, terms = NULL,
na.action = stats::na.pass, ...) {
insight::check_if_installed(c("survey", "MASS"))
fnb <- MASS::glm.nb(
attr(object, "nb.formula", exact = TRUE),
data = object$design$variables,
weights = scaled.weights
)
cf <- stats::coef(fnb)
names.cf <- names(cf)
cf <- stats::coef(object)[-1]
cf <- stats::setNames(cf, names.cf)
fnb$coefficients <- cf
stats::predict.glm(
object = fnb,
newdata = newdata,
type = type,
se.fit = se.fit,
dispersion = dispersion,
terms = terms,
na.action = na.action,
...
)
}
#' @export
residuals.svyglm.nb <- function(object, ...) {
if (!isNamespaceLoaded("survey"))
requireNamespace("survey", quietly = TRUE)
fnb <- MASS::glm.nb(
attr(object, "nb.formula", exact = TRUE),
data = object$design$variables,
weights = scaled.weights
)
y <- insight::get_response(fnb)
mu <- stats::predict.glm(fnb, type = "response")
wts <- fnb$prior.weights
(y - mu) * sqrt(wts) / sqrt(fnb$family$variance(mu))
}
#' @export
terms.svyglm.nb <- function(x, ...) {
if (!isNamespaceLoaded("survey"))
requireNamespace("survey", quietly = TRUE)
stats::terms(stats::formula(x), ...)
}
#' @export
AIC.svyglm.nb <- function(object, ...) {
## FIXME this one just returns the AIC of the underlying glm.nb() model
aics <- lapply(list(object, ...), getaic)
as.data.frame(do.call(rbind, aics))
}
getaic <- function(x) {
c(df = x$df, AIC = x$aic)
}
#' @export
deviance.svyglm.nb <- function(object, ...) {
## FIXME this one just returns the deviance of the underlying glm.nb() model
object$deviance
}
#' @export
as.integer.sj_resample <- function(x, ...) {
x$id
}
#' @export
as.data.frame.sj_resample <- function(x, ...) {
x$data[x$id, , drop = FALSE]
}
#' @export
print.sj_resample <- function(x, ...) {
n <- length(x$id)
if (n > 12)
id10 <- c(x$id[1:12], "...")
else
id10 <- x$id
cat("<", paste0(
"id's of resample [", prettyNum(nrow(x$data), big.mark = ","), " x ",
prettyNum(ncol(x$data), big.mark = ","), "]"
), "> ",
toString(id10), "\n",
sep = ""
)
}
#' @export
plot.sj_inequ_trend <- function(x, ...) {
.data <- NULL
insight::check_if_installed("ggplot2")
# add time indicator
x$data$zeit <- seq_len(nrow(x$data))
# get gather column names
gather.cols1 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "lo", "hi")]
gather.cols2 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "rr", "rd")]
# gather data to plot rr and rd
dat1 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y")
# gather data for raw prevalences
dat2 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y")
# Proper value names, for facet labels
dat1$grp[dat1$grp == "rr"] <- "Rate Ratios"
dat1$grp[dat1$grp == "rd"] <- "Rate Differences"
# plot prevalences
gp1 <- ggplot2::ggplot(dat2, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) +
ggplot2::geom_smooth(method = "loess", se = FALSE) +
ggplot2::labs(title = "Prevalance Rates for Lower and Higher SES Groups",
y = "Prevalances", x = "Time", colour = "") +
ggplot2::scale_color_manual(values = c("darkblue", "darkred"), labels = c("High SES", "Low SES"))
# plot rr and rd
gp2 <- ggplot2::ggplot(dat1, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) +
ggplot2::geom_smooth(method = "loess", se = FALSE) +
ggplot2::facet_wrap(~grp, ncol = 1, scales = "free") +
ggplot2::labs(title = "Proportional Change in Rate Ratios and Rate Differences",
colour = NULL, y = NULL, x = "Time") +
ggplot2::guides(colour = "none")
suppressMessages(graphics::plot(gp1))
suppressMessages(graphics::plot(gp2))
}
#' @export
print.sj_xtab_stat <- function(x, ...) {
# get length of method name, to align output
l <- max(nchar(c(x$method, x$stat.name, "p-value", "Observations")))
# headline
insight::print_color("\n# Measure of Association for Contingency Tables\n", "blue")
# used fisher?
if (x$fisher)
insight::print_color(" (using Fisher's Exact Test)\n", "blue")
cat("\n")
# print test statistic
cat(sprintf(" %*s: %.4f\n", l, x$stat.name, x$statistic))
cat(sprintf(" %*s: %.4f\n", l, x$method, x$estimate))
cat(sprintf(" %*s: %g\n", l, "df", x$df))
cat(sprintf(" %*s: %s\n", l, "p-value", insight::format_p(x$p.value, stars = TRUE, name = NULL)))
cat(sprintf(" %*s: %g\n", l, "Observations", x$n_obs))
}
#' @export
print.sj_chi2gof <- function(x, ...) {
insight::print_color("\n# Chi-squared Goodness-of-Fit Test\n\n", "blue")
v1 <- sprintf("%.3f", x$chisq)
v2 <- sprintf("%.3f", x$z.score)
v3 <- sprintf("%.3f", x$p.value)
space <- max(nchar(c(v1, v2, v3)))
cat(sprintf(" Chi-squared: %*s\n", space, v1))
cat(sprintf(" z-score: %*s\n", space, v2))
cat(sprintf(" p-value: %*s\n\n", space, v3))
if (x$p.value >= 0.05)
message("Summary: model seems to fit well.")
else
message("Summary: model does not fit well.")
}
#' @export
print.sj_ttest <- function(x, ...) {
insight::print_color(sprintf("\n%s (%s)\n", x$method, x$alternative), "blue")
group <- attr(x, "group.name", exact = TRUE)
xn <- attr(x, "x.name", exact = TRUE)
yn <- attr(x, "y.name", exact = TRUE)
if (!is.null(group))
verbs <- c("of", "by")
else
verbs <- c("between", "and")
st <- sprintf("# t=%.2f df=%i p-value=%.3f\n\n", x$statistic, as.integer(x$df), x$p.value)
if (!is.null(yn)) {
insight::print_color(sprintf("\n# comparison %s %s %s %s\n", verbs[1], xn, verbs[2], yn), "cyan")
}
insight::print_color(st, "cyan")
if (!is.null(yn)) {
if (!is.null(group)) {
l1 <- sprintf("mean in group %s", group[1])
l2 <- sprintf("mean in group %s", group[2])
} else {
l1 <- sprintf("mean of %s", xn)
l2 <- sprintf("mean of %s", yn)
}
l3 <- "difference of mean"
slen <- max(nchar(c(l1, l2, l3)))
cat(sprintf(" %s: %.3f\n", format(l1, width = slen), x$estimate[1]))
cat(sprintf(" %s: %.3f\n", format(l2, width = slen), x$estimate[2]))
cat(sprintf(" %s: %.3f [%.3f %.3f]\n", format(l3, width = slen), x$estimate[1] - x$estimate[2], x$ci[1], x$ci[2]))
} else {
cat(sprintf(" mean of %s: %.3f [%.3f, %.3f]\n", xn, x$estimate[1], x$ci[1], x$ci[2]))
}
cat("\n")
}
#' @export
print.sj_wcor <- function(x, ...) {
insight::print_color(sprintf("\nWeighted %s\n\n", x$method), "blue")
if (!is.null(x$ci)) {
cilvl <- sprintf("%.2i%%", as.integer(100 * x$ci.lvl))
cat(sprintf(" estimate [%s CI]: %.3f [%.3f %.3f]\n", cilvl, x$estimate, x$ci[1], x$ci[2]))
cat(sprintf(" p-value: %.3f\n\n", x$p.value))
} else {
cat(sprintf(" estimate: %.3f\n", x$estimate))
cat(sprintf(" p-value: %.3f\n\n", x$p.value))
}
}
#' @export
print.sj_anova_stat <- function(x, digits = 3, ...) {
x$p.value <- insight::format_p(x$p.value, name = NULL)
cat(insight::export_table(x, digits = digits, protect_integers = TRUE))
}
sjstats/R/find_beta.R 0000644 0001762 0000144 00000014217 14620351262 014212 0 ustar ligges users #' @title Determining distribution parameters
#' @name find_beta
#'
#' @description `find_beta()`, `find_normal()` and `find_cauchy()` find the
#' shape, mean and standard deviation resp. the location and scale parameters
#' to describe the beta, normal or cauchy distribution, based on two
#' percentiles. `find_beta2()` finds the shape parameters for a Beta
#' distribution, based on a probability value and its standard error
#' or confidence intervals.
#'
#' @param x1 Value for the first percentile.
#' @param p1 Probability of the first percentile.
#' @param x2 Value for the second percentile.
#' @param p2 Probability of the second percentile.
#' @param x Numeric, a probability value between 0 and 1. Typically indicates
#' a prevalence rate of an outcome of interest; Or an integer value
#' with the number of observed events. In this case, specify `n`
#' to indicate the toral number of observations.
#' @param se The standard error of `x`. Either `se` or `ci` must
#' be specified.
#' @param ci The upper limit of the confidence interval of `x`. Either
#' `se` or `ci` must be specified.
#' @param n Numeric, number of total observations. Needs to be specified, if
#' `x` is an integer (number of observed events), and no
#' probability. See 'Examples'.
#'
#' @return A list of length two, with the two distribution parameters than can
#' be used to define the distribution, which (best) describes
#' the shape for the given input parameters.
#'
#' @details These functions can be used to find parameter for various distributions,
#' to define prior probabilities for Bayesian analyses. `x1`, `p1`, `x2` and
#' `p2` are parameters that describe two quantiles. Given this knowledge, the
#' distribution parameters are returned.
#'
#' Use `find_beta2()`, if the known parameters are, e.g. a prevalence rate or
#' similar probability, and its standard deviation or confidence interval. In
#' this case. `x` should be a probability, for example a prevalence rate of a
#' certain event. `se` then needs to be the standard error for this probability.
#' Alternatively, `ci` can be specified, which should indicate the upper limit
#' of the confidence interval od the probability (prevalence rate) `x`. If the
#' number of events out of a total number of trials is known (e.g. 12 heads out
#' of 30 coin tosses), `x` can also be the number of observed events, while `n`
#' indicates the total amount of trials (in the above example, the function
#' call would be: `find_beta2(x = 12, n = 30)`).
#'
#' @references Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF})
#'
#' @examples
#' # example from blogpost:
#' # https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/
#' # 10% of patients respond within 30 days of treatment
#' # and 80% respond within 90 days of treatment
#' find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8)
#' find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8)
#'
#' parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8)
#' curve(
#' dnorm(x, mean = parms$mean, sd = parms$sd),
#' from = 0, to = 200
#' )
#'
#' parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8)
#' curve(
#' dcauchy(x, location = parms$location, scale = parms$scale),
#' from = 0, to = 200
#' )
#'
#'
#' find_beta2(x = .25, ci = .5)
#'
#' shapes <- find_beta2(x = .25, ci = .5)
#' curve(dbeta(x, shapes[[1]], shapes[[2]]))
#'
#' # find Beta distribution for 3 events out of 20 observations
#' find_beta2(x = 3, n = 20)
#'
#' shapes <- find_beta2(x = 3, n = 20)
#' curve(dbeta(x, shapes[[1]], shapes[[2]]))
#'
#' @export
find_beta <- function(x1, p1, x2, p2) {
logK <- seq(-5, 10, length = 200)
K <- exp(logK)
m <- unlist(lapply(K, betaprior, x = x1, p = p1))
prob2 <- stats::pbeta(x2, K * m, K * (1 - m))
ind <- ((prob2 > 0) & (prob2 < 1))
app <- stats::approx(prob2[ind], logK[ind], p2)
K0 <- exp(app$y)
m0 <- betaprior(K0, x1, p1)
s1 <- K0 * m0
s2 <- K0 * (1 - m0)
list(shape1 = s1, shape2 = s2)
}
betaprior <- function(K, x, p) {
m.lo <- 0
m.hi <- 1
flag <- TRUE
while (flag) {
m0 <- (m.lo + m.hi) / 2
p0 <- stats::pbeta(x, K * m0, K * (1 - m0))
if (p0 < p)
m.hi <- m0
else
m.lo <- m0
if (abs(p0 - p) < 1e-04) flag <- FALSE
}
m0
}
#' @rdname find_beta
#' @export
find_beta2 <- function(x, se, ci, n) {
# check if all required arguments are given
if (missing(se) && missing(ci) && missing(n)) {
insight::format_error("Either `se` or `ci`, or `n` must be specified.")
}
# for number of observations, compute variance of beta distribution
if (!missing(n)) {
if (!is.integer(x) && x < 1)
insight::format_error("If `n` is given, x` must be an integer value greater than 0.")
# compute 2 SD from beta variance
bvar <- 2 * sqrt((x * n) / ((x + n)^2 * (x + n + 1)))
# need to compute proportion
x <- x / n
p2 <- 0.95
x2 <- x + bvar
}
# for standard errors, we assume a 68% quantile
if (!missing(se)) {
p2 <- 0.68
x2 <- x + se
}
# for CI, we assume a 68% quantile
if (!missing(ci)) {
p2 <- 0.95
x2 <- ci
}
# the probability is assumed to be the median
p1 <- 0.5
x1 <- x
find_beta(x1, p1, x2, p2)
}
#' @rdname find_beta
#' @export
find_cauchy <- function(x1, p1, x2, p2) {
# find location paramater
l <- (x1 * stats::qcauchy(p2) ^ -1 - x2 * stats::qcauchy(p1) ^ -1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1)
s <- (x2 - x1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1)
list(location = l, scale = s)
}
#' @rdname find_beta
#' @export
find_normal <- function(x1, p1, x2, p2) {
# find location paramater
mw <- (x1 * stats::qnorm(p2) ^ -1 - x2 * stats::qnorm(p1) ^ -1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1)
stddev <- (x2 - x1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1)
list(mean = mw, sd = stddev)
}
sjstats/R/bootstrap.R 0000644 0001762 0000144 00000010204 14620351262 014304 0 ustar ligges users #' @title Generate nonparametric bootstrap replications
#' @name bootstrap
#'
#' @description Generates \code{n} bootstrap samples of \code{data} and
#' returns the bootstrapped data frames as list-variable.
#'
#' @param data A data frame.
#' @param n Number of bootstraps to be generated.
#' @param size Optional, size of the bootstrap samples. May either be a number
#' between 1 and \code{nrow(data)} or a value between 0 and 1 to sample
#' a proportion of observations from \code{data} (see 'Examples').
#'
#' @return A data frame with one column: a list-variable
#' \code{strap}, which contains resample-objects of class \code{sj_resample}.
#' These resample-objects are lists with three elements:
#' \enumerate{
#' \item the original data frame, \code{data}
#' \item the rownmumbers \code{id}, i.e. rownumbers of \code{data}, indicating the resampled rows with replacement
#' \item the \code{resample.id}, indicating the index of the resample (i.e. the position of the \code{sj_resample}-object in the list \code{strap})
#' }
#'
#' @details By default, each bootstrap sample has the same number of observations
#' as \code{data}. To generate bootstrap samples without resampling
#' same observations (i.e. sampling without replacement), use
#' \code{size} to get bootstrapped data with a specific number
#' of observations. However, specifying the \code{size}-argument is much
#' less memory-efficient than the bootstrap with replacement. Hence,
#' it is recommended to ignore the \code{size}-argument, if it is
#' not really needed.
#'
#' @note This function applies nonparametric bootstrapping, i.e. the function
#' draws samples with replacement.
#' \cr \cr
#' There is an \code{as.data.frame}- and a \code{print}-method to get or
#' print the resampled data frames. See 'Examples'. The \code{as.data.frame}-
#' method automatically applies whenever coercion is done because a data
#' frame is required as input. See 'Examples' in \code{\link{boot_ci}}.
#'
#'
#' @seealso \code{\link{boot_ci}} to calculate confidence intervals from
#' bootstrap samples.
#'
#' @examples
#' data(efc)
#' bs <- bootstrap(efc, 5)
#'
#' # now run models for each bootstrapped sample
#' lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x))
#'
#' # generate bootstrap samples with 600 observations for each sample
#' bs <- bootstrap(efc, 5, 600)
#'
#' # generate bootstrap samples with 70% observations of the original sample size
#' bs <- bootstrap(efc, 5, .7)
#'
#' # compute standard error for a simple vector from bootstraps
#' # use the `as.data.frame()`-method to get the resampled
#' # data frame
#' bs <- bootstrap(efc, 100)
#' bs$c12hour <- unlist(lapply(bs$strap, function(x) {
#' mean(as.data.frame(x)$c12hour, na.rm = TRUE)
#' }))
#'
#' # bootstrapped standard error
#' boot_se(bs, "c12hour")
#'
#' # bootstrapped CI
#' boot_ci(bs, "c12hour")
#' @export
bootstrap <- function(data, n, size) {
if (!missing(size) && !is.null(size)) {
# check for valid range
if (size < 0 || size > nrow(data))
stop("`size` must be greater than 0, but not greater than number of rows of `data`.", call. = F)
# check if we want proportions
if (size < 1) size <- as.integer(nrow(data) * size)
# generate bootstraps w/o replacement
repl <- FALSE
} else {
# size = observations
size <- nrow(data)
# generate bootstraps with replacement
repl <- TRUE
}
# generate bootstrap resamples
strap <- replicate(n, resample(data, size, repl), simplify = FALSE)
# add resample ID, may be used for other functions (like 'se()' for 'icc()')
for (i in seq_len(length(strap))) strap[[i]]$resample.id <- i
# return daza frame
data.frame(strap = I(strap))
}
resample <- function(data, size, replace) {
structure(
class = "sj_resample",
list(
data = data,
id = sample(nrow(data), size = size, replace = replace)
))
}
sjstats/R/wtd_variance.R 0000644 0001762 0000144 00000000401 14620351262 014733 0 ustar ligges users weighted_variance <- function(x, w) {
if (is.null(w)) w <- rep(1, length(x))
x[is.na(w)] <- NA
w[is.na(x)] <- NA
w <- stats::na.omit(w)
x <- stats::na.omit(x)
xbar <- sum(w * x) / sum(w)
sum(w * ((x - xbar)^2)) / (sum(w) - 1)
}
sjstats/R/re-exports.R 0000644 0001762 0000144 00000000700 14620367120 014377 0 ustar ligges users #' @importFrom performance mse
#' @export
performance::mse
#' @importFrom performance rmse
#' @export
performance::rmse
#' @importFrom insight link_inverse
#' @export
insight::link_inverse
#' @importFrom datawizard weighted_sd
#' @export
datawizard::weighted_sd
#' @importFrom datawizard weighted_mean
#' @export
datawizard::weighted_mean
#' @importFrom datawizard weighted_median
#' @export
datawizard::weighted_median
sjstats/R/samplesize_mixed.R 0000644 0001762 0000144 00000011332 14620351262 015634 0 ustar ligges users #' @title Sample size for linear mixed models
#' @name samplesize_mixed
#'
#' @description Compute an approximated sample size for linear mixed models
#' (two-level-designs), based on power-calculation for standard
#' design and adjusted for design effect for 2-level-designs.
#'
#' @param eff.size Effect size.
#' @param df.n Optional argument for the degrees of freedom for numerator. See 'Details'.
#' @param power Power of test (1 minus Type II error probability).
#' @param sig.level Significance level (Type I error probability).
#' @param k Number of cluster groups (level-2-unit) in multilevel-design.
#' @param n Optional, number of observations per cluster groups
#' (level-2-unit) in multilevel-design.
#' @param icc Expected intraclass correlation coefficient for multilevel-model.
#'
#' @return A list with two values: The number of subjects per cluster, and the
#' total sample size for the linear mixed model.
#'
#' @references Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum.
#' \cr \cr
#' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257.
#' \cr \cr
#' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd.
#'
#' @details The sample size calculation is based on a power-calculation for the
#' standard design. If \code{df.n} is not specified, a power-calculation
#' for an unpaired two-sample t-test will be computed (using
#' \code{\link[pwr]{pwr.t.test}} of the \CRANpkg{pwr}-package).
#' If \code{df.n} is given, a power-calculation for general linear models
#' will be computed (using \code{\link[pwr]{pwr.f2.test}} of the
#' \pkg{pwr}-package). The sample size of the standard design
#' is then adjusted for the design effect of two-level-designs (see
#' \code{\link{design_effect}}). Thus, the sample size calculation is appropriate
#' in particular for two-level-designs (see \cite{Snijders 2005}). Models that
#' additionally include repeated measures (three-level-designs) may work
#' as well, however, the computed sample size may be less accurate.
#'
#' @examplesIf requireNamespace("pwr")
#' # Sample size for multilevel model with 30 cluster groups and a small to
#' # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and
#' # hence a total sample size of about 802 observations is needed.
#' samplesize_mixed(eff.size = .3, k = 30)
#'
#' # Sample size for multilevel model with 20 cluster groups and a medium
#' # to large effect size for linear models of 0.2. Five subjects per cluster and
#' # hence a total sample size of about 107 observations is needed.
#' samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9)
#' @export
samplesize_mixed <- function(eff.size,
df.n = NULL,
power = 0.8,
sig.level = 0.05,
k,
n,
icc = 0.05) {
if (!requireNamespace("pwr", quietly = TRUE)) {
stop("Package `pwr` needed for this function to work. Please install it.", call. = FALSE)
}
# compute sample size for standard design
if (is.null(df.n))
# if we have no degrees of freedom specified, use t-test
obs <- 2 * pwr::pwr.t.test(d = eff.size, sig.level = sig.level, power = power)$n
else
# we have df, so power-calc for linear models
obs <- pwr::pwr.f2.test(u = df.n, f2 = eff.size, sig.level = sig.level, power = power)$v + df.n + 1
# if we have no information on the number of observations per cluster,
# compute this number now
if (missing(n) || is.null(n)) {
n <- (obs * (1 - icc)) / (k - (obs * icc))
if (n < 1) {
warning("Minimum required number of subjects per cluster is negative and was adjusted to be positive. You may reduce the requirements for the multi-level structure (i.e. reduce `k` or `icc`), or you can increase the effect-size.", call. = FALSE)
n <- 1
}
}
# adjust standard design by design effect
total.n <- obs * design_effect(n = n, icc = icc)
# sample size for each group and total n
smpsz <- list(round(total.n / k), round(total.n))
# name list
names(smpsz) <- c("Subjects per Cluster", "Total Sample Size")
smpsz
}
#' @rdname samplesize_mixed
#' @export
smpsize_lmm <- samplesize_mixed
sjstats/R/t_test.R 0000644 0001762 0000144 00000032000 14623373000 013564 0 ustar ligges users #' @title Student's t test
#' @name t_test
#' @description This function performs a Student's t test for two independent
#' samples, for paired samples, or for one sample. It's a parametric test for
#' the null hypothesis that the means of two independent samples are equal, or
#' that the mean of one sample is equal to a specified value. The hypothesis
#' can be one- or two-sided.
#'
#' Unlike the underlying base R function `t.test()`, this function allows for
#' weighted tests and automatically calculates effect sizes. Cohen's _d_ is
#' returned for larger samples (n > 20), while Hedges' _g_ is returned for
#' smaller samples.
#'
#' @inheritParams mann_whitney_test
#' @param paired Logical, whether to compute a paired t-test for dependent
#' samples.
#' @inherit mann_whitney_test seealso
#'
#' @inheritSection mann_whitney_test Which test to use
#'
#' @details Interpretation of effect sizes are based on rules described in
#' [`effectsize::interpret_cohens_d()`] and [`effectsize::interpret_hedges_g()`].
#' Use these function directly to get other interpretations, by providing the
#' returned effect size (_Cohen's d_ or _Hedges's g_ in this case) as argument,
#' e.g. `interpret_cohens_d(0.35, rules = "sawilowsky2009")`.
#'
#' @return A data frame with test results. Effectsize Cohen's _d_ is returned
#' for larger samples (n > 20), while Hedges' _g_ is returned for smaller samples.
#'
#' @references
#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
#' Dtsch Med Wochenschr 2007; 132: e24–e25
#'
#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
#'
#' @examplesIf requireNamespace("effectsize")
#' data(sleep)
#' # one-sample t-test
#' t_test(sleep, "extra")
#' # base R equivalent
#' t.test(extra ~ 1, data = sleep)
#'
#' # two-sample t-test, by group
#' t_test(mtcars, "mpg", by = "am")
#' # base R equivalent
#' t.test(mpg ~ am, data = mtcars)
#'
#' # paired t-test
#' t_test(mtcars, c("mpg", "hp"), paired = TRUE)
#' # base R equivalent
#' t.test(mtcars$mpg, mtcars$hp, data = mtcars, paired = TRUE)
#' @export
t_test <- function(data,
select = NULL,
by = NULL,
weights = NULL,
paired = FALSE,
mu = 0,
alternative = "two.sided") {
insight::check_if_installed(c("datawizard", "effectsize"))
alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater"))
# sanity checks
.sanitize_htest_input(data, select, by, weights, test = "t_test")
data_name <- NULL
# filter and remove NA
data <- stats::na.omit(data[c(select, by, weights)])
# does select indicate more than one variable? We than reshape the data
# to have one continous scale and one grouping variable
if (length(select) > 1) {
# paired?
if (paired) {
# subtract the two variables for paired t-test, and "set" by to NULL
data[[select[1]]] <- data[[select[1]]] - data[[select[2]]]
data_name <- paste(select[1], "and", select[2])
select <- select[1]
by <- NULL
} else {
# we convert the data into long format, and create a grouping variable
data <- datawizard::data_to_long(
data[c(select, weights)],
select = select,
names_to = "group",
values_to = "scale"
)
by <- select[2]
select <- select[1]
# after converting to long, we have the "grouping" variable first in the data
colnames(data) <- c(weights, by, select)
}
}
# get data
dv <- data[[select]]
# for two-sample t-test...
if (!is.null(by)) {
grp <- data[[by]]
# coerce to factor
grp <- datawizard::to_factor(grp)
# only two groups allowed
if (insight::n_unique(grp) > 2) {
insight::format_error("Only two groups are allowed for Student's t test.") # nolint
}
# value labels
group_labels <- names(attr(data[[by]], "labels", exact = TRUE))
if (is.null(group_labels)) {
group_labels <- levels(droplevels(grp))
}
data_name <- paste(select, "by", by)
} else {
# one-sample t-test...
grp <- NULL
group_labels <- select
if (is.null(data_name)) {
data_name <- select
}
}
if (is.null(weights)) {
.calculate_ttest(dv, grp, mu, paired, alternative, group_labels, data_name)
} else {
.calculate_weighted_ttest(dv, grp, mu, paired, alternative, data[[weights]], group_labels, data_name)
}
}
# Mann-Whitney-Test for two groups --------------------------------------------
.calculate_ttest <- function(dv, grp, mu, paired, alternative, group_labels, data_name) {
insight::check_if_installed("effectsize")
# prepare data
if (is.null(grp)) {
tdat <- data.frame(dv)
t_formula <- stats::as.formula("dv ~ 1")
} else {
tdat <- data.frame(dv, grp)
t_formula <- stats::as.formula("dv ~ grp")
}
# perfom wilcox test
htest <- stats::t.test(
t_formula,
data = tdat,
alternative = alternative,
mu = mu
)
test_statistic <- htest$statistic
if (nrow(tdat) > 20) {
effect_size <- stats::setNames(
effectsize::cohens_d(
t_formula,
data = tdat,
alternative = alternative,
mu = mu
)$Cohens_d,
"Cohens_d"
)
} else {
effect_size <- stats::setNames(
effectsize::hedges_g(
t_formula,
data = tdat,
alternative = alternative,
mu = mu
)$Hedges_g,
"Hedges_g"
)
}
# return result
out <- data.frame(
data = data_name,
statistic_name = "t",
statistic = test_statistic,
effect_size_name = names(effect_size),
effect_size = as.numeric(effect_size),
p = as.numeric(htest$p.value),
df = as.numeric(htest$parameter),
method = ifelse(paired, "Paired t-test", htest$method),
alternative = alternative,
mu = mu,
stringsAsFactors = FALSE
)
class(out) <- c("sj_htest_t", "data.frame")
attr(out, "group_labels") <- group_labels
attr(out, "means") <- as.numeric(htest$estimate)
attr(out, "paired") <- isTRUE(paired)
attr(out, "one_sample") <- is.null(grp)
attr(out, "weighted") <- FALSE
if (!is.null(grp)) {
attr(out, "n_groups") <- stats::setNames(
c(as.numeric(table(grp))),
c("N Group 1", "N Group 2")
)
}
out
}
# Weighted Mann-Whitney-Test for two groups ----------------------------------
.calculate_weighted_ttest <- function(dv, grp, mu, paired, alternative, weights, group_labels, data_name) {
insight::check_if_installed(c("datawizard", "effectsize"))
if (is.null(grp)) {
dat <- stats::na.omit(data.frame(dv, weights))
colnames(dat) <- c("y", "w")
x_values <- dat$y
x_weights <- dat$w
y_values <- NULL
# group N's
n_groups <- stats::setNames(round(sum(x_weights)), "N Group 1")
} else {
dat <- stats::na.omit(data.frame(dv, grp, weights))
colnames(dat) <- c("y", "g", "w")
# unique groups
groups <- unique(dat$g)
# values for sample 1
x_values <- dat$y[dat$g == groups[1]]
x_weights <- dat$w[dat$g == groups[1]]
# values for sample 2
y_values <- dat$y[dat$g == groups[2]]
y_weights <- dat$w[dat$g == groups[2]]
# group N's
n_groups <- stats::setNames(
c(round(sum(x_weights)), round(sum(y_weights))),
c("N Group 1", "N Group 2")
)
}
mu_x <- stats::weighted.mean(x_values, x_weights, na.rm = TRUE)
var_x <- datawizard::weighted_sd(x_values, x_weights)^2
se_x <- sqrt(var_x / length(x_values))
if (paired || is.null(y_values)) {
# paired
se <- se_x
dof <- length(x_values) - 1
test_statistic <- (mu_x - mu) / se
estimate <- mu_x
method <- if (paired) "Paired t-test" else "One Sample t-test"
} else {
# unpaired t-test
mu_y <- stats::weighted.mean(y_values, y_weights)
var_y <- datawizard::weighted_sd(y_values, y_weights)^2
se_y <- sqrt(var_y / length(y_values))
se <- sqrt(se_x^2 + se_y^2)
dof <- se^4 / (se_x^4 / (length(x_values) - 1) + se_y^4 / (length(y_values) - 1))
test_statistic <- (mu_x - mu_y - mu) / se
estimate <- c(mu_x, mu_y)
method <- "Two-Sample t-test"
}
# p-values
if (alternative == "less") {
pval <- stats::pt(test_statistic, dof)
} else if (alternative == "greater") {
pval <- stats::pt(test_statistic, dof, lower.tail = FALSE)
} else {
pval <- 2 * stats::pt(-abs(test_statistic), dof)
}
# effect size
dat$y <- dat$y * dat$w
if (is.null(y_values)) {
t_formula <- stats::as.formula("y ~ 1")
} else {
t_formula <- stats::as.formula("y ~ g")
}
if (nrow(dat) > 20) {
effect_size <- stats::setNames(
effectsize::cohens_d(
t_formula,
data = dat,
alternative = alternative,
mu = mu,
paired = FALSE
)$Cohens_d,
"Cohens_d"
)
} else {
effect_size <- stats::setNames(
effectsize::hedges_g(
t_formula,
data = dat,
alternative = alternative,
mu = mu,
paired = FALSE
)$Hedges_g,
"Hedges_g"
)
}
# return result
out <- data.frame(
data = data_name,
statistic_name = "t",
statistic = test_statistic,
effect_size_name = names(effect_size),
effect_size = as.numeric(effect_size),
p = pval,
df = dof,
method = method,
alternative = alternative,
mu = mu,
stringsAsFactors = FALSE
)
class(out) <- c("sj_htest_t", "data.frame")
attr(out, "means") <- estimate
attr(out, "n_groups") <- n_groups
attr(out, "means") <- estimate
attr(out, "group_labels") <- group_labels
attr(out, "paired") <- isTRUE(paired)
attr(out, "one_sample") <- is.null(y_values) && !isTRUE(paired)
attr(out, "weighted") <- TRUE
out
}
# methods ---------------------------------------------------------------------
#' @export
print.sj_htest_t <- function(x, ...) {
insight::check_if_installed("effectsize")
# fetch attributes
group_labels <- attributes(x)$group_labels
means <- attributes(x)$means
n_groups <- attributes(x)$n_groups
weighted <- attributes(x)$weighted
paired <- isTRUE(attributes(x)$paired)
one_sample <- isTRUE(attributes(x)$one_sample)
if (weighted) {
weight_string <- " (weighted)"
} else {
weight_string <- ""
}
# same width
group_labels <- format(group_labels)
# header
insight::print_color(sprintf("# %s%s\n\n", x$method, weight_string), "blue")
# print for paired t-test
if (paired) {
# data
insight::print_color(sprintf(
" Data: %s (mean difference = %s)\n",
x$data,
insight::format_value(means[1], protect_integers = TRUE)
), "cyan")
} else {
# data
insight::print_color(sprintf(" Data: %s\n", x$data), "cyan")
# group-1-info
if (is.null(n_groups)) {
insight::print_color(
sprintf(
" Group 1: %s (mean = %s)\n",
group_labels[1], insight::format_value(means[1], protect_integers = TRUE)
), "cyan"
)
} else {
insight::print_color(
sprintf(
" Group 1: %s (n = %i, mean = %s)\n",
group_labels[1], n_groups[1], insight::format_value(means[1], protect_integers = TRUE)
), "cyan"
)
}
# group-2-info
if (length(group_labels) > 1) {
if (is.null(n_groups)) {
insight::print_color(
sprintf(
" Group 2: %s (mean = %s)\n",
group_labels[2], insight::format_value(means[2], protect_integers = TRUE)
), "cyan"
)
} else {
insight::print_color(
sprintf(
" Group 2: %s (n = %i, mean = %s)\n",
group_labels[2], n_groups[2], insight::format_value(means[2], protect_integers = TRUE)
), "cyan"
)
}
}
}
# alternative hypothesis
alt_string <- switch(x$alternative,
two.sided = "not equal to",
less = "less than",
greater = "greater than"
)
if (one_sample) {
alt_string <- paste("true mean is", alt_string, x$mu)
} else if (paired) {
alt_string <- paste("true mean difference is", alt_string, x$mu)
} else {
alt_string <- paste("true difference in means is", alt_string, x$mu)
}
insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan")
# string for effectsizes
if (x$effect_size_name == "Cohens_d") {
eff_string <- sprintf(
"Cohen's d = %.2f (%s effect)",
x$effect_size,
effectsize::interpret_cohens_d(x$effect_size)
)
} else {
eff_string <- sprintf(
"Hedges' g = %.2f (%s effect)",
x$effect_size,
effectsize::interpret_hedges_g(x$effect_size)
)
}
cat(sprintf(
"\n t = %.2f, %s, df = %s, %s\n\n",
x$statistic,
eff_string,
insight::format_value(x$df, digits = 1, protect_integers = TRUE),
insight::format_p(x$p)
))
}
sjstats/R/wtd_se.R 0000644 0001762 0000144 00000004733 14620351262 013566 0 ustar ligges users #' @title Weighted statistics for variables
#' @name weighted_se
#' @description
#' `weighted_se()` computes weighted standard errors of a variable or for
#' all variables of a data frame. `survey_median()` computes the median for
#' a variable in a survey-design (see [`survey::svydesign()]`).
#' `weighted_correlation()` computes a weighted correlation for a two-sided
#' alternative hypothesis.
#'
#' @param x (Numeric) vector or a data frame. For `survey_median()` or `weighted_ttest()`,
#' the bare (unquoted) variable name, or a character vector with the variable name.
#' @param weights Bare (unquoted) variable name, or a character vector with
#' the variable name of the numeric vector of weights. If `weights = NULL`,
#' unweighted statistic is reported.
#' @param data A data frame.
#' @param formula A formula of the form `lhs ~ rhs1 + rhs2` where `lhs` is a
#' numeric variable giving the data values and `rhs1` a factor with two
#' levels giving the corresponding groups and `rhs2` a variable with weights.
#' @param y Optional, bare (unquoted) variable name, or a character vector with
#' the variable name.
#' @param ci.lvl Confidence level of the interval.
#' @param ... Currently not used.
#'
#' @inheritParams svyglm.nb
#'
#' @return The weighted (test) statistic.
#'
#' @examplesIf requireNamespace("survey")
#' data(efc)
#' weighted_se(efc$c12hour, abs(runif(n = nrow(efc))))
#'
#' # survey_median ----
#' # median for variables from weighted survey designs
#' data(nhanes_sample)
#'
#' des <- survey::svydesign(
#' id = ~SDMVPSU,
#' strat = ~SDMVSTRA,
#' weights = ~WTINT2YR,
#' nest = TRUE,
#' data = nhanes_sample
#' )
#' survey_median(total, des)
#' survey_median("total", des)
#' @export
weighted_se <- function(x, weights = NULL) {
UseMethod("weighted_se")
}
#' @export
weighted_se.data.frame <- function(x, weights = NULL) {
se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights)
names(se_result) <- colnames(x)
se_result
}
#' @export
weighted_se.matrix <- function(x, weights = NULL) {
se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights)
names(se_result) <- colnames(x)
se_result
}
#' @export
weighted_se.default <- function(x, weights = NULL) {
weighted_se_helper(x, weights)
}
weighted_se_helper <- function(x, weights) {
if (is.null(weights)) weights <- rep(1, length(x))
sqrt(weighted_variance(x, weights) / length(stats::na.omit(x)))
}
sjstats/R/nhanes_sample.R 0000644 0001762 0000144 00000001505 13737567502 015126 0 ustar ligges users #' @docType data
#' @title Sample dataset from the National Health and Nutrition Examination Survey
#' @name nhanes_sample
#' @keywords data
#'
#' @description Selected variables from the National Health and Nutrition Examination
#' Survey that are used in the example from Lumley (2010), Appendix E.
#' See \code{\link{svyglm.nb}} for examples.
#'
#' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley
NULL
#' @docType data
#' @title Sample dataset from the EUROFAMCARE project
#' @name efc
#' @keywords data
#'
#' @description German data set from the European study on family care of older people.
#'
#' @references Lamura G, Döhner H, Kofahl C, editors. Family carers of older people in Europe: a six-country comparative study. Münster: LIT, 2008.
NULL
sjstats/R/Deprecated.R 0000644 0001762 0000144 00000003405 14617352174 014345 0 ustar ligges users #' @title Deprecated functions
#' @name r2
#' @description A list of deprecated functions.
#'
#' @param x An object.
#' @param ... Currently not used.
#'
#' @return Nothing.
#'
#' @export
r2 <- function(x) {
.Defunct("performance::r2()")
performance::r2(x)
}
#' @rdname r2
#' @export
cohens_f <- function(x, ...) {
.Defunct("effectsize::cohens_f()")
effectsize::cohens_f(x)
}
#' @rdname r2
#' @export
eta_sq <- function(x, ...) {
.Defunct("effectsize::eta_squared()")
effectsize::eta_squared(x)
}
#' @rdname r2
#' @export
epsilon_sq <- function(x, ...) {
.Defunct("effectsize::epsilon_squared()")
effectsize::epsilon_squared(x)
}
#' @rdname r2
#' @export
omega_sq <- function(x, ...) {
.Defunct("effectsize::omega_sqared()")
effectsize::omega_squared(x)
}
#' @rdname r2
#' @export
scale_weights <- function(x, ...) {
.Defunct("datawizard::rescale_weights()")
datawizard::rescale_weights(x, ...)
}
#' @rdname r2
#' @export
robust <- function(x, ...) {
.Defunct("parameters::standard_error()")
parameters::standard_error(x, ...)
}
#' @rdname r2
#' @export
icc <- function(x) {
.Defunct("performance::icc()")
performance::icc(x)
}
#' @rdname r2
#' @export
p_value <- function(x, ...) {
.Defunct("parameters::p_value()")
parameters::p_value(x)
}
#' @rdname r2
#' @export
se <- function(x, ...) {
.Defunct("parameters::standard_error()")
parameters::standard_error(x)
}
#' @rdname r2
#' @export
means_by_group <- function(x, ...) {
.Defunct("datawizard::means_by_group()")
datawizard::means_by_group(x, ...)
}
#' @rdname r2
#' @export
mean_n <- function(x, ...) {
.Defunct("datawizard::row_means()")
datawizard::row_means(x, ...)
}
sjstats/R/helpfunctions.R 0000644 0001762 0000144 00000006720 14620351262 015160 0 ustar ligges users # Help-functions
data_frame <- function(...) {
x <- data.frame(..., stringsAsFactors = FALSE)
rownames(x) <- NULL
x
}
is_merMod <- function(fit) {
inherits(fit, c("lmerMod", "glmerMod", "nlmerMod", "merModLmerTest"))
}
.compact_character <- function(x) {
x[!sapply(x, function(i) is.null(i) || !nzchar(i, keepNA = TRUE) || is.na(i) || any(i == "NULL", na.rm = TRUE))]
}
.format_symbols <- function(x) {
if (.unicode_symbols()) {
x <- gsub("Delta", "\u0394", x, ignore.case = TRUE)
x <- gsub("Phi", "\u03D5", x, ignore.case = TRUE)
x <- gsub("Eta", "\u03B7", x, ignore.case = TRUE)
x <- gsub("Epsilon", "\u03b5", x, ignore.case = TRUE)
x <- gsub("Omega", "\u03b5", x, ignore.case = TRUE)
x <- gsub("R2", "R\u00b2", x, ignore.case = TRUE)
x <- gsub("Chi2", "\u03C7\u00b2", x, ignore.case = TRUE)
x <- gsub("Chi-squared", "\u03C7\u00b2", x, ignore.case = TRUE)
x <- gsub("Chi", "\u03C7", x, ignore.case = TRUE)
x <- gsub("Sigma", "\u03C3", x, ignore.case = TRUE)
x <- gsub("Rho", "\u03C1", x, ignore.case = TRUE)
x <- gsub("Mu", "\u03BC", x, ignore.case = TRUE)
x <- gsub("Theta", "\u03B8", x, ignore.case = TRUE)
x <- gsub("Fei", "\u05E4\u200E", x, ignore.case = TRUE)
}
x
}
.unicode_symbols <- function() {
win_os <- tryCatch(
{
si <- Sys.info()
if (is.null(si["sysname"])) {
FALSE
} else {
si["sysname"] == "Windows" || startsWith(R.version$os, "mingw")
}
},
error = function(e) {
TRUE
}
)
l10n_info()[["UTF-8"]] && ((win_os && getRversion() >= "4.2") || (!win_os && getRversion() >= "4.0"))
}
.is_pseudo_numeric <- function(x) {
(is.character(x) && !anyNA(suppressWarnings(as.numeric(stats::na.omit(x[nzchar(x, keepNA = TRUE)]))))) || (is.factor(x) && !anyNA(suppressWarnings(as.numeric(levels(x))))) # nolint
}
.misspelled_string <- function(source, searchterm, default_message = NULL) {
if (is.null(searchterm) || length(searchterm) < 1) {
return(default_message)
}
# used for many matches
more_found <- ""
# init default
msg <- ""
# remove matching strings
same <- intersect(source, searchterm)
searchterm <- setdiff(searchterm, same)
source <- setdiff(source, same)
# guess the misspelled string
possible_strings <- unlist(lapply(searchterm, function(s) {
source[.fuzzy_grep(source, s)] # nolint
}), use.names = FALSE)
if (length(possible_strings)) {
msg <- "Did you mean "
if (length(possible_strings) > 1) {
# make sure we don't print dozens of alternatives for larger data frames
if (length(possible_strings) > 5) {
more_found <- sprintf(
" We even found %i more possible matches, not shown here.",
length(possible_strings) - 5
)
possible_strings <- possible_strings[1:5]
}
msg <- paste0(msg, "one of ", toString(paste0("\"", possible_strings, "\"")))
} else {
msg <- paste0(msg, "\"", possible_strings, "\"")
}
msg <- paste0(msg, "?", more_found)
} else {
msg <- default_message
}
# no double white space
insight::trim_ws(msg)
}
.fuzzy_grep <- function(x, pattern, precision = NULL) {
if (is.null(precision)) {
precision <- round(nchar(pattern) / 3)
}
if (precision > nchar(pattern)) {
return(NULL)
}
p <- sprintf("(%s){~%i}", pattern, precision)
grep(pattern = p, x = x, ignore.case = FALSE)
}
sjstats/R/cramer.R 0000644 0001762 0000144 00000002354 14620351262 013547 0 ustar ligges users #' @rdname crosstable_statistics
#' @export
cramers_v <- function(tab, ...) {
UseMethod("cramers_v")
}
#' @rdname crosstable_statistics
#' @export
cramer <- cramers_v
#' @export
cramers_v.table <- function(tab, ...) {
.cramers_v(tab)
}
#' @export
cramers_v.ftable <- function(tab, ...) {
.cramers_v(tab)
}
#' @rdname crosstable_statistics
#' @export
cramers_v.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) {
fterms <- all.vars(formula)
tab <- table(data[[fterms[1]]], data[[fterms[2]]])
method <- match.arg(method)
if (is.null(ci.lvl) || is.na(ci.lvl)) {
.cramers_v(tab)
} else {
straps <- sjstats::bootstrap(data[fterms], n)
tables <- lapply(straps$strap, function(x) {
dat <- as.data.frame(x)
table(dat[[1]], dat[[2]])
})
cramers <- sapply(tables, .cramers_v)
ci <- boot_ci(cramers, ci.lvl = ci.lvl, method = method)
data_frame(
cramer = .cramers_v(tab),
conf.low = ci$conf.low,
conf.high = ci$conf.high
)
}
}
.cramers_v <- function(tab) {
# convert to flat table
if (!inherits(tab, "ftable")) tab <- stats::ftable(tab)
sqrt(phi(tab)^2 / min(dim(tab) - 1))
}
sjstats/R/boot_ci.R 0000644 0001762 0000144 00000013614 14620351262 013715 0 ustar ligges users #' @title Standard error and confidence intervals for bootstrapped estimates
#' @name boot_ci
#'
#' @description Compute nonparametric bootstrap estimate, standard error,
#' confidence intervals and p-value for a vector of bootstrap
#' replicate estimates.
#'
#' @param data A data frame that containts the vector with bootstrapped
#' estimates, or directly the vector (see 'Examples').
#' @param ci.lvl Numeric, the level of the confidence intervals.
#' @param select Optional, unquoted names of variables (as character vector)
#' with bootstrapped estimates. Required, if either `data` is a data frame
#' (and no vector), and only selected variables from `data` should be processed.
#' @param method Character vector, indicating if confidence intervals should be
#' based on bootstrap standard error, multiplied by the value of the quantile
#' function of the t-distribution (default), or on sample quantiles of the
#' bootstrapped values. See 'Details' in `boot_ci()`. May be abbreviated.
#'
#' @return A data frame with either bootstrap estimate, standard error, the
#' lower and upper confidence intervals or the p-value for all bootstrapped
#' estimates.
#'
#' @details The methods require one or more vectors of bootstrap replicate
#' estimates as input.
#'
#' - `boot_est()`: returns the bootstrapped estimate, simply by computing
#' the mean value of all bootstrap estimates.
#' - `boot_se()`: computes the nonparametric bootstrap standard error by
#' calculating the standard deviation of the input vector.
#' - The mean value of the input vector and its standard error is used by
#' `boot_ci()` to calculate the lower and upper confidence interval,
#' assuming a t-distribution of bootstrap estimate replicates (for
#' `method = "dist"`, the default, which is
#' `mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)`); for
#' `method = "quantile"`, 95\% sample quantiles are used to compute the
#' confidence intervals (`quantile(x, probs = c(0.025, 0.975))`). Use
#' `ci.lvl` to change the level for the confidence interval.
#' - P-values from `boot_p()` are also based on t-statistics, assuming normal
#' distribution.
#'
#' @references Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164
#'
#' @seealso []`bootstrap()`] to generate nonparametric bootstrap samples.
#'
#' @examples
#' data(efc)
#' bs <- bootstrap(efc, 100)
#'
#' # now run models for each bootstrapped sample
#' bs$models <- lapply(
#' bs$strap,
#' function(.x) lm(neg_c_7 ~ e42dep + c161sex, data = .x)
#' )
#'
#' # extract coefficient "dependency" and "gender" from each model
#' bs$dependency <- vapply(bs$models, function(x) coef(x)[2], numeric(1))
#' bs$gender <- vapply(bs$models, function(x) coef(x)[3], numeric(1))
#'
#' # get bootstrapped confidence intervals
#' boot_ci(bs$dependency)
#'
#' # compare with model fit
#' fit <- lm(neg_c_7 ~ e42dep + c161sex, data = efc)
#' confint(fit)[2, ]
#'
#' # alternative function calls.
#' boot_ci(bs$dependency)
#' boot_ci(bs, "dependency")
#' boot_ci(bs, c("dependency", "gender"))
#' boot_ci(bs, c("dependency", "gender"), method = "q")
#'
#'
#' # compare coefficients
#' mean(bs$dependency)
#' boot_est(bs$dependency)
#' coef(fit)[2]
#' @export
boot_ci <- function(data, select = NULL, method = c("dist", "quantile"), ci.lvl = 0.95) {
# match arguments
method <- match.arg(method)
# evaluate arguments, generate data
if (is.null(select)) {
.dat <- as.data.frame(data)
} else {
.dat <- data[select]
}
# compute confidence intervals for all values
transform_boot_result(lapply(.dat, function(x) {
# check if method should be based on t-distribution of
# bootstrap values or quantiles
if (method == "dist") {
# get bootstrap standard error
bootse <- stats::qt((1 + ci.lvl) / 2, df = length(x) - 1) * stats::sd(x, na.rm = TRUE)
# lower and upper confidence interval
ci <- mean(x, na.rm = TRUE) + c(-bootse, bootse)
} else {
# CI based on quantiles of bootstrapped values
ci <- stats::quantile(x, probs = c((1 - ci.lvl) / 2, (1 + ci.lvl) / 2))
}
# give proper names
names(ci) <- c("conf.low", "conf.high")
ci
}))
}
#' @rdname boot_ci
#' @export
boot_se <- function(data, select = NULL) {
# evaluate arguments, generate data
if (is.null(select)) {
.dat <- as.data.frame(data)
} else {
.dat <- data[select]
}
# compute confidence intervalls for all values
transform_boot_result(lapply(.dat, function(x) {
# get bootstrap standard error
se <- stats::sd(x, na.rm = TRUE)
names(se) <- "std.err"
se
}))
}
#' @rdname boot_ci
#' @export
boot_p <- function(data, select = NULL) {
# evaluate arguments, generate data
if (is.null(select)) {
.dat <- as.data.frame(data)
} else {
.dat <- data[select]
}
# compute confidence intervalls for all values
transform_boot_result(lapply(.dat, function(x) {
# compute t-statistic
t.stat <- mean(x, na.rm = TRUE) / stats::sd(x, na.rm = TRUE)
# compute p-value
p <- 2 * stats::pt(abs(t.stat), df = length(x) - 1, lower.tail = FALSE)
names(p) <- "p.value"
p
}))
}
#' @rdname boot_ci
#' @export
boot_est <- function(data, select = NULL) {
# evaluate arguments, generate data
if (is.null(select)) {
.dat <- as.data.frame(data)
} else {
.dat <- data[select]
}
# compute mean for all values (= bootstrapped estimate)
transform_boot_result(lapply(.dat, function(x) {
estimate <- mean(x, na.rm = TRUE)
names(estimate) <- "estimate"
estimate
}))
}
transform_boot_result <- function(res) {
# transform a bit, so we have each estimate in a row, and ci's as columns...
rownames_as_column(as.data.frame(t(as.data.frame(res))), var = "term")
}
sjstats/R/weight.R 0000644 0001762 0000144 00000006235 14620351262 013567 0 ustar ligges users #' @title Weight a variable
#' @name weight
#' @description These functions weight the variable \code{x} by
#' a specific vector of \code{weights}.
#'
#' @param x (Unweighted) variable.
#' @param weights Vector with same length as \code{x}, which
#' contains weight factors. Each value of \code{x} has a
#' specific assigned weight in \code{weights}.
#' @param digits Numeric value indicating the number of decimal places to be
#' used for rounding the weighted values. By default, this value is
#' \code{0}, i.e. the returned values are integer values.
#'
#' @return The weighted \code{x}.
#'
#' @details \code{weight2()} sums up all \code{weights} values of the associated
#' categories of \code{x}, whereas \code{weight()} uses a
#' \code{\link[stats]{xtabs}} formula to weight cases. Thus, \code{weight()}
#' may return a vector of different length than \code{x}.
#'
#' @note The values of the returned vector are in sorted order, whereas the values'
#' order of the original \code{x} may be spread randomly. Hence, \code{x} can't be
#' used, for instance, for further cross tabulation. In case you want to have
#' weighted contingency tables or (grouped) box plots etc., use the \code{weightBy}
#' argument of most functions.
#'
#' @examples
#' v <- sample(1:4, 20, TRUE)
#' table(v)
#' w <- abs(rnorm(20))
#' table(weight(v, w))
#' table(weight2(v, w))
#'
#' set.seed(1)
#' x <- sample(letters[1:5], size = 20, replace = TRUE)
#' w <- runif(n = 20)
#'
#' table(x)
#' table(weight(x, w))
#'
#' @export
weight <- function(x, weights, digits = 0) {
# remember if x is numeric
x.is.num <- is.numeric(x)
# init values
weightedvar <- c()
wtab <- round(stats::xtabs(weights ~ x,
data = data.frame(weights = weights, x = x),
na.action = stats::na.pass,
exclude = NULL),
digits = digits)
# iterate all table values
for (w in seq_len(length(wtab))) {
# retrieve count of each table cell
w_count <- wtab[[w]]
# retrieve "cell name" which is identical to the variable value
# first check whether values are numeric or not
nval_ <- suppressWarnings(as.numeric(names(wtab[w])))
# if value is not numeric, use as is
if (is.na(nval_))
w_value <- names(wtab[w])
else
# else, use numeric value
w_value <- nval_
# append variable value, repeating it "w_count" times.
weightedvar <- c(weightedvar, rep(w_value, w_count))
}
# if we have NA values, weighted var is coerced to character.
# coerce back to numeric then here
if (!is.numeric(weightedvar) && x.is.num) {
weightedvar <- datawizard::to_numeric(weightedvar, dummy_factors = FALSE)
}
# return result
weightedvar
}
#' @rdname weight
#' @export
weight2 <- function(x, weights) {
items <- unique(x)
newvar <- c()
for (i in seq_len(length(items))) {
newcount <- round(sum(weights[which(x == items[i])]))
newvar <- c(newvar, rep(items[i], newcount))
}
newvar
}
sjstats/R/se_ybar.R 0000644 0001762 0000144 00000003121 14620351262 013713 0 ustar ligges users #' @title Standard error of sample mean for mixed models
#' @name se_ybar
#'
#' @description Compute the standard error for the sample mean for mixed models,
#' regarding the extent to which clustering affects the standard errors.
#' May be used as part of the multilevel power calculation for cluster sampling
#' (see \cite{Gelman and Hill 2007, 447ff}).
#'
#' @param fit Fitted mixed effects model (\code{\link[lme4]{merMod}}-class).
#'
#' @return The standard error of the sample mean of \code{fit}.
#'
#' @references Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press
#'
#' @examplesIf require("lme4")
#' fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy)
#' se_ybar(fit)
#' @export
se_ybar <- function(fit) {
# get model icc
vars <- insight::get_variance(fit, verbose = FALSE)
# get group variances
tau.00 <- unname(vars$var.intercept)
# total variance
tot_var <- sum(tau.00, vars$var.residual)
# get number of groups
m.cnt <- vapply(fit@flist, nlevels, 1)
# compute number of observations per group (level-2-unit)
obs <- round(stats::nobs(fit) / m.cnt)
# compute simple icc
icc <- tau.00 / tot_var
# compute standard error of sample mean
se <- unlist(lapply(seq_len(length(m.cnt)), function(.x) {
sqrt((tot_var / stats::nobs(fit)) * design_effect(n = obs[.x], icc = icc[.x]))
}))
# give names for se, so user sees, which random effect has what impact
names(se) <- names(m.cnt)
se
}
sjstats/R/xtab_statistics.R 0000644 0001762 0000144 00000022402 14620351262 015502 0 ustar ligges users #' @title Measures of association for contingency tables
#' @name crosstable_statistics
#'
#' @description This function calculates various measure of association for
#' contingency tables and returns the statistic and p-value.
#' Supported measures are Cramer's V, Phi, Spearman's rho,
#' Kendall's tau and Pearson's r.
#'
#' @param data A data frame or a table object. If a table object, `x1` and
#' `x2` will be ignored. For Kendall's _tau_, Spearman's _rho_ or Pearson's
#' product moment correlation coefficient, `data` needs to be a data frame.
#' If `x1` and `x2` are not specified, the first two columns of the data
#' frames are used as variables to compute the crosstab.
#' @param formula A formula of the form `lhs ~ rhs` where `lhs` is a
#' numeric variable giving the data values and `rhs` a factor giving the
#' corresponding groups.
#' @param tab A [`table()`] or [`ftable()`]. Tables of class [`xtabs()`] and
#' other will be coerced to `ftable` objects.
#' @param x1 Name of first variable that should be used to compute the
#' contingency table. If `data` is a table object, this argument will be
#' irgnored.
#' @param x2 Name of second variable that should be used to compute the
#' contingency table. If `data` is a table object, this argument will be
#' irgnored.
#' @param statistics Name of measure of association that should be computed. May
#' be one of `"auto"`, `"cramer"`, `"phi"`, `"spearman"`, `"kendall"`,
#' `"pearson"` or `"fisher"`. See 'Details'.
#' @param ci.lvl Scalar between 0 and 1. If not `NULL`, returns a data
#' frame including lower and upper confidence intervals.
#' @param weights Name of variable in `x` that indicated the vector of weights
#' that will be applied to weight all observations. Default is `NULL`, so no
#' weights are used.
#' @param ... Other arguments, passed down to the statistic functions
#' [`chisq.test()`], [`fisher.test()`] or [`cor.test()`].
#'
#' @inheritParams bootstrap
#' @inheritParams boot_ci
#'
#' @return For [`phi()`], the table's Phi value. For [`cramers_v()]`, the
#' table's Cramer's V.
#'
#' For `crosstable_statistics()`, a list with following components:
#'
#' - `estimate`: the value of the estimated measure of association.
#' - `p.value`: the p-value for the test.
#' - `statistic`: the value of the test statistic.
#' - `stat.name`: the name of the test statistic.
#' - `stat.html`: if applicable, the name of the test statistic, in HTML-format.
#' - `df`: the degrees of freedom for the contingency table.
#' - `method`: character string indicating the name of the measure of association.
#' - `method.html`: if applicable, the name of the measure of association, in HTML-format.
#' - `method.short`: the short form of association measure, equals the `statistics`-argument.
#' - `fisher`: logical, if Fisher's exact test was used to calculate the p-value.
#'
#' @details The p-value for Cramer's V and the Phi coefficient are based
#' on `chisq.test()`. If any expected value of a table cell is smaller than 5,
#' or smaller than 10 and the df is 1, then `fisher.test()` is used to compute
#' the p-value, unless `statistics = "fisher"`; in this case, the use of
#' `fisher.test()` is forced to compute the p-value. The test statistic is
#' calculated with `cramers_v()` resp. `phi()`.
#'
#' Both test statistic and p-value for Spearman's rho, Kendall's tau and
#' Pearson's r are calculated with `cor.test()`.
#'
#' When `statistics = "auto"`, only Cramer's V or Phi are calculated, based on
#' the dimension of the table (i.e. if the table has more than two rows or
#' columns, Cramer's V is calculated, else Phi).
#'
#' @references Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
#' \doi{10.3390/math11091982}
#'
#' @examples
#' # Phi coefficient for 2x2 tables
#' tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE))
#' phi(tab)
#'
#' # Cramer's V for nominal variables with more than 2 categories
#' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE))
#' cramer(tab)
#'
#' # formula notation
#' data(efc)
#' cramer(e16sex ~ c161sex, data = efc)
#'
#' # bootstrapped confidence intervals
#' cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100)
#'
#' # 2x2 table, compute Phi automatically
#' crosstable_statistics(efc, e16sex, c161sex)
#'
#' # more dimensions than 2x2, compute Cramer's V automatically
#' crosstable_statistics(efc, c172code, c161sex)
#'
#' # ordinal data, use Kendall's tau
#' crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall")
#'
#' # calcilate Spearman's rho, with continuity correction
#' crosstable_statistics(efc,
#' e42dep,
#' quol_5,
#' statistics = "spearman",
#' exact = FALSE,
#' continuity = TRUE
#' )
#' @export
crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ...) {
# match arguments
statistics <- match.arg(statistics)
# name for test statistics in HTML
stat.html <- NULL
# check if data is a table
if (is.table(data)) {
# 'data' is a table - copy to table object
tab <- data
# check if statistics are possible to compute
if (statistics %in% c("spearman", "kendall", "pearson")) {
stop(
sprintf(
"Need arguments `data`, `x1` and `x2` to compute %s-statistics.",
statistics
),
call. = FALSE
)
}
} else {
# evaluate unquoted names
x1 <- deparse(substitute(x1))
x2 <- deparse(substitute(x2))
weights <- deparse(substitute(weights))
# if names were quotes, remove quotes
x1 <- gsub("\"", "", x1, fixed = TRUE)
x2 <- gsub("\"", "", x2, fixed = TRUE)
weights <- gsub("\"", "", weights, fixed = TRUE)
if (insight::is_empty_object(weights) || weights == "NULL")
weights <- NULL
else
weights <- data[[weights]]
# check for "NULL" and get data
if (x1 != "NULL" && x2 != "NULL")
data <- data[, c(x1, x2)]
else
data <- data[, 1:2]
if (!is.null(weights)) data <- cbind(data, weights)
# make table
if (!is.null(weights)) {
tab <- as.table(round(stats::xtabs(data[[3]] ~ data[[1]] + data[[2]])))
class(tab) <- "table"
} else {
tab <- table(data)
}
}
# get expected values
tab.val <- table_values(tab)
# remember whether fisher's exact test was used or not
use.fisher <- FALSE
# select statistics automatically, based on number of rows/columns
if (statistics %in% c("auto", "cramer", "phi", "fisher")) {
# get chisq-statistics, for df and p-value
chsq <- suppressWarnings(stats::chisq.test(tab, ...))
pv <- chsq$p.value
test <- chsq$statistic
# set statistics name
names(test) <- "Chi-squared"
stat.html <- "χ2"
# check row/columns
if ((nrow(tab) > 2 || ncol(tab) > 2 || statistics %in% c("cramer", "fisher")) && statistics != "phi") {
# get cramer's V
s <- cramer(tab)
# if minimum expected values below 5, compute fisher's exact test
if (statistics == "fisher" ||
min(tab.val$expected) < 5 ||
(min(tab.val$expected) < 10 && chsq$parameter == 1)) {
pv <- stats::fisher.test(tab, simulate.p.value = TRUE, ...)$p.value
use.fisher <- TRUE
}
# set statistics
statistics <- "cramer"
} else {
# get Phi
s <- phi(tab)
# if minimum expected values below 5 and df=1, compute fisher's exact test
if (min(tab.val$expected) < 5 ||
(min(tab.val$expected) < 10 && chsq$parameter == 1)) {
pv <- stats::fisher.test(tab, ...)$p.value
use.fisher <- TRUE
}
# set statistics
statistics <- "phi"
}
} else {
# compute correlation coefficient
cv <- stats::cor.test(x = data[[1]], y = data[[2]], method = statistics, ...)
# get statistics and p-value
s <- cv$estimate
pv <- cv$p.value
test <- cv$statistic
stat.html <- names(test)
}
# compute method string
method <- ifelse(statistics == "kendall", "Kendall's tau",
ifelse(statistics == "spearman", "Spearman's rho", # nolint
ifelse(statistics == "pearson", "Pearson's r", # nolint
ifelse(statistics == "cramer", "Cramer's V", "Phi") # nolint
)
)
)
# compute method string
method.html <- ifelse(statistics == "kendall", "Kendall's τ",
ifelse(statistics == "spearman", "Spearman's ρ", # nolint
ifelse(statistics == "pearson", "Pearson's r", # nolint
ifelse(statistics == "cramer", "Cramer's V", "&phi") # nolint
)
)
)
# return result
structure(class = "sj_xtab_stat", list(
estimate = s,
p.value = pv,
statistic = test,
stat.name = names(test),
stat.html = stat.html,
df = (nrow(tab) - 1) * (ncol(tab) - 1),
n_obs = sum(tab, na.rm = TRUE),
method = method,
method.html = method.html,
method.short = statistics,
fisher = use.fisher
))
}
#' @rdname crosstable_statistics
#' @export
xtab_statistics <- crosstable_statistics
sjstats/R/sjStatistics.R 0000644 0001762 0000144 00000003525 14402055204 014761 0 ustar ligges users #' @title Expected and relative table values
#' @name table_values
#' @description This function calculates a table's cell, row and column percentages as
#' well as expected values and returns all results as lists of tables.
#'
#' @param tab Simple \code{\link{table}} or \code{\link[stats]{ftable}} of which
#' cell, row and column percentages as well as expected values are calculated.
#' Tables of class \code{\link[stats]{xtabs}} and other will be coerced to
#' \code{ftable} objects.
#' @param digits Amount of digits for the table percentage values.
#'
#' @return (Invisibly) returns a list with four tables:
#' \enumerate{
#' \item \code{cell} a table with cell percentages of \code{tab}
#' \item \code{row} a table with row percentages of \code{tab}
#' \item \code{col} a table with column percentages of \code{tab}
#' \item \code{expected} a table with expected values of \code{tab}
#' }
#'
#' @examples
#' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE))
#' # show expected values
#' table_values(tab)$expected
#' # show cell percentages
#' table_values(tab)$cell
#'
#' @export
table_values <- function(tab, digits = 2) {
# convert to ftable object
if (!inherits(tab, "ftable")) tab <- stats::ftable(tab)
tab.cell <- round(100 * prop.table(tab), digits)
tab.row <- round(100 * prop.table(tab, 1), digits)
tab.col <- round(100 * prop.table(tab, 2), digits)
tab.expected <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab)))
# return results
invisible(structure(class = "sjutablevalues",
list(cell = tab.cell,
row = tab.row,
col = tab.col,
expected = tab.expected)))
}
sjstats/R/inequ_trends.R 0000644 0001762 0000144 00000010023 14620351262 014766 0 ustar ligges users #' @title Compute trends in status inequalities
#' @name inequ_trend
#'
#' @description This method computes the proportional change of absolute
#' (rate differences) and relative (rate ratios) inequalities
#' of prevalence rates for two different status groups, as proposed
#' by Mackenbach et al. (2015).
#'
#' @param data A data frame that contains the variables with prevalence rates for both low
#' and high status groups (see 'Examples').
#' @param prev.low The name of the variable with the prevalence rates for
#' the low status groups.
#' @param prev.hi The name of the variable with the prevalence rates for
#' the hi status groups.
#'
#' @return A data frame with the prevalence rates as well as the values for the
#' proportional change in absolute (\code{rd}) and relative (\code{rr})
#' ineqqualities.
#'
#' @references Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. \doi{10.1136/jech-2015-207018}
#'
#' @details Given the time trend of prevalence rates of an outcome for two status
#' groups (e.g. the mortality rates for people with lower and higher
#' socioeconomic status over 40 years), this function computes the
#' proportional change of absolute and relative inequalities, expressed
#' in changes in rate differences and rate ratios. The function implements
#' the algorithm proposed by \emph{Mackenbach et al. 2015}.
#'
#' @examplesIf requireNamespace("ggplot2")
#' # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5
#'
#' # 40 simulated time points, with an initial rate ratio of 2 and
#' # a rate difference of 100 (i.e. low status group starts with a
#' # prevalence rate of 200, the high status group with 100)
#'
#' # annual decline of prevalence is 1% for the low, and 3% for the
#' # high status group
#'
#' n <- 40
#' time <- seq(1, n, by = 1)
#' lo <- rep(200, times = n)
#' for (i in 2:n) lo[i] <- lo[i - 1] * .99
#'
#' hi <- rep(100, times = n)
#' for (i in 2:n) hi[i] <- hi[i - 1] * .97
#'
#' prev.data <- data.frame(lo, hi)
#'
#' # print values
#' inequ_trend(prev.data, "lo", "hi")
#'
#' # plot trends - here we see that the relative inequalities
#' # are increasing over time, while the absolute inequalities
#' # are first increasing as well, but later are decreasing
#' # (while rel. inequ. are still increasing)
#' plot(inequ_trend(prev.data, "lo", "hi"))
#'
#' @export
inequ_trend <- function(data, prev.low, prev.hi) {
# prepare data for prevalence rates for low and hi status groups
if (is.null(data) || missing(data)) {
dat <- data.frame(prev.low, prev.hi)
} else {
dat <- data[c(prev.low, prev.hi)]
}
# ensure common column names
colnames(dat) <- c("lo", "hi")
# trends in rate ratios
# compute relative inequality for first time point, needed
# as reference to compute proportional change over time
dat$rr <- dat$lo[1] / dat$hi[1]
# compute proportional change of relative inequalities over time
for (t in 2:nrow(dat)) {
delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1]
delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1]
dat$rr[t] <- dat$rr[t - 1] * ((1 + delta.low) / (1 + delta.hi))
}
# trends in rate difference
# compute absolute inequality for first time point, needed
# as reference to compute proportional change over time
dat$rd <- dat$lo[1] - dat$hi[1]
# compute proportional change of absolute inequalities over time
for (t in 2:nrow(dat)) {
delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1]
delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1]
dat$rd[t] <- dat$rd[t - 1] + (dat$lo[t - 1] * delta.low - dat$hi[t - 1] * delta.hi)
}
# return
structure(class = "sj_inequ_trend", list(data = dat))
}
sjstats/R/gmd.R 0000644 0001762 0000144 00000002636 14620351262 013050 0 ustar ligges users #' @title Gini's Mean Difference
#' @name gmd
#' @description `gmd()` computes Gini's mean difference for a numeric vector
#' or for all numeric vectors in a data frame.
#'
#' @param x A vector or data frame.
#' @param select Optional, names of variables as character vector that should be
#' selected for further processing. Required, if `x` is a data frame (and no vector)
#' and only selected variables from `x` should be processed.
#'
#' @return For numeric vectors, Gini's mean difference. For non-numeric vectors
#' or vectors of length < 2, returns `NA`.
#'
#' @note Gini's mean difference is defined as the mean absolute difference between
#' any two distinct elements of a vector. Missing values from `x` are silently
#' removed.
#'
#' @references David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575
#'
#' @examples
#' data(efc)
#' gmd(efc$e17age)
#' gmd(efc, c("e17age", "c160age", "c12hour"))
#'
#' @export
gmd <- function(x, select = NULL) {
if (is.data.frame(x)) {
do.call(rbind, lapply(select, function(i) {
data.frame(
variable = i,
gmd = gmd_helper(x[[i]])
)
}))
} else {
gmd_helper(x)
}
}
gmd_helper <- function(x) {
if (!is.numeric(x)) return(NA)
x <- stats::na.omit(x)
n <- length(x)
if (n < 2) return(NA)
w <- 4 * ((1:n) - (n - 1) / 2) / n / (n - 1)
sum(w * sort(x - mean(x)))
}
sjstats/R/wilcoxon_test.R 0000644 0001762 0000144 00000016753 14620351262 015207 0 ustar ligges users #' @title Wilcoxon rank sum test
#' @name wilcoxon_test
#' @description This function performs Wilcoxon rank sum tests for one sample
#' or for two _paired_ (dependent) samples. For _unpaired_ (independent)
#' samples, please use the `mann_whitney_test()` function.
#'
#' A Wilcoxon rank sum test is a non-parametric test for the null hypothesis
#' that two samples have identical continuous distributions. The implementation
#' in `wilcoxon_test()` is only used for _paired_, i.e. _dependent_ samples. For
#' independent (unpaired) samples, use `mann_whitney_test()`.
#'
#' `wilcoxon_test()` can be used for ordinal scales or when the continuous
#' variables are not normally distributed. For large samples, or approximately
#' normally distributed variables, the `t_test()` function can be used (with
#' `paired = TRUE`).
#'
#' @inheritParams mann_whitney_test
#' @inherit mann_whitney_test seealso
#'
#' @inheritSection mann_whitney_test Which test to use
#'
#' @return A data frame with test results. The function returns p and Z-values
#' as well as effect size r and group-rank-means.
#'
#' @references
#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
#' Dtsch Med Wochenschr 2007; 132: e24–e25
#'
#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
#'
#' @examplesIf requireNamespace("coin")
#' data(mtcars)
#' # one-sample test
#' wilcoxon_test(mtcars, "mpg")
#' # base R equivalent, we set exact = FALSE to avoid a warning
#' wilcox.test(mtcars$mpg ~ 1, exact = FALSE)
#'
#' # paired test
#' wilcoxon_test(mtcars, c("mpg", "hp"))
#' # base R equivalent, we set exact = FALSE to avoid a warning
#' wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE)
#'
#' # when `by` is specified, each group must be of same length
#' data(iris)
#' d <- iris[iris$Species != "setosa", ]
#' wilcoxon_test(d, "Sepal.Width", by = "Species")
#' @export
wilcoxon_test <- function(data,
select = NULL,
by = NULL,
weights = NULL,
mu = 0,
alternative = "two.sided",
...) {
insight::check_if_installed("datawizard")
alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater"))
# sanity checks
.sanitize_htest_input(data, select, by, weights, test = "wilcoxon_test")
# alternative only if weights are NULL
if (!is.null(weights) && alternative != "two.sided") {
insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.")
}
# for paired two-sample, do groups all have same length?
if (!is.null(by)) {
group_len <- as.numeric(table(as.vector(data[[by]])))
if (!all(group_len == group_len[1])) {
insight::format_error("For paired two-sample Wilcoxon test, all groups specified in `by` must have the same length.") # nolint
}
# convert to wide format
out <- split(data[select], as.character(data[[by]]))
data <- stats::setNames(do.call(cbind, out), names(out))
select <- colnames(data)
}
# value labels
group_labels <- select
x <- data[[select[1]]]
if (length(select) > 1) {
y <- data[[select[2]]]
} else {
y <- NULL
}
if (is.null(weights)) {
.calculate_wilcox(x, y, alternative, mu, group_labels, ...)
} else {
.calculate_weighted_mwu(x, y, data[[weights]], group_labels)
}
}
# Mann-Whitney-Test for two groups --------------------------------------------
.calculate_wilcox <- function(x, y, alternative, mu, group_labels, ...) {
insight::check_if_installed("coin")
# for paired Wilcoxon test, we have effect sizes
if (!is.null(y)) {
# prepare data
wcdat <- data.frame(x, y)
# perfom wilcox test
wt <- coin::wilcoxsign_test(x ~ y, data = wcdat)
# compute statistics
u <- as.numeric(coin::statistic(wt, type = "linear"))
z <- as.numeric(coin::statistic(wt, type = "standardized"))
r <- abs(z / sqrt(nrow(wcdat)))
} else {
wt <- u <- z <- r <- NULL
}
# prepare data
if (is.null(y)) {
dv <- x
} else {
dv <- x - y
}
htest <- suppressWarnings(stats::wilcox.test(
dv ~ 1,
alternative = alternative,
mu = mu,
...
))
v <- htest$statistic
p <- htest$p.value
out <- data.frame(
group1 = group_labels[1],
v = v,
p = as.numeric(p),
mu = mu,
alternative = alternative
)
# two groups?
if (length(group_labels) > 1) {
out$group2 <- group_labels[2]
}
# add effectsizes, when we have
if (!is.null(wt)) {
out$u <- u
out$z <- z
out$r <- r
}
attr(out, "group_labels") <- group_labels
attr(out, "method") <- "wilcoxon"
attr(out, "weighted") <- FALSE
attr(out, "one_sample") <- length(group_labels) == 1
class(out) <- c("sj_htest_wilcox", "data.frame")
out
}
# Weighted Mann-Whitney-Test for two groups ----------------------------------
.calculate_weighted_wilcox <- function(x, y, weights, group_labels) {
# check if pkg survey is available
insight::check_if_installed("survey")
# prepare data
if (is.null(y)) {
dv <- x
} else {
dv <- x - y
}
dat <- stats::na.omit(data.frame(dv, weights))
colnames(dat) <- c("y", "w")
design <- survey::svydesign(ids = ~0, data = dat, weights = ~w)
result <- survey::svyranktest(formula = y ~ 1, design, test = "wilcoxon")
# statistics and effect sizes
z <- result$statistic
r <- abs(z / sqrt(nrow(dat)))
out <- data_frame(
group1 = group_labels[1],
estimate = result$estimate,
z = z,
r = r,
p = as.numeric(result$p.value),
mu = 0,
alternative = "two.sided"
)
# two groups?
if (length(group_labels) > 1) {
out$group2 <- group_labels[2]
}
attr(out, "group_labels") <- group_labels
attr(out, "weighted") <- TRUE
attr(out, "one_sample") <- length(group_labels) == 1
attr(out, "method") <- "wilcoxon"
class(out) <- c("sj_htest_wilcox", "data.frame")
out
}
# methods ---------------------------------------------------------------------
#' @export
print.sj_htest_wilcox <- function(x, ...) {
# fetch attributes
group_labels <- attributes(x)$group_labels
weighted <- attributes(x)$weighted
one_sample <- attributes(x)$one_sample
if (weighted) {
weight_string <- " (weighted)"
} else {
weight_string <- ""
}
if (one_sample) {
onesample_string <- "One Sample"
} else {
onesample_string <- "Paired"
}
# same width
group_labels <- format(group_labels)
# header
insight::print_color(sprintf(
"# %s Wilcoxon signed rank test%s\n\n",
onesample_string,
weight_string
), "blue")
# alternative hypothesis
if (!is.null(x$alternative) && !is.null(x$mu)) {
alt_string <- switch(x$alternative,
two.sided = "not equal to",
less = "less than",
greater = "greater than"
)
alt_string <- paste("true location shift is", alt_string, x$mu)
insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan")
}
if (!is.null(x[["v"]])) {
v_stat <- sprintf("V = %i, ", round(x$v))
} else {
v_stat <- ""
}
if (!is.null(x[["r"]])) {
cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", v_stat, x$r, x$z, insight::format_p(x$p)))
} else {
cat(sprintf("\n %s%s\n\n", v_stat, insight::format_p(x$p)))
}
}
sjstats/R/var_pop.R 0000644 0001762 0000144 00000002762 14620351262 013747 0 ustar ligges users #' @title Calculate population variance and standard deviation
#' @name var_pop
#' @description Calculate the population variance or standard deviation of a vector.
#'
#' @param x (Numeric) vector.
#'
#' @return The population variance or standard deviation of \code{x}.
#'
#' @details Unlike \code{\link[stats]{var}}, which returns the sample variance,
#' \code{var_pop()} returns the population variance. \code{sd_pop()}
#' returns the standard deviation based on the population variance.
#'
#' @examples
#' data(efc)
#'
#' # sampling variance
#' var(efc$c12hour, na.rm = TRUE)
#' # population variance
#' var_pop(efc$c12hour)
#'
#' # sampling sd
#' sd(efc$c12hour, na.rm = TRUE)
#' # population sd
#' sd_pop(efc$c12hour)
#' @export
var_pop <- function(x) {
insight::check_if_installed("datawizard")
# check for categorical
if (is.factor(x)) {
# only allow numeric factors
if (!.is_pseudo_numeric(x)) {
insight::format_error("`x` must be numeric vector or a factor with numeric levels.")
}
# convert factor to numeric
x <- datawizard::to_numeric(x, dummy_factors = FALSE)
}
# remove NA
x <- stats::na.omit(x)
n <- length(x)
# population variance
stats::var(x) * ((n - 1) / n)
}
#' @rdname var_pop
#' @export
sd_pop <- function(x) {
# get population variance
pv <- var_pop(x)
# factors with non-numeric level return NULL
if (!is.null(pv) && !is.na(pv))
sqrt(pv)
else
NA
}
sjstats/R/svyglmnb.R 0000644 0001762 0000144 00000011617 14620351262 014141 0 ustar ligges users utils::globalVariables("scaled.weights")
#' @title Survey-weighted negative binomial generalised linear model
#' @name svyglm.nb
#' @description \code{svyglm.nb()} is an extension to the \CRANpkg{survey}-package
#' to fit survey-weighted negative binomial models. It uses
#' \code{\link[survey]{svymle}} to fit sampling-weighted
#' maximum likelihood estimates, based on starting values provided
#' by \code{\link[MASS]{glm.nb}}, as proposed by \emph{Lumley
#' (2010, pp249)}.
#'
#'
#' @param formula An object of class \code{formula}, i.e. a symbolic description
#' of the model to be fitted. See 'Details' in \code{\link[stats]{glm}}.
#' @param design An object of class \code{\link[survey]{svydesign}}, providing
#' a specification of the survey design.
#' @param ... Other arguments passed down to \code{\link[MASS]{glm.nb}}.
#'
#' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.nb},
#' with some additional information about the model.
#'
#' @details For details on the computation method, see Lumley (2010), Appendix E
#' (especially 254ff.)
#' \cr \cr
#' \pkg{sjstats} implements following S3-methods for \code{svyglm.nb}-objects:
#' \code{family()}, \code{model.frame()}, \code{formula()}, \code{print()},
#' \code{predict()} and \code{residuals()}. However, these functions have some
#' limitations:
#' \itemize{
#' \item{\code{family()} simply returns the family-object from the
#' underlying \code{\link[MASS]{glm.nb}}-model.}
#' \item{The \code{predict()}-method just re-fits the \code{svyglm.nb}-model
#' with \code{\link[MASS]{glm.nb}}, overwrites the \code{$coefficients}
#' from this model-object with the coefficients from the returned
#' \code{\link[survey]{svymle}}-object and finally calls
#' \code{\link[stats]{predict.glm}} to compute the predicted values.}
#' \item{\code{residuals()} re-fits the \code{svyglm.nb}-model with
#' \code{\link[MASS]{glm.nb}} and then computes the Pearson-residuals
#' from the \code{glm.nb}-object.}
#' }
#'
#'
#' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley
#'
#' @examples
#' # ------------------------------------------
#' # This example reproduces the results from
#' # Lumley 2010, figure E.7 (Appendix E, p256)
#' # ------------------------------------------
#' if (require("survey")) {
#' data(nhanes_sample)
#'
#' # create survey design
#' des <- svydesign(
#' id = ~SDMVPSU,
#' strat = ~SDMVSTRA,
#' weights = ~WTINT2YR,
#' nest = TRUE,
#' data = nhanes_sample
#' )
#'
#' # fit negative binomial regression
#' fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des)
#'
#' # print coefficients and standard errors
#' fit
#' }
#' @export
svyglm.nb <- function(formula, design, ...) {
insight::check_if_installed(c("survey", "MASS"))
# get design weights. we need to scale these weights for the glm.nb() function
dw <- stats::weights(design)
# update design with scaled weights
design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE))
# fit negative binomial model, with scaled design weights
mod <- MASS::glm.nb(formula, data = stats::model.frame(design), weights = scaled.weights, ...)
fam <- stats::family(mod)
# fit survey model, using maximum likelihood estimation
svyfit <-
survey::svymle(
loglike = sjstats_loglik,
grad = sjstats_score,
design = design,
formulas = list(theta = ~1, eta = formula),
start = c(mod$theta, stats::coef(mod)),
na.action = "na.omit"
)
# add additoinal information
class(svyfit) <- c("svyglm.nb", class(svyfit))
attr(svyfit, "nb.terms") <- all.vars(formula)
attr(svyfit, "nb.formula") <- formula
attr(svyfit, "family") <- fam
attr(svyfit, "nb.theta") <- mod[["theta"]]
attr(svyfit, "nb.theta.se") <- mod[["SE.theta"]]
svyfit$deviance <- mod$deviance
svyfit$df.residuals <- mod$df.residuals
svyfit$df <- length(stats::coef(mod)) + 1
svyfit$aic <- mod$aic
svyfit
}
# log-likelihood function used in "svymle()"
sjstats_loglik <- function(y, theta, eta) {
mu <- exp(eta)
return(
lgamma(theta + y) - lgamma(theta) - lgamma(y + 1) + theta * log(theta) + y * log(mu + (y == 0)) - (theta + y) * log(theta + mu)
)
}
# derivative
sjstats_deta <- function(y, theta, eta) {
mu <- exp(eta)
dmu <- y / mu - (theta + y) / (theta + mu)
dmu * mu
}
# derivative
sjstats_dtheta <- function(y, theta, eta) {
mu <- exp(eta)
digamma(theta + y) - digamma(theta) + log(theta) + 1 - log(theta + mu) - (y + theta) / (mu + theta)
}
# score function, combines derivatives
sjstats_score <- function(y, theta, eta) {
cbind(sjstats_dtheta(y, theta,eta), sjstats_deta(y, theta, eta))
}
sjstats/data/ 0000755 0001762 0000144 00000000000 13737567502 012675 5 ustar ligges users sjstats/data/efc.RData 0000644 0001762 0000144 00000042134 13563265750 014350 0 ustar ligges users ՝]qߏIJ9 1*TRhjIlHI)AIˡ. ÿ ( p/gιV_3ܻKw{{'ϧ?.|rsXq{?9/
pӍ~<=|o11}z
G7~g9 Pv^1r<371?Wü+1y5KAFO>
}z6̼cz'ZcB?AS|n<_#}DXK?3UӍaEG||]
|+w< A#:;wgc="˾9GUҏqƥ_zKBn#ZW?6<_Q/`ޛh~艾_"eOyѮ71ۀE@# :_
A#Kq?x9/{s{ u`a܈@z^MSqsK#O1 ޫh˟T1_H/z^釶zM?qߒCr)owcݠ/OET_}ճʺ'fK鳀u81ƕW*QrI^':
<1X2?'ᾥy)=~3?Yw=v'/bu\puR;_A
.\>E)oH?W/E:G[+u~-d?W(uR?!}|:W?-swGWP/;U|kS?\RP柕2x.O|y;=q!wCyP5@/>Aj?ެKX Kq8к>SYgJWЯ~+qd\K{
^tq<ȼsYi\tY${J ^J:
ΒCr3