sjstats/0000755000176200001440000000000015022765352011754 5ustar liggesuserssjstats/tests/0000755000176200001440000000000014620351262013110 5ustar liggesuserssjstats/tests/testthat/0000755000176200001440000000000015022765352014756 5ustar liggesuserssjstats/tests/testthat/test-t_test.R0000644000176200001440000000273314623373000017354 0ustar liggesusersskip_if_not_installed("datawizard") skip_if_not_installed("effectsize") test_that("t_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) expect_snapshot(t_test(efc, "e17age")) expect_snapshot(t_test(efc, "e17age", "e16sex")) expect_snapshot(t_test(efc, c("e17age", "c160age"))) expect_snapshot(t_test(efc, c("e17age", "c160age"), paired = TRUE)) expect_snapshot(t_test(efc, "e17age", weights = "weight")) expect_snapshot(t_test(efc, "e17age", "e16sex", weights = "weight")) expect_snapshot(t_test(efc, c("e17age", "c160age"), weights = "weight")) expect_snapshot(t_test(efc, c("e17age", "c160age"), weights = "weight", paired = TRUE)) out1 <- t_test(efc, "e17age") out2 <- t.test(efc$e17age ~ 1) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$effect_size, 9.774916, tolerance = 1e-4, ignore_attr = TRUE) out1 <- t_test(efc, "e17age", "e16sex") out2 <- t.test(efc$e17age ~ efc$e16sex) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$effect_size, -0.5641989, tolerance = 1e-4, ignore_attr = TRUE) efc$e17age <- as.factor(efc$e17age) expect_error(t_test(efc, "e17age", "c161sex"), regex = "Variable provided in `select` must be numeric") }) sjstats/tests/testthat/test-wilcoxon_test.R0000644000176200001440000000157414620351262020760 0ustar liggesusersskip_if_not_installed("survey") skip_if_not_installed("datawizard") skip_if_not_installed("coin") test_that("wilcoxon_test", { data(mtcars) out1 <- wilcoxon_test(mtcars, "mpg") out2 <- wilcox.test(mtcars$mpg ~ 1, exact = FALSE) expect_equal(out1$v, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out1 <- wilcoxon_test(mtcars, c("mpg", "hp")) out2 <- wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE) expect_equal(out1$v, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) data(iris) d <- iris[iris$Species != "setosa", ] out <- wilcoxon_test(d, "Sepal.Width", by = "Species") expect_snapshot(print(out)) }) sjstats/tests/testthat/test-kruskal_wallis_test.R0000644000176200001440000000265614620351262022147 0ustar liggesusersskip_if_not_installed("survey") skip_if_not_installed("datawizard") test_that("kruskal_wallis_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) out1 <- kruskal_wallis_test(efc, "e17age", by = "c172code") out2 <- kruskal.test(e17age ~ c172code, data = efc) expect_equal(out1$Chi2, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) set.seed(123) wide_data <- data.frame( scale1 = runif(20), scale2 = runif(20), scale3 = runif(20) ) long_data <- data.frame( scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3), groups = as.factor(rep(c("A", "B", "C"), each = 20)), stringsAsFactors = FALSE ) out1 <- kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3")) out2 <- kruskal_wallis_test(long_data, select = "scales", by = "groups") out3 <- kruskal.test(scales ~ groups, data = long_data) expect_equal(out1$Chi2, out2$Chi2, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$Chi2, out3$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out3$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out1 <- kruskal_wallis_test(efc, "e17age", by = "c172code", weights = "weight") }) sjstats/tests/testthat/test-autoprior.R0000644000176200001440000000177114620351262020102 0ustar liggesusers.runThisTest <- Sys.getenv("RunAllsjstatsTests") == "yes" if (.runThisTest) { if (suppressWarnings( require("testthat") && require("sjstats") && require("brms") )) { context("sjstats, autoprior") data(efc) efc$c172code <- as.factor(efc$c172code) efc$c161sex <- to_label(efc$c161sex) efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) test_that("auto_prior", { mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) expect_s3_class(auto_prior(mf, efc, TRUE), "brmsprior") mf <- formula(neg_c_7 ~ c161sex + c160age + c172code + c12hour + e17age) expect_s3_class(auto_prior(mf, efc, TRUE), "brmsprior") expect_error(auto_prior(mf, efc)) mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) expect_s3_class(auto_prior(mf, efc, FALSE), "brmsprior") expect_s3_class(auto_prior(mf, efc), "brmsprior") expect_warning(auto_prior(mf, efc, TRUE)) }) } } sjstats/tests/testthat/test-mann_whitney_test.R0000644000176200001440000000364414620351262021616 0ustar liggesusersskip_if_not_installed("coin") skip_if_not_installed("survey") skip_if_not_installed("datawizard") test_that("mann_whitney_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) out1 <- mann_whitney_test(efc, "e17age", by = "e16sex") out2 <- wilcox.test(e17age ~ e16sex, data = efc) expect_equal(out1$w, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$estimate, -151, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$r, 0.2571254, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) set.seed(123) wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20)) out1 <- mann_whitney_test(wide_data, select = c("scale1", "scale2")) out2 <- wilcox.test(wide_data$scale1, wide_data$scale2) expect_equal(out1$w, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$r, 0.05132394, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out <- mann_whitney_test(efc, "e17age", by = "e16sex", weights = "weight") expect_equal(out$p, 1.976729e-14, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$estimate, 0.1594972, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$r, 0.2599877, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out)) }) test_that("mann_whitney_test, sanity checks", { data(efc) expect_error(mann_whitney_test(efc, "e17age", by = "c172code"), regex = "Only two groups are") expect_error(mann_whitney_test(efc, c("e17age", "c172code", "e16sex")), regex = "You may only specify") expect_error(mann_whitney_test(efc, c("e17age", "c172code"), by = "e17age"), regex = "If `select` specifies more") expect_error(mann_whitney_test(efc, "e17age"), regex = "Only one variable provided") }) sjstats/tests/testthat/_snaps/0000755000176200001440000000000014623373000016230 5ustar liggesuserssjstats/tests/testthat/_snaps/t_test.md0000644000176200001440000000574314620351262020070 0ustar liggesusers# t_test Code t_test(efc, "e17age") Output # One Sample t-test Data: e17age Group 1: e17age (mean = 79.12) Alternative hypothesis: true mean is not equal to 0 t = 291.78, Cohen's d = 9.77 (large effect), df = 890, p < .001 --- Code t_test(efc, "e17age", "e16sex") Output # Welch Two Sample t-test Data: e17age by e16sex Group 1: 1 (n = 294, mean = 76.16) Group 2: 2 (n = 596, mean = 80.57) Alternative hypothesis: true difference in means is not equal to 0 t = -8.05, Cohen's d = -0.56 (medium effect), df = 610.8, p < .001 --- Code t_test(efc, c("e17age", "c160age")) Output # Welch Two Sample t-test Data: e17age by c160age Group 1: c160age (n = 890, mean = 53.42) Group 2: e17age (n = 890, mean = 79.12) Alternative hypothesis: true difference in means is not equal to 0 t = -49.22, Cohen's d = -2.33 (large effect), df = 1468.1, p < .001 --- Code t_test(efc, c("e17age", "c160age"), paired = TRUE) Output # Paired t-test Data: e17age and c160age (mean difference = 25.70) Alternative hypothesis: true mean is not equal to 0 t = 54.11, Cohen's d = 1.81 (large effect), df = 889, p < .001 --- Code t_test(efc, "e17age", weights = "weight") Output # One Sample t-test (weighted) Data: e17age Group 1: e17age (n = 897, mean = 79.17) Alternative hypothesis: true mean is not equal to 0 t = 291.31, Cohen's d = 3.17 (large effect), df = 890, p < .001 --- Code t_test(efc, "e17age", "e16sex", weights = "weight") Output # Two-Sample t-test (weighted) Data: e17age by e16sex Group 1: 1 (n = 600, mean = 80.63) Group 2: 2 (n = 296, mean = 76.19) Alternative hypothesis: true difference in means is not equal to 0 t = 8.03, Cohen's d = -0.17 (very small effect), df = 604.5, p < .001 --- Code t_test(efc, c("e17age", "c160age"), weights = "weight") Output # Two-Sample t-test (weighted) Data: e17age by c160age Group 1: c160age (n = 896, mean = 79.17) Group 2: e17age (n = 896, mean = 53.40) Alternative hypothesis: true difference in means is not equal to 0 t = 49.31, Cohen's d = -1.12 (large effect), df = 1470.0, p < .001 --- Code t_test(efc, c("e17age", "c160age"), weights = "weight", paired = TRUE) Output # Paired t-test (weighted) Data: e17age and c160age (mean difference = 25.77) Alternative hypothesis: true mean difference is not equal to 0 t = 54.37, Cohen's d = 1.54 (large effect), df = 889, p < .001 sjstats/tests/testthat/_snaps/wilcoxon_test.md0000644000176200001440000000127214620351262021460 0ustar liggesusers# wilcoxon_test Code print(out1) Output # One Sample Wilcoxon signed rank test Alternative hypothesis: true location shift is not equal to 0 V = 528, p < .001 --- Code print(out1) Output # Paired Wilcoxon signed rank test Alternative hypothesis: true location shift is not equal to 0 V = 0, r = 0.87, Z = -4.94, p < .001 --- Code print(out) Output # Paired Wilcoxon signed rank test Alternative hypothesis: true location shift is not equal to 0 V = 247, r = 0.39, Z = -2.76, p = 0.006 sjstats/tests/testthat/_snaps/chi_squared_test.md0000644000176200001440000000246414623373000022106 0ustar liggesusers# chi_squared_test Code print(out1) Output # Chi-squared test for contingency tables Data: c161sex by e16sex (n = 900) χ² = 2.233, ϕ = 0.053 (very small effect), df = 1, p = 0.135 --- Code print(out) Output # Chi-squared test for contingency tables (weighted) Data: c161sex by e16sex (n = 904) χ² = 2.416, ϕ = 0.054 (very small effect), df = 1, p = 0.120 --- Code print(out1) Output # Chi-squared test for given probabilities Data: c161sex against probabilities 30% and 70% (n = 901) χ² = 16.162, פ‎ = 0.088 (very small effect), df = 1, p < .001 --- Code print(out) Output # Chi-squared test for given probabilities (weighted) Data: c161sex against probabilities 30% and 70% (n = 906) χ² = 20.074, פ‎ = 0.097 (very small effect), df = 1, p < .001 --- Code print(out1) Output # Chi-squared test for contingency tables (using McNemar's test for paired data) Data: survey_1 by survey_2 (n = 1000) χ² = 10.868, ϕ = 0.032 (tiny effect), df = 1, p < .001 sjstats/tests/testthat/_snaps/mann_whitney_test.md0000644000176200001440000000165014620351262022316 0ustar liggesusers# mann_whitney_test Code print(out1) Output # Mann-Whitney test Group 1: male (n = 294, rank mean = 147.50) Group 2: female (n = 596, rank mean = 298.50) Alternative hypothesis: true location shift is not equal to 0 W = 59684 , r = 0.26, Z = -7.75, p < .001 --- Code print(out1) Output # Mann-Whitney test Group 1: scale1 (n = 20, rank mean = 10.50) Group 2: scale2 (n = 20, rank mean = 10.50) Alternative hypothesis: true location shift is not equal to 0 W = 188 , r = 0.05, Z = -0.32, p = 0.758 --- Code print(out) Output # Mann-Whitney test (weighted) Group 1: male (n = 296, rank mean = 147.58) Group 2: female (n = 600, rank mean = 299.42) r = 0.26, Z = 7.78, p < .001 sjstats/tests/testthat/_snaps/kruskal_wallis_test.md0000644000176200001440000000066114620351262022646 0ustar liggesusers# kruskal_wallis_test Code print(out1) Output # Kruskal-Wallis test Data: e17age by c172code (3 groups, n = 506, 180 and 156) χ² = 4.05, df = 2, p = 0.132 --- Code print(out1) Output # Kruskal-Wallis test Data: scale1 by scale2 (3 groups, n = 20, 20 and 20) χ² = 4.86, df = 2, p = 0.088 sjstats/tests/testthat/test-wtd.R0000644000176200001440000000042714620351262016651 0ustar liggesuserstest_that("wtd", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) expect_equal(weighted_se(efc$c12hour, weights = efc$weight), 1.704182, tolerance = 1e-5) expect_equal(weighted_se(efc$c12hour, weights = NULL), 1.691623, tolerance = 1e-5) }) sjstats/tests/testthat/test-chi_squared_test.R0000644000176200001440000000410014623373035021376 0ustar liggesusersskip_if_not_installed("effectsize") skip_if_not_installed("datawizard") skip_if_not_installed("MASS") test_that("chi_squared_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) out1 <- chi_squared_test(efc, "c161sex", by = "e16sex") out2 <- chisq.test(efc$c161sex, efc$e16sex) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out <- chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight") expect_equal(out$statistic, 2.415755, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$effect_size, 0.05448519, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$p, 0.1201201, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out)) out1 <- chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7)) out2 <- chisq.test(table(efc$c161sex), p = c(0.3, 0.7)) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out <- chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7), weights = "weight") expect_equal(out$statistic, 20.07379, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$effect_size, 0.0974456, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out)) set.seed(1234) d <- data.frame( survey_1 = sample(c("Approve", "Disapprove"), size = 1000, replace = TRUE, prob = c(0.45, 0.55)), survey_2 = sample(c("Approve", "Disapprove"), size = 1000, replace = TRUE, prob = c(0.42, 0.58)) ) out1 <- chi_squared_test(d, "survey_1", "survey_2", paired = TRUE) out2 <- mcnemar.test(table(d)) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$effect_size, 0.03170437, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) }) sjstats/tests/testthat.R0000644000176200001440000000007414620351262015074 0ustar liggesuserslibrary(testthat) library(sjstats) test_check("sjstats") sjstats/MD50000644000176200001440000001135215022765352012266 0ustar liggesusersc1aef7ed54f2637f188181a52473501b *DESCRIPTION 74455df2ddc77d6e8704d5268f9c6204 *NAMESPACE 2842f48f167c6406c8424f945c136b3b *NEWS.md e10dd983c750c5e52dcbd672c628fbf2 *R/Deprecated.R d9dba7573d21e93660ec5a360b45ce43 *R/S3-methods.R 292fb71d8ced0f748dd0f4e66f29925b *R/anova_stats.R 94f104911f33da856c41be988c125168 *R/auto_prior.R afa0b3edfad717886d11f70b5a3b8dc2 *R/boot_ci.R 9e96fc4c88cf6dde542dad506fbd024c *R/bootstrap.R dae151ca1921e12cdc50d34ec4aaba98 *R/chi_squared_test.R bea37c3b504f502316ea0c0ea13ace6d *R/confint_ncg.R fdc049d40c1128eda89e331c5c90d9ba *R/cramer.R a4d0e395069f0b478afde3129cc335f1 *R/cv.R d4bd61851d9cbf5c43ec4eb85307155c *R/cv_error.R b826d04f20826fbf7e12669244ca6e1f *R/design_effect.R 1ad1908f342e4326610442ec1f87a733 *R/find_beta.R 9e41625c78d4f0fc616fda45af0ddb8c *R/gmd.R 999e7912c1aefa456570911f5d78fe6c *R/gof.R 0cc0afccd5e8618261a77a39fcaf48ab *R/helpfunctions.R 6a846d9f6d8e46234264fafb97ccb650 *R/inequ_trends.R 1fa8b40681d24aee9e3f61fe2f5c3280 *R/is_prime.R 67314769d13f100aa42347989682118c *R/kruskal_wallis_test.R 1d5d666a8a158249ded68970dbc2b1a7 *R/mann_whitney_test.R 95ac928d84740007386966a5acd85477 *R/nhanes_sample.R d6b62025eb44e0cec4b5b2b3ac35d95c *R/phi.R 0862472d8822eb19a02df43afed8252f *R/prop.R 3b73410ea4df483db4d088b26cd67fcd *R/re-exports.R e05b4f907d36a356776c91a2efd4dd64 *R/samplesize_mixed.R 361bc5f678c9c2319dbdb8eeac7eb45a *R/se_ybar.R 086354712673aab0cc83fd08f1989359 *R/select_helpers.R 7d974672824a933c8324bcd8d01469a2 *R/sjStatistics.R 045c06ea2a7397394b2953a9b583298c *R/svy_median.R 6493371b66b336aa577c64dcb106c184 *R/svyglmnb.R 93a3a551f4b1f0aee327253aa5c8900e *R/svyglmzip.R 3e75c25d915766e171526bdf56c751be *R/t_test.R b31743de39c751d4d377fa619511b60f *R/var_pop.R 974429e0f1399cf4bf3fd978cd48ea88 *R/weight.R f116a04b43b7ccd58e83a073d3127d92 *R/wilcoxon_test.R 87f10c7fc8851aadc42785eb054f1513 *R/wtd_cor.R 3d485f8685d0eab81d5ef06afc2a6ebe *R/wtd_se.R 0ff693defb2e755fcf9335563a05af75 *R/wtd_variance.R 5aabab241149572088cd6eee7608af2d *R/xtab_statistics.R ae778cf204793e0dfedee9997d09129b *README.md cb3b20077bebc5da31fc1b46c657c0ff *build/partial.rdb 3172b22b3d87d0f86d78326bc26891fc *data/efc.RData ad3f18b79c24699a3c122d8b377b5028 *data/nhanes_sample.RData 87f8a6a5cb2f59e0b2110feef3926498 *inst/CITATION e7b45979874646edfe06751f551ecac0 *man/anova_stats.Rd 59646cdd3bf69f527ab406e00ab87fa9 *man/auto_prior.Rd 70644108d79f2803c7942ac793429ff5 *man/boot_ci.Rd 1145032561d5db541e2d6507fb7a0767 *man/bootstrap.Rd aa75706bfe65fddfef67c466659bb300 *man/chi_squared_test.Rd d442b8e1e4f45c610e7f36156b45f337 *man/chisq_gof.Rd 7c79e41fdc70e1658f589abeb6b461e8 *man/crosstable_statistics.Rd ffdb50b8e82e22adbf2a02c0109ad4df *man/cv.Rd cedc450189ecfdec77f4b1c854d684cc *man/cv_error.Rd 4ca804b5ac14c5bd99da78f47255d2ba *man/design_effect.Rd 07b7d0ca4d598d4623ef85ce91255f2c *man/efc.Rd 00a3f8db1d678464d37705968d42286f *man/figures/logo.png 5972d9fb77c826cf2dfc1bedf013ae6d *man/find_beta.Rd 1d3f4f4138dfbce4505b8d4a115b50e3 *man/gmd.Rd aba99aa993b4a557f0263a74ac9d03e3 *man/inequ_trend.Rd 4ce0bf4951990d5e54c63e58c6655b9a *man/is_prime.Rd 7d0da9e0dddae98ae29ef61067d92858 *man/kruskal_wallis_test.Rd a260eeaa63dfd35efd31bef099bc52ca *man/mann_whitney_test.Rd 2caeab2b1a18d198a10c8f2922a3f569 *man/nhanes_sample.Rd 29b6feb7a0b67ad073cde5e2bba6adb2 *man/prop.Rd 489cce3aeab9138295c8f9b96bc8b411 *man/r2.Rd df2053d8e790c20ba0628b8e14c6661f *man/reexports.Rd 8b6bca7c1d0ca430e195d7f85957d2b3 *man/samplesize_mixed.Rd 6376e9c3db371cfc7941e2867d848567 *man/se_ybar.Rd d805390222777ca589b1ae881b21b86d *man/svyglm.nb.Rd 08b6aac1d7829ce80aa4a86ec3e32e17 *man/svyglm.zip.Rd bf3b5ec64335984fc48f271d39f15f9a *man/t_test.Rd 1fc30c16002e33aedd18a861df8bb685 *man/table_values.Rd 12f085a686effc5f7215f57dbe9a337a *man/var_pop.Rd ce9b25cf83dceadb0d8312b9ec0ef64b *man/weight.Rd 2e7b20401a5f7788b47893d95ee60a29 *man/weighted_se.Rd fe079ad00575d5017383eadab1fb321c *man/wilcoxon_test.Rd bcdd319f419f3a3c026bd5c6cf6c6606 *tests/testthat.R a7fdaa4db0db6dcb721eda69951ac277 *tests/testthat/_snaps/chi_squared_test.md cdc4c6c33056ce23e1f8ea10c81c6153 *tests/testthat/_snaps/kruskal_wallis_test.md 44f5aa1953dacf2508276030089c1b7d *tests/testthat/_snaps/mann_whitney_test.md 8abd67404361072251463e4ec47310c4 *tests/testthat/_snaps/t_test.md 4dc8c45020c146986def33cf0994b64f *tests/testthat/_snaps/wilcoxon_test.md 79df90b4a1ef0dfa51a14229e13354ec *tests/testthat/test-autoprior.R 289430580685f1f24b5e8b0ae055708a *tests/testthat/test-chi_squared_test.R 26a2f03b97851963289a3b6ea589cf36 *tests/testthat/test-kruskal_wallis_test.R 0fe1150cbf1b8f688c88f6dc6ebfea8f *tests/testthat/test-mann_whitney_test.R 113db343ce6d3454faa6f62cc812a317 *tests/testthat/test-t_test.R b70593e19a84da47a7fd5631e0817458 *tests/testthat/test-wilcoxon_test.R 3ddf1a5493907c2646b615b2aaeb56e4 *tests/testthat/test-wtd.R sjstats/R/0000755000176200001440000000000015022763073012153 5ustar liggesuserssjstats/R/mann_whitney_test.R0000644000176200001440000004232214623373000016031 0ustar liggesusers#' @title Mann-Whitney test #' @name mann_whitney_test #' @description This function performs a Mann-Whitney test (or Wilcoxon rank #' sum test for _unpaired_ samples). Unlike the underlying base R function #' `wilcox.test()`, this function allows for weighted tests and automatically #' calculates effect sizes. For _paired_ (dependent) samples, or for one-sample #' tests, please use the `wilcoxon_test()` function. #' #' A Mann-Whitney test is a non-parametric test for the null hypothesis that two #' _independent_ samples have identical continuous distributions. It can be used #' for ordinal scales or when the two continuous variables are not normally #' distributed. For large samples, or approximately normally distributed variables, #' the `t_test()` function can be used. #' #' @param data A data frame. #' @param select Name(s) of the continuous variable(s) (as character vector) #' to be used as samples for the test. `select` can be one of the following: #' #' - `select` can be used in combination with `by`, in which case `select` is #' the name of the continous variable (and `by` indicates a grouping factor). #' - `select` can also be a character vector of length two or more (more than #' two names only apply to `kruskal_wallis_test()`), in which case the two #' continuous variables are treated as samples to be compared. `by` must be #' `NULL` in this case. #' - If `select` select is of length **two** and `paired = TRUE`, the two samples #' are considered as *dependent* and a paired test is carried out. #' - If `select` specifies **one** variable and `by = NULL`, a one-sample test #' is carried out (only applicable for `t_test()` and `wilcoxon_test()`) #' - For `chi_squared_test()`, if `select` specifies **one** variable and #' both `by` and `probabilities` are `NULL`, a one-sample test against given #' probabilities is automatically conducted, with equal probabilities for #' each level of `select`. #' @param by Name of the variable indicating the groups. Required if `select` #' specifies only one variable that contains all samples to be compared in the #' test. If `by` is not a factor, it will be coerced to a factor. For #' `chi_squared_test()`, if `probabilities` is provided, `by` must be `NULL`. #' @param weights Name of an (optional) weighting variable to be used for the test. #' @param alternative A character string specifying the alternative hypothesis, #' must be one of `"two.sided"` (default), `"greater"` or `"less"`. See `?t.test` #' and `?wilcox.test`. #' @param mu The hypothesized difference in means (for `t_test()`) or location #' shift (for `wilcoxon_test()` and `mann_whitney_test()`). The default is 0. #' @param ... Additional arguments passed to `wilcox.test()` (for unweighted #' tests, i.e. when `weights = NULL`). #' #' @section Which test to use: #' The following table provides an overview of which test to use for different #' types of data. The choice of test depends on the scale of the outcome #' variable and the number of samples to compare. #' #' | **Samples** | **Scale of Outcome** | **Significance Test** | #' |-----------------|------------------------|---------------------------------| #' | 1 | binary / nominal | `chi_squared_test()` | #' | 1 | continuous, not normal | `wilcoxon_test()` | #' | 1 | continuous, normal | `t_test()` | #' | 2, independent | binary / nominal | `chi_squared_test()` | #' | 2, independent | continuous, not normal | `mann_whitney_test()` | #' | 2, independent | continuous, normal | `t_test()` | #' | 2, dependent | binary (only 2x2) | `chi_squared_test(paired=TRUE)` | #' | 2, dependent | continuous, not normal | `wilcoxon_test()` | #' | 2, dependent | continuous, normal | `t_test(paired=TRUE)` | #' | >2, independent | continuous, not normal | `kruskal_wallis_test()` | #' | >2, independent | continuous, normal | `datawizard::means_by_group()` | #' | >2, dependent | continuous, not normal | _not yet implemented_ (1) | #' | >2, dependent | continuous, normal | _not yet implemented_ (2) | #' #' (1) More than two dependent samples are considered as _repeated measurements_. #' For ordinal or not-normally distributed outcomes, these samples are #' usually tested using a [`friedman.test()`], which requires the samples #' in one variable, the groups to compare in another variable, and a third #' variable indicating the repeated measurements (subject IDs). #' #' (2) More than two dependent samples are considered as _repeated measurements_. #' For normally distributed outcomes, these samples are usually tested using #' a ANOVA for repeated measurements. A more sophisticated approach would #' be using a linear mixed model. #' #' @seealso #' - [`t_test()`] for parametric t-tests of dependent and independent samples. #' - [`mann_whitney_test()`] for non-parametric tests of unpaired (independent) #' samples. #' - [`wilcoxon_test()`] for Wilcoxon rank sum tests for non-parametric tests #' of paired (dependent) samples. #' - [`kruskal_wallis_test()`] for non-parametric tests with more than two #' independent samples. #' - [`chi_squared_test()`] for chi-squared tests (two categorical variables, #' dependent and independent). #' #' @return A data frame with test results. The function returns p and Z-values #' as well as effect size r and group-rank-means. #' #' @references #' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., #' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data #' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. #' \doi{10.3390/math11091982} #' #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @details This function is based on [`wilcox.test()`] and [`coin::wilcox_test()`] #' (the latter to extract effect sizes). The weighted version of the test is #' based on [`survey::svyranktest()`]. #' #' Interpretation of the effect size **r**, as a rule-of-thumb: #' #' - small effect >= 0.1 #' - medium effect >= 0.3 #' - large effect >= 0.5 #' #' **r** is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}. #' #' @examplesIf requireNamespace("coin") && requireNamespace("survey") #' data(efc) #' # Mann-Whitney-U tests for elder's age by elder's sex. #' mann_whitney_test(efc, "e17age", by = "e16sex") #' # base R equivalent #' wilcox.test(e17age ~ e16sex, data = efc) #' #' # when data is in wide-format, specify all relevant continuous #' # variables in `select` and omit `by` #' set.seed(123) #' wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20)) #' mann_whitney_test(wide_data, select = c("scale1", "scale2")) #' # base R equivalent #' wilcox.test(wide_data$scale1, wide_data$scale2) #' # same as if we had data in long format, with grouping variable #' long_data <- data.frame( #' scales = c(wide_data$scale1, wide_data$scale2), #' groups = as.factor(rep(c("A", "B"), each = 20)) #' ) #' mann_whitney_test(long_data, select = "scales", by = "groups") #' # base R equivalent #' wilcox.test(scales ~ groups, long_data) #' @export mann_whitney_test <- function(data, select = NULL, by = NULL, weights = NULL, mu = 0, alternative = "two.sided", ...) { insight::check_if_installed("datawizard") alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) # sanity checks .sanitize_htest_input(data, select, by, weights, test = "mann_whitney_test") # alternative only if weights are NULL if (!is.null(weights) && alternative != "two.sided") { insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.") } # does select indicate more than one variable? if (length(select) > 1) { # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale") by <- select[2] select <- select[1] # after converting to long, we have the "grouping" variable first in the data colnames(data) <- c(by, select) } # get data dv <- data[[select]] grp <- data[[by]] # coerce to factor grp <- datawizard::to_factor(grp) # only two groups allowed if (insight::n_unique(grp) > 2) { insight::format_error("Only two groups are allowed for Mann-Whitney test. Please use `kruskal_wallis_test()` for more than two groups.") # nolint } # value labels group_labels <- names(attr(data[[by]], "labels", exact = TRUE)) if (is.null(group_labels)) { group_labels <- levels(droplevels(grp)) } if (is.null(weights)) { .calculate_mwu(dv, grp, alternative, mu, group_labels, ...) } else { .calculate_weighted_mwu(dv, grp, data[[weights]], group_labels) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_mwu <- function(dv, grp, alternative, mu, group_labels, ...) { insight::check_if_installed("coin") # prepare data wcdat <- data.frame(dv, grp) # perfom wilcox test wt <- coin::wilcox_test(dv ~ grp, data = wcdat) # for rank mean group_levels <- levels(grp) # compute statistics u <- as.numeric(coin::statistic(wt, type = "linear")) z <- as.numeric(coin::statistic(wt, type = "standardized")) r <- abs(z / sqrt(length(dv))) htest <- suppressWarnings(stats::wilcox.test( dv ~ grp, data = wcdat, alternative = alternative, mu = mu, ... )) w <- htest$statistic p <- htest$p.value # group means dat_gr1 <- stats::na.omit(dv[grp == group_levels[1]]) dat_gr2 <- stats::na.omit(dv[grp == group_levels[2]]) rank_mean_1 <- mean(rank(dat_gr1)) rank_mean_2 <- mean(rank(dat_gr2)) # compute n for each group n_grp1 <- length(dat_gr1) n_grp2 <- length(dat_gr2) out <- data.frame( group1 = group_levels[1], group2 = group_levels[2], estimate = rank_mean_1 - rank_mean_2, u = u, w = w, z = z, r = r, p = as.numeric(p), mu = mu, alternative = alternative ) attr(out, "rank_means") <- stats::setNames( c(rank_mean_1, rank_mean_2), c("Mean Group 1", "Mean Group 2") ) attr(out, "n_groups") <- stats::setNames( c(n_grp1, n_grp2), c("N Group 1", "N Group 2") ) attr(out, "group_labels") <- group_labels attr(out, "method") <- "wilcoxon" attr(out, "weighted") <- FALSE class(out) <- c("sj_htest_mwu", "data.frame") out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_mwu <- function(dv, grp, weights, group_labels) { # check if pkg survey is available insight::check_if_installed("survey") dat <- stats::na.omit(data.frame(dv, grp, weights)) colnames(dat) <- c("x", "g", "w") design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) result <- survey::svyranktest(formula = x ~ g, design, test = "wilcoxon") # for rank mean group_levels <- levels(droplevels(grp)) # subgroups dat_gr1 <- dat[dat$g == group_levels[1], ] dat_gr2 <- dat[dat$g == group_levels[2], ] dat_gr1$rank_x <- rank(dat_gr1$x) dat_gr2$rank_x <- rank(dat_gr2$x) # rank means design_mean1 <- survey::svydesign( ids = ~0, data = dat_gr1, weights = ~w ) rank_mean_1 <- survey::svymean(~rank_x, design_mean1) design_mean2 <- survey::svydesign( ids = ~0, data = dat_gr2, weights = ~w ) rank_mean_2 <- survey::svymean(~rank_x, design_mean2) # group Ns n_grp1 <- round(sum(dat_gr1$w)) n_grp2 <- round(sum(dat_gr2$w)) # statistics and effect sizes z <- result$statistic r <- abs(z / sqrt(sum(n_grp1, n_grp2))) out <- data_frame( group1 = group_levels[1], group2 = group_levels[2], estimate = result$estimate, z = z, r = r, p = as.numeric(result$p.value), alternative = "two.sided" ) attr(out, "rank_means") <- stats::setNames( c(rank_mean_1, rank_mean_2), c("Mean Group 1", "Mean Group 2") ) attr(out, "n_groups") <- stats::setNames( c(n_grp1, n_grp2), c("N Group 1", "N Group 2") ) attr(out, "group_labels") <- group_labels attr(out, "weighted") <- TRUE class(out) <- c("sj_htest_mwu", "data.frame") out } # helper ---------------------------------------------------------------------- .sanitize_htest_input <- function(data, select, by, weights, test = NULL) { # check if arguments are NULL if (is.null(select)) { insight::format_error("Argument `select` is missing.") } # sanity check - may only specify two variable names if (identical(test, "mann_whitney_test") && length(select) > 2) { insight::format_error("You may only specify two variables for Mann-Whitney test.") } if (identical(test, "mann_whitney_test") && length(select) == 1 && is.null(by)) { insight::format_error("Only one variable provided in `select`, but none in `by`. You need to specify a second continuous variable in `select`, or a grouping variable in `by` for Mann-Whitney test.") # nolint } # sanity check - may only specify two variable names if (identical(test, "t_test") && length(select) > 2) { insight::format_error("You may only specify two variables for Student's t test.") } if ((!is.null(test) && test %in% c("t_test", "kruskal_wallis_test", "mann_whitney_test")) && length(select) > 1 && !is.null(by)) { # nolint insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.") } # check if arguments have correct length or are of correct type if (!is.character(select)) { insight::format_error("Argument `select` must be a character string with the name(s) of the variable(s).") } if (!is.null(by) && (length(by) != 1 || !is.character(by))) { insight::format_error("Argument `by` must be a character string with the name of a single variable.") } if (!is.null(weights) && (length(weights) != 1 || !is.character(weights))) { insight::format_error("Argument `weights` must be a character string with the name of a single variable.") } # check if "select" is in data if (!all(select %in% colnames(data))) { not_found <- setdiff(select, colnames(data))[1] insight::format_error( sprintf("Variable '%s' not found in data frame.", not_found), .misspelled_string(colnames(data), not_found, "Maybe misspelled?") ) } # check if "by" is in data if (!is.null(by) && !by %in% colnames(data)) { insight::format_error( sprintf("Variable '%s' not found in data frame.", by), .misspelled_string(colnames(data), by, "Maybe misspelled?") ) } # check if "weights" is in data if (!is.null(weights) && !weights %in% colnames(data)) { insight::format_error( sprintf("Weighting variable '%s' not found in data frame.", weights), .misspelled_string(colnames(data), weights, "Maybe misspelled?") ) } # select variable type for certain tests if (identical(test, "t_test") && !all(vapply(data[select], is.numeric, logical(1)))) { insight::format_error("Variable provided in `select` must be numeric for Student's t test.") } } # methods --------------------------------------------------------------------- #' @export print.sj_htest_mwu <- function(x, ...) { # fetch attributes group_labels <- attributes(x)$group_labels rank_means <- attributes(x)$rank_means n_groups <- attributes(x)$n_groups weighted <- attributes(x)$weighted if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } # same width group_labels <- format(group_labels) # header insight::print_color(sprintf("# Mann-Whitney test%s\n\n", weight_string), "blue") # group-1-info insight::print_color( sprintf( " Group 1: %s (n = %i, rank mean = %s)\n", group_labels[1], n_groups[1], insight::format_value(rank_means[1], protect_integers = TRUE) ), "cyan" ) # group-2-info insight::print_color( sprintf( " Group 2: %s (n = %i, rank mean = %s)\n", group_labels[2], n_groups[2], insight::format_value(rank_means[2], protect_integers = TRUE) ), "cyan" ) # alternative hypothesis if (!is.null(x$alternative) && !is.null(x$mu)) { alt_string <- switch(x$alternative, two.sided = "not equal to", less = "less than", greater = "greater than" ) alt_string <- paste("true location shift is", alt_string, x$mu) insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") } if (!is.null(x$w)) { w_stat <- paste("W =", insight::format_value(x$w, protect_integers = TRUE), ", ") } else { w_stat <- "" } cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", w_stat, x$r, x$z, insight::format_p(x$p))) } sjstats/R/wtd_cor.R0000644000176200001440000000421614620351262013736 0ustar liggesusers#' @rdname weighted_se #' @export weighted_correlation <- function(data, ...) { UseMethod("weighted_correlation") } #' @rdname weighted_se #' @export weighted_correlation.default <- function(data, x, y, weights, ci.lvl = 0.95, ...) { if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) insight::format_error("'ci.lvl' must be a single number between 0 and 1.") x.name <- deparse(substitute(x)) y.name <- deparse(substitute(y)) w.name <- deparse(substitute(weights)) if (w.name == "NULL") { w.name <- "weights" data$weights <- 1 } # create string with variable names vars <- c(x.name, y.name, w.name) # get data dat <- suppressMessages(data[vars]) dat <- stats::na.omit(dat) xv <- dat[[x.name]] yv <- dat[[y.name]] wv <- dat[[w.name]] weighted_correlation_helper(xv, yv, wv, ci.lvl) } #' @rdname weighted_se #' @export weighted_correlation.formula <- function(formula, data, ci.lvl = 0.95, ...) { if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) insight::format_error("'ci.lvl' must be a single number between 0 and 1.") vars <- all.vars(formula) if (length(vars) < 3) { vars <- c(vars, "weights") data$weights <- 1 } # get data dat <- suppressMessages(data[vars]) dat <- stats::na.omit(dat) xv <- dat[[vars[1]]] yv <- dat[[vars[2]]] wv <- dat[[vars[3]]] weighted_correlation_helper(xv, yv, wv, ci.lvl) } weighted_correlation_helper <- function(xv, yv, wv, ci.lvl) { x <- xv - weighted_mean(xv, weights = wv) y <- yv - weighted_mean(yv, weights = wv) x <- x / weighted_sd(x, weights = wv) y <- y / weighted_sd(y, weights = wv) results <- stats::coef(summary(stats::lm(y ~ x, weights = wv)))[2, ] ci <- ci.lvl - ((1 - ci.lvl) / 2) ci <- results[1] + (stats::qnorm(ci) * c(-1, 1) * results[2]) structure( class = "sj_wcor", list( estimate = results[1], method = "Pearson's Correlation Coefficient", p.value = results[4], ci = ci, ci.lvl = ci.lvl ) ) } sjstats/R/kruskal_wallis_test.R0000644000176200001440000001451115022763073016366 0ustar liggesusers#' @title Kruskal-Wallis test #' @name kruskal_wallis_test #' @description This function performs a Kruskal-Wallis rank sum test, which is #' a non-parametric method to test the null hypothesis that the population median #' of all of the groups are equal. The alternative is that they differ in at #' least one. Unlike the underlying base R function `kruskal.test()`, this #' function allows for weighted tests. #' #' @inheritParams mann_whitney_test #' @inherit mann_whitney_test seealso #' #' @return A data frame with test results. #' #' @inheritSection mann_whitney_test Which test to use #' #' @references #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @details The function simply is a wrapper around [`kruskal.test()`]. The #' weighted version of the Kruskal-Wallis test is based on the **survey** package, #' using [`survey::svyranktest()`]. #' #' @examples #' data(efc) #' # Kruskal-Wallis test for elder's age by education #' kruskal_wallis_test(efc, "e17age", by = "c172code") #' #' # when data is in wide-format, specify all relevant continuous #' # variables in `select` and omit `by` #' set.seed(123) #' wide_data <- data.frame( #' scale1 = runif(20), #' scale2 = runif(20), #' scale3 = runif(20) #' ) #' kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3")) #' #' # same as if we had data in long format, with grouping variable #' long_data <- data.frame( #' scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3), #' groups = rep(c("A", "B", "C"), each = 20) #' ) #' kruskal_wallis_test(long_data, select = "scales", by = "groups") #' # base R equivalent #' kruskal.test(scales ~ groups, data = long_data) #' @export kruskal_wallis_test <- function(data, select = NULL, by = NULL, weights = NULL) { insight::check_if_installed("datawizard") # sanity checks .sanitize_htest_input(data, select, by, weights, test = "kruskal_wallis_test") # does select indicate more than one variable? if (length(select) > 1) { # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale") by <- select[2] select <- select[1] # after converting to long, we have the "grouping" variable first in the data colnames(data) <- c(by, select) } # get data dv <- data[[select]] grp <- data[[by]] # coerce to factor grp <- datawizard::to_factor(grp) # only two groups allowed if (insight::n_unique(grp) < 2) { insight::format_error("At least two groups are required, i.e. data must have at least two unique levels in `by` for `kruskal_wallis_test()`.") # nolint } if (is.null(weights)) { .calculate_kw(dv, grp, group_labels = c(select, by)) } else { .calculate_weighted_kw(dv, grp, data[[weights]], group_labels = c(select, by)) } } # Kruskal-Wallis-Test -------------------------------------------- .calculate_kw <- function(dv, grp, paired = FALSE, group_labels = NULL) { # prepare data wcdat <- data.frame(dv, grp) if (paired) { # perfom friedman test for paired data wt <- stats::friedman.test(table(wcdat)) } else { # perfom kruskal wallis test wt <- stats::kruskal.test(dv ~ grp, data = wcdat) } # number of groups n_groups <- vapply( stats::na.omit(unique(grp)), function(g) sum(grp == g, na.rm = TRUE), numeric(1) ) out <- data.frame( data = paste(group_labels[1], "by", group_labels[2]), Chi2 = wt$statistic, df = wt$parameter, p = as.numeric(wt$p.value), stringsAsFactors = FALSE ) attr(out, "n_groups") <- n_groups attr(out, "method") <- ifelse(paired, "friedman", "kruskal") attr(out, "weighted") <- FALSE class(out) <- c("sj_htest_kw", "data.frame") out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_kw <- function(dv, grp, weights, paired = FALSE, group_labels = NULL) { # check if pkg survey is available insight::check_if_installed("survey") dat <- stats::na.omit(data.frame(dv, grp, weights)) colnames(dat) <- c("x", "g", "w") # number of groups n_groups <- vapply(stats::na.omit(unique(grp)), function(i) { round(sum(dat$w[dat$g == i], na.rm = TRUE)) }, numeric(1)) if (paired) { ## TODO: paired no working. should call `friedman.test()` insight::format_error("Paired Kruskal-Wallis test is using weights is not implemented yet.") } else { design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) result <- survey::svyranktest(formula = x ~ g, design, test = "KruskalWallis") } out <- data.frame( data = paste(group_labels[1], "by", group_labels[2]), Chi2 = result$statistic, df = result$parameter, p = as.numeric(result$p.value), stringsAsFactors = FALSE ) attr(out, "n_groups") <- n_groups attr(out, "method") <- ifelse(paired, "friedman", "kruskal") attr(out, "weighted") <- TRUE class(out) <- c("sj_htest_kw", "data.frame") out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_kw <- function(x, ...) { insight::check_if_installed("datawizard") # fetch attributes n_groups <- attributes(x)$n_groups weighted <- attributes(x)$weighted method <- attributes(x)$method if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } # header if (identical(method, "kruskal")) { insight::print_color(sprintf("# Kruskal-Wallis test%s\n\n", weight_string), "blue") } else { insight::print_color(sprintf("# Friedman test%s\n\n", weight_string), "blue") } # data info insight::print_color( sprintf( " Data: %s (%i groups, n = %s)\n", x$data, length(n_groups), datawizard::text_concatenate(n_groups) ), "cyan" ) stat_symbol <- .format_symbols("Chi2") cat(sprintf( "\n %s = %.2f, df = %i, %s\n\n", stat_symbol, x$Chi2, round(x$df), insight::format_p(x$p) )) } sjstats/R/auto_prior.R0000644000176200001440000001157614623373000014464 0ustar liggesusers#' @title Create default priors for brms-models #' @name auto_prior #' #' @description This function creates default priors for brms-regression #' models, based on the same automatic prior-scale adjustment as in #' \pkg{rstanarm}. #' #' @param formula A formula describing the model, which just needs to contain #' the model terms, but no notation of interaction, splines etc. Usually, #' you want only those predictors in the formula, for which automatic #' priors should be generated. Add informative priors afterwards to the #' returned \code{brmsprior}-object. #' @param data The data that will be used to fit the model. #' @param gaussian Logical, if the outcome is gaussian or not. #' @param locations A numeric vector with location values for the priors. If #' \code{locations = NULL}, \code{0} is used as location parameter. #' #' @return A \code{brmsprior}-object. #' #' @details \code{auto_prior()} is a small, convenient function to create #' some default priors for brms-models with automatically adjusted prior #' scales, in a similar way like \pkg{rstanarm} does. The default scale for #' the intercept is 10, for coefficients 2.5. If the outcome is gaussian, #' both scales are multiplied with \code{sd(y)}. Then, for categorical #' variables, nothing more is changed. For numeric variables, the scales #' are divided by the standard deviation of the related variable. #' \cr \cr #' All prior distributions are \emph{normal} distributions. \code{auto_prior()} #' is intended to quickly create default priors with feasible scales. If #' more precise definitions of priors is necessary, this needs to be done #' directly with brms-functions like \code{set_prior()}. #' #' @note As \code{auto_prior()} also sets priors on the intercept, the model #' formula used in \code{brms::brm()} must be rewritten to something like #' \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. #' #' @examplesIf requireNamespace("brms") #' data(efc) #' efc$c172code <- as.factor(efc$c172code) #' efc$c161sex <- as.factor(efc$c161sex) #' #' mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) #' auto_prior(mf, efc, TRUE) #' #' ## compare to #' # m <- rstanarm::stan_glm(mf, data = efc, chains = 2, iter = 200) #' # ps <- prior_summary(m) #' # ps$prior_intercept$adjusted_scale #' # ps$prior$adjusted_scale #' #' ## usage #' # ap <- auto_prior(mf, efc, TRUE) #' # brm(mf, data = efc, prior = ap) #' #' # add informative priors #' mf <- formula(neg_c_7 ~ c161sex + c172code) #' #' auto_prior(mf, efc, TRUE) + #' brms::prior(normal(.1554, 40), class = "b", coef = "c160age") #' #' # example with binary response #' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) #' mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) #' auto_prior(mf, efc, FALSE) #' @export auto_prior <- function(formula, data, gaussian, locations = NULL) { insight::check_if_installed("brms") scale.b <- 2.5 scale.y <- 10 pred <- insight::find_predictors(formula, effects = "all", flatten = TRUE) y.name <- insight::find_response(formula, combine = TRUE) data <- stats::na.omit(data[c(y.name, pred)]) y <- data[[y.name]] # check if response is binary if (missing(gaussian) && insight::n_unique(y) == 2) gaussian <- FALSE if (isTRUE(gaussian) && insight::n_unique(y) == 2) insight::format_alert("Priors were calculated based on assumption that the response is Gaussian, however it seems to be binary.") # nolint if (gaussian) { scale.factor <- stats::sd(y, na.rm = TRUE) scale.b <- scale.b * scale.factor scale.y <- scale.y * scale.factor } if (!is.null(locations)) location.y <- locations[1] else location.y <- 0 priors <- brms::set_prior( sprintf("normal(%s, %s)", round(location.y, 2), round(scale.y, 2)), class = "Intercept" ) is.fac <- NULL term.names <- NULL scale.pred <- NULL # we need to check which predictors are categorical and then "mimic" # their coefficient name as it is represented in the model (i.e. variable # name + category name) for (i in pred) { f <- data[[i]] if (is.factor(f)) { i <- sprintf("%s%s", i, levels(f)[2:nlevels(f)]) is.fac <- c(is.fac, rep(TRUE, nlevels(f) - 1)) scale.pred <- c(scale.pred, rep(scale.b, nlevels(f) - 1)) } else { is.fac <- c(is.fac, FALSE) scale.pred <- c(scale.pred, scale.b / stats::sd(f, na.rm = TRUE)) } term.names <- c(term.names, i) } for (i in seq_along(term.names)) { if (!is.null(locations) && length(locations) >= (i + 1)) location.b <- locations[i + 1] else location.b <- 0 priors <- priors + brms::set_prior( sprintf("normal(%s, %s)", round(location.b, 2), round(scale.pred[i], 2)), class = "b", coef = term.names[i] ) } priors } sjstats/R/select_helpers.R0000644000176200001440000000154514620351262015300 0ustar liggesusersstring_starts_with <- function(pattern, x) { pattern <- paste0("^\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_contains <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_ends_with <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E$") grep(pattern, x, perl = TRUE) } string_one_of <- function(pattern, x) { m <- unlist(lapply(pattern, grep, x = x, fixed = TRUE, useBytes = TRUE)) x[m] } rownames_as_column <- function(x, var = "rowname") { rn <- data.frame(rn = rownames(x), stringsAsFactors = FALSE) x <- cbind(rn, x) colnames(x)[1] <- var rownames(x) <- NULL x } obj_has_name <- function(x, name) { name %in% names(x) } obj_has_rownames <- function(x) { !identical(as.character(seq_len(nrow(x))), rownames(x)) } sjstats/R/prop.R0000644000176200001440000001530615022763073013263 0ustar liggesusers#' @title Proportions of values in a vector #' @name prop #' #' @description `prop()` calculates the proportion of a value or category #' in a variable. `props()` does the same, but allows for #' multiple logical conditions in one statement. It is similar #' to `mean()` with logical predicates, however, both #' `prop()` and `props()` work with grouped data frames. #' #' @param data A data frame. May also be a grouped data frame (see 'Examples'). #' @param ... One or more value pairs of comparisons (logical predicates). Put #' variable names the left-hand-side and values to match on the #' right hand side. Expressions may be quoted or unquoted. See #' 'Examples'. #' @param weights Vector of weights that will be applied to weight all observations. #' Must be a vector of same length as the input vector. Default is #' `NULL`, so no weights are used. #' @param na.rm Logical, whether to remove NA values from the vector when the #' proportion is calculated. `na.rm = FALSE` gives you the raw #' percentage of a value in a vector, `na.rm = TRUE` the valid #' percentage. #' @param digits Amount of digits for returned values. #' #' @details `prop()` only allows one logical statement per comparison, #' while `props()` allows multiple logical statements per comparison. #' However, `prop()` supports weighting of variables before calculating #' proportions, and comparisons may also be quoted. Hence, `prop()` #' also processes comparisons, which are passed as character vector #' (see 'Examples'). #' #' #' @return For one condition, a numeric value with the proportion of the values #' inside a vector. For more than one condition, a data frame with one column #' of conditions and one column with proportions. For grouped data frames, #' returns a data frame with one column per group with grouping categories, #' followed by one column with proportions per condition. #' #' @examplesIf getRversion() >= "4.2.0" && requireNamespace("datawizard", quietly = TRUE) #' data(efc) #' #' # proportion of value 1 in e42dep #' prop(efc, e42dep == 1) #' #' # expression may also be completely quoted #' prop(efc, "e42dep == 1") #' #' # use "props()" for multiple logical statements #' props(efc, e17age > 70 & e17age < 80) #' #' # proportion of value 1 in e42dep, and all values greater #' # than 2 in e42dep, including missing values. will return a data frame #' prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE) #' #' # for factors or character vectors, use quoted or unquoted values #' library(datawizard) #' # convert numeric to factor, using labels as factor levels #' efc$e16sex <- to_factor(efc$e16sex) #' efc$n4pstu <- to_factor(efc$n4pstu) #' #' # get proportion of female older persons #' prop(efc, e16sex == female) #' #' # get proportion of male older persons #' prop(efc, e16sex == "male") #' #' # "props()" needs quotes around non-numeric factor levels #' props(efc, #' e17age > 70 & e17age < 80, #' n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' #' ) #' @export prop <- function(data, ..., weights = NULL, na.rm = TRUE, digits = 4) { # check argument if (!is.data.frame(data)) { insight::format_error("`data` needs to be a data frame.") } dots <- match.call(expand.dots = FALSE)[["..."]] .proportions(data, dots = dots, weight.by = weights, na.rm, digits, multi_logical = FALSE) } #' @rdname prop #' @export props <- function(data, ..., na.rm = TRUE, digits = 4) { # check argument if (!is.data.frame(data)) { insight::format_error("`data` needs to be a data frame.") } dots <- match.call(expand.dots = FALSE)[["..."]] .proportions(data, dots = dots, NULL, na.rm, digits, multi_logical = TRUE) } .proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) { # remember comparisons comparisons <- lapply(dots, function(x) { # to character, and remove spaces and quotes x <- gsub(" ", "", deparse(x), fixed = TRUE) x <- gsub("\"", "", x, fixed = TRUE) x }) if (inherits(data, "grouped_df")) { grps <- attributes(data)$groups result <- lapply(grps[[".rows"]], function(x) { .process_prop(data[x, , drop = FALSE], comparisons, dots, multi_logical, na.rm, digits, weight.by) }) } else { result <- .process_prop(data, comparisons, dots, multi_logical, na.rm, digits, weight.by) } result } .process_prop <- function(data, comparisons, dots, multi_logical, na.rm, digits, weight.by) { # iterate dots (comparing conditions) if (multi_logical) result <- lapply(dots, get_multiple_proportion, data, na.rm, digits) else result <- lapply(dots, get_proportion, data, weight.by, na.rm, digits) # if we have more than one proportion, return a data frame. this allows us # to save more information, the condition and the proportion value if (length(comparisons) > 1) { return(data_frame( condition = as.character(unlist(comparisons)), prop = unlist(result) )) } unlist(result) } get_proportion <- function(x, data, weight.by, na.rm, digits) { # to character, and remove spaces and quotes x <- gsub(" ", "", deparse(x), fixed = TRUE) x <- gsub("\"", "", x, fixed = TRUE) # split expression at ==, < or > x.parts <- unlist(regmatches(x, gregexpr("[!=]=|[<>]|(?:(?![=!]=)[^<>])+", x, perl = TRUE))) # shorter version, however, does not split variable names with dots # x.parts <- unlist(regmatches(x, regexec("(\\w+)(\\W+)(\\w+)", x)))[-1] # correct == assignment? if (length(x.parts) < 3) { message("?Syntax error in argument. You possibly used `=` instead of `==`.") return(NULL) } # get variable from data and value from equation f <- data[[x.parts[1]]] v <- suppressWarnings(as.numeric(x.parts[3])) # if we have factor, values maybe non-numeric if (is.na(v)) v <- x.parts[3] # weight vector? if (!is.null(weight.by)) f <- weight(f, weights = weight.by) # get proportions dummy <- switch(x.parts[2], "==" = f == v, "!=" = f != v, "<" = f < v, ">" = f > v, f == v ) # remove missings? if (na.rm) dummy <- stats::na.omit(dummy) # get proportion round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits) } get_multiple_proportion <- function(x, data, na.rm, digits) { # evaluate argument dummy <- with(data, eval(parse(text = deparse(x)))) # remove missings? if (na.rm) dummy <- stats::na.omit(dummy) # get proportion round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits) } sjstats/R/svy_median.R0000644000176200001440000000060714620351262014433 0ustar liggesusers#' @rdname weighted_se #' @export survey_median <- function(x, design) { # check if pkg survey is available insight::check_if_installed("survey") # deparse v <- stats::as.formula(paste("~", as.character(substitute(x)))) as.vector( survey::svyquantile( v, design = design, quantiles = 0.5, ci = FALSE, na.rm = TRUE ) ) } sjstats/R/design_effect.R0000644000176200001440000000364213563265750015077 0ustar liggesusers#' @title Design effects for two-level mixed models #' @name design_effect #' #' @description Compute the design effect (also called \emph{Variance Inflation Factor}) #' for mixed models with two-level design. #' #' @param n Average number of observations per grouping cluster (i.e. level-2 unit). #' @param icc Assumed intraclass correlation coefficient for multilevel-model. #' #' @return The design effect (Variance Inflation Factor) for the two-level model. #' #' @references Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20. #' \cr \cr #' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} #' \cr \cr #' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} #' \cr \cr #' Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. \doi{10.1370/afm.1347} #' #' @details The formula for the design effect is simply \code{(1 + (n - 1) * icc)}. #' #' @examples #' # Design effect for two-level model with 30 observations per #' # cluster group (level-2 unit) and an assumed intraclass #' # correlation coefficient of 0.05. #' design_effect(n = 30) #' #' # Design effect for two-level model with 24 observation per cluster #' # group and an assumed intraclass correlation coefficient of 0.2. #' design_effect(n = 24, icc = 0.2) #' #' @export design_effect <- function(n, icc = 0.05) { 1 + (n - 1) * icc } sjstats/R/anova_stats.R0000644000176200001440000001577714620363055014640 0ustar liggesusers#' @title Effect size statistics for anova #' @name anova_stats #' @description Returns the (partial) eta-squared, (partial) omega-squared, #' epsilon-squared statistic or Cohen's F for all terms in an anovas. #' \code{anova_stats()} returns a tidy summary, including all these statistics #' and power for each term. #' #' @param model A fitted anova-model of class \code{aov} or \code{anova}. Other #' models are coerced to \code{\link[stats]{anova}}. #' @param digits Amount of digits for returned values. #' #' @return A data frame with all statistics is returned (excluding confidence intervals). #' #' @references Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. #' \cr \cr #' Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. #' #' @examplesIf requireNamespace("car") #' # load sample data #' data(efc) #' #' # fit linear model #' fit <- aov( #' c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, #' data = efc #' ) #' anova_stats(car::Anova(fit, type = 2)) #' @export anova_stats <- function(model, digits = 3) { # .Deprecated("effectsize::effectsize()", package = "effectsize") # get tidy summary table aov.sum <- aov_stat_summary(model) # compute all model statistics etasq <- aov_stat_core(aov.sum, type = "eta") partial.etasq <- aov_stat_core(aov.sum, type = "peta") omegasq <- aov_stat_core(aov.sum, type = "omega") partial.omegasq <- aov_stat_core(aov.sum, type = "pomega") epsilonsq <- aov_stat_core(aov.sum, type = "epsilon") # compute power for each estimate cohens.f <- sqrt(partial.etasq / (1 - partial.etasq)) # bind as data frame anov_stat <- rbind( data.frame(etasq, partial.etasq, omegasq, partial.omegasq, epsilonsq, cohens.f), data.frame(etasq = NA, partial.etasq = NA, omegasq = NA, partial.omegasq = NA, epsilonsq = NA, cohens.f = NA) ) anov_stat <- cbind(anov_stat, data.frame(aov.sum)) # get nr of terms nt <- nrow(anov_stat) - 1 # finally, compute power as_power <- tryCatch( c(.calculate_power( df1 = anov_stat$df[1:nt], df2 = anov_stat$df[nrow(anov_stat)], effect_size = anov_stat$cohens.f[1:nt]^2 ), NA ), error = function(x) { NA } ) out <- cbind(anov_stat, data.frame(power = as_power)) out[] <- lapply(out, function(i) { if (is.numeric(i)) { round(i, digits) } else { i } }) class(out) <- c("sj_anova_stat", class(out)) out } aov_stat <- function(model, type) { aov.sum <- aov_stat_summary(model) aov.res <- aov_stat_core(aov.sum, type) if (obj_has_name(aov.sum, "stratum")) attr(aov.res, "stratum") <- aov.sum[["stratum"]] aov.res } aov_stat_summary <- function(model) { insight::check_if_installed("parameters") # check if we have a mixed model mm <- is_merMod(model) ori.model <- model # check that model inherits from correct class # else, try to coerce to anova table if (!inherits(model, c("Gam", "aov", "anova", "anova.rms", "aovlist"))) model <- stats::anova(model) # get summary table aov.sum <- insight::standardize_names(as.data.frame(parameters::model_parameters(model)), style = "broom") # for mixed models, add information on residuals if (mm) { res <- stats::residuals(ori.model) aov.sum <- rbind( aov.sum, data_frame( term = "Residuals", df = length(res) - sum(aov.sum[["df"]]), sumsq = sum(res^2, na.rm = TRUE), meansq = mse(ori.model), statistic = NA ) ) } # check if object has sums of square if (!obj_has_name(aov.sum, "sumsq")) { stop("Model object has no sums of squares. Cannot compute effect size statistic.", call. = FALSE) } # need special handling for rms-anova if (inherits(model, "anova.rms")) colnames(aov.sum) <- c("term", "df", "sumsq", "meansq", "statistic", "p.value") # for car::Anova, the meansq-column might be missing, so add it manually if (!obj_has_name(aov.sum, "meansq")) { pos_sumsq <- which(colnames(aov.sum) == "sumsq") aov.sum <- cbind( aov.sum[1:pos_sumsq], data.frame(meansq = aov.sum$sumsq / aov.sum$df), aov.sum[(pos_sumsq + 1):ncol(aov.sum)] ) } intercept <- .which_intercept(aov.sum$term) if (length(intercept) > 0) { aov.sum <- aov.sum[-intercept, ] } aov.sum } aov_stat_core <- function(aov.sum, type) { intercept <- .which_intercept(aov.sum$term) if (length(intercept) > 0) { aov.sum <- aov.sum[-intercept, ] } # get mean squared of residuals meansq.resid <- aov.sum[["meansq"]][nrow(aov.sum)] # get total sum of squares ss.total <- sum(aov.sum[["sumsq"]]) # get sum of squares of residuals ss.resid <- aov.sum[["sumsq"]][nrow(aov.sum)] # number of terms in model n_terms <- nrow(aov.sum) - 1 # number of observations N <- sum(aov.sum[["df"]]) + 1 aovstat <- switch(type, # compute omega squared for each model term omega = unlist(lapply(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / (ss.total + meansq.resid) })), # compute partial omega squared for each model term pomega = unlist(lapply(1:n_terms, function(x) { df.term <- aov.sum[["df"]][x] meansq.term <- aov.sum[["meansq"]][x] (df.term * (meansq.term - meansq.resid)) / (df.term * meansq.term + (N - df.term) * meansq.resid) })), # compute epsilon squared for each model term epsilon = unlist(lapply(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / ss.total })), # compute eta squared for each model term eta = unlist(lapply(1:n_terms, function(x) { aov.sum[["sumsq"]][x] / sum(aov.sum[["sumsq"]]) })), # compute partial eta squared for each model term cohens.f = , peta = unlist(lapply(1:n_terms, function(x) { aov.sum[["sumsq"]][x] / (aov.sum[["sumsq"]][x] + ss.resid) })) ) # compute Cohen's F if (type == "cohens.f") aovstat <- sqrt(aovstat / (1 - aovstat)) # give values names of terms names(aovstat) <- aov.sum[["term"]][1:n_terms] aovstat } .which_intercept <- function(x) { which(tolower(x) %in% c("(intercept)_zi", "intercept (zero-inflated)", "intercept", "zi_intercept", "(intercept)", "b_intercept", "b_zi_intercept")) } .calculate_power <- function(df1, df2, effect_size) { if (any(effect_size < 0)) { return(NA) } if (!is.null(df1) && any(df1 < 1)) { return(NA) } if (!is.null(df2) && any(df2 < 1)) { return(NA) } lambda <- effect_size * (df1 + df2 + 1) stats::pf( stats::qf(0.05, df1 = df1, df2 = df2, lower.tail = FALSE), df1 = df1, df2 = df2, ncp = lambda, lower.tail = FALSE ) } sjstats/R/gof.R0000644000176200001440000000727214620351262013055 0ustar liggesusers#' @title Compute model quality #' @name chisq_gof #' #' @description For logistic regression models, performs a Chi-squared #' goodness-of-fit-test. #' #' @param x A numeric vector or a \code{glm}-object. #' @param prob Vector of probabilities (indicating the population probabilities) #' of the same length as \code{x}'s amount of categories / factor levels. #' Use \code{nrow(table(x))} to determine the amount of necessary values #' for \code{prob}. Only used, when \code{x} is a vector, and not a #' \code{glm}-object. #' @param weights Vector with weights, used to weight \code{x}. #' #' @references #' Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. #' #' @details For vectors, this function is a convenient function for the #' \code{chisq.test()}, performing goodness-of-fit test. For #' \code{glm}-objects, this function performs a goodness-of-fit test. #' A well-fitting model shows \emph{no} significant difference between the #' model and the observed data, i.e. the reported p-values should be #' greater than 0.05. #' #' @return For vectors, returns the object of the computed \code{\link[stats]{chisq.test}}. #' For \code{glm}-objects, an object of class \code{chisq_gof} with #' following values: \code{p.value}, the p-value for the goodness-of-fit test; #' \code{z.score}, the standardized z-score for the goodness-of-fit test; #' \code{rss}, the residual sums of squares term and \code{chisq}, the pearson #' chi-squared statistic. #' #' @examples #' data(efc) #' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) #' m <- glm( #' neg_c_7d ~ c161sex + barthtot + c172code, #' data = efc, #' family = binomial(link = "logit") #' ) #' #' # goodness-of-fit test for logistic regression #' chisq_gof(m) #' #' # goodness-of-fit test for vectors against probabilities #' # differing from population #' chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28)) #' #' # equal to population #' chisq_gof(efc$e42dep, prop.table(table(efc$e42dep))) #' #' @export chisq_gof <- function(x, prob = NULL, weights = NULL) { if (inherits(x, "glm")) { # This is an adapted version from the # "binomTools" package. The "X2GOFtest()" # function did not work when model data frame # had missing values. y_hat <- stats::fitted(x) wt <- x$prior.weight vJ <- wt * y_hat * (1 - y_hat) cJ <- (1 - 2 * y_hat) / vJ X2 <- sum(stats::resid(x, type = "pearson")^2) form <- stats::as.formula(x$formula) form[[2]] <- as.name("cJ") # use model matrix instead of data values, # because data may contain more variables # than needed, and due to missing may have # different row length dat <- stats::na.omit(x$model) dat$cJ <- cJ dat$vJ <- vJ RSS <- sum(vJ * stats::resid(stats::lm(form, data = dat, weights = vJ))^2) A <- 2 * (length(y_hat) - sum(1 / wt)) z <- (X2 - x$df.residual) / sqrt(A + RSS) p.value <- 2 * stats::pnorm(abs(z), lower.tail = FALSE) chi2gof <- list( p.value = p.value, z.score = z, rss = RSS, chisq = X2 ) class(chi2gof) <- c("sj_chi2gof", "list") } else { # check if we have probs if (is.null(prob)) { warning("`prob` needs to be specified.", call. = F) return(invisible(NULL)) } # performs a Chi-square goodnes-of-fit-test if (!is.null(weights)) x <- weight(x, weights) dummy <- as.vector(table(x)) # goodness of fit-test. x is one-dimensional and # y not given chi2gof <- stats::chisq.test(dummy, p = prob) } chi2gof } sjstats/R/phi.R0000644000176200001440000000225714620351262013060 0ustar liggesusers#' @rdname crosstable_statistics #' @export phi <- function(tab, ...) { UseMethod("phi") } #' @export phi.table <- function(tab, ...) { .phi(tab) } #' @export phi.ftable <- function(tab, ...) { .phi(tab) } #' @export phi.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { formula_terms <- all.vars(formula) tab <- table(data[[formula_terms[1]]], data[[formula_terms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { .cramers_v(tab) } else { straps <- sjstats::bootstrap(data[formula_terms], n) tables <- lapply(straps$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }) phis <- sapply(tables, .phi) ci <- boot_ci(phis, ci.lvl = ci.lvl, method = method) data_frame( phi = .phi(tab), conf.low = ci$conf.low, conf.high = ci$conf.high ) } } .phi <- function(tab) { insight::check_if_installed("MASS") # convert to flat table if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) tb <- summary(MASS::loglm(~1 + 2, tab))$tests sqrt(tb[2, 1] / sum(tab)) } sjstats/R/cv_error.R0000644000176200001440000000521214620351262014113 0ustar liggesusers#' @title Test and training error from model cross-validation #' @name cv_error #' #' @description \code{cv_error()} computes the root mean squared error from a model fitted #' to kfold cross-validated test-training-data. \code{cv_compare()} #' does the same, for multiple formulas at once (by calling \code{cv_error()} #' for each formula). #' #' @param data A data frame. #' @param formula The formula to fit the linear model for the test and training data. #' @param formulas A list of formulas, to fit linear models for the test and training data. #' @param k The number of folds for the kfold-crossvalidation. #' #' @return A data frame with the root mean squared errors for the training and test data. #' #' @details \code{cv_error()} first generates cross-validated test-training pairs, using #' \code{\link[modelr]{crossv_kfold}} and then fits a linear model, which #' is described in \code{formula}, to the training data. Then, predictions #' for the test data are computed, based on the trained models. #' The \emph{training error} is the mean value of the \code{\link{rmse}} for #' all \emph{trained} models; the \emph{test error} is the rmse based on all #' residuals from the test data. #' #' @examples #' data(efc) #' cv_error(efc, neg_c_7 ~ barthtot + c161sex) #' #' cv_compare(efc, formulas = list( #' neg_c_7 ~ barthtot + c161sex, #' neg_c_7 ~ barthtot + c161sex + e42dep, #' neg_c_7 ~ barthtot + c12hour #' )) #' #' @export cv_error <- function(data, formula, k = 5) { insight::check_if_installed("datawizard") # response resp <- insight::find_response(formula) # compute cross validation data cv_data <- lapply(k, function(i) { datawizard::data_partition(data, proportion = 0.8) }) # get train and test datasets train_data <- lapply(cv_data, function(cvdat) cvdat[[1]]) test_data <- lapply(cv_data, function(cvdat) cvdat[[2]]) # fit models on datasets trained_models <- lapply(train_data, function(x) stats::lm(formula, data = x)) test_models <- lapply(test_data, function(x) stats::lm(formula, data = x)) # RMSE train_error <- mean(vapply(trained_models, performance::rmse, numeric(1)), na.rm = TRUE) test_error <- mean(vapply(test_models, performance::rmse, numeric(1)), na.rm = TRUE) data_frame( model = deparse(formula), train.error = round(train_error, 4), test.error = round(test_error, 4) ) } #' @rdname cv_error #' @export cv_compare <- function(data, formulas, k = 5) { do.call(rbind, lapply(formulas, function(f) cv_error(data, formula = f, k = k))) } sjstats/R/cv.R0000644000176200001440000000311314620351262012700 0ustar liggesusers#' @title Compute model quality #' @name cv #' #' @description Compute the coefficient of variation. #' #' @param x Fitted linear model of class \code{lm}, \code{merMod} (\pkg{lme4}) #' or \code{lme} (\pkg{nlme}). #' @param ... More fitted model objects, to compute multiple coefficients of #' variation at once. #' #' @details The advantage of the cv is that it is unitless. This allows #' coefficient of variation to be compared to each other in ways #' that other measures, like standard deviations or root mean #' squared residuals, cannot be. #' #' @return Numeric, the coefficient of variation. #' #' @examples #' data(efc) #' fit <- lm(barthtot ~ c160age + c12hour, data = efc) #' cv(fit) #' #' @export cv <- function(x, ...) { # return value cv_ <- cv_helper(x) # check if we have multiple parameters if (nargs() > 1) { # get input list params_ <- list(...) cv_ <- c(cv_, sapply(params_, cv_helper)) } cv_ } cv_helper <- function(x) { insight::check_if_installed("performance") # check if we have a fitted linear model if (inherits(x, c("lm", "lmerMod", "lme", "merModLmerTest")) && !inherits(x, "glm")) { # get response dv <- insight::get_response(x) mw <- mean(dv, na.rm = TRUE) stddev <- performance::rmse(x) } else { mw <- mean(x, na.rm = TRUE) stddev <- stats::sd(x, na.rm = TRUE) } # check if mean is zero? if (mw == 0) stop("Mean of dependent variable is zero. Cannot compute model's coefficient of variation.", call. = F) stddev / mw } sjstats/R/is_prime.R0000644000176200001440000000121414620351262014077 0ustar liggesusers#' @title Find prime numbers #' @name is_prime #' #' @description This functions checks whether a number is, or numbers in a #' vector are prime numbers. #' #' @param x An integer, or a vector of integers. #' #' @return `TRUE` for each prime number in `x`, `FALSE` otherwise. #' #' @examples #' is_prime(89) #' is_prime(15) #' is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) #' #' @export is_prime <- function(x) { if (is.numeric(x) && !all(x %% 1 == 0, na.rm = TRUE)) { insight::format_error("`x` needs to be an integer value.") } vapply(x, function(.x) .x == 2L || all(.x %% 2L:max(2, floor(sqrt(.x))) != 0), logical(1)) } sjstats/R/confint_ncg.R0000644000176200001440000000743714620351262014574 0ustar liggesusers# This function is a modified version from package MBESS # copied from https://github.com/cran/MBESS/blob/master/R/conf.limits.ncf.R # Author: Ken Kelley # License: GPL-3 confint_ncg <- function(F.value = NULL, conf.level = 0.95, df.1 = NULL, df.2 = NULL) { alpha.lower <- alpha.upper <- (1 - conf.level) / 2 tol <- 1e-09 Jumping.Prop <- 0.1 FAILED <- NULL LL.0 <- stats::qf(p = alpha.lower * 5e-04, df1 = df.1, df2 = df.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) - (1 - alpha.lower) if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < (1 - alpha.lower)) { FAILED <- if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = 0) < 1 - alpha.lower) LL.0 <- 1e-08 if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < 1 - alpha.lower) FAILED <- TRUE } if (is.null(FAILED)) { LL.1 <- LL.2 <- LL.0 while (Diff > tol) { LL.2 <- LL.1 * (1 + Jumping.Prop) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.2) - (1 - alpha.lower) LL.1 <- LL.2 } LL.1 <- LL.2 / (1 + Jumping.Prop) LL.Bounds <- c(LL.1, (LL.1 + LL.2) / 2, LL.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) while (abs(Diff) > tol) { Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[1]) - (1 - alpha.lower) > tol Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[3]) - (1 - alpha.lower) > tol if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[2], (LL.Bounds[2] + LL.Bounds[3]) / 2, LL.Bounds[3]) } if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[1], (LL.Bounds[1] + LL.Bounds[2]) / 2, LL.Bounds[2]) } Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) } LL <- LL.Bounds[2] } if (!is.null(FAILED)) LL <- NA FAILED.Up <- NULL UL.0 <- stats::qf(p = 1 - alpha.upper * 5e-04, df1 = df.1, df2 = df.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper if (Diff < 0) UL.0 <- 1e-08 Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper if (Diff < 0) FAILED.Up <- TRUE if (is.null(FAILED.Up)) { UL.1 <- UL.2 <- UL.0 while (Diff > tol) { UL.2 <- UL.1 * (1 + Jumping.Prop) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.2) - alpha.upper UL.1 <- UL.2 } UL.1 <- UL.2 / (1 + Jumping.Prop) UL.Bounds <- c(UL.1, (UL.1 + UL.2) / 2, UL.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper while (abs(Diff) > tol) { Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[1]) - alpha.upper > tol Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[3]) - alpha.upper > tol if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) { UL.Bounds <- c(UL.Bounds[2], (UL.Bounds[2] + UL.Bounds[3]) / 2, UL.Bounds[3]) } if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) { UL.Bounds <- c(UL.Bounds[1], (UL.Bounds[1] + UL.Bounds[2]) / 2, UL.Bounds[2]) } Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper } UL <- UL.Bounds[2] } if (!is.null(FAILED.Up)) UL <- NA list(Lower.Limit = LL, Upper.Limit = UL) } sjstats/R/chi_squared_test.R0000644000176200001440000002474714623373000015633 0ustar liggesusers#' @title Chi-Squared test #' @name chi_squared_test #' @description This function performs a \eqn{\chi^2} test for contingency #' tables or tests for given probabilities. The returned effects sizes are #' Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) #' for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against #' given probabilities (see _Ben-Shachar et al. 2023_). #' #' @param probabilities A numeric vector of probabilities for each cell in the #' contingency table. The length of the vector must match the number of cells #' in the table, i.e. the number of unique levels of the variable specified #' in `select`. If `probabilities` is provided, a chi-squared test for given #' probabilities is conducted. Furthermore, if `probabilities` is given, `by` #' must be `NULL`. The probabilities must sum to 1. #' @param paired Logical, if `TRUE`, a McNemar test is conducted for 2x2 tables. #' Note that `paired` only works for 2x2 tables. #' @param ... Additional arguments passed down to [`chisq.test()`]. #' @inheritParams mann_whitney_test #' #' @inheritSection mann_whitney_test Which test to use #' #' @inherit mann_whitney_test seealso #' #' @return A data frame with test results. The returned effects sizes are #' Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) #' for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against #' given probabilities. #' #' @details The function is a wrapper around [`chisq.test()`] and #' [`fisher.test()`] (for small expected values) for contingency tables, and #' `chisq.test()` for given probabilities. When `probabilities` are provided, #' these are rescaled to sum to 1 (i.e. `rescale.p = TRUE`). When `fisher.test()` #' is called, simulated p-values are returned (i.e. `simulate.p.value = TRUE`, #' see `?fisher.test`). If `paired = TRUE` and a 2x2 table is provided, #' a McNemar test (see [`mcnemar.test()`]) is conducted. #' #' The weighted version of the chi-squared test is based on the a weighted #' table, using [`xtabs()`] as input for `chisq.test()`. #' #' Interpretation of effect sizes are based on rules described in #' [`effectsize::interpret_phi()`], [`effectsize::interpret_cramers_v()`], #' and [`effectsize::interpret_fei()`]. Use these function directly to get other #' interpretations, by providing the returned effect size as argument, e.g. #' `interpret_phi(0.35, rules = "gignac2016")`. #' #' @references #' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., #' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data #' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. #' \doi{10.3390/math11091982} #' #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @examplesIf requireNamespace("effectsize") && requireNamespace("MASS") #' data(efc) #' efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) #' #' # Chi-squared test #' chi_squared_test(efc, "c161sex", by = "e16sex") #' #' # weighted Chi-squared test #' chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight") #' #' # Chi-squared test for given probabilities #' chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7)) #' @export chi_squared_test <- function(data, select = NULL, by = NULL, probabilities = NULL, weights = NULL, paired = FALSE, ...) { # sanity check - if we only have one variable in "select" and "by" and # "probabilities" are NULL, set probalities if (is.null(probabilities) && !is.null(select) && is.null(by) && length(select) == 1) { probabilities <- rep(1 / length(data[[select]]), length(data[[select]])) } if (is.null(probabilities)) { .calculate_chisq(data, select, by, weights, paired, ...) } else { # sanity check - `paired = TRUE` is not available for given probabilities if (paired) { insight::format_error("When `probabilities` are provided, `paired = TRUE` is not available.") # nolint } .calculate_chisq_gof(data, select, probabilities, weights, ...) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_chisq <- function(data, select, by, weights, paired = FALSE, ...) { insight::check_if_installed(c("datawizard", "MASS")) # sanity checks .sanitize_htest_input(data, select, by, weights) # get data grp1 <- data[[select]] grp2 <- data[[by]] # if paired = TRUE, we only allow a 2x2 table if (paired && (length(stats::na.omit(unique(grp1))) != 2 || length(stats::na.omit(unique(grp2))) != 2)) { insight::format_error("When `paired = TRUE`, only 2x2 tables are allowed (i.e. both variables must have exactly two levels).") # nolint } # create data frame for table x <- data.frame( grp1 = datawizard::to_factor(grp1), grp2 = datawizard::to_factor(grp2) ) # add weights if (!is.null(weights)) { x$weights <- data[[weights]] } # remove missings x <- stats::na.omit(x) # contingency table if (is.null(weights)) { tab <- table(x) } else { tab <- as.table(round(stats::xtabs(x[[3]] ~ x[[1]] + x[[2]]))) class(tab) <- "table" } # expected values, to identify whether Fisher's test is needed expected_values <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab))) # nolint # paired? mc-nemar test if (paired) { htest <- suppressWarnings(stats::mcnemar.test(tab, ...)) test_statistic <- htest$statistic } else { # chi-squared test htest <- suppressWarnings(stats::chisq.test(tab, ...)) test_statistic <- htest$statistic # need fisher? if (min(expected_values) < 5 || (min(expected_values) < 10 && htest$parameter == 1)) { htest <- stats::fisher.test(tab, simulate.p.value = TRUE, ...) } } p_value <- htest$p.value # effect size if (nrow(tab) > 2 || ncol(tab) > 2) { effect_size <- stats::setNames(cramer(tab), "Cramer's V") } else { effect_size <- stats::setNames(phi(tab), "Phi") } # return result out <- data.frame( data = paste(select, "by", by), statistic_name = "Chi-squared", statistic = test_statistic, effect_size_name = names(effect_size), effect_size = as.numeric(effect_size), p = p_value, df = (nrow(tab) - 1) * (ncol(tab) - 1), n_obs = sum(tab, na.rm = TRUE), stringsAsFactors = FALSE ) class(out) <- c("sj_htest_chi", "data.frame") attr(out, "weighted") <- !is.null(weights) attr(out, "fisher") <- isTRUE(startsWith(htest$method, "Fisher")) attr(out, "mcnemar") <- isTRUE(paired) attr(out, "caption") <- "contingency tables" out } .calculate_chisq_gof <- function(data, select, probabilities, weights, ...) { insight::check_if_installed("effectsize") # get data x <- data.frame(grp = data[[select]]) # add weights if (!is.null(weights)) { x$weights <- data[[weights]] } # remove missings x <- stats::na.omit(x) # contingency table if (is.null(weights)) { tab <- table(x) } else { tab <- as.table(round(stats::xtabs(x[[2]] ~ x[[1]]))) class(tab) <- "table" } # table dimensions n_rows <- nlevels(droplevels(as.factor(x$grp))) # sanity check if (length(probabilities) != n_rows) { insight::format_error("Length of probabilities must match number of cells in table (i.e. number of levels of input factor).") # nolint } if (!isTRUE(all.equal(sum(probabilities), 1))) { insight::format_error("Probabilities must sum to 1.") } # chi-squared test htest <- suppressWarnings(stats::chisq.test(tab, p = probabilities, rescale.p = TRUE, ...)) test_statistic <- htest$statistic p_value <- htest$p.value effect_size <- effectsize::chisq_to_fei( test_statistic, n = sum(tab), nrow = n_rows, ncol = 1, p = probabilities, alternative = "two.sided" )$Fei # return result out <- data.frame( data = paste( select, "against probabilities", datawizard::text_concatenate(sprintf("%i%%", round(100 * probabilities))) ), statistic_name = "Chi-squared", statistic = test_statistic, effect_size_name = "Fei", effect_size = as.numeric(effect_size), p = p_value, df = n_rows - 1, n_obs = sum(tab, na.rm = TRUE), stringsAsFactors = FALSE ) class(out) <- c("sj_htest_chi", "data.frame") attr(out, "caption") <- "given probabilities" attr(out, "weighted") <- !is.null(weights) out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_chi <- function(x, ...) { weighted <- attributes(x)$weighted if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } fisher <- attributes(x)$fisher mcnemar <- attributes(x)$mcnemar # headline insight::print_color(sprintf( "# Chi-squared test for %s%s\n", attributes(x)$caption, weight_string ), "blue") # Fisher's exact test? if (isTRUE(fisher)) { insight::print_color(" (using Fisher's exact test due to small expected values)\n", "blue") # nolint } else if (isTRUE(mcnemar)) { insight::print_color(" (using McNemar's test for paired data)\n", "blue") # nolint } cat("\n") # data info insight::print_color( sprintf(" Data: %s (n = %i)\n", x$data, round(x$n_obs)), "cyan" ) # prepare and align strings eff_symbol <- .format_symbols(x$effect_size_name) stat_symbol <- .format_symbols(x$statistic_name) # string for effectsizes eff_string <- switch(x$effect_size_name, Fei = sprintf( "%s = %.3f (%s effect)", eff_symbol, x$effect_size, effectsize::interpret_fei(x$effect_size) ), Phi = sprintf( "%s = %.3f (%s effect)", eff_symbol, x$effect_size, effectsize::interpret_phi(x$effect_size) ), sprintf( "Cramer's V = %.3f (%s effect)", x$effect_size, effectsize::interpret_cramers_v(x$effect_size) ) ) cat(sprintf( "\n %s = %.3f, %s, df = %i, %s\n\n", stat_symbol, x$statistic, eff_string, round(x$df), insight::format_p(x$p) )) } sjstats/R/svyglmzip.R0000644000176200001440000000767514620351262014355 0ustar liggesusersutils::globalVariables("scaled.weights") #' @title Survey-weighted zero-inflated Poisson model #' @name svyglm.zip #' @description \code{svyglm.zip()} is an extension to the \CRANpkg{survey}-package #' to fit survey-weighted zero-inflated Poisson models. It uses #' \code{\link[survey]{svymle}} to fit sampling-weighted #' maximum likelihood estimates, based on starting values provided #' by \code{\link[pscl]{zeroinfl}}. #' #' #' @param formula An object of class \code{formula}, i.e. a symbolic description #' of the model to be fitted. See 'Details' in \code{\link[pscl]{zeroinfl}}. #' @param design An object of class \code{\link[survey]{svydesign}}, providing #' a specification of the survey design. #' @param ... Other arguments passed down to \code{\link[pscl]{zeroinfl}}. #' #' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.zip}, #' with some additional information about the model. #' #' @details Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/. #' #' @examples #' if (require("survey")) { #' data(nhanes_sample) #' set.seed(123) #' nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2) #' nhanes_sample$malepartners[sample(1:2992, 400)] <- 0 #' #' # create survey design #' des <- svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' #' # fit negative binomial regression #' fit <- svyglm.zip( #' malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1), #' des #' ) #' #' # print coefficients and standard errors #' fit #' } #' @export svyglm.zip <- function(formula, design, ...) { insight::check_if_installed(c("survey", "pscl")) # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) # update design with scaled weights design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE)) # fit ZIP model, with scaled design weights mod <- suppressWarnings(pscl::zeroinfl(formula, data = stats::model.frame(design), weights = scaled.weights, ...)) ff <- insight::find_formula(mod) # fit survey model, using maximum likelihood estimation svyfit <- survey::svymle( loglike = sjstats_loglik_zip, grad = sjstats_score_zip, design = design, formulas = list(eta = ff$conditional, logitp = ff$zero_inflated), start = stats::coef(mod), na.action = "na.omit" ) # add additoinal information class(svyfit) <- c("svyglm.zip", class(svyfit)) attr(svyfit, "zip.terms") <- all.vars(formula) attr(svyfit, "zip.formula") <- formula svyfit$deviance <- mod$deviance svyfit$df.residuals <- mod$df.residuals svyfit$df <- length(stats::coef(mod)) + 1 svyfit$aic <- mod$aic svyfit } # log-likelihood function used in "svymle()" sjstats_loglik_zip <- function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) log(p * (y == 0) + (1 - p) * stats::dpois(y, mu)) } sjstats_dlogitp = function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) dexpit <- p / (1 + p) ^ 2 num <- dexpit * (y == 0) - dexpit * stats::dpois(y, mu) denom <- p * (y == 0) + (1 - p) * stats::dpois(y, mu) num / denom } # derivative sjstats_deta_zip <- function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) dmutoy <- 0 * y dmutoy[y > 0] = exp(-mu[y > 0]) * mu[y > 0] ^ (y[y > 0] - 1) / factorial(y[y > 0] - 1) num = (1 - p) * (-stats::dpois(y, mu) + dmutoy) denom = p * (y == 0) + (1 - p) * stats::dpois(y, mu) num / denom } # score function, combines derivatives sjstats_score_zip <- function(y, eta, logitp) { cbind(sjstats_deta_zip(y, eta, logitp), sjstats_dlogitp(y, eta, logitp)) } sjstats/R/S3-methods.R0000644000176200001440000002612114620351262014222 0ustar liggesusers#' @export print.svyglm.nb <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.nb(x, digits, v_se = se)[-1, -2] pan <- ifelse(sm$p.value < 0.001, "<0.001 ***", ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint sprintf("%.*f ", digits, sm$p.value) ) ) ) ) sm$p.value <- pan print(sm, ...) # add dispersion parameter cat(sprintf("\nDispersion parameter Theta: %.*f", digits, attr(x, "nb.theta", exact = TRUE))) cat(sprintf("\n Standard Error of Theta: %.*f", digits, attr(x, "nb.theta.se", exact = TRUE))) message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se)) } #' @export print.svyglm.zip <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.zip(x, digits, v_se = se)[-1, ] pan <- ifelse(sm$p.value < 0.001, "<0.001 ***", ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint sprintf("%.*f ", digits, sm$p.value) ) ) ) ) sm$p.value <- pan print(sm, ...) message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se)) } tidy_svyglm.nb <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) # keep original value, not rounded est <- stats::coef(x) se <- sqrt(diag(stats::vcov(x, stderr = v_se))) data_frame( term = substring(names(stats::coef(x)), 5), estimate = round(est, digits), irr = round(exp(est), digits), std.error = round(se, digits), conf.low = round(exp(est - stats::qnorm(0.975) * se), digits), conf.high = round(exp(est + stats::qnorm(0.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } tidy_svyglm.zip <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) # keep original value, not rounded est <- stats::coef(x) se <- sqrt(diag(stats::vcov(x, stderr = v_se))) data_frame( term = substring(names(stats::coef(x)), 5), estimate = round(est, digits), std.error = round(se, digits), conf.low = round(exp(est - stats::qnorm(0.975) * se), digits), conf.high = round(exp(est + stats::qnorm(0.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } #' @export model.frame.svyglm.nb <- function(formula, ...) { pred <- attr(formula, "nb.terms", exact = TRUE) formula$design$variables[intersect(pred, colnames(formula$design$variables))] } #' @export model.frame.svyglm.zip <- function(formula, ...) { pred <- attr(formula, "zip.terms", exact = TRUE) formula$design$variables[intersect(pred, colnames(formula$design$variables))] } #' @importFrom stats family #' @export family.svyglm.nb <- function(object, ...) { attr(object, "family", exact = TRUE) } #' @export formula.svyglm.nb <- function(x, ...) { attr(x, "nb.formula", exact = TRUE) } #' @export formula.svyglm.zip <- function(x, ...) { attr(x, "zip.formula", exact = TRUE) } #' @export predict.svyglm.nb <- function(object, newdata = NULL, type = c("link", "response", "terms"), se.fit = FALSE, dispersion = NULL, terms = NULL, na.action = stats::na.pass, ...) { insight::check_if_installed(c("survey", "MASS")) fnb <- MASS::glm.nb( attr(object, "nb.formula", exact = TRUE), data = object$design$variables, weights = scaled.weights ) cf <- stats::coef(fnb) names.cf <- names(cf) cf <- stats::coef(object)[-1] cf <- stats::setNames(cf, names.cf) fnb$coefficients <- cf stats::predict.glm( object = fnb, newdata = newdata, type = type, se.fit = se.fit, dispersion = dispersion, terms = terms, na.action = na.action, ... ) } #' @export residuals.svyglm.nb <- function(object, ...) { if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) fnb <- MASS::glm.nb( attr(object, "nb.formula", exact = TRUE), data = object$design$variables, weights = scaled.weights ) y <- insight::get_response(fnb) mu <- stats::predict.glm(fnb, type = "response") wts <- fnb$prior.weights (y - mu) * sqrt(wts) / sqrt(fnb$family$variance(mu)) } #' @export terms.svyglm.nb <- function(x, ...) { if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) stats::terms(stats::formula(x), ...) } #' @export AIC.svyglm.nb <- function(object, ...) { ## FIXME this one just returns the AIC of the underlying glm.nb() model aics <- lapply(list(object, ...), getaic) as.data.frame(do.call(rbind, aics)) } getaic <- function(x) { c(df = x$df, AIC = x$aic) } #' @export deviance.svyglm.nb <- function(object, ...) { ## FIXME this one just returns the deviance of the underlying glm.nb() model object$deviance } #' @export as.integer.sj_resample <- function(x, ...) { x$id } #' @export as.data.frame.sj_resample <- function(x, ...) { x$data[x$id, , drop = FALSE] } #' @export print.sj_resample <- function(x, ...) { n <- length(x$id) if (n > 12) id10 <- c(x$id[1:12], "...") else id10 <- x$id cat("<", paste0( "id's of resample [", prettyNum(nrow(x$data), big.mark = ","), " x ", prettyNum(ncol(x$data), big.mark = ","), "]" ), "> ", toString(id10), "\n", sep = "" ) } #' @export plot.sj_inequ_trend <- function(x, ...) { .data <- NULL insight::check_if_installed("ggplot2") # add time indicator x$data$zeit <- seq_len(nrow(x$data)) # get gather column names gather.cols1 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "lo", "hi")] gather.cols2 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "rr", "rd")] # gather data to plot rr and rd dat1 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y") # gather data for raw prevalences dat2 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y") # Proper value names, for facet labels dat1$grp[dat1$grp == "rr"] <- "Rate Ratios" dat1$grp[dat1$grp == "rd"] <- "Rate Differences" # plot prevalences gp1 <- ggplot2::ggplot(dat2, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + ggplot2::geom_smooth(method = "loess", se = FALSE) + ggplot2::labs(title = "Prevalance Rates for Lower and Higher SES Groups", y = "Prevalances", x = "Time", colour = "") + ggplot2::scale_color_manual(values = c("darkblue", "darkred"), labels = c("High SES", "Low SES")) # plot rr and rd gp2 <- ggplot2::ggplot(dat1, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + ggplot2::geom_smooth(method = "loess", se = FALSE) + ggplot2::facet_wrap(~grp, ncol = 1, scales = "free") + ggplot2::labs(title = "Proportional Change in Rate Ratios and Rate Differences", colour = NULL, y = NULL, x = "Time") + ggplot2::guides(colour = "none") suppressMessages(graphics::plot(gp1)) suppressMessages(graphics::plot(gp2)) } #' @export print.sj_xtab_stat <- function(x, ...) { # get length of method name, to align output l <- max(nchar(c(x$method, x$stat.name, "p-value", "Observations"))) # headline insight::print_color("\n# Measure of Association for Contingency Tables\n", "blue") # used fisher? if (x$fisher) insight::print_color(" (using Fisher's Exact Test)\n", "blue") cat("\n") # print test statistic cat(sprintf(" %*s: %.4f\n", l, x$stat.name, x$statistic)) cat(sprintf(" %*s: %.4f\n", l, x$method, x$estimate)) cat(sprintf(" %*s: %g\n", l, "df", x$df)) cat(sprintf(" %*s: %s\n", l, "p-value", insight::format_p(x$p.value, stars = TRUE, name = NULL))) cat(sprintf(" %*s: %g\n", l, "Observations", x$n_obs)) } #' @export print.sj_chi2gof <- function(x, ...) { insight::print_color("\n# Chi-squared Goodness-of-Fit Test\n\n", "blue") v1 <- sprintf("%.3f", x$chisq) v2 <- sprintf("%.3f", x$z.score) v3 <- sprintf("%.3f", x$p.value) space <- max(nchar(c(v1, v2, v3))) cat(sprintf(" Chi-squared: %*s\n", space, v1)) cat(sprintf(" z-score: %*s\n", space, v2)) cat(sprintf(" p-value: %*s\n\n", space, v3)) if (x$p.value >= 0.05) message("Summary: model seems to fit well.") else message("Summary: model does not fit well.") } #' @export print.sj_ttest <- function(x, ...) { insight::print_color(sprintf("\n%s (%s)\n", x$method, x$alternative), "blue") group <- attr(x, "group.name", exact = TRUE) xn <- attr(x, "x.name", exact = TRUE) yn <- attr(x, "y.name", exact = TRUE) if (!is.null(group)) verbs <- c("of", "by") else verbs <- c("between", "and") st <- sprintf("# t=%.2f df=%i p-value=%.3f\n\n", x$statistic, as.integer(x$df), x$p.value) if (!is.null(yn)) { insight::print_color(sprintf("\n# comparison %s %s %s %s\n", verbs[1], xn, verbs[2], yn), "cyan") } insight::print_color(st, "cyan") if (!is.null(yn)) { if (!is.null(group)) { l1 <- sprintf("mean in group %s", group[1]) l2 <- sprintf("mean in group %s", group[2]) } else { l1 <- sprintf("mean of %s", xn) l2 <- sprintf("mean of %s", yn) } l3 <- "difference of mean" slen <- max(nchar(c(l1, l2, l3))) cat(sprintf(" %s: %.3f\n", format(l1, width = slen), x$estimate[1])) cat(sprintf(" %s: %.3f\n", format(l2, width = slen), x$estimate[2])) cat(sprintf(" %s: %.3f [%.3f %.3f]\n", format(l3, width = slen), x$estimate[1] - x$estimate[2], x$ci[1], x$ci[2])) } else { cat(sprintf(" mean of %s: %.3f [%.3f, %.3f]\n", xn, x$estimate[1], x$ci[1], x$ci[2])) } cat("\n") } #' @export print.sj_wcor <- function(x, ...) { insight::print_color(sprintf("\nWeighted %s\n\n", x$method), "blue") if (!is.null(x$ci)) { cilvl <- sprintf("%.2i%%", as.integer(100 * x$ci.lvl)) cat(sprintf(" estimate [%s CI]: %.3f [%.3f %.3f]\n", cilvl, x$estimate, x$ci[1], x$ci[2])) cat(sprintf(" p-value: %.3f\n\n", x$p.value)) } else { cat(sprintf(" estimate: %.3f\n", x$estimate)) cat(sprintf(" p-value: %.3f\n\n", x$p.value)) } } #' @export print.sj_anova_stat <- function(x, digits = 3, ...) { x$p.value <- insight::format_p(x$p.value, name = NULL) cat(insight::export_table(x, digits = digits, protect_integers = TRUE)) } sjstats/R/find_beta.R0000644000176200001440000001421714620351262014212 0ustar liggesusers#' @title Determining distribution parameters #' @name find_beta #' #' @description `find_beta()`, `find_normal()` and `find_cauchy()` find the #' shape, mean and standard deviation resp. the location and scale parameters #' to describe the beta, normal or cauchy distribution, based on two #' percentiles. `find_beta2()` finds the shape parameters for a Beta #' distribution, based on a probability value and its standard error #' or confidence intervals. #' #' @param x1 Value for the first percentile. #' @param p1 Probability of the first percentile. #' @param x2 Value for the second percentile. #' @param p2 Probability of the second percentile. #' @param x Numeric, a probability value between 0 and 1. Typically indicates #' a prevalence rate of an outcome of interest; Or an integer value #' with the number of observed events. In this case, specify `n` #' to indicate the toral number of observations. #' @param se The standard error of `x`. Either `se` or `ci` must #' be specified. #' @param ci The upper limit of the confidence interval of `x`. Either #' `se` or `ci` must be specified. #' @param n Numeric, number of total observations. Needs to be specified, if #' `x` is an integer (number of observed events), and no #' probability. See 'Examples'. #' #' @return A list of length two, with the two distribution parameters than can #' be used to define the distribution, which (best) describes #' the shape for the given input parameters. #' #' @details These functions can be used to find parameter for various distributions, #' to define prior probabilities for Bayesian analyses. `x1`, `p1`, `x2` and #' `p2` are parameters that describe two quantiles. Given this knowledge, the #' distribution parameters are returned. #' #' Use `find_beta2()`, if the known parameters are, e.g. a prevalence rate or #' similar probability, and its standard deviation or confidence interval. In #' this case. `x` should be a probability, for example a prevalence rate of a #' certain event. `se` then needs to be the standard error for this probability. #' Alternatively, `ci` can be specified, which should indicate the upper limit #' of the confidence interval od the probability (prevalence rate) `x`. If the #' number of events out of a total number of trials is known (e.g. 12 heads out #' of 30 coin tosses), `x` can also be the number of observed events, while `n` #' indicates the total amount of trials (in the above example, the function #' call would be: `find_beta2(x = 12, n = 30)`). #' #' @references Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF}) #' #' @examples #' # example from blogpost: #' # https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/ #' # 10% of patients respond within 30 days of treatment #' # and 80% respond within 90 days of treatment #' find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' #' parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' curve( #' dnorm(x, mean = parms$mean, sd = parms$sd), #' from = 0, to = 200 #' ) #' #' parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' curve( #' dcauchy(x, location = parms$location, scale = parms$scale), #' from = 0, to = 200 #' ) #' #' #' find_beta2(x = .25, ci = .5) #' #' shapes <- find_beta2(x = .25, ci = .5) #' curve(dbeta(x, shapes[[1]], shapes[[2]])) #' #' # find Beta distribution for 3 events out of 20 observations #' find_beta2(x = 3, n = 20) #' #' shapes <- find_beta2(x = 3, n = 20) #' curve(dbeta(x, shapes[[1]], shapes[[2]])) #' #' @export find_beta <- function(x1, p1, x2, p2) { logK <- seq(-5, 10, length = 200) K <- exp(logK) m <- unlist(lapply(K, betaprior, x = x1, p = p1)) prob2 <- stats::pbeta(x2, K * m, K * (1 - m)) ind <- ((prob2 > 0) & (prob2 < 1)) app <- stats::approx(prob2[ind], logK[ind], p2) K0 <- exp(app$y) m0 <- betaprior(K0, x1, p1) s1 <- K0 * m0 s2 <- K0 * (1 - m0) list(shape1 = s1, shape2 = s2) } betaprior <- function(K, x, p) { m.lo <- 0 m.hi <- 1 flag <- TRUE while (flag) { m0 <- (m.lo + m.hi) / 2 p0 <- stats::pbeta(x, K * m0, K * (1 - m0)) if (p0 < p) m.hi <- m0 else m.lo <- m0 if (abs(p0 - p) < 1e-04) flag <- FALSE } m0 } #' @rdname find_beta #' @export find_beta2 <- function(x, se, ci, n) { # check if all required arguments are given if (missing(se) && missing(ci) && missing(n)) { insight::format_error("Either `se` or `ci`, or `n` must be specified.") } # for number of observations, compute variance of beta distribution if (!missing(n)) { if (!is.integer(x) && x < 1) insight::format_error("If `n` is given, x` must be an integer value greater than 0.") # compute 2 SD from beta variance bvar <- 2 * sqrt((x * n) / ((x + n)^2 * (x + n + 1))) # need to compute proportion x <- x / n p2 <- 0.95 x2 <- x + bvar } # for standard errors, we assume a 68% quantile if (!missing(se)) { p2 <- 0.68 x2 <- x + se } # for CI, we assume a 68% quantile if (!missing(ci)) { p2 <- 0.95 x2 <- ci } # the probability is assumed to be the median p1 <- 0.5 x1 <- x find_beta(x1, p1, x2, p2) } #' @rdname find_beta #' @export find_cauchy <- function(x1, p1, x2, p2) { # find location paramater l <- (x1 * stats::qcauchy(p2) ^ -1 - x2 * stats::qcauchy(p1) ^ -1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1) s <- (x2 - x1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1) list(location = l, scale = s) } #' @rdname find_beta #' @export find_normal <- function(x1, p1, x2, p2) { # find location paramater mw <- (x1 * stats::qnorm(p2) ^ -1 - x2 * stats::qnorm(p1) ^ -1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1) stddev <- (x2 - x1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1) list(mean = mw, sd = stddev) } sjstats/R/bootstrap.R0000644000176200001440000001020414620351262014304 0ustar liggesusers#' @title Generate nonparametric bootstrap replications #' @name bootstrap #' #' @description Generates \code{n} bootstrap samples of \code{data} and #' returns the bootstrapped data frames as list-variable. #' #' @param data A data frame. #' @param n Number of bootstraps to be generated. #' @param size Optional, size of the bootstrap samples. May either be a number #' between 1 and \code{nrow(data)} or a value between 0 and 1 to sample #' a proportion of observations from \code{data} (see 'Examples'). #' #' @return A data frame with one column: a list-variable #' \code{strap}, which contains resample-objects of class \code{sj_resample}. #' These resample-objects are lists with three elements: #' \enumerate{ #' \item the original data frame, \code{data} #' \item the rownmumbers \code{id}, i.e. rownumbers of \code{data}, indicating the resampled rows with replacement #' \item the \code{resample.id}, indicating the index of the resample (i.e. the position of the \code{sj_resample}-object in the list \code{strap}) #' } #' #' @details By default, each bootstrap sample has the same number of observations #' as \code{data}. To generate bootstrap samples without resampling #' same observations (i.e. sampling without replacement), use #' \code{size} to get bootstrapped data with a specific number #' of observations. However, specifying the \code{size}-argument is much #' less memory-efficient than the bootstrap with replacement. Hence, #' it is recommended to ignore the \code{size}-argument, if it is #' not really needed. #' #' @note This function applies nonparametric bootstrapping, i.e. the function #' draws samples with replacement. #' \cr \cr #' There is an \code{as.data.frame}- and a \code{print}-method to get or #' print the resampled data frames. See 'Examples'. The \code{as.data.frame}- #' method automatically applies whenever coercion is done because a data #' frame is required as input. See 'Examples' in \code{\link{boot_ci}}. #' #' #' @seealso \code{\link{boot_ci}} to calculate confidence intervals from #' bootstrap samples. #' #' @examples #' data(efc) #' bs <- bootstrap(efc, 5) #' #' # now run models for each bootstrapped sample #' lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x)) #' #' # generate bootstrap samples with 600 observations for each sample #' bs <- bootstrap(efc, 5, 600) #' #' # generate bootstrap samples with 70% observations of the original sample size #' bs <- bootstrap(efc, 5, .7) #' #' # compute standard error for a simple vector from bootstraps #' # use the `as.data.frame()`-method to get the resampled #' # data frame #' bs <- bootstrap(efc, 100) #' bs$c12hour <- unlist(lapply(bs$strap, function(x) { #' mean(as.data.frame(x)$c12hour, na.rm = TRUE) #' })) #' #' # bootstrapped standard error #' boot_se(bs, "c12hour") #' #' # bootstrapped CI #' boot_ci(bs, "c12hour") #' @export bootstrap <- function(data, n, size) { if (!missing(size) && !is.null(size)) { # check for valid range if (size < 0 || size > nrow(data)) stop("`size` must be greater than 0, but not greater than number of rows of `data`.", call. = F) # check if we want proportions if (size < 1) size <- as.integer(nrow(data) * size) # generate bootstraps w/o replacement repl <- FALSE } else { # size = observations size <- nrow(data) # generate bootstraps with replacement repl <- TRUE } # generate bootstrap resamples strap <- replicate(n, resample(data, size, repl), simplify = FALSE) # add resample ID, may be used for other functions (like 'se()' for 'icc()') for (i in seq_len(length(strap))) strap[[i]]$resample.id <- i # return daza frame data.frame(strap = I(strap)) } resample <- function(data, size, replace) { structure( class = "sj_resample", list( data = data, id = sample(nrow(data), size = size, replace = replace) )) } sjstats/R/wtd_variance.R0000644000176200001440000000040114620351262014733 0ustar liggesusersweighted_variance <- function(x, w) { if (is.null(w)) w <- rep(1, length(x)) x[is.na(w)] <- NA w[is.na(x)] <- NA w <- stats::na.omit(w) x <- stats::na.omit(x) xbar <- sum(w * x) / sum(w) sum(w * ((x - xbar)^2)) / (sum(w) - 1) } sjstats/R/re-exports.R0000644000176200001440000000070014620367120014377 0ustar liggesusers#' @importFrom performance mse #' @export performance::mse #' @importFrom performance rmse #' @export performance::rmse #' @importFrom insight link_inverse #' @export insight::link_inverse #' @importFrom datawizard weighted_sd #' @export datawizard::weighted_sd #' @importFrom datawizard weighted_mean #' @export datawizard::weighted_mean #' @importFrom datawizard weighted_median #' @export datawizard::weighted_median sjstats/R/samplesize_mixed.R0000644000176200001440000001133214620351262015634 0ustar liggesusers#' @title Sample size for linear mixed models #' @name samplesize_mixed #' #' @description Compute an approximated sample size for linear mixed models #' (two-level-designs), based on power-calculation for standard #' design and adjusted for design effect for 2-level-designs. #' #' @param eff.size Effect size. #' @param df.n Optional argument for the degrees of freedom for numerator. See 'Details'. #' @param power Power of test (1 minus Type II error probability). #' @param sig.level Significance level (Type I error probability). #' @param k Number of cluster groups (level-2-unit) in multilevel-design. #' @param n Optional, number of observations per cluster groups #' (level-2-unit) in multilevel-design. #' @param icc Expected intraclass correlation coefficient for multilevel-model. #' #' @return A list with two values: The number of subjects per cluster, and the #' total sample size for the linear mixed model. #' #' @references Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum. #' \cr \cr #' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. #' \cr \cr #' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. #' #' @details The sample size calculation is based on a power-calculation for the #' standard design. If \code{df.n} is not specified, a power-calculation #' for an unpaired two-sample t-test will be computed (using #' \code{\link[pwr]{pwr.t.test}} of the \CRANpkg{pwr}-package). #' If \code{df.n} is given, a power-calculation for general linear models #' will be computed (using \code{\link[pwr]{pwr.f2.test}} of the #' \pkg{pwr}-package). The sample size of the standard design #' is then adjusted for the design effect of two-level-designs (see #' \code{\link{design_effect}}). Thus, the sample size calculation is appropriate #' in particular for two-level-designs (see \cite{Snijders 2005}). Models that #' additionally include repeated measures (three-level-designs) may work #' as well, however, the computed sample size may be less accurate. #' #' @examplesIf requireNamespace("pwr") #' # Sample size for multilevel model with 30 cluster groups and a small to #' # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and #' # hence a total sample size of about 802 observations is needed. #' samplesize_mixed(eff.size = .3, k = 30) #' #' # Sample size for multilevel model with 20 cluster groups and a medium #' # to large effect size for linear models of 0.2. Five subjects per cluster and #' # hence a total sample size of about 107 observations is needed. #' samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) #' @export samplesize_mixed <- function(eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05) { if (!requireNamespace("pwr", quietly = TRUE)) { stop("Package `pwr` needed for this function to work. Please install it.", call. = FALSE) } # compute sample size for standard design if (is.null(df.n)) # if we have no degrees of freedom specified, use t-test obs <- 2 * pwr::pwr.t.test(d = eff.size, sig.level = sig.level, power = power)$n else # we have df, so power-calc for linear models obs <- pwr::pwr.f2.test(u = df.n, f2 = eff.size, sig.level = sig.level, power = power)$v + df.n + 1 # if we have no information on the number of observations per cluster, # compute this number now if (missing(n) || is.null(n)) { n <- (obs * (1 - icc)) / (k - (obs * icc)) if (n < 1) { warning("Minimum required number of subjects per cluster is negative and was adjusted to be positive. You may reduce the requirements for the multi-level structure (i.e. reduce `k` or `icc`), or you can increase the effect-size.", call. = FALSE) n <- 1 } } # adjust standard design by design effect total.n <- obs * design_effect(n = n, icc = icc) # sample size for each group and total n smpsz <- list(round(total.n / k), round(total.n)) # name list names(smpsz) <- c("Subjects per Cluster", "Total Sample Size") smpsz } #' @rdname samplesize_mixed #' @export smpsize_lmm <- samplesize_mixed sjstats/R/t_test.R0000644000176200001440000003200014623373000013564 0ustar liggesusers#' @title Student's t test #' @name t_test #' @description This function performs a Student's t test for two independent #' samples, for paired samples, or for one sample. It's a parametric test for #' the null hypothesis that the means of two independent samples are equal, or #' that the mean of one sample is equal to a specified value. The hypothesis #' can be one- or two-sided. #' #' Unlike the underlying base R function `t.test()`, this function allows for #' weighted tests and automatically calculates effect sizes. Cohen's _d_ is #' returned for larger samples (n > 20), while Hedges' _g_ is returned for #' smaller samples. #' #' @inheritParams mann_whitney_test #' @param paired Logical, whether to compute a paired t-test for dependent #' samples. #' @inherit mann_whitney_test seealso #' #' @inheritSection mann_whitney_test Which test to use #' #' @details Interpretation of effect sizes are based on rules described in #' [`effectsize::interpret_cohens_d()`] and [`effectsize::interpret_hedges_g()`]. #' Use these function directly to get other interpretations, by providing the #' returned effect size (_Cohen's d_ or _Hedges's g_ in this case) as argument, #' e.g. `interpret_cohens_d(0.35, rules = "sawilowsky2009")`. #' #' @return A data frame with test results. Effectsize Cohen's _d_ is returned #' for larger samples (n > 20), while Hedges' _g_ is returned for smaller samples. #' #' @references #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @examplesIf requireNamespace("effectsize") #' data(sleep) #' # one-sample t-test #' t_test(sleep, "extra") #' # base R equivalent #' t.test(extra ~ 1, data = sleep) #' #' # two-sample t-test, by group #' t_test(mtcars, "mpg", by = "am") #' # base R equivalent #' t.test(mpg ~ am, data = mtcars) #' #' # paired t-test #' t_test(mtcars, c("mpg", "hp"), paired = TRUE) #' # base R equivalent #' t.test(mtcars$mpg, mtcars$hp, data = mtcars, paired = TRUE) #' @export t_test <- function(data, select = NULL, by = NULL, weights = NULL, paired = FALSE, mu = 0, alternative = "two.sided") { insight::check_if_installed(c("datawizard", "effectsize")) alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) # sanity checks .sanitize_htest_input(data, select, by, weights, test = "t_test") data_name <- NULL # filter and remove NA data <- stats::na.omit(data[c(select, by, weights)]) # does select indicate more than one variable? We than reshape the data # to have one continous scale and one grouping variable if (length(select) > 1) { # paired? if (paired) { # subtract the two variables for paired t-test, and "set" by to NULL data[[select[1]]] <- data[[select[1]]] - data[[select[2]]] data_name <- paste(select[1], "and", select[2]) select <- select[1] by <- NULL } else { # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long( data[c(select, weights)], select = select, names_to = "group", values_to = "scale" ) by <- select[2] select <- select[1] # after converting to long, we have the "grouping" variable first in the data colnames(data) <- c(weights, by, select) } } # get data dv <- data[[select]] # for two-sample t-test... if (!is.null(by)) { grp <- data[[by]] # coerce to factor grp <- datawizard::to_factor(grp) # only two groups allowed if (insight::n_unique(grp) > 2) { insight::format_error("Only two groups are allowed for Student's t test.") # nolint } # value labels group_labels <- names(attr(data[[by]], "labels", exact = TRUE)) if (is.null(group_labels)) { group_labels <- levels(droplevels(grp)) } data_name <- paste(select, "by", by) } else { # one-sample t-test... grp <- NULL group_labels <- select if (is.null(data_name)) { data_name <- select } } if (is.null(weights)) { .calculate_ttest(dv, grp, mu, paired, alternative, group_labels, data_name) } else { .calculate_weighted_ttest(dv, grp, mu, paired, alternative, data[[weights]], group_labels, data_name) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_ttest <- function(dv, grp, mu, paired, alternative, group_labels, data_name) { insight::check_if_installed("effectsize") # prepare data if (is.null(grp)) { tdat <- data.frame(dv) t_formula <- stats::as.formula("dv ~ 1") } else { tdat <- data.frame(dv, grp) t_formula <- stats::as.formula("dv ~ grp") } # perfom wilcox test htest <- stats::t.test( t_formula, data = tdat, alternative = alternative, mu = mu ) test_statistic <- htest$statistic if (nrow(tdat) > 20) { effect_size <- stats::setNames( effectsize::cohens_d( t_formula, data = tdat, alternative = alternative, mu = mu )$Cohens_d, "Cohens_d" ) } else { effect_size <- stats::setNames( effectsize::hedges_g( t_formula, data = tdat, alternative = alternative, mu = mu )$Hedges_g, "Hedges_g" ) } # return result out <- data.frame( data = data_name, statistic_name = "t", statistic = test_statistic, effect_size_name = names(effect_size), effect_size = as.numeric(effect_size), p = as.numeric(htest$p.value), df = as.numeric(htest$parameter), method = ifelse(paired, "Paired t-test", htest$method), alternative = alternative, mu = mu, stringsAsFactors = FALSE ) class(out) <- c("sj_htest_t", "data.frame") attr(out, "group_labels") <- group_labels attr(out, "means") <- as.numeric(htest$estimate) attr(out, "paired") <- isTRUE(paired) attr(out, "one_sample") <- is.null(grp) attr(out, "weighted") <- FALSE if (!is.null(grp)) { attr(out, "n_groups") <- stats::setNames( c(as.numeric(table(grp))), c("N Group 1", "N Group 2") ) } out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_ttest <- function(dv, grp, mu, paired, alternative, weights, group_labels, data_name) { insight::check_if_installed(c("datawizard", "effectsize")) if (is.null(grp)) { dat <- stats::na.omit(data.frame(dv, weights)) colnames(dat) <- c("y", "w") x_values <- dat$y x_weights <- dat$w y_values <- NULL # group N's n_groups <- stats::setNames(round(sum(x_weights)), "N Group 1") } else { dat <- stats::na.omit(data.frame(dv, grp, weights)) colnames(dat) <- c("y", "g", "w") # unique groups groups <- unique(dat$g) # values for sample 1 x_values <- dat$y[dat$g == groups[1]] x_weights <- dat$w[dat$g == groups[1]] # values for sample 2 y_values <- dat$y[dat$g == groups[2]] y_weights <- dat$w[dat$g == groups[2]] # group N's n_groups <- stats::setNames( c(round(sum(x_weights)), round(sum(y_weights))), c("N Group 1", "N Group 2") ) } mu_x <- stats::weighted.mean(x_values, x_weights, na.rm = TRUE) var_x <- datawizard::weighted_sd(x_values, x_weights)^2 se_x <- sqrt(var_x / length(x_values)) if (paired || is.null(y_values)) { # paired se <- se_x dof <- length(x_values) - 1 test_statistic <- (mu_x - mu) / se estimate <- mu_x method <- if (paired) "Paired t-test" else "One Sample t-test" } else { # unpaired t-test mu_y <- stats::weighted.mean(y_values, y_weights) var_y <- datawizard::weighted_sd(y_values, y_weights)^2 se_y <- sqrt(var_y / length(y_values)) se <- sqrt(se_x^2 + se_y^2) dof <- se^4 / (se_x^4 / (length(x_values) - 1) + se_y^4 / (length(y_values) - 1)) test_statistic <- (mu_x - mu_y - mu) / se estimate <- c(mu_x, mu_y) method <- "Two-Sample t-test" } # p-values if (alternative == "less") { pval <- stats::pt(test_statistic, dof) } else if (alternative == "greater") { pval <- stats::pt(test_statistic, dof, lower.tail = FALSE) } else { pval <- 2 * stats::pt(-abs(test_statistic), dof) } # effect size dat$y <- dat$y * dat$w if (is.null(y_values)) { t_formula <- stats::as.formula("y ~ 1") } else { t_formula <- stats::as.formula("y ~ g") } if (nrow(dat) > 20) { effect_size <- stats::setNames( effectsize::cohens_d( t_formula, data = dat, alternative = alternative, mu = mu, paired = FALSE )$Cohens_d, "Cohens_d" ) } else { effect_size <- stats::setNames( effectsize::hedges_g( t_formula, data = dat, alternative = alternative, mu = mu, paired = FALSE )$Hedges_g, "Hedges_g" ) } # return result out <- data.frame( data = data_name, statistic_name = "t", statistic = test_statistic, effect_size_name = names(effect_size), effect_size = as.numeric(effect_size), p = pval, df = dof, method = method, alternative = alternative, mu = mu, stringsAsFactors = FALSE ) class(out) <- c("sj_htest_t", "data.frame") attr(out, "means") <- estimate attr(out, "n_groups") <- n_groups attr(out, "means") <- estimate attr(out, "group_labels") <- group_labels attr(out, "paired") <- isTRUE(paired) attr(out, "one_sample") <- is.null(y_values) && !isTRUE(paired) attr(out, "weighted") <- TRUE out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_t <- function(x, ...) { insight::check_if_installed("effectsize") # fetch attributes group_labels <- attributes(x)$group_labels means <- attributes(x)$means n_groups <- attributes(x)$n_groups weighted <- attributes(x)$weighted paired <- isTRUE(attributes(x)$paired) one_sample <- isTRUE(attributes(x)$one_sample) if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } # same width group_labels <- format(group_labels) # header insight::print_color(sprintf("# %s%s\n\n", x$method, weight_string), "blue") # print for paired t-test if (paired) { # data insight::print_color(sprintf( " Data: %s (mean difference = %s)\n", x$data, insight::format_value(means[1], protect_integers = TRUE) ), "cyan") } else { # data insight::print_color(sprintf(" Data: %s\n", x$data), "cyan") # group-1-info if (is.null(n_groups)) { insight::print_color( sprintf( " Group 1: %s (mean = %s)\n", group_labels[1], insight::format_value(means[1], protect_integers = TRUE) ), "cyan" ) } else { insight::print_color( sprintf( " Group 1: %s (n = %i, mean = %s)\n", group_labels[1], n_groups[1], insight::format_value(means[1], protect_integers = TRUE) ), "cyan" ) } # group-2-info if (length(group_labels) > 1) { if (is.null(n_groups)) { insight::print_color( sprintf( " Group 2: %s (mean = %s)\n", group_labels[2], insight::format_value(means[2], protect_integers = TRUE) ), "cyan" ) } else { insight::print_color( sprintf( " Group 2: %s (n = %i, mean = %s)\n", group_labels[2], n_groups[2], insight::format_value(means[2], protect_integers = TRUE) ), "cyan" ) } } } # alternative hypothesis alt_string <- switch(x$alternative, two.sided = "not equal to", less = "less than", greater = "greater than" ) if (one_sample) { alt_string <- paste("true mean is", alt_string, x$mu) } else if (paired) { alt_string <- paste("true mean difference is", alt_string, x$mu) } else { alt_string <- paste("true difference in means is", alt_string, x$mu) } insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") # string for effectsizes if (x$effect_size_name == "Cohens_d") { eff_string <- sprintf( "Cohen's d = %.2f (%s effect)", x$effect_size, effectsize::interpret_cohens_d(x$effect_size) ) } else { eff_string <- sprintf( "Hedges' g = %.2f (%s effect)", x$effect_size, effectsize::interpret_hedges_g(x$effect_size) ) } cat(sprintf( "\n t = %.2f, %s, df = %s, %s\n\n", x$statistic, eff_string, insight::format_value(x$df, digits = 1, protect_integers = TRUE), insight::format_p(x$p) )) } sjstats/R/wtd_se.R0000644000176200001440000000473314620351262013566 0ustar liggesusers#' @title Weighted statistics for variables #' @name weighted_se #' @description #' `weighted_se()` computes weighted standard errors of a variable or for #' all variables of a data frame. `survey_median()` computes the median for #' a variable in a survey-design (see [`survey::svydesign()]`). #' `weighted_correlation()` computes a weighted correlation for a two-sided #' alternative hypothesis. #' #' @param x (Numeric) vector or a data frame. For `survey_median()` or `weighted_ttest()`, #' the bare (unquoted) variable name, or a character vector with the variable name. #' @param weights Bare (unquoted) variable name, or a character vector with #' the variable name of the numeric vector of weights. If `weights = NULL`, #' unweighted statistic is reported. #' @param data A data frame. #' @param formula A formula of the form `lhs ~ rhs1 + rhs2` where `lhs` is a #' numeric variable giving the data values and `rhs1` a factor with two #' levels giving the corresponding groups and `rhs2` a variable with weights. #' @param y Optional, bare (unquoted) variable name, or a character vector with #' the variable name. #' @param ci.lvl Confidence level of the interval. #' @param ... Currently not used. #' #' @inheritParams svyglm.nb #' #' @return The weighted (test) statistic. #' #' @examplesIf requireNamespace("survey") #' data(efc) #' weighted_se(efc$c12hour, abs(runif(n = nrow(efc)))) #' #' # survey_median ---- #' # median for variables from weighted survey designs #' data(nhanes_sample) #' #' des <- survey::svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' survey_median(total, des) #' survey_median("total", des) #' @export weighted_se <- function(x, weights = NULL) { UseMethod("weighted_se") } #' @export weighted_se.data.frame <- function(x, weights = NULL) { se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights) names(se_result) <- colnames(x) se_result } #' @export weighted_se.matrix <- function(x, weights = NULL) { se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights) names(se_result) <- colnames(x) se_result } #' @export weighted_se.default <- function(x, weights = NULL) { weighted_se_helper(x, weights) } weighted_se_helper <- function(x, weights) { if (is.null(weights)) weights <- rep(1, length(x)) sqrt(weighted_variance(x, weights) / length(stats::na.omit(x))) } sjstats/R/nhanes_sample.R0000644000176200001440000000150513737567502015126 0ustar liggesusers#' @docType data #' @title Sample dataset from the National Health and Nutrition Examination Survey #' @name nhanes_sample #' @keywords data #' #' @description Selected variables from the National Health and Nutrition Examination #' Survey that are used in the example from Lumley (2010), Appendix E. #' See \code{\link{svyglm.nb}} for examples. #' #' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley NULL #' @docType data #' @title Sample dataset from the EUROFAMCARE project #' @name efc #' @keywords data #' #' @description German data set from the European study on family care of older people. #' #' @references Lamura G, Döhner H, Kofahl C, editors. Family carers of older people in Europe: a six-country comparative study. Münster: LIT, 2008. NULL sjstats/R/Deprecated.R0000644000176200001440000000340514617352174014345 0ustar liggesusers#' @title Deprecated functions #' @name r2 #' @description A list of deprecated functions. #' #' @param x An object. #' @param ... Currently not used. #' #' @return Nothing. #' #' @export r2 <- function(x) { .Defunct("performance::r2()") performance::r2(x) } #' @rdname r2 #' @export cohens_f <- function(x, ...) { .Defunct("effectsize::cohens_f()") effectsize::cohens_f(x) } #' @rdname r2 #' @export eta_sq <- function(x, ...) { .Defunct("effectsize::eta_squared()") effectsize::eta_squared(x) } #' @rdname r2 #' @export epsilon_sq <- function(x, ...) { .Defunct("effectsize::epsilon_squared()") effectsize::epsilon_squared(x) } #' @rdname r2 #' @export omega_sq <- function(x, ...) { .Defunct("effectsize::omega_sqared()") effectsize::omega_squared(x) } #' @rdname r2 #' @export scale_weights <- function(x, ...) { .Defunct("datawizard::rescale_weights()") datawizard::rescale_weights(x, ...) } #' @rdname r2 #' @export robust <- function(x, ...) { .Defunct("parameters::standard_error()") parameters::standard_error(x, ...) } #' @rdname r2 #' @export icc <- function(x) { .Defunct("performance::icc()") performance::icc(x) } #' @rdname r2 #' @export p_value <- function(x, ...) { .Defunct("parameters::p_value()") parameters::p_value(x) } #' @rdname r2 #' @export se <- function(x, ...) { .Defunct("parameters::standard_error()") parameters::standard_error(x) } #' @rdname r2 #' @export means_by_group <- function(x, ...) { .Defunct("datawizard::means_by_group()") datawizard::means_by_group(x, ...) } #' @rdname r2 #' @export mean_n <- function(x, ...) { .Defunct("datawizard::row_means()") datawizard::row_means(x, ...) } sjstats/R/helpfunctions.R0000644000176200001440000000672014620351262015160 0ustar liggesusers# Help-functions data_frame <- function(...) { x <- data.frame(..., stringsAsFactors = FALSE) rownames(x) <- NULL x } is_merMod <- function(fit) { inherits(fit, c("lmerMod", "glmerMod", "nlmerMod", "merModLmerTest")) } .compact_character <- function(x) { x[!sapply(x, function(i) is.null(i) || !nzchar(i, keepNA = TRUE) || is.na(i) || any(i == "NULL", na.rm = TRUE))] } .format_symbols <- function(x) { if (.unicode_symbols()) { x <- gsub("Delta", "\u0394", x, ignore.case = TRUE) x <- gsub("Phi", "\u03D5", x, ignore.case = TRUE) x <- gsub("Eta", "\u03B7", x, ignore.case = TRUE) x <- gsub("Epsilon", "\u03b5", x, ignore.case = TRUE) x <- gsub("Omega", "\u03b5", x, ignore.case = TRUE) x <- gsub("R2", "R\u00b2", x, ignore.case = TRUE) x <- gsub("Chi2", "\u03C7\u00b2", x, ignore.case = TRUE) x <- gsub("Chi-squared", "\u03C7\u00b2", x, ignore.case = TRUE) x <- gsub("Chi", "\u03C7", x, ignore.case = TRUE) x <- gsub("Sigma", "\u03C3", x, ignore.case = TRUE) x <- gsub("Rho", "\u03C1", x, ignore.case = TRUE) x <- gsub("Mu", "\u03BC", x, ignore.case = TRUE) x <- gsub("Theta", "\u03B8", x, ignore.case = TRUE) x <- gsub("Fei", "\u05E4\u200E", x, ignore.case = TRUE) } x } .unicode_symbols <- function() { win_os <- tryCatch( { si <- Sys.info() if (is.null(si["sysname"])) { FALSE } else { si["sysname"] == "Windows" || startsWith(R.version$os, "mingw") } }, error = function(e) { TRUE } ) l10n_info()[["UTF-8"]] && ((win_os && getRversion() >= "4.2") || (!win_os && getRversion() >= "4.0")) } .is_pseudo_numeric <- function(x) { (is.character(x) && !anyNA(suppressWarnings(as.numeric(stats::na.omit(x[nzchar(x, keepNA = TRUE)]))))) || (is.factor(x) && !anyNA(suppressWarnings(as.numeric(levels(x))))) # nolint } .misspelled_string <- function(source, searchterm, default_message = NULL) { if (is.null(searchterm) || length(searchterm) < 1) { return(default_message) } # used for many matches more_found <- "" # init default msg <- "" # remove matching strings same <- intersect(source, searchterm) searchterm <- setdiff(searchterm, same) source <- setdiff(source, same) # guess the misspelled string possible_strings <- unlist(lapply(searchterm, function(s) { source[.fuzzy_grep(source, s)] # nolint }), use.names = FALSE) if (length(possible_strings)) { msg <- "Did you mean " if (length(possible_strings) > 1) { # make sure we don't print dozens of alternatives for larger data frames if (length(possible_strings) > 5) { more_found <- sprintf( " We even found %i more possible matches, not shown here.", length(possible_strings) - 5 ) possible_strings <- possible_strings[1:5] } msg <- paste0(msg, "one of ", toString(paste0("\"", possible_strings, "\""))) } else { msg <- paste0(msg, "\"", possible_strings, "\"") } msg <- paste0(msg, "?", more_found) } else { msg <- default_message } # no double white space insight::trim_ws(msg) } .fuzzy_grep <- function(x, pattern, precision = NULL) { if (is.null(precision)) { precision <- round(nchar(pattern) / 3) } if (precision > nchar(pattern)) { return(NULL) } p <- sprintf("(%s){~%i}", pattern, precision) grep(pattern = p, x = x, ignore.case = FALSE) } sjstats/R/cramer.R0000644000176200001440000000235414620351262013547 0ustar liggesusers#' @rdname crosstable_statistics #' @export cramers_v <- function(tab, ...) { UseMethod("cramers_v") } #' @rdname crosstable_statistics #' @export cramer <- cramers_v #' @export cramers_v.table <- function(tab, ...) { .cramers_v(tab) } #' @export cramers_v.ftable <- function(tab, ...) { .cramers_v(tab) } #' @rdname crosstable_statistics #' @export cramers_v.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { fterms <- all.vars(formula) tab <- table(data[[fterms[1]]], data[[fterms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { .cramers_v(tab) } else { straps <- sjstats::bootstrap(data[fterms], n) tables <- lapply(straps$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }) cramers <- sapply(tables, .cramers_v) ci <- boot_ci(cramers, ci.lvl = ci.lvl, method = method) data_frame( cramer = .cramers_v(tab), conf.low = ci$conf.low, conf.high = ci$conf.high ) } } .cramers_v <- function(tab) { # convert to flat table if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) sqrt(phi(tab)^2 / min(dim(tab) - 1)) } sjstats/R/boot_ci.R0000644000176200001440000001361414620351262013715 0ustar liggesusers#' @title Standard error and confidence intervals for bootstrapped estimates #' @name boot_ci #' #' @description Compute nonparametric bootstrap estimate, standard error, #' confidence intervals and p-value for a vector of bootstrap #' replicate estimates. #' #' @param data A data frame that containts the vector with bootstrapped #' estimates, or directly the vector (see 'Examples'). #' @param ci.lvl Numeric, the level of the confidence intervals. #' @param select Optional, unquoted names of variables (as character vector) #' with bootstrapped estimates. Required, if either `data` is a data frame #' (and no vector), and only selected variables from `data` should be processed. #' @param method Character vector, indicating if confidence intervals should be #' based on bootstrap standard error, multiplied by the value of the quantile #' function of the t-distribution (default), or on sample quantiles of the #' bootstrapped values. See 'Details' in `boot_ci()`. May be abbreviated. #' #' @return A data frame with either bootstrap estimate, standard error, the #' lower and upper confidence intervals or the p-value for all bootstrapped #' estimates. #' #' @details The methods require one or more vectors of bootstrap replicate #' estimates as input. #' #' - `boot_est()`: returns the bootstrapped estimate, simply by computing #' the mean value of all bootstrap estimates. #' - `boot_se()`: computes the nonparametric bootstrap standard error by #' calculating the standard deviation of the input vector. #' - The mean value of the input vector and its standard error is used by #' `boot_ci()` to calculate the lower and upper confidence interval, #' assuming a t-distribution of bootstrap estimate replicates (for #' `method = "dist"`, the default, which is #' `mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)`); for #' `method = "quantile"`, 95\% sample quantiles are used to compute the #' confidence intervals (`quantile(x, probs = c(0.025, 0.975))`). Use #' `ci.lvl` to change the level for the confidence interval. #' - P-values from `boot_p()` are also based on t-statistics, assuming normal #' distribution. #' #' @references Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 #' #' @seealso []`bootstrap()`] to generate nonparametric bootstrap samples. #' #' @examples #' data(efc) #' bs <- bootstrap(efc, 100) #' #' # now run models for each bootstrapped sample #' bs$models <- lapply( #' bs$strap, #' function(.x) lm(neg_c_7 ~ e42dep + c161sex, data = .x) #' ) #' #' # extract coefficient "dependency" and "gender" from each model #' bs$dependency <- vapply(bs$models, function(x) coef(x)[2], numeric(1)) #' bs$gender <- vapply(bs$models, function(x) coef(x)[3], numeric(1)) #' #' # get bootstrapped confidence intervals #' boot_ci(bs$dependency) #' #' # compare with model fit #' fit <- lm(neg_c_7 ~ e42dep + c161sex, data = efc) #' confint(fit)[2, ] #' #' # alternative function calls. #' boot_ci(bs$dependency) #' boot_ci(bs, "dependency") #' boot_ci(bs, c("dependency", "gender")) #' boot_ci(bs, c("dependency", "gender"), method = "q") #' #' #' # compare coefficients #' mean(bs$dependency) #' boot_est(bs$dependency) #' coef(fit)[2] #' @export boot_ci <- function(data, select = NULL, method = c("dist", "quantile"), ci.lvl = 0.95) { # match arguments method <- match.arg(method) # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute confidence intervals for all values transform_boot_result(lapply(.dat, function(x) { # check if method should be based on t-distribution of # bootstrap values or quantiles if (method == "dist") { # get bootstrap standard error bootse <- stats::qt((1 + ci.lvl) / 2, df = length(x) - 1) * stats::sd(x, na.rm = TRUE) # lower and upper confidence interval ci <- mean(x, na.rm = TRUE) + c(-bootse, bootse) } else { # CI based on quantiles of bootstrapped values ci <- stats::quantile(x, probs = c((1 - ci.lvl) / 2, (1 + ci.lvl) / 2)) } # give proper names names(ci) <- c("conf.low", "conf.high") ci })) } #' @rdname boot_ci #' @export boot_se <- function(data, select = NULL) { # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # get bootstrap standard error se <- stats::sd(x, na.rm = TRUE) names(se) <- "std.err" se })) } #' @rdname boot_ci #' @export boot_p <- function(data, select = NULL) { # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # compute t-statistic t.stat <- mean(x, na.rm = TRUE) / stats::sd(x, na.rm = TRUE) # compute p-value p <- 2 * stats::pt(abs(t.stat), df = length(x) - 1, lower.tail = FALSE) names(p) <- "p.value" p })) } #' @rdname boot_ci #' @export boot_est <- function(data, select = NULL) { # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute mean for all values (= bootstrapped estimate) transform_boot_result(lapply(.dat, function(x) { estimate <- mean(x, na.rm = TRUE) names(estimate) <- "estimate" estimate })) } transform_boot_result <- function(res) { # transform a bit, so we have each estimate in a row, and ci's as columns... rownames_as_column(as.data.frame(t(as.data.frame(res))), var = "term") } sjstats/R/weight.R0000644000176200001440000000623514620351262013567 0ustar liggesusers#' @title Weight a variable #' @name weight #' @description These functions weight the variable \code{x} by #' a specific vector of \code{weights}. #' #' @param x (Unweighted) variable. #' @param weights Vector with same length as \code{x}, which #' contains weight factors. Each value of \code{x} has a #' specific assigned weight in \code{weights}. #' @param digits Numeric value indicating the number of decimal places to be #' used for rounding the weighted values. By default, this value is #' \code{0}, i.e. the returned values are integer values. #' #' @return The weighted \code{x}. #' #' @details \code{weight2()} sums up all \code{weights} values of the associated #' categories of \code{x}, whereas \code{weight()} uses a #' \code{\link[stats]{xtabs}} formula to weight cases. Thus, \code{weight()} #' may return a vector of different length than \code{x}. #' #' @note The values of the returned vector are in sorted order, whereas the values' #' order of the original \code{x} may be spread randomly. Hence, \code{x} can't be #' used, for instance, for further cross tabulation. In case you want to have #' weighted contingency tables or (grouped) box plots etc., use the \code{weightBy} #' argument of most functions. #' #' @examples #' v <- sample(1:4, 20, TRUE) #' table(v) #' w <- abs(rnorm(20)) #' table(weight(v, w)) #' table(weight2(v, w)) #' #' set.seed(1) #' x <- sample(letters[1:5], size = 20, replace = TRUE) #' w <- runif(n = 20) #' #' table(x) #' table(weight(x, w)) #' #' @export weight <- function(x, weights, digits = 0) { # remember if x is numeric x.is.num <- is.numeric(x) # init values weightedvar <- c() wtab <- round(stats::xtabs(weights ~ x, data = data.frame(weights = weights, x = x), na.action = stats::na.pass, exclude = NULL), digits = digits) # iterate all table values for (w in seq_len(length(wtab))) { # retrieve count of each table cell w_count <- wtab[[w]] # retrieve "cell name" which is identical to the variable value # first check whether values are numeric or not nval_ <- suppressWarnings(as.numeric(names(wtab[w]))) # if value is not numeric, use as is if (is.na(nval_)) w_value <- names(wtab[w]) else # else, use numeric value w_value <- nval_ # append variable value, repeating it "w_count" times. weightedvar <- c(weightedvar, rep(w_value, w_count)) } # if we have NA values, weighted var is coerced to character. # coerce back to numeric then here if (!is.numeric(weightedvar) && x.is.num) { weightedvar <- datawizard::to_numeric(weightedvar, dummy_factors = FALSE) } # return result weightedvar } #' @rdname weight #' @export weight2 <- function(x, weights) { items <- unique(x) newvar <- c() for (i in seq_len(length(items))) { newcount <- round(sum(weights[which(x == items[i])])) newvar <- c(newvar, rep(items[i], newcount)) } newvar } sjstats/R/se_ybar.R0000644000176200001440000000312114620351262013713 0ustar liggesusers#' @title Standard error of sample mean for mixed models #' @name se_ybar #' #' @description Compute the standard error for the sample mean for mixed models, #' regarding the extent to which clustering affects the standard errors. #' May be used as part of the multilevel power calculation for cluster sampling #' (see \cite{Gelman and Hill 2007, 447ff}). #' #' @param fit Fitted mixed effects model (\code{\link[lme4]{merMod}}-class). #' #' @return The standard error of the sample mean of \code{fit}. #' #' @references Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press #' #' @examplesIf require("lme4") #' fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) #' se_ybar(fit) #' @export se_ybar <- function(fit) { # get model icc vars <- insight::get_variance(fit, verbose = FALSE) # get group variances tau.00 <- unname(vars$var.intercept) # total variance tot_var <- sum(tau.00, vars$var.residual) # get number of groups m.cnt <- vapply(fit@flist, nlevels, 1) # compute number of observations per group (level-2-unit) obs <- round(stats::nobs(fit) / m.cnt) # compute simple icc icc <- tau.00 / tot_var # compute standard error of sample mean se <- unlist(lapply(seq_len(length(m.cnt)), function(.x) { sqrt((tot_var / stats::nobs(fit)) * design_effect(n = obs[.x], icc = icc[.x])) })) # give names for se, so user sees, which random effect has what impact names(se) <- names(m.cnt) se } sjstats/R/xtab_statistics.R0000644000176200001440000002240214620351262015502 0ustar liggesusers#' @title Measures of association for contingency tables #' @name crosstable_statistics #' #' @description This function calculates various measure of association for #' contingency tables and returns the statistic and p-value. #' Supported measures are Cramer's V, Phi, Spearman's rho, #' Kendall's tau and Pearson's r. #' #' @param data A data frame or a table object. If a table object, `x1` and #' `x2` will be ignored. For Kendall's _tau_, Spearman's _rho_ or Pearson's #' product moment correlation coefficient, `data` needs to be a data frame. #' If `x1` and `x2` are not specified, the first two columns of the data #' frames are used as variables to compute the crosstab. #' @param formula A formula of the form `lhs ~ rhs` where `lhs` is a #' numeric variable giving the data values and `rhs` a factor giving the #' corresponding groups. #' @param tab A [`table()`] or [`ftable()`]. Tables of class [`xtabs()`] and #' other will be coerced to `ftable` objects. #' @param x1 Name of first variable that should be used to compute the #' contingency table. If `data` is a table object, this argument will be #' irgnored. #' @param x2 Name of second variable that should be used to compute the #' contingency table. If `data` is a table object, this argument will be #' irgnored. #' @param statistics Name of measure of association that should be computed. May #' be one of `"auto"`, `"cramer"`, `"phi"`, `"spearman"`, `"kendall"`, #' `"pearson"` or `"fisher"`. See 'Details'. #' @param ci.lvl Scalar between 0 and 1. If not `NULL`, returns a data #' frame including lower and upper confidence intervals. #' @param weights Name of variable in `x` that indicated the vector of weights #' that will be applied to weight all observations. Default is `NULL`, so no #' weights are used. #' @param ... Other arguments, passed down to the statistic functions #' [`chisq.test()`], [`fisher.test()`] or [`cor.test()`]. #' #' @inheritParams bootstrap #' @inheritParams boot_ci #' #' @return For [`phi()`], the table's Phi value. For [`cramers_v()]`, the #' table's Cramer's V. #' #' For `crosstable_statistics()`, a list with following components: #' #' - `estimate`: the value of the estimated measure of association. #' - `p.value`: the p-value for the test. #' - `statistic`: the value of the test statistic. #' - `stat.name`: the name of the test statistic. #' - `stat.html`: if applicable, the name of the test statistic, in HTML-format. #' - `df`: the degrees of freedom for the contingency table. #' - `method`: character string indicating the name of the measure of association. #' - `method.html`: if applicable, the name of the measure of association, in HTML-format. #' - `method.short`: the short form of association measure, equals the `statistics`-argument. #' - `fisher`: logical, if Fisher's exact test was used to calculate the p-value. #' #' @details The p-value for Cramer's V and the Phi coefficient are based #' on `chisq.test()`. If any expected value of a table cell is smaller than 5, #' or smaller than 10 and the df is 1, then `fisher.test()` is used to compute #' the p-value, unless `statistics = "fisher"`; in this case, the use of #' `fisher.test()` is forced to compute the p-value. The test statistic is #' calculated with `cramers_v()` resp. `phi()`. #' #' Both test statistic and p-value for Spearman's rho, Kendall's tau and #' Pearson's r are calculated with `cor.test()`. #' #' When `statistics = "auto"`, only Cramer's V or Phi are calculated, based on #' the dimension of the table (i.e. if the table has more than two rows or #' columns, Cramer's V is calculated, else Phi). #' #' @references Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., #' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data #' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. #' \doi{10.3390/math11091982} #' #' @examples #' # Phi coefficient for 2x2 tables #' tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) #' phi(tab) #' #' # Cramer's V for nominal variables with more than 2 categories #' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) #' cramer(tab) #' #' # formula notation #' data(efc) #' cramer(e16sex ~ c161sex, data = efc) #' #' # bootstrapped confidence intervals #' cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100) #' #' # 2x2 table, compute Phi automatically #' crosstable_statistics(efc, e16sex, c161sex) #' #' # more dimensions than 2x2, compute Cramer's V automatically #' crosstable_statistics(efc, c172code, c161sex) #' #' # ordinal data, use Kendall's tau #' crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall") #' #' # calcilate Spearman's rho, with continuity correction #' crosstable_statistics(efc, #' e42dep, #' quol_5, #' statistics = "spearman", #' exact = FALSE, #' continuity = TRUE #' ) #' @export crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ...) { # match arguments statistics <- match.arg(statistics) # name for test statistics in HTML stat.html <- NULL # check if data is a table if (is.table(data)) { # 'data' is a table - copy to table object tab <- data # check if statistics are possible to compute if (statistics %in% c("spearman", "kendall", "pearson")) { stop( sprintf( "Need arguments `data`, `x1` and `x2` to compute %s-statistics.", statistics ), call. = FALSE ) } } else { # evaluate unquoted names x1 <- deparse(substitute(x1)) x2 <- deparse(substitute(x2)) weights <- deparse(substitute(weights)) # if names were quotes, remove quotes x1 <- gsub("\"", "", x1, fixed = TRUE) x2 <- gsub("\"", "", x2, fixed = TRUE) weights <- gsub("\"", "", weights, fixed = TRUE) if (insight::is_empty_object(weights) || weights == "NULL") weights <- NULL else weights <- data[[weights]] # check for "NULL" and get data if (x1 != "NULL" && x2 != "NULL") data <- data[, c(x1, x2)] else data <- data[, 1:2] if (!is.null(weights)) data <- cbind(data, weights) # make table if (!is.null(weights)) { tab <- as.table(round(stats::xtabs(data[[3]] ~ data[[1]] + data[[2]]))) class(tab) <- "table" } else { tab <- table(data) } } # get expected values tab.val <- table_values(tab) # remember whether fisher's exact test was used or not use.fisher <- FALSE # select statistics automatically, based on number of rows/columns if (statistics %in% c("auto", "cramer", "phi", "fisher")) { # get chisq-statistics, for df and p-value chsq <- suppressWarnings(stats::chisq.test(tab, ...)) pv <- chsq$p.value test <- chsq$statistic # set statistics name names(test) <- "Chi-squared" stat.html <- "χ2" # check row/columns if ((nrow(tab) > 2 || ncol(tab) > 2 || statistics %in% c("cramer", "fisher")) && statistics != "phi") { # get cramer's V s <- cramer(tab) # if minimum expected values below 5, compute fisher's exact test if (statistics == "fisher" || min(tab.val$expected) < 5 || (min(tab.val$expected) < 10 && chsq$parameter == 1)) { pv <- stats::fisher.test(tab, simulate.p.value = TRUE, ...)$p.value use.fisher <- TRUE } # set statistics statistics <- "cramer" } else { # get Phi s <- phi(tab) # if minimum expected values below 5 and df=1, compute fisher's exact test if (min(tab.val$expected) < 5 || (min(tab.val$expected) < 10 && chsq$parameter == 1)) { pv <- stats::fisher.test(tab, ...)$p.value use.fisher <- TRUE } # set statistics statistics <- "phi" } } else { # compute correlation coefficient cv <- stats::cor.test(x = data[[1]], y = data[[2]], method = statistics, ...) # get statistics and p-value s <- cv$estimate pv <- cv$p.value test <- cv$statistic stat.html <- names(test) } # compute method string method <- ifelse(statistics == "kendall", "Kendall's tau", ifelse(statistics == "spearman", "Spearman's rho", # nolint ifelse(statistics == "pearson", "Pearson's r", # nolint ifelse(statistics == "cramer", "Cramer's V", "Phi") # nolint ) ) ) # compute method string method.html <- ifelse(statistics == "kendall", "Kendall's τ", ifelse(statistics == "spearman", "Spearman's ρ", # nolint ifelse(statistics == "pearson", "Pearson's r", # nolint ifelse(statistics == "cramer", "Cramer's V", "&phi") # nolint ) ) ) # return result structure(class = "sj_xtab_stat", list( estimate = s, p.value = pv, statistic = test, stat.name = names(test), stat.html = stat.html, df = (nrow(tab) - 1) * (ncol(tab) - 1), n_obs = sum(tab, na.rm = TRUE), method = method, method.html = method.html, method.short = statistics, fisher = use.fisher )) } #' @rdname crosstable_statistics #' @export xtab_statistics <- crosstable_statistics sjstats/R/sjStatistics.R0000644000176200001440000000352514402055204014761 0ustar liggesusers#' @title Expected and relative table values #' @name table_values #' @description This function calculates a table's cell, row and column percentages as #' well as expected values and returns all results as lists of tables. #' #' @param tab Simple \code{\link{table}} or \code{\link[stats]{ftable}} of which #' cell, row and column percentages as well as expected values are calculated. #' Tables of class \code{\link[stats]{xtabs}} and other will be coerced to #' \code{ftable} objects. #' @param digits Amount of digits for the table percentage values. #' #' @return (Invisibly) returns a list with four tables: #' \enumerate{ #' \item \code{cell} a table with cell percentages of \code{tab} #' \item \code{row} a table with row percentages of \code{tab} #' \item \code{col} a table with column percentages of \code{tab} #' \item \code{expected} a table with expected values of \code{tab} #' } #' #' @examples #' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) #' # show expected values #' table_values(tab)$expected #' # show cell percentages #' table_values(tab)$cell #' #' @export table_values <- function(tab, digits = 2) { # convert to ftable object if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) tab.cell <- round(100 * prop.table(tab), digits) tab.row <- round(100 * prop.table(tab, 1), digits) tab.col <- round(100 * prop.table(tab, 2), digits) tab.expected <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab))) # return results invisible(structure(class = "sjutablevalues", list(cell = tab.cell, row = tab.row, col = tab.col, expected = tab.expected))) } sjstats/R/inequ_trends.R0000644000176200001440000001002314620351262014766 0ustar liggesusers#' @title Compute trends in status inequalities #' @name inequ_trend #' #' @description This method computes the proportional change of absolute #' (rate differences) and relative (rate ratios) inequalities #' of prevalence rates for two different status groups, as proposed #' by Mackenbach et al. (2015). #' #' @param data A data frame that contains the variables with prevalence rates for both low #' and high status groups (see 'Examples'). #' @param prev.low The name of the variable with the prevalence rates for #' the low status groups. #' @param prev.hi The name of the variable with the prevalence rates for #' the hi status groups. #' #' @return A data frame with the prevalence rates as well as the values for the #' proportional change in absolute (\code{rd}) and relative (\code{rr}) #' ineqqualities. #' #' @references Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. \doi{10.1136/jech-2015-207018} #' #' @details Given the time trend of prevalence rates of an outcome for two status #' groups (e.g. the mortality rates for people with lower and higher #' socioeconomic status over 40 years), this function computes the #' proportional change of absolute and relative inequalities, expressed #' in changes in rate differences and rate ratios. The function implements #' the algorithm proposed by \emph{Mackenbach et al. 2015}. #' #' @examplesIf requireNamespace("ggplot2") #' # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 #' #' # 40 simulated time points, with an initial rate ratio of 2 and #' # a rate difference of 100 (i.e. low status group starts with a #' # prevalence rate of 200, the high status group with 100) #' #' # annual decline of prevalence is 1% for the low, and 3% for the #' # high status group #' #' n <- 40 #' time <- seq(1, n, by = 1) #' lo <- rep(200, times = n) #' for (i in 2:n) lo[i] <- lo[i - 1] * .99 #' #' hi <- rep(100, times = n) #' for (i in 2:n) hi[i] <- hi[i - 1] * .97 #' #' prev.data <- data.frame(lo, hi) #' #' # print values #' inequ_trend(prev.data, "lo", "hi") #' #' # plot trends - here we see that the relative inequalities #' # are increasing over time, while the absolute inequalities #' # are first increasing as well, but later are decreasing #' # (while rel. inequ. are still increasing) #' plot(inequ_trend(prev.data, "lo", "hi")) #' #' @export inequ_trend <- function(data, prev.low, prev.hi) { # prepare data for prevalence rates for low and hi status groups if (is.null(data) || missing(data)) { dat <- data.frame(prev.low, prev.hi) } else { dat <- data[c(prev.low, prev.hi)] } # ensure common column names colnames(dat) <- c("lo", "hi") # trends in rate ratios # compute relative inequality for first time point, needed # as reference to compute proportional change over time dat$rr <- dat$lo[1] / dat$hi[1] # compute proportional change of relative inequalities over time for (t in 2:nrow(dat)) { delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1] delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1] dat$rr[t] <- dat$rr[t - 1] * ((1 + delta.low) / (1 + delta.hi)) } # trends in rate difference # compute absolute inequality for first time point, needed # as reference to compute proportional change over time dat$rd <- dat$lo[1] - dat$hi[1] # compute proportional change of absolute inequalities over time for (t in 2:nrow(dat)) { delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1] delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1] dat$rd[t] <- dat$rd[t - 1] + (dat$lo[t - 1] * delta.low - dat$hi[t - 1] * delta.hi) } # return structure(class = "sj_inequ_trend", list(data = dat)) } sjstats/R/gmd.R0000644000176200001440000000263614620351262013050 0ustar liggesusers#' @title Gini's Mean Difference #' @name gmd #' @description `gmd()` computes Gini's mean difference for a numeric vector #' or for all numeric vectors in a data frame. #' #' @param x A vector or data frame. #' @param select Optional, names of variables as character vector that should be #' selected for further processing. Required, if `x` is a data frame (and no vector) #' and only selected variables from `x` should be processed. #' #' @return For numeric vectors, Gini's mean difference. For non-numeric vectors #' or vectors of length < 2, returns `NA`. #' #' @note Gini's mean difference is defined as the mean absolute difference between #' any two distinct elements of a vector. Missing values from `x` are silently #' removed. #' #' @references David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575 #' #' @examples #' data(efc) #' gmd(efc$e17age) #' gmd(efc, c("e17age", "c160age", "c12hour")) #' #' @export gmd <- function(x, select = NULL) { if (is.data.frame(x)) { do.call(rbind, lapply(select, function(i) { data.frame( variable = i, gmd = gmd_helper(x[[i]]) ) })) } else { gmd_helper(x) } } gmd_helper <- function(x) { if (!is.numeric(x)) return(NA) x <- stats::na.omit(x) n <- length(x) if (n < 2) return(NA) w <- 4 * ((1:n) - (n - 1) / 2) / n / (n - 1) sum(w * sort(x - mean(x))) } sjstats/R/wilcoxon_test.R0000644000176200001440000001675314620351262015207 0ustar liggesusers#' @title Wilcoxon rank sum test #' @name wilcoxon_test #' @description This function performs Wilcoxon rank sum tests for one sample #' or for two _paired_ (dependent) samples. For _unpaired_ (independent) #' samples, please use the `mann_whitney_test()` function. #' #' A Wilcoxon rank sum test is a non-parametric test for the null hypothesis #' that two samples have identical continuous distributions. The implementation #' in `wilcoxon_test()` is only used for _paired_, i.e. _dependent_ samples. For #' independent (unpaired) samples, use `mann_whitney_test()`. #' #' `wilcoxon_test()` can be used for ordinal scales or when the continuous #' variables are not normally distributed. For large samples, or approximately #' normally distributed variables, the `t_test()` function can be used (with #' `paired = TRUE`). #' #' @inheritParams mann_whitney_test #' @inherit mann_whitney_test seealso #' #' @inheritSection mann_whitney_test Which test to use #' #' @return A data frame with test results. The function returns p and Z-values #' as well as effect size r and group-rank-means. #' #' @references #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @examplesIf requireNamespace("coin") #' data(mtcars) #' # one-sample test #' wilcoxon_test(mtcars, "mpg") #' # base R equivalent, we set exact = FALSE to avoid a warning #' wilcox.test(mtcars$mpg ~ 1, exact = FALSE) #' #' # paired test #' wilcoxon_test(mtcars, c("mpg", "hp")) #' # base R equivalent, we set exact = FALSE to avoid a warning #' wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE) #' #' # when `by` is specified, each group must be of same length #' data(iris) #' d <- iris[iris$Species != "setosa", ] #' wilcoxon_test(d, "Sepal.Width", by = "Species") #' @export wilcoxon_test <- function(data, select = NULL, by = NULL, weights = NULL, mu = 0, alternative = "two.sided", ...) { insight::check_if_installed("datawizard") alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) # sanity checks .sanitize_htest_input(data, select, by, weights, test = "wilcoxon_test") # alternative only if weights are NULL if (!is.null(weights) && alternative != "two.sided") { insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.") } # for paired two-sample, do groups all have same length? if (!is.null(by)) { group_len <- as.numeric(table(as.vector(data[[by]]))) if (!all(group_len == group_len[1])) { insight::format_error("For paired two-sample Wilcoxon test, all groups specified in `by` must have the same length.") # nolint } # convert to wide format out <- split(data[select], as.character(data[[by]])) data <- stats::setNames(do.call(cbind, out), names(out)) select <- colnames(data) } # value labels group_labels <- select x <- data[[select[1]]] if (length(select) > 1) { y <- data[[select[2]]] } else { y <- NULL } if (is.null(weights)) { .calculate_wilcox(x, y, alternative, mu, group_labels, ...) } else { .calculate_weighted_mwu(x, y, data[[weights]], group_labels) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_wilcox <- function(x, y, alternative, mu, group_labels, ...) { insight::check_if_installed("coin") # for paired Wilcoxon test, we have effect sizes if (!is.null(y)) { # prepare data wcdat <- data.frame(x, y) # perfom wilcox test wt <- coin::wilcoxsign_test(x ~ y, data = wcdat) # compute statistics u <- as.numeric(coin::statistic(wt, type = "linear")) z <- as.numeric(coin::statistic(wt, type = "standardized")) r <- abs(z / sqrt(nrow(wcdat))) } else { wt <- u <- z <- r <- NULL } # prepare data if (is.null(y)) { dv <- x } else { dv <- x - y } htest <- suppressWarnings(stats::wilcox.test( dv ~ 1, alternative = alternative, mu = mu, ... )) v <- htest$statistic p <- htest$p.value out <- data.frame( group1 = group_labels[1], v = v, p = as.numeric(p), mu = mu, alternative = alternative ) # two groups? if (length(group_labels) > 1) { out$group2 <- group_labels[2] } # add effectsizes, when we have if (!is.null(wt)) { out$u <- u out$z <- z out$r <- r } attr(out, "group_labels") <- group_labels attr(out, "method") <- "wilcoxon" attr(out, "weighted") <- FALSE attr(out, "one_sample") <- length(group_labels) == 1 class(out) <- c("sj_htest_wilcox", "data.frame") out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_wilcox <- function(x, y, weights, group_labels) { # check if pkg survey is available insight::check_if_installed("survey") # prepare data if (is.null(y)) { dv <- x } else { dv <- x - y } dat <- stats::na.omit(data.frame(dv, weights)) colnames(dat) <- c("y", "w") design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) result <- survey::svyranktest(formula = y ~ 1, design, test = "wilcoxon") # statistics and effect sizes z <- result$statistic r <- abs(z / sqrt(nrow(dat))) out <- data_frame( group1 = group_labels[1], estimate = result$estimate, z = z, r = r, p = as.numeric(result$p.value), mu = 0, alternative = "two.sided" ) # two groups? if (length(group_labels) > 1) { out$group2 <- group_labels[2] } attr(out, "group_labels") <- group_labels attr(out, "weighted") <- TRUE attr(out, "one_sample") <- length(group_labels) == 1 attr(out, "method") <- "wilcoxon" class(out) <- c("sj_htest_wilcox", "data.frame") out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_wilcox <- function(x, ...) { # fetch attributes group_labels <- attributes(x)$group_labels weighted <- attributes(x)$weighted one_sample <- attributes(x)$one_sample if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } if (one_sample) { onesample_string <- "One Sample" } else { onesample_string <- "Paired" } # same width group_labels <- format(group_labels) # header insight::print_color(sprintf( "# %s Wilcoxon signed rank test%s\n\n", onesample_string, weight_string ), "blue") # alternative hypothesis if (!is.null(x$alternative) && !is.null(x$mu)) { alt_string <- switch(x$alternative, two.sided = "not equal to", less = "less than", greater = "greater than" ) alt_string <- paste("true location shift is", alt_string, x$mu) insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") } if (!is.null(x[["v"]])) { v_stat <- sprintf("V = %i, ", round(x$v)) } else { v_stat <- "" } if (!is.null(x[["r"]])) { cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", v_stat, x$r, x$z, insight::format_p(x$p))) } else { cat(sprintf("\n %s%s\n\n", v_stat, insight::format_p(x$p))) } } sjstats/R/var_pop.R0000644000176200001440000000276214620351262013747 0ustar liggesusers#' @title Calculate population variance and standard deviation #' @name var_pop #' @description Calculate the population variance or standard deviation of a vector. #' #' @param x (Numeric) vector. #' #' @return The population variance or standard deviation of \code{x}. #' #' @details Unlike \code{\link[stats]{var}}, which returns the sample variance, #' \code{var_pop()} returns the population variance. \code{sd_pop()} #' returns the standard deviation based on the population variance. #' #' @examples #' data(efc) #' #' # sampling variance #' var(efc$c12hour, na.rm = TRUE) #' # population variance #' var_pop(efc$c12hour) #' #' # sampling sd #' sd(efc$c12hour, na.rm = TRUE) #' # population sd #' sd_pop(efc$c12hour) #' @export var_pop <- function(x) { insight::check_if_installed("datawizard") # check for categorical if (is.factor(x)) { # only allow numeric factors if (!.is_pseudo_numeric(x)) { insight::format_error("`x` must be numeric vector or a factor with numeric levels.") } # convert factor to numeric x <- datawizard::to_numeric(x, dummy_factors = FALSE) } # remove NA x <- stats::na.omit(x) n <- length(x) # population variance stats::var(x) * ((n - 1) / n) } #' @rdname var_pop #' @export sd_pop <- function(x) { # get population variance pv <- var_pop(x) # factors with non-numeric level return NULL if (!is.null(pv) && !is.na(pv)) sqrt(pv) else NA } sjstats/R/svyglmnb.R0000644000176200001440000001161714620351262014141 0ustar liggesusersutils::globalVariables("scaled.weights") #' @title Survey-weighted negative binomial generalised linear model #' @name svyglm.nb #' @description \code{svyglm.nb()} is an extension to the \CRANpkg{survey}-package #' to fit survey-weighted negative binomial models. It uses #' \code{\link[survey]{svymle}} to fit sampling-weighted #' maximum likelihood estimates, based on starting values provided #' by \code{\link[MASS]{glm.nb}}, as proposed by \emph{Lumley #' (2010, pp249)}. #' #' #' @param formula An object of class \code{formula}, i.e. a symbolic description #' of the model to be fitted. See 'Details' in \code{\link[stats]{glm}}. #' @param design An object of class \code{\link[survey]{svydesign}}, providing #' a specification of the survey design. #' @param ... Other arguments passed down to \code{\link[MASS]{glm.nb}}. #' #' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.nb}, #' with some additional information about the model. #' #' @details For details on the computation method, see Lumley (2010), Appendix E #' (especially 254ff.) #' \cr \cr #' \pkg{sjstats} implements following S3-methods for \code{svyglm.nb}-objects: #' \code{family()}, \code{model.frame()}, \code{formula()}, \code{print()}, #' \code{predict()} and \code{residuals()}. However, these functions have some #' limitations: #' \itemize{ #' \item{\code{family()} simply returns the family-object from the #' underlying \code{\link[MASS]{glm.nb}}-model.} #' \item{The \code{predict()}-method just re-fits the \code{svyglm.nb}-model #' with \code{\link[MASS]{glm.nb}}, overwrites the \code{$coefficients} #' from this model-object with the coefficients from the returned #' \code{\link[survey]{svymle}}-object and finally calls #' \code{\link[stats]{predict.glm}} to compute the predicted values.} #' \item{\code{residuals()} re-fits the \code{svyglm.nb}-model with #' \code{\link[MASS]{glm.nb}} and then computes the Pearson-residuals #' from the \code{glm.nb}-object.} #' } #' #' #' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley #' #' @examples #' # ------------------------------------------ #' # This example reproduces the results from #' # Lumley 2010, figure E.7 (Appendix E, p256) #' # ------------------------------------------ #' if (require("survey")) { #' data(nhanes_sample) #' #' # create survey design #' des <- svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' #' # fit negative binomial regression #' fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des) #' #' # print coefficients and standard errors #' fit #' } #' @export svyglm.nb <- function(formula, design, ...) { insight::check_if_installed(c("survey", "MASS")) # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) # update design with scaled weights design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE)) # fit negative binomial model, with scaled design weights mod <- MASS::glm.nb(formula, data = stats::model.frame(design), weights = scaled.weights, ...) fam <- stats::family(mod) # fit survey model, using maximum likelihood estimation svyfit <- survey::svymle( loglike = sjstats_loglik, grad = sjstats_score, design = design, formulas = list(theta = ~1, eta = formula), start = c(mod$theta, stats::coef(mod)), na.action = "na.omit" ) # add additoinal information class(svyfit) <- c("svyglm.nb", class(svyfit)) attr(svyfit, "nb.terms") <- all.vars(formula) attr(svyfit, "nb.formula") <- formula attr(svyfit, "family") <- fam attr(svyfit, "nb.theta") <- mod[["theta"]] attr(svyfit, "nb.theta.se") <- mod[["SE.theta"]] svyfit$deviance <- mod$deviance svyfit$df.residuals <- mod$df.residuals svyfit$df <- length(stats::coef(mod)) + 1 svyfit$aic <- mod$aic svyfit } # log-likelihood function used in "svymle()" sjstats_loglik <- function(y, theta, eta) { mu <- exp(eta) return( lgamma(theta + y) - lgamma(theta) - lgamma(y + 1) + theta * log(theta) + y * log(mu + (y == 0)) - (theta + y) * log(theta + mu) ) } # derivative sjstats_deta <- function(y, theta, eta) { mu <- exp(eta) dmu <- y / mu - (theta + y) / (theta + mu) dmu * mu } # derivative sjstats_dtheta <- function(y, theta, eta) { mu <- exp(eta) digamma(theta + y) - digamma(theta) + log(theta) + 1 - log(theta + mu) - (y + theta) / (mu + theta) } # score function, combines derivatives sjstats_score <- function(y, theta, eta) { cbind(sjstats_dtheta(y, theta,eta), sjstats_deta(y, theta, eta)) } sjstats/data/0000755000176200001440000000000013737567502012675 5ustar liggesuserssjstats/data/efc.RData0000644000176200001440000004213413563265750014350 0ustar liggesusers՝]qߏIJ9 1*TRhjIlHI)AIˡ. ÿ( p/gι V_3ܻKw{{򉧺'ϧ?.|rsXq{?9/ pӍ~<=|o11}z G 7~g9Pv^1r<371?Wü+1y5KA FO> }z6̼cz'ZcB?AS|n<_#}DXK?3UӍaEG||] |+w< A#:;wgc="˾9GUҏqƥ_zKBn#ZW?6<_Q/`ޛh ~艾_"e܏OyѮ71ۀE@# :_ A#K q?x9/{s{ u`a܈@z^MSqsK#O1ޫh˟T1_H/z^釶zM?qߒCr)owcݠ/OET_}ճʺ'fK鳀u81ƕW*QrI^': <1X2?'ᾥy)=~3?Yw=v'/bu\puR;_A .\>E)oH?W/E:G[+u~-d?׸W(uR?!}|:W?-swGWP/;U|kS?\RP柕2x.O|y;=q!wCyP5@/> Aj?ެKXKq8к>SYgJWЯ~+qd\K{ ^tq<ȼsYi\tY${J ^J: ΒCr3I?kS1#7|mٝ 9#zzu%_=S?1{s}y?I/8[xCGXǯοf!A~k|n>on>:v6[w;l]W\v`C7^̍L_Զ|kV@Jyj嶰7Vg?/l?fgW`oIwuH;.k!;;%kGx2N0?'puvz6`I^uwߟM7<0VA7rvj7K|&~ .6o&H?v=̷I?ۼ}7Od$/+`~>o#:\?3-7Q>g`nѿB?f2l5nƃf2щtun?ߏۡ?ߣ 4 q)Xv[</;y-{+}26%g7 ?|Y?ܻ:9\morT==; j|zx/o>]k7}w뻷?y֣ova7?ڽ}p2?:\HkwHÓWǷ6n|{\F:_S_ȢnVhvnWbr̕^R0SKuVuprΕ{vY:s6͛ۆu_t\nn0Wu+Wv}N.VV[+G_;\j#"_ ʿz{4ޚOxY~54>KieG߶?.g-VX[_,pf,]ϭ"zrDmG[~\VX{XJmוKvwy헭c-ut|սُl#ڿ9^}9(jt r?<1Py^7޹OūC~s@߱<8_\?7H. o[wc“<?%:])'r@Kø~Sn'9 >/<ѻ*yGE^]I_>nS阯"=eOqihz4^3›Uz8a\~#{79ܢϗ.)@%\c}/4̧h=Dڲ[mtJԺmK[_zNW Oo?1yWX.ҁOIGqvi#vYDK>`=WP?+qq?!bn/yOr֩Oz_'؍aON|$Ax%/Pv,~=oG%7a>^;ҫ D)zЖi=$&bn.P9[.%+~y+%N/'zN0_v+tcyJ>C'&@G1?3r]p?r2%ҁ?RzOc~ܷ5z~Yߍ^% mօ$!'g_X_^zI<گ/}%~yMyWx|`/r7o~'7_<ެ/qֳzL]%<9x~;OdYx{n7SRIu>wB(zЯ3gQX⇐@yq(~cXп̸諮[9o/<%+w uhړs`ǐ1^YeC /\WħY`?Ճ83wg[z +yӑ<ϕ~Wa4\][n,Oo}JI5uԖ+tYG{.vb r%~C?I}MУnL笲Ϟ^8/׼r4cXn=sm?(x#XG_ %L@KuqCx^;cy$nHo3%}q{ ޿yxߡ]M~re=%K?r,|ov?HO#`yd^^Kٻ^ےR'Tfpc vt0^B>NGϭ#؟{)z.dGD wh;?ҋ镍xrK:(m [0._NHHO'wmܸ=Y9H?қrDuRkE9\6Ku#9"ġ+5ym~ y}*gh◜M؁~4߬[m} m'_m5P7{Ooȅ{9c/7o#*g; n4-ڢ>_6xjkf7|\a$g>Y|E$~oժwEszfv~|%ή ڭvM-$lt,Z9:VIq۹v'ed_W?N~VxY/rY9"~%R޺N<],,.h8590]_3Wv{kdY*:yɟx?+_>Ѻf[+F"<G7?/͏=jj_H?U_`mUY+ja<‹dyuvuCϬ]+^\?\}yx{ucٿk6/6յݣͧ{o~o| thg0zxd?/v<ܟݖz9`2~i6r[zce$_em>\yj#W_Q^Ot1ʩJ }3"lg2ڼG+-C_DՋ= =G"G=Vێ֞W|y]|*[;_9uhk]泥9N/|Ζ1vi=':j9P.;wzWf?N8qէwALt}e;9֏sa^6un=lu~N_t|:9<\]|{pnk)>%744ߝ)"]%^oHVkqU+=f)̋Bx1~fE>B>OI{C70+md%lu^J k)[4Wi=HZ6E둎p3M;QOKw6txxpշ?yǵ'}[xR狭#EYgga>x#T{xw޵'{wG~z:ܻbG{Ǐ %x<sQv|zT񫽊Ke֫HW31JZzH/*%R/Ֆh|55m/[$m_F7ˑJq2K?|5o?+Wv^6+;/h'Yfam !˿8ڼ[$-S7yl})oWЬ<曵cz=gr#y'x= I?N?8GV~VN>'W׆o?[g;݈_ϭ/ οnK{mK'p_M~z: Oh8yulɵZ+O8 U>wnqvOFn?;?zʏQse */̛{^((ݓܸv:޺޵id- z}8^I/SR/d[ԥv #k_a Ul<>eyBIjdo'owT_)"vmK@'WVh>kߢZ׃rYX?ݣ^G%wm;M {8 k"ۊʶ||Nɷz|uw"g{_>~ORkw4e3*W{ίl]Z2}^3xAU脪̸.Ym󣂭q=t ?HV9kS+GMS?ǣ7"͒RO~Xק\9֣^󼾍Ow%ǶjG.|{>ϛ_zlIj=sl-[[A]O/OB'[ӳUpi:o=/={G}%WX_ok^j]Gn'|6Wm{\8΋߈g?I۳;=?z<-M[[xقrUjedx{k1{% L~o'6~ꓵSѺ?(/M8^|#wq'/ȷKtuCv_sr̿]5o18:a~ݝ mIʪ%+_z3'x٧=:kS@ڪ:?g#7'r8}Ο뇑fuH~#YldKϵu{jd_FO{3j[y#ɗpOo?FlkWϤu4ċ괰 }W։lgw"o@7ʱSr=~e-ț1Kl,:inuxۨsGlyYhK;f|v=CF=K-n/#[r'yqUS{ѭo|re[Ns`^<`__{tuc_޺ny9zmm }4vƈ o=+y\HE+od0?^tCZäAn }(g7Ͼo?2. qke>`?}:c$'ac=Pwv6,'Fj5k$K406HwN.rTog3ڳô$l>o͏s`:?ϯW]gwnd_tԝ]xZ us2Nxpqum -a/uJxټh7ylU{ya=A:A^ehlN=_$w+ >ub7 }ڀ({5'Gz٩3'njy"pl3սُK۷nl>ruhospGOo~rxh|a1~ana)7,H%)Oh^q7J#g ތGKo4zFG|Xtt+WV\1=>ɋ#[<8fGnz$ED 9l1c=D$(oZ?-ʳQQߏa8ʧ[NroY97Ѹrg}>+7z&%{ߛ~dvcKᅣʼn|K97y/H>=g=[uG:Oh䭮ϻ) pޢ'ߥOU~>@WyO\j-.<,|~O'~\oѹ2~>֥0~E|J<hk=^Ez0>-<Y?};g,ZG# tdwe? ⧼!. 7\e=ŕK֛y!AtOP~Z e7_w:SkZz x%Otxn.yFrgݸ_)ڥ%ɾokOW.|8W呝ؖ]ŷC~JP~% m/NmzJn[|7ɏ8)'d0/DF~ŋݹ>Gޜ|}u/>^Zi='9 PQPq-ܲV~?2.Z_~]۬/~7[8My4&׾U~zZo5Kayߢ/{eOkHѓZ?~7{].{hRgJ'[= /S?CAy03@?_tx ۏ15]G+3~yZoK/Cvơ_o99[>_o/џCT\Zq]ŏyMtyK_A<_wӱ|7Z֡K9d?7!4_<?+~> ޝ_rtc>̃?] 8_B-91/Z|uȯ?HٕH֏Zr^> &;JN_8:*.e_`(Uih(?)?~S^̓Z5T?U. zNPuH9QفuIr)ym~/g~,}.O0./qo}U's=&Gʝ1]y5=,Жh}ӱ>W<%<8w~,z7O7Y tE7 KQ|q/mn-ʹGa[m+/so{Z kV+ݬZ˥٥_f#p)o7KQ?n~}dqn>*y{㳟FǫO77Gr_|bԯw77}-+sZdjA}ӺY?TٺٽniŏZ~m}PI:Y9y#:q\ܖO3H~>W`Y'!n|v0K&+|yܼu'yynߚjoy9jWoe<[럵v\*oF~ZKGs~Mg);ͷ|ZtnܺfnTim[Rȍg8noךg?=OG{g\|b*ڙ_}pד_n{S ~XM oBw6v0߀~/!~C"i==TM6=#='ziOKoRz: }'~}f7_cbo =d#?zn_)!o+/ /2BWj=} q#?xxq["_yA8 _0O|4q$<͗Ԗj3j3S?}3.zCrreGӏh}v1%%y[m o?^C?\ Z_ E z7ySWu#go$旿n||A3%yDr0иGڕu1e7>yc~$%=92_Ň%-y/_rNqyN}InIx̷_iG~)YGބS^~OPzf9(9eWm"q? 3ga~)3?. _AB?a|g''<楫<_п>Ʃח~Tv\<0nW˼+> _֫Oc\s&4yJ|ȼI:AgisySOw'S?%GE#c޽yoc>9nߡYr`=C=9a\:!ƙ;rn1.3o\|P^ek<Żg;c*ƕאzmVo_~7N>70WA76_8P/{\ y(lOm{~#Wr<_\l2.+.oɲ= ~E|_|Ux0_~sZO-\y/a>?C~C;пwxỤm^/ڃs/<~#uz㤻}(w_a5θѸ:b>=2oH~o:r=W/;t5.zcvg\oŗ?EP{'ѡ=o2}N\O̗'\vD~.=޴ x\/;=7ig]M;0|/WL?Au)/ggޣo(%7W/n>Ÿd[8bGH}is˿k3YH^ {S </~:M|Wv"㹚Jwu7Rnݙ8(huO3@un_\dڢCeJ}俬cO9dc>sʸqu'ϛNb]]6S>wwx]ar.Et_dn_y sk>5Ps8yݹٕʸ&ڍqF?qu#v>t 3.p\=-|9{Gv#9tyu0L?$?w>]1E95׍\}ud?ӯ}c]H.|s\MʟÂyB'}gv;zmwl%O2]2ߺ{<^Sy#y9̘~-@!b_|ڑv7v/ :2;0`~K'sv>?}j^wv^.U} _O+9hmw} ϯ㮎wN;gcwp>;qĽgϼGݜ|FyCGyw>tŗ{n LsuoszoQrw~pu9K}Y;yב~9ǣM\1\tujGuGdwrg}ףs~Wq^;;[eqxwoo?xIn<̠T!02_ 3Y|9--9fZ+%rݞOwq7yC'?W=^󝬣i_'kԓ)|=Y^i~2џտc'@k7)]>>M>rM~+1EyK._c3-Ӟ.nWGoOC8aqXkxw_8?+/Vgsybޡ;9t}1{|"|ܿkW߽.y?Wz=)|^FF}ǟwpr3/Ǽf` ?EƺOv:_tײyG={oꗞy+RN=KvdRaŁzJ% hx>ղ3w0y=.aAs.9<oSx<*._~ui]WЇ_'Cɟyc/ o=vr&~7=ɗ3Ty|_c=~x 9<+~6%<%W:Dk_wp|N<ڼ'~O=X2_puwoځwncJxܟd:/Gre\ODGy/FgtG#zԏ<Wx=Eɟtytksu;܌ϣxB]\B'/W7V{76W&3}*݇џtxsJi]~H7u2McՌ'Пfe>ۢms^;?:O.o~h?rhϲٹVߵ_l܏Շ~j]dX K.?y˵zq&y>=w_繥?p80Zow>o͇/&N--\}wEGYz6>y}j]>Zw;>a[^~tO=ãE5ҟm~x~:9Nn3M:du9>9ڻvwx0d?bF'7yyxǿ<66| ?cUOcXCI?k>\r{#wzB??o=|}aO瑾>r"΢?LyîOwrkZƥw=`;ƹgOeW>?Z:wo=Sظi=!]kW6] 8^dZ=|ܼ}֭&~$G6~D6l>'v#|~_~%~Z~SM =G7+'.ټT~D5`m}\Ol{׋ڻZY:s[yqލvuz#:}rMo#?gUi[o9dv7,q~ާm6.w,ݹ뜥Ϛ08B\eWdS6nRugxɺjB|3[ɞgܼzn ߻9^8kHoğ]GH{sc'9O ΍DA<__wm]q|#;dOvϞkΝ=H} '}'_'KJK]Z;z}j7"'[OFi=Om|ϞsO30eT=ۓT#?o[D=':czig g=7+v ?Or|8x._c|kE%eO&{޻~=YxFKgӟv>x*rѶwE>QY {aq{qY?6gO=KC^Fބ[6|QvqvrLrx[zj^/FFGyr\U&v(<nݢuO>Dkտ>O"y9//ʇv6Nz/ҋ|s5RQYg˻|#{8ΞV`}ks-}g.Q=ЪeNX~VʇKom>qtjjK!~]!jo_{;̷3F٫v=<5x=[Ng,kn|q"!ͻN^]-0̇s풥ptn=E@N(.UG@WQ^7k0 #[ŮKwm>ΛOO/ǣ8~Þ>Yt~I菾,tD\{񥛇ӛ_9 oo^|x^tSZ_Uҵ[/>h|҃Ż͗4_ۼ+mkom^om@Wwu <}Jy}+nӟ<99| [}~;@\o.o}|}Y/~ljOkt'/8;_0sjstats/data/nhanes_sample.RData0000644000176200001440000006554013563265750016436 0ustar liggesusers| N1nDi6Wݦ?2 JBXdJ$p{C#CY;϶;W9gkֳ>oPqN~{\{~IK=Eu&\tn NtyJ9y_31TC7gXz[ǻWqdz<|䷤F.נ[,_~/R}/ z]qYKC?P ׁxO){rQ~K5Gn2^~UCazZ-z?`?|~Bo[w9Em6rubI0<ܘ 2^Zx5puNx܆|gK$.Ecѡrx<Kwy̯ a?Izq\r6eq5OB(6֟k~?C4F_i> =}"8cr2I\9|N]W@n#ɿ|TŻۉ%`W}=S_-)9$]d}jD~z 棆ϔ~:zf8.0`xy8*y=Fc0a?g/g~qO#gwx}y?z1\y Or|]}xʰ7!:gʻIW~yks'{8AzlϿ1ahqx2+ǃ}#\_;,q+P?@>JG"pCyw?%pu%8j?ڏqg9qǺӏMJk[=C;"j(Ojzh9TxCR'@1|4<^W4{@S{~9n0iZϼWޜvF\A_+F.;;/b~O_Xg=A?h{眷@\ß/qc<>7y=䲝a]1/w=0/a]αxn,yO#xxߙ! \f?~ 84-/dؿKs|rZ䦸ʺcӸ}5w{ W'iXpźps 猷_ 0NF=tnΆ6ggvґ_Fs^Sw{x>hc\%-=J(G᧛Ż,`a U:"N=g?x\l76=6=>v=ΟsX{ qW+sܒ-ox@<ҰJT%z,mS9!cbǓJy kG=b(8cr7ܯ5~"OeO%bݎu!w}>I<l=浲"Itzvks:w?S_yWa܅x{;x+WS=C~29}p=7<^ߌ~F[Ge; a|PPċr \byxߡz̻XXwF9hDWX+}?J=ǼyxfZXGKzy+~$= 09|W|*\=^'\ܬ~/l XK=lG,3d_ܰSڿg 䈥+/G;My\ Đ<K9Է>| :՟q7y?+-;Z!u{>r^{bqckKR9- ]Do3#GIvn'QA#g?Ĝ[|uOڍ!9ն`q"?]ܢ!7S,{ o'Y4b}/U#yH_DS=8@vxgg pǺ/܂ǣ9[.zQ&}Bw7<=I}ʧhsW>'X1~؍ux<Wy#|4p Y60~\R;,/OKЛ1 ^/ƾFvw?'^orq'eo3hr:w(?%G? 7y[qHwc})k\wpcoy ÷+px'<9'r!/Cs+pa=cq//g#=ɌuS;c_,ݯ'lNw3x~hϡ{˹2k>ܡ+Hگuz~?V\ms!ݑ ;ခ?ߐeCq܄vh?A!2pP~Q|vn3o떓qBxgyr#Nxq)V_XH[~gabf%YOC)la^w/q^7 04/|v/?GɳMu!\Ju9~VkJYK_spنd/Ƌ{?׋LJu9'YG˼]_,N7ցdD#}NDn>b]Un?GN!ngdWF2_XW_.ޣ_v^({^@'/c\hC8c9G=l'o$}|Wt7/xHy]u|_'&EGvyomC+z:zwNYNWkr܍_uw9}-ȅuzH(}^= zb~x.5sc~{/\)1n <1_^o!-N3: 0_K78wc/8 /_Ẉ~/? _ /7IO%\:rm䟘3'{0ǺH>=ޜ.B~{+<31#98<zq$rD/.ǺKK~_ǽyݥ]/S\,׽"ŗF>o>`!'!#(#ց}w`ǐq=D}/GGy{x.x7XW_`^0أiugv->oNǼ!ܴd CS)O@@Nkb.R9ށ^Wf#/"-|K~CnY7GHz=)3fyi]7x0N zg_Cq{NEy.%O|pva/#v$r]_ߊyoSD QBN M$n@\5s v(x%g a}L']s~y {ؗ'%vE5#']Cp3NHS3y;y౾ Or縗s L#~Ƽ4> !6W3 7>;~(zi<碏:A 1< '36q~;>Ob3@䏍 0>[Ox$zy#q$d^8Iq)&3~(|~lq8Oqbxv%IyPQ/qϒab%@Sx v~+$Sփ p\ۮ9o#wטk<۫؉;eP)x8_>g3 '#nd{";0h~% zo %N#?y~Xo"/8).¯Q>Qݝ_2g('8Hq2ޡ]ȇy ޝSgz7Yi\=J_璟pGop_G1c;8yB=+<8?s~?$ oROUۯx?s￰ߕ<{~y|wyNsN⒜sir=oY<2~p:h䛘ǐxoW`!m,yCbB9s6丂TI&1Mƥz7՜hqKg>,yU7M#'ԃSܗ ܿO~fؕA>%όýo#~1$Lqw9!~ڭw%#>ǂ~0gsSFSۈ$C94s?Ǔ܎/S E ^u8܆RɸWpy(qP^ф낗y0<;n0HwG$/b=rۓ'hGA~O Ei zIZ̳Z/ʂ7kxUC+rλisa^[ĵ/ ޟCX_\Q^1%wr|9a>QPc#!<¿Kނ c^0.[x~NDvpBxC'q( #އ^oOx ~??楘?G·ʇ8h̋8zAOf}|y})>7O_(y72w]j1OcQ=Ey!BHY!K?B-Xځܘ3q(w{ |-dW~C;^~پ0O~!/١'ijxƹe{|90XYwaK*qCSE=_${8~RF<qgkX>漟ÐuMv!F\oweK8ۧ:7| >iOSb|Mɱ-yo$ zqr 5W|73_AWK<迶3琞q~ e\S$b<`0~惔?>ļXWSOb$&'y2rQ>֍FUKq^?Y;b7dv>_ug*U/ ُ%}#l^^7hH5Տvm/9gP={֮m ;zH;R9'^ï\~˅oѲhCڏ6Da]E 1[;~C-ǯE?z%?h;_mװ~zaσ)ψo}.u j'^lm ~۾\h7Ho?Hs^ϽGA [Rbk?kX_dڿWyrGk> +O"'l}ut]Ou׫[;ђ#z~8rx-v%wî{-m{s~y{_zn#h[~yX9lmAۯWAj翚hDp47Z-Tx>5Z8b_ }o_YVծ^G˟y ڋDKo%W<]D{"m/R;'}]?h)װo+mG\OnFD\P_xm۟#ny緝e_G?~ED 碍A劶 ھMhWP+W4t}"9~ׯ^䈖z|V_ַokV5R+/G~ {U.,^-Wpuﰫ|~j'\x|ȫU,m:bԮGDWclm9wuٮU=տoq_ܫ/}3ëm=匱9w9k㵮Aݱ顇(<_ gSw7/fgk? ?1k1zя^9wq9]S..^rg~1VymZo%-+:/OCn|~1n" /_\ ̯Y΀ӫ?_=~w^:t>CgKOzȫjR/pO)KCu1ܾ ,g{}ss܁COb,#ƿVXWPzmz? =1^xQ/w|\?x<<_үsο^M߂ԫ=r[ȏn7$n{g/ڳq&^8e+8.}qu?1Ά^rx_m3{^}y籑M{>'ߡ,A]|s?~΃G;ηC^{s/۫_OA]\v<-zc,,Cr'SO{3<ۯ~Xe}W/ɟ,O6y#czXϜoϋXg>݊?aO]rN8+{+ۏz~u񝿋9w=O\~~|Peބ ;Ay("NO{{/m8XyP<? <=ŧy'bk?^xz+rE#p^(xz鉫1ܾKm_}|oӠ8%'y}Gy _h^}y_{}ذ~W7=[9N}Qq9~>z{_:ފ⭧?ݠvծ_KyOu꥟</П=3Ke}3v'p˧^z7 <߸ڳӎ- OqDKqr9ƽM~On/~Tw{Vz~=^qO{HտO9Emɞvj;r=߁yꥥg9^{{}"=6m߯{q~8dC w=q;wW[?\ۯ5)<~{+^㷵cvVޯ\~v_9^ *_¶㷞_yv~m;k"+lF~oa嶕\yP>,gA Z>*m} >h9za0ROFZW{6y䌴M>[hgvտ\ڳq95ZxWHvt-V~_G#m?:-֎+'kx%_P}Ͻۿ~v +_6yr?r~\#Z>l{~*w? ܯk}G[B *yXі3,/zخa'h}h!'qa~x>وʣ>3Sl?STb7$㚢^P;l,798ycA;=ڗyv/?=l7uNd>[1nNQ?lKٿz+ab'#yl~r2|qTy~x|3hfRte~Pxˇa$'0멱4^w)»IN+?x -7P=Y^c|D9ï+/>u9ϋ_IN[|~/1ur=m''C+xޱf9}?VO=Y#0n G/<j5< H"zteoT3ļqqO&}܌#qO`u2?Ty?ϛ՟[ysX^l> <ϠW?}88/n 8.9os^丁yW[q,<rz1Ogb>#+g~vf6^ʸɸo˳2Yy=x^v/'98̣ W~$?0{8<zzo?hGxw}KysާF>^-< 臾~|12~27䴭-zz<r_{qu4rDz9ӆ+F\Jm8q0)>23uWCH.n0/~liΧԷpqy|x_o? --?o?\-8e?f=o.|omy3ϣz8eKhmzwӽ3xO ' =nxnm[ly8Tֿ wm^zlqך~.scMr|<~~9`!o{\6 s#+yct}"ΟpyWOxןΰ?/8/vqNƁr}o'{Wo=WB`<u>G9>ڵW:ȿ6r0$(-nCTc_Lss?Gb?iaIy>ؾ8wطg\9/'=߳K=}ebі}{-~7c??(#޵Cmh|OqгxwO8Obx-O~8}B?6m;Wm3EXn 5޶k;gƼTzFލ㸄A/{s87CQ|~9q}gq<$#@X =yu?kSQ=, [|h;/9>c{B`1=mFE}ǧv9b2PyY7zmvc<`}5?ն/lqM/xFr3{1_5Yyq^填-.χ>9wy[2O|~n=Fr8lqCyVqy-<mzd|۳a<.G-?$;ڷb} ~ٿAQy\S_9O`y[~߶Gnlō_5!.grqT]84}x m889mTދ>9bl1T.Eپlq?7mж?f;OS3c\clAѽqe;1v8di8rR{4noovöc; ~S[l_~WXڡ g;Gl<od'd><7vlo'8q8X. O< kG^Eml1?xwܞro{9Q .oOP0^~OW#?۾Kak[G\ ^qﭿ7Bqo<7-ʖ'0ΙXby58e:+?A98Ɵטwh߆6ghc:}?ewjS$33W 8i|v'lQ[\~y`=.<=[sX_ gbsS =la|XlvΕmxy {;Al7S9+~B>K==ylǩ Ì$<'9?R[v>'} yُ0NN4n汆|T͈a;/f|W_=9z3~ m4Y۾2 #<5pq?g;ud^˸<4[}k36{GjxYm?iCy>l*G @0_L}|0'bxu^3S^L|m|Tv>ϖװOG{F^a\f8>Pto>&xvkݖOm#lW]s9O'?wXj߯xt2Ï +ݤQ\q.vԼТvߗx,O6V|ڨn_%?9 j=+_*8^gTq;ٯSK![T:F2i?zuy]-_暙{U2u_MRs~Uv麴lK%We C=┘ۋ?+)\^jߍnV]5«w6T){9Ub+&'_Gf6;WWMO zUͿZߥ nnkdlT1፼jՎ}Xw٨2[Vs2j Z~rJAZ+> O=kc7՚/Tso}{s}V*ek'lhix7ui~d^}pMȶ^UꏾN$[eM潤nWn޴սԂzn6=Xf\<`Λx8c՞O㵚ltyJkUj>lxTm(XԞ Q:k"uN?]w ۫uʹ:w3K*tz'OYLxCSRI[J/Kh<^wU|ʸUQ:s$u}-9ZZz'gp뵩/S[ 8j]KOdsuP.+VھqSN~~5V{7<%jώkZ7wE?;7 {~CfWg*rʾ^[eUپTZ+5ۙx[QᵢOi6ԑMW+{j[wyf)X'3]p 8o/,F=/w|aJH=02jn˶̨Һ)QF KZ+ S%{*˯[ƻʃû9ThUG'm7VLR7>_|Uj=MxpZЗ]r-岲۹ieV >ꤹEm,ym;7w-SUmXnD+Vo˫}{7ׇTQ#==?WK}-cny=+vvB9 z81"Iۨw4e*—J >/hsZ2&cJJt-SQny/[BMy=mC|Reߤ溦X '.~WM5kTfZIU74('4wZ_N^Ԧi_|N?ia'j^ezN jێ/xOO[H[ܱF 77v|ȳe65R> -{2ǿSw!uUCKVxe_ޠJʪe*k #.';楗#5sMR[S[ҕ)/Tor`ԗk/JR=Īy}(s~&n]%^|7 M~?:]{Ί:*YKmbwv_/'WvU?/}a?qgxȪ_s\~0MJ|.N/x_gȱc#&mܸIg!YpfݷkΏn~mINT}f狫]ûP)߹ƽ%R/-.v >u=O;'4X߱^TʥU:[j3ԑ.{&mϫ.}O{:&6ʌ6;0>ěyՒgxYqW,U{ȇ<9^7rB}OO۝Sh kGmjF5--i /ц눟ݸל y5ҟo{2wqEiǘ.iꏙuXLplw7???K^.r;lHK7!<~{7j9Z, 듾]c7<]|52Pc x'$?0$hZVPۦy:ӖÏ ?9Gi;u߈Q!/ |]eu.gCo/C/o߮hw_? E';ϿA*~_ _Z瞿@ை3Gkڌ_~q$ }~ߋ_Tjy'^A>䇀/+?]È{?~ʁic%w.0*{;!>Q)6ncrf;.7u%NhqK$ͳvjgfkzE3؛ H۠7T;ze~=+F܁}&S]G n=5}C%o)DՈ{`tRs1.pjZ}gӅ7"_}œc.U{^uKW3jkSȾΛ'^2J>pvM/wpwW|ĝ*;Mm/׾JO'i8{:CQ؏c{~|j^s^d ]+-wMQQJt}";7'>!x,#?Q/_t*y/*"+##]?oD<|/B??smǴL䡗{baV?SUW]U̜ԃW_1es/#~|YҌ +I~xxmOBfȕٮ"׶46c⟠߂z'%~|f5\:݆ f|s%Es'qx kvu? @v݊GY-}g/O ̗?*qB~OO>yJ^n=6=g@|s<{Wyw #=tvce˞}`f]'dx[M~zqa57&9Nr`]WY$_#]!_}cʩ~™)(rʘ&/ynx~875hpKg/q~y& /^#=maNF?ȏCOwayMv-;75Ѫ3n.<s ^>yd~?4޺w 5U~8o~t7Cw W!MXj]YQA z< 8 ~ak/Z,=^ FK7oϽYOK9j4OmYEOYԃy@-)v}wg!9<|2+?^I2^{bĒy#|Y'v!weS9 C_G=(@dz<$'Fӈ3n~D·$`m՚l1Ŀ?o^)TN˖<|r@!p> =zZ$V/<9wJӯ\K~3eߡN<ĭ?zG >)u~s teF;X޾mg;S/K+x p/B棫yP`Lj ¹@/yHcB`-3-/#>`r 0SIs4F{~.塝/ZډG]cs:g<᷻ʽqYr/'w:c '@.}<wෑ7Z?~˸a=%㥟ʾư ݲ7|9Apj#3E:55>~ 'A|srhnokvvV_y,в?ԑK3V'„OSe!iz[i9?y;v&g{ckq%s]1Pq˶<!a8\~qV)V9x͌wN98Ļ8zq۾XQ~>W|e?6~筲OF^ 7>om |XC[@ >Oyw_cQ_A.!r\:#KhPy/C =no>ΥO|,j䳱/?ZIگu'}ㆯo@hOJIVxyJ]7K>9q[O;WRBg%CX#ӏ_ib9ѩ~`IGZ1>wCz_C@˯RS?dJ<~ ? }cu8x%⒬E87j]uK!O_΢?o}o"P_8sވǐ<`59Y_ ?9y0Tگ vA]g<$6l&x¥*eg6y\1[gCr>|}(+|\C_-&شf#c^֢wZ%ͻ!< )w|ny sX_?VTCu9:TZ\QS%VyCY-:gt3\z~g? ^?J | sǁ'nPa\}cb59<=ʳGoz6t\Y-kF!O51K;۬9|ÔW=3_zY 7}ďr>Jw-ic?{NW:M3*,(sjJܗ2OC~-ݥͅG]r?%Gy8g<!灵Foui+^ȕ!8Fp|w/K9*ONeO ͻvҳ`g+| 9;~$SXs9|\yV !op`'T/\eJjmnzξS'gү:yi*śgSqW~GDYժN\LJ>"žGgx?;N][pCkWWA^3ثݲ.aev_yl %6?rNL+A |?y]Xʹ7'y=x"xDR/>ڮԃ_ﴑo3ċxׯ׋G$qϲ}kސQ93Aݓ><kuGO^VO.y-O}yQii7MXO~5M7b7{[#>K~M*qN3Nl]$NohE%ڌXH<".949~SuWZ[H\"(Z|? =Xe#xwIv6s-^,usϠ7{a1ƝJQgXg_Ca n8D[wo(~O7ȃ<TI\vܬӷ//-Yt%ռ1wI-ᰟ-yǶ^KԚQ8ďاiy_ 7omM'np=sz_a儧/ >Pg?O4xxO ƾp>ee}S#o>'K"_A<$lһnBijqrG5ǧ/Fy5qӸ w,Yޜw{wqNv@< :w;C{'C~q~=]r;3|w8wwC^k#A@Υ)#~_#O$/j\y 齜||EoEk>wo}5"@r7uU ꇏ_[_8w {A!o~7 Oe _okv?ĩI@r K{g⽏h#u/^};km@ QJ+]k?;|n뾧;3A\ _b]qSƥg>aoXK>jaC"9'!Zkye% ΀C؉官.4'(['sOf=n1ʹĭQWQRN0= Λ!@}K|ǀ+W츳NSGI ?c_k]I W(YؿնOurgMuw/A!Kϥ}r.Ց?N;0@| żh~sOԗz -3x~B</h8e7n_  ;@^y4MFK\%q<m=zwx!Nx{KGA.y[h'Aq1y^ȃsXߍ ,%qDx{||M<"+3?\<6qJ͉'5|%(wg>&/9qscO5CLj2+q_~;hzK Y7|kesʹ`'Ǐ5/qA7ӯٍIķ' ]+ݞ2C#~d1^~?Lc <kR{{|O}{%>˗*yc}0/@^ma{;^ٗ¹l?mFr$_h( }~x~oxʠ 49ϋh ߡ㜄Gȏ}Ǎ |)r~8 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1284472.svg)](https://doi.org/10.5281/zenodo.1284472) Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages. This package aims at providing, **first**, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like Cramer's V, Phi, or effict size statistics like Eta or Omega squared), or for which currently no functions are available. **Second**, another focus lies on implementations of common statistical significance tests with a consistent syntax, like t-test, Mann-Whitney test, Chi-squared test, and more. These functions are designed to be more user-friendly and also support weights, i.e. weighted statistics can be calculated. **Finally**, the package includes miscellaneous functions that are either not yet available in R (like `svyglm.nb()` or `svyglm.zip()` to calculate negative binomial or zero-inflated poisson models for survey data) or are just convenient for daily work (like functions for bootstrapping, or ANOVA summary tables). The comprised tools include: * Especially for mixed models: design effect, sample size calculation * Significance tests: Correlation, Chi-squared test, t-test, Mann-Whitney-U test, Wilcoxon rank sum test, Kruskal-Wallis test. Note that most functions that formerly were available in this package have been moved to the [**easystats** project](https://easystats.github.io/easystats/). ## Documentation Please visit [https://strengejacke.github.io/sjstats/](https://strengejacke.github.io/sjstats/) for documentation and vignettes. ## Installation ### Latest development build To install the latest development snapshot (see latest changes below), type following commands into the R console: ```r library(remotes) remotes::install_github("strengejacke/sjstats") ``` ### Officiale, stable release [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/sjstats)](https://cran.r-project.org/package=sjstats) To install the latest stable release from CRAN, type following command into the R console: ```r install.packages("sjstats") ``` ## Citation In case you want / have to cite my package, please use `citation('sjstats')` for citation information. [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1284472.svg)](https://doi.org/10.5281/zenodo.1284472) sjstats/build/0000755000176200001440000000000015022763150013045 5ustar liggesuserssjstats/build/partial.rdb0000644000176200001440000000007415022763150015173 0ustar liggesusersb```b`a 00 FN ͚Z d@$/7sjstats/man/0000755000176200001440000000000015022763073012525 5ustar liggesuserssjstats/man/svyglm.zip.Rd0000644000176200001440000000320214617405334015135 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svyglmzip.R \name{svyglm.zip} \alias{svyglm.zip} \title{Survey-weighted zero-inflated Poisson model} \usage{ svyglm.zip(formula, design, ...) } \arguments{ \item{formula}{An object of class \code{formula}, i.e. a symbolic description of the model to be fitted. See 'Details' in \code{\link[pscl]{zeroinfl}}.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{...}{Other arguments passed down to \code{\link[pscl]{zeroinfl}}.} } \value{ An object of class \code{\link[survey]{svymle}} and \code{svyglm.zip}, with some additional information about the model. } \description{ \code{svyglm.zip()} is an extension to the \CRANpkg{survey}-package to fit survey-weighted zero-inflated Poisson models. It uses \code{\link[survey]{svymle}} to fit sampling-weighted maximum likelihood estimates, based on starting values provided by \code{\link[pscl]{zeroinfl}}. } \details{ Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/. } \examples{ if (require("survey")) { data(nhanes_sample) set.seed(123) nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2) nhanes_sample$malepartners[sample(1:2992, 400)] <- 0 # create survey design des <- svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) # fit negative binomial regression fit <- svyglm.zip( malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1), des ) # print coefficients and standard errors fit } } sjstats/man/anova_stats.Rd0000644000176200001440000000251614620362737015347 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/anova_stats.R \name{anova_stats} \alias{anova_stats} \title{Effect size statistics for anova} \usage{ anova_stats(model, digits = 3) } \arguments{ \item{model}{A fitted anova-model of class \code{aov} or \code{anova}. Other models are coerced to \code{\link[stats]{anova}}.} \item{digits}{Amount of digits for returned values.} } \value{ A data frame with all statistics is returned (excluding confidence intervals). } \description{ Returns the (partial) eta-squared, (partial) omega-squared, epsilon-squared statistic or Cohen's F for all terms in an anovas. \code{anova_stats()} returns a tidy summary, including all these statistics and power for each term. } \examples{ \dontshow{if (requireNamespace("car")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # load sample data data(efc) # fit linear model fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) anova_stats(car::Anova(fit, type = 2)) \dontshow{\}) # examplesIf} } \references{ Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. \cr \cr Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. } sjstats/man/var_pop.Rd0000644000176200001440000000154714620351262014465 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/var_pop.R \name{var_pop} \alias{var_pop} \alias{sd_pop} \title{Calculate population variance and standard deviation} \usage{ var_pop(x) sd_pop(x) } \arguments{ \item{x}{(Numeric) vector.} } \value{ The population variance or standard deviation of \code{x}. } \description{ Calculate the population variance or standard deviation of a vector. } \details{ Unlike \code{\link[stats]{var}}, which returns the sample variance, \code{var_pop()} returns the population variance. \code{sd_pop()} returns the standard deviation based on the population variance. } \examples{ data(efc) # sampling variance var(efc$c12hour, na.rm = TRUE) # population variance var_pop(efc$c12hour) # sampling sd sd(efc$c12hour, na.rm = TRUE) # population sd sd_pop(efc$c12hour) } sjstats/man/gmd.Rd0000644000176200001440000000207314620351262013561 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/gmd.R \name{gmd} \alias{gmd} \title{Gini's Mean Difference} \usage{ gmd(x, select = NULL) } \arguments{ \item{x}{A vector or data frame.} \item{select}{Optional, names of variables as character vector that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed.} } \value{ For numeric vectors, Gini's mean difference. For non-numeric vectors or vectors of length < 2, returns \code{NA}. } \description{ \code{gmd()} computes Gini's mean difference for a numeric vector or for all numeric vectors in a data frame. } \note{ Gini's mean difference is defined as the mean absolute difference between any two distinct elements of a vector. Missing values from \code{x} are silently removed. } \examples{ data(efc) gmd(efc$e17age) gmd(efc, c("e17age", "c160age", "c12hour")) } \references{ David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575 } sjstats/man/inequ_trend.Rd0000644000176200001440000000526714620351262015337 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/inequ_trends.R \name{inequ_trend} \alias{inequ_trend} \title{Compute trends in status inequalities} \usage{ inequ_trend(data, prev.low, prev.hi) } \arguments{ \item{data}{A data frame that contains the variables with prevalence rates for both low and high status groups (see 'Examples').} \item{prev.low}{The name of the variable with the prevalence rates for the low status groups.} \item{prev.hi}{The name of the variable with the prevalence rates for the hi status groups.} } \value{ A data frame with the prevalence rates as well as the values for the proportional change in absolute (\code{rd}) and relative (\code{rr}) ineqqualities. } \description{ This method computes the proportional change of absolute (rate differences) and relative (rate ratios) inequalities of prevalence rates for two different status groups, as proposed by Mackenbach et al. (2015). } \details{ Given the time trend of prevalence rates of an outcome for two status groups (e.g. the mortality rates for people with lower and higher socioeconomic status over 40 years), this function computes the proportional change of absolute and relative inequalities, expressed in changes in rate differences and rate ratios. The function implements the algorithm proposed by \emph{Mackenbach et al. 2015}. } \examples{ \dontshow{if (requireNamespace("ggplot2")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 # 40 simulated time points, with an initial rate ratio of 2 and # a rate difference of 100 (i.e. low status group starts with a # prevalence rate of 200, the high status group with 100) # annual decline of prevalence is 1\% for the low, and 3\% for the # high status group n <- 40 time <- seq(1, n, by = 1) lo <- rep(200, times = n) for (i in 2:n) lo[i] <- lo[i - 1] * .99 hi <- rep(100, times = n) for (i in 2:n) hi[i] <- hi[i - 1] * .97 prev.data <- data.frame(lo, hi) # print values inequ_trend(prev.data, "lo", "hi") # plot trends - here we see that the relative inequalities # are increasing over time, while the absolute inequalities # are first increasing as well, but later are decreasing # (while rel. inequ. are still increasing) plot(inequ_trend(prev.data, "lo", "hi")) \dontshow{\}) # examplesIf} } \references{ Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. \doi{10.1136/jech-2015-207018} } sjstats/man/weight.Rd0000644000176200001440000000307214616424266014313 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/weight.R \name{weight} \alias{weight} \alias{weight2} \title{Weight a variable} \usage{ weight(x, weights, digits = 0) weight2(x, weights) } \arguments{ \item{x}{(Unweighted) variable.} \item{weights}{Vector with same length as \code{x}, which contains weight factors. Each value of \code{x} has a specific assigned weight in \code{weights}.} \item{digits}{Numeric value indicating the number of decimal places to be used for rounding the weighted values. By default, this value is \code{0}, i.e. the returned values are integer values.} } \value{ The weighted \code{x}. } \description{ These functions weight the variable \code{x} by a specific vector of \code{weights}. } \details{ \code{weight2()} sums up all \code{weights} values of the associated categories of \code{x}, whereas \code{weight()} uses a \code{\link[stats]{xtabs}} formula to weight cases. Thus, \code{weight()} may return a vector of different length than \code{x}. } \note{ The values of the returned vector are in sorted order, whereas the values' order of the original \code{x} may be spread randomly. Hence, \code{x} can't be used, for instance, for further cross tabulation. In case you want to have weighted contingency tables or (grouped) box plots etc., use the \code{weightBy} argument of most functions. } \examples{ v <- sample(1:4, 20, TRUE) table(v) w <- abs(rnorm(20)) table(weight(v, w)) table(weight2(v, w)) set.seed(1) x <- sample(letters[1:5], size = 20, replace = TRUE) w <- runif(n = 20) table(x) table(weight(x, w)) } sjstats/man/weighted_se.Rd0000644000176200001440000000465014620351262015304 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svy_median.R, R/wtd_cor.R, R/wtd_se.R \name{survey_median} \alias{survey_median} \alias{weighted_correlation} \alias{weighted_correlation.default} \alias{weighted_correlation.formula} \alias{weighted_se} \title{Weighted statistics for variables} \usage{ survey_median(x, design) weighted_correlation(data, ...) \method{weighted_correlation}{default}(data, x, y, weights, ci.lvl = 0.95, ...) \method{weighted_correlation}{formula}(formula, data, ci.lvl = 0.95, ...) weighted_se(x, weights = NULL) } \arguments{ \item{x}{(Numeric) vector or a data frame. For \code{survey_median()} or \code{weighted_ttest()}, the bare (unquoted) variable name, or a character vector with the variable name.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{data}{A data frame.} \item{...}{Currently not used.} \item{y}{Optional, bare (unquoted) variable name, or a character vector with the variable name.} \item{weights}{Bare (unquoted) variable name, or a character vector with the variable name of the numeric vector of weights. If \code{weights = NULL}, unweighted statistic is reported.} \item{ci.lvl}{Confidence level of the interval.} \item{formula}{A formula of the form \code{lhs ~ rhs1 + rhs2} where \code{lhs} is a numeric variable giving the data values and \code{rhs1} a factor with two levels giving the corresponding groups and \code{rhs2} a variable with weights.} } \value{ The weighted (test) statistic. } \description{ \code{weighted_se()} computes weighted standard errors of a variable or for all variables of a data frame. \code{survey_median()} computes the median for a variable in a survey-design (see [\verb{survey::svydesign()]}). \code{weighted_correlation()} computes a weighted correlation for a two-sided alternative hypothesis. } \examples{ \dontshow{if (requireNamespace("survey")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) weighted_se(efc$c12hour, abs(runif(n = nrow(efc)))) # survey_median ---- # median for variables from weighted survey designs data(nhanes_sample) des <- survey::svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) survey_median(total, des) survey_median("total", des) \dontshow{\}) # examplesIf} } sjstats/man/boot_ci.Rd0000644000176200001440000000703214620351262014430 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/boot_ci.R \name{boot_ci} \alias{boot_ci} \alias{boot_se} \alias{boot_p} \alias{boot_est} \title{Standard error and confidence intervals for bootstrapped estimates} \usage{ boot_ci(data, select = NULL, method = c("dist", "quantile"), ci.lvl = 0.95) boot_se(data, select = NULL) boot_p(data, select = NULL) boot_est(data, select = NULL) } \arguments{ \item{data}{A data frame that containts the vector with bootstrapped estimates, or directly the vector (see 'Examples').} \item{select}{Optional, unquoted names of variables (as character vector) with bootstrapped estimates. Required, if either \code{data} is a data frame (and no vector), and only selected variables from \code{data} should be processed.} \item{method}{Character vector, indicating if confidence intervals should be based on bootstrap standard error, multiplied by the value of the quantile function of the t-distribution (default), or on sample quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{ci.lvl}{Numeric, the level of the confidence intervals.} } \value{ A data frame with either bootstrap estimate, standard error, the lower and upper confidence intervals or the p-value for all bootstrapped estimates. } \description{ Compute nonparametric bootstrap estimate, standard error, confidence intervals and p-value for a vector of bootstrap replicate estimates. } \details{ The methods require one or more vectors of bootstrap replicate estimates as input. \itemize{ \item \code{boot_est()}: returns the bootstrapped estimate, simply by computing the mean value of all bootstrap estimates. \item \code{boot_se()}: computes the nonparametric bootstrap standard error by calculating the standard deviation of the input vector. \item The mean value of the input vector and its standard error is used by \code{boot_ci()} to calculate the lower and upper confidence interval, assuming a t-distribution of bootstrap estimate replicates (for \code{method = "dist"}, the default, which is \verb{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)}); for \code{method = "quantile"}, 95\\% sample quantiles are used to compute the confidence intervals (\code{quantile(x, probs = c(0.025, 0.975))}). Use \code{ci.lvl} to change the level for the confidence interval. \item P-values from \code{boot_p()} are also based on t-statistics, assuming normal distribution. } } \examples{ data(efc) bs <- bootstrap(efc, 100) # now run models for each bootstrapped sample bs$models <- lapply( bs$strap, function(.x) lm(neg_c_7 ~ e42dep + c161sex, data = .x) ) # extract coefficient "dependency" and "gender" from each model bs$dependency <- vapply(bs$models, function(x) coef(x)[2], numeric(1)) bs$gender <- vapply(bs$models, function(x) coef(x)[3], numeric(1)) # get bootstrapped confidence intervals boot_ci(bs$dependency) # compare with model fit fit <- lm(neg_c_7 ~ e42dep + c161sex, data = efc) confint(fit)[2, ] # alternative function calls. boot_ci(bs$dependency) boot_ci(bs, "dependency") boot_ci(bs, c("dependency", "gender")) boot_ci(bs, c("dependency", "gender"), method = "q") # compare coefficients mean(bs$dependency) boot_est(bs$dependency) coef(fit)[2] } \references{ Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 } \seealso{ []\code{bootstrap()}] to generate nonparametric bootstrap samples. } sjstats/man/auto_prior.Rd0000644000176200001440000000571014623373000015173 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/auto_prior.R \name{auto_prior} \alias{auto_prior} \title{Create default priors for brms-models} \usage{ auto_prior(formula, data, gaussian, locations = NULL) } \arguments{ \item{formula}{A formula describing the model, which just needs to contain the model terms, but no notation of interaction, splines etc. Usually, you want only those predictors in the formula, for which automatic priors should be generated. Add informative priors afterwards to the returned \code{brmsprior}-object.} \item{data}{The data that will be used to fit the model.} \item{gaussian}{Logical, if the outcome is gaussian or not.} \item{locations}{A numeric vector with location values for the priors. If \code{locations = NULL}, \code{0} is used as location parameter.} } \value{ A \code{brmsprior}-object. } \description{ This function creates default priors for brms-regression models, based on the same automatic prior-scale adjustment as in \pkg{rstanarm}. } \details{ \code{auto_prior()} is a small, convenient function to create some default priors for brms-models with automatically adjusted prior scales, in a similar way like \pkg{rstanarm} does. The default scale for the intercept is 10, for coefficients 2.5. If the outcome is gaussian, both scales are multiplied with \code{sd(y)}. Then, for categorical variables, nothing more is changed. For numeric variables, the scales are divided by the standard deviation of the related variable. \cr \cr All prior distributions are \emph{normal} distributions. \code{auto_prior()} is intended to quickly create default priors with feasible scales. If more precise definitions of priors is necessary, this needs to be done directly with brms-functions like \code{set_prior()}. } \note{ As \code{auto_prior()} also sets priors on the intercept, the model formula used in \code{brms::brm()} must be rewritten to something like \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. } \examples{ \dontshow{if (requireNamespace("brms")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) efc$c172code <- as.factor(efc$c172code) efc$c161sex <- as.factor(efc$c161sex) mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) auto_prior(mf, efc, TRUE) ## compare to # m <- rstanarm::stan_glm(mf, data = efc, chains = 2, iter = 200) # ps <- prior_summary(m) # ps$prior_intercept$adjusted_scale # ps$prior$adjusted_scale ## usage # ap <- auto_prior(mf, efc, TRUE) # brm(mf, data = efc, prior = ap) # add informative priors mf <- formula(neg_c_7 ~ c161sex + c172code) auto_prior(mf, efc, TRUE) + brms::prior(normal(.1554, 40), class = "b", coef = "c160age") # example with binary response efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) auto_prior(mf, efc, FALSE) \dontshow{\}) # examplesIf} } sjstats/man/nhanes_sample.Rd0000644000176200001440000000102014616424265015627 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nhanes_sample.R \docType{data} \name{nhanes_sample} \alias{nhanes_sample} \title{Sample dataset from the National Health and Nutrition Examination Survey} \description{ Selected variables from the National Health and Nutrition Examination Survey that are used in the example from Lumley (2010), Appendix E. See \code{\link{svyglm.nb}} for examples. } \references{ Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley } \keyword{data} sjstats/man/samplesize_mixed.Rd0000644000176200001440000000643414620351262016361 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/samplesize_mixed.R \name{samplesize_mixed} \alias{samplesize_mixed} \alias{smpsize_lmm} \title{Sample size for linear mixed models} \usage{ samplesize_mixed( eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05 ) smpsize_lmm( eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05 ) } \arguments{ \item{eff.size}{Effect size.} \item{df.n}{Optional argument for the degrees of freedom for numerator. See 'Details'.} \item{power}{Power of test (1 minus Type II error probability).} \item{sig.level}{Significance level (Type I error probability).} \item{k}{Number of cluster groups (level-2-unit) in multilevel-design.} \item{n}{Optional, number of observations per cluster groups (level-2-unit) in multilevel-design.} \item{icc}{Expected intraclass correlation coefficient for multilevel-model.} } \value{ A list with two values: The number of subjects per cluster, and the total sample size for the linear mixed model. } \description{ Compute an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs. } \details{ The sample size calculation is based on a power-calculation for the standard design. If \code{df.n} is not specified, a power-calculation for an unpaired two-sample t-test will be computed (using \code{\link[pwr]{pwr.t.test}} of the \CRANpkg{pwr}-package). If \code{df.n} is given, a power-calculation for general linear models will be computed (using \code{\link[pwr]{pwr.f2.test}} of the \pkg{pwr}-package). The sample size of the standard design is then adjusted for the design effect of two-level-designs (see \code{\link{design_effect}}). Thus, the sample size calculation is appropriate in particular for two-level-designs (see \cite{Snijders 2005}). Models that additionally include repeated measures (three-level-designs) may work as well, however, the computed sample size may be less accurate. } \examples{ \dontshow{if (requireNamespace("pwr")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # Sample size for multilevel model with 30 cluster groups and a small to # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and # hence a total sample size of about 802 observations is needed. samplesize_mixed(eff.size = .3, k = 30) # Sample size for multilevel model with 20 cluster groups and a medium # to large effect size for linear models of 0.2. Five subjects per cluster and # hence a total sample size of about 107 observations is needed. samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) \dontshow{\}) # examplesIf} } \references{ Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum. \cr \cr Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \cr \cr Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. } sjstats/man/mann_whitney_test.Rd0000644000176200001440000001763414623373000016557 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mann_whitney_test.R \name{mann_whitney_test} \alias{mann_whitney_test} \title{Mann-Whitney test} \usage{ mann_whitney_test( data, select = NULL, by = NULL, weights = NULL, mu = 0, alternative = "two.sided", ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} \item{mu}{The hypothesized difference in means (for \code{t_test()}) or location shift (for \code{wilcoxon_test()} and \code{mann_whitney_test()}). The default is 0.} \item{alternative}{A character string specifying the alternative hypothesis, must be one of \code{"two.sided"} (default), \code{"greater"} or \code{"less"}. See \code{?t.test} and \code{?wilcox.test}.} \item{...}{Additional arguments passed to \code{wilcox.test()} (for unweighted tests, i.e. when \code{weights = NULL}).} } \value{ A data frame with test results. The function returns p and Z-values as well as effect size r and group-rank-means. } \description{ This function performs a Mann-Whitney test (or Wilcoxon rank sum test for \emph{unpaired} samples). Unlike the underlying base R function \code{wilcox.test()}, this function allows for weighted tests and automatically calculates effect sizes. For \emph{paired} (dependent) samples, or for one-sample tests, please use the \code{wilcoxon_test()} function. A Mann-Whitney test is a non-parametric test for the null hypothesis that two \emph{independent} samples have identical continuous distributions. It can be used for ordinal scales or when the two continuous variables are not normally distributed. For large samples, or approximately normally distributed variables, the \code{t_test()} function can be used. } \details{ This function is based on \code{\link[=wilcox.test]{wilcox.test()}} and \code{\link[coin:LocationTests]{coin::wilcox_test()}} (the latter to extract effect sizes). The weighted version of the test is based on \code{\link[survey:svyranktest]{survey::svyranktest()}}. Interpretation of the effect size \strong{r}, as a rule-of-thumb: \itemize{ \item small effect >= 0.1 \item medium effect >= 0.3 \item large effect >= 0.5 } \strong{r} is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("coin") && requireNamespace("survey")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) # Mann-Whitney-U tests for elder's age by elder's sex. mann_whitney_test(efc, "e17age", by = "e16sex") # base R equivalent wilcox.test(e17age ~ e16sex, data = efc) # when data is in wide-format, specify all relevant continuous # variables in `select` and omit `by` set.seed(123) wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20)) mann_whitney_test(wide_data, select = c("scale1", "scale2")) # base R equivalent wilcox.test(wide_data$scale1, wide_data$scale2) # same as if we had data in long format, with grouping variable long_data <- data.frame( scales = c(wide_data$scale1, wide_data$scale2), groups = as.factor(rep(c("A", "B"), each = 20)) ) mann_whitney_test(long_data, select = "scales", by = "groups") # base R equivalent wilcox.test(scales ~ groups, long_data) \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. \doi{10.3390/math11091982} \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/wilcoxon_test.Rd0000644000176200001440000001562114623373000015713 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/wilcoxon_test.R \name{wilcoxon_test} \alias{wilcoxon_test} \title{Wilcoxon rank sum test} \usage{ wilcoxon_test( data, select = NULL, by = NULL, weights = NULL, mu = 0, alternative = "two.sided", ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} \item{mu}{The hypothesized difference in means (for \code{t_test()}) or location shift (for \code{wilcoxon_test()} and \code{mann_whitney_test()}). The default is 0.} \item{alternative}{A character string specifying the alternative hypothesis, must be one of \code{"two.sided"} (default), \code{"greater"} or \code{"less"}. See \code{?t.test} and \code{?wilcox.test}.} \item{...}{Additional arguments passed to \code{wilcox.test()} (for unweighted tests, i.e. when \code{weights = NULL}).} } \value{ A data frame with test results. The function returns p and Z-values as well as effect size r and group-rank-means. } \description{ This function performs Wilcoxon rank sum tests for one sample or for two \emph{paired} (dependent) samples. For \emph{unpaired} (independent) samples, please use the \code{mann_whitney_test()} function. A Wilcoxon rank sum test is a non-parametric test for the null hypothesis that two samples have identical continuous distributions. The implementation in \code{wilcoxon_test()} is only used for \emph{paired}, i.e. \emph{dependent} samples. For independent (unpaired) samples, use \code{mann_whitney_test()}. \code{wilcoxon_test()} can be used for ordinal scales or when the continuous variables are not normally distributed. For large samples, or approximately normally distributed variables, the \code{t_test()} function can be used (with \code{paired = TRUE}). } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("coin")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(mtcars) # one-sample test wilcoxon_test(mtcars, "mpg") # base R equivalent, we set exact = FALSE to avoid a warning wilcox.test(mtcars$mpg ~ 1, exact = FALSE) # paired test wilcoxon_test(mtcars, c("mpg", "hp")) # base R equivalent, we set exact = FALSE to avoid a warning wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE) # when `by` is specified, each group must be of same length data(iris) d <- iris[iris$Species != "setosa", ] wilcoxon_test(d, "Sepal.Width", by = "Species") \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/reexports.Rd0000644000176200001440000000146314620367146015057 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/re-exports.R \docType{import} \name{reexports} \alias{reexports} \alias{mse} \alias{rmse} \alias{link_inverse} \alias{weighted_sd} \alias{weighted_mean} \alias{weighted_median} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{datawizard}{\code{\link[datawizard]{weighted_mean}}, \code{\link[datawizard:weighted_mean]{weighted_median}}, \code{\link[datawizard:weighted_mean]{weighted_sd}}} \item{insight}{\code{\link[insight]{link_inverse}}} \item{performance}{\code{\link[performance:performance_mse]{mse}}, \code{\link[performance:performance_rmse]{rmse}}} }} sjstats/man/svyglm.nb.Rd0000644000176200001440000000514414617405334014741 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svyglmnb.R \name{svyglm.nb} \alias{svyglm.nb} \title{Survey-weighted negative binomial generalised linear model} \usage{ svyglm.nb(formula, design, ...) } \arguments{ \item{formula}{An object of class \code{formula}, i.e. a symbolic description of the model to be fitted. See 'Details' in \code{\link[stats]{glm}}.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{...}{Other arguments passed down to \code{\link[MASS]{glm.nb}}.} } \value{ An object of class \code{\link[survey]{svymle}} and \code{svyglm.nb}, with some additional information about the model. } \description{ \code{svyglm.nb()} is an extension to the \CRANpkg{survey}-package to fit survey-weighted negative binomial models. It uses \code{\link[survey]{svymle}} to fit sampling-weighted maximum likelihood estimates, based on starting values provided by \code{\link[MASS]{glm.nb}}, as proposed by \emph{Lumley (2010, pp249)}. } \details{ For details on the computation method, see Lumley (2010), Appendix E (especially 254ff.) \cr \cr \pkg{sjstats} implements following S3-methods for \code{svyglm.nb}-objects: \code{family()}, \code{model.frame()}, \code{formula()}, \code{print()}, \code{predict()} and \code{residuals()}. However, these functions have some limitations: \itemize{ \item{\code{family()} simply returns the family-object from the underlying \code{\link[MASS]{glm.nb}}-model.} \item{The \code{predict()}-method just re-fits the \code{svyglm.nb}-model with \code{\link[MASS]{glm.nb}}, overwrites the \code{$coefficients} from this model-object with the coefficients from the returned \code{\link[survey]{svymle}}-object and finally calls \code{\link[stats]{predict.glm}} to compute the predicted values.} \item{\code{residuals()} re-fits the \code{svyglm.nb}-model with \code{\link[MASS]{glm.nb}} and then computes the Pearson-residuals from the \code{glm.nb}-object.} } } \examples{ # ------------------------------------------ # This example reproduces the results from # Lumley 2010, figure E.7 (Appendix E, p256) # ------------------------------------------ if (require("survey")) { data(nhanes_sample) # create survey design des <- svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) # fit negative binomial regression fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des) # print coefficients and standard errors fit } } \references{ Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley } sjstats/man/bootstrap.Rd0000644000176200001440000000555714620351262015041 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bootstrap.R \name{bootstrap} \alias{bootstrap} \title{Generate nonparametric bootstrap replications} \usage{ bootstrap(data, n, size) } \arguments{ \item{data}{A data frame.} \item{n}{Number of bootstraps to be generated.} \item{size}{Optional, size of the bootstrap samples. May either be a number between 1 and \code{nrow(data)} or a value between 0 and 1 to sample a proportion of observations from \code{data} (see 'Examples').} } \value{ A data frame with one column: a list-variable \code{strap}, which contains resample-objects of class \code{sj_resample}. These resample-objects are lists with three elements: \enumerate{ \item the original data frame, \code{data} \item the rownmumbers \code{id}, i.e. rownumbers of \code{data}, indicating the resampled rows with replacement \item the \code{resample.id}, indicating the index of the resample (i.e. the position of the \code{sj_resample}-object in the list \code{strap}) } } \description{ Generates \code{n} bootstrap samples of \code{data} and returns the bootstrapped data frames as list-variable. } \details{ By default, each bootstrap sample has the same number of observations as \code{data}. To generate bootstrap samples without resampling same observations (i.e. sampling without replacement), use \code{size} to get bootstrapped data with a specific number of observations. However, specifying the \code{size}-argument is much less memory-efficient than the bootstrap with replacement. Hence, it is recommended to ignore the \code{size}-argument, if it is not really needed. } \note{ This function applies nonparametric bootstrapping, i.e. the function draws samples with replacement. \cr \cr There is an \code{as.data.frame}- and a \code{print}-method to get or print the resampled data frames. See 'Examples'. The \code{as.data.frame}- method automatically applies whenever coercion is done because a data frame is required as input. See 'Examples' in \code{\link{boot_ci}}. } \examples{ data(efc) bs <- bootstrap(efc, 5) # now run models for each bootstrapped sample lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x)) # generate bootstrap samples with 600 observations for each sample bs <- bootstrap(efc, 5, 600) # generate bootstrap samples with 70\% observations of the original sample size bs <- bootstrap(efc, 5, .7) # compute standard error for a simple vector from bootstraps # use the `as.data.frame()`-method to get the resampled # data frame bs <- bootstrap(efc, 100) bs$c12hour <- unlist(lapply(bs$strap, function(x) { mean(as.data.frame(x)$c12hour, na.rm = TRUE) })) # bootstrapped standard error boot_se(bs, "c12hour") # bootstrapped CI boot_ci(bs, "c12hour") } \seealso{ \code{\link{boot_ci}} to calculate confidence intervals from bootstrap samples. } sjstats/man/chi_squared_test.Rd0000644000176200001440000002004214623373000016331 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/chi_squared_test.R \name{chi_squared_test} \alias{chi_squared_test} \title{Chi-Squared test} \usage{ chi_squared_test( data, select = NULL, by = NULL, probabilities = NULL, weights = NULL, paired = FALSE, ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{probabilities}{A numeric vector of probabilities for each cell in the contingency table. The length of the vector must match the number of cells in the table, i.e. the number of unique levels of the variable specified in \code{select}. If \code{probabilities} is provided, a chi-squared test for given probabilities is conducted. Furthermore, if \code{probabilities} is given, \code{by} must be \code{NULL}. The probabilities must sum to 1.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} \item{paired}{Logical, if \code{TRUE}, a McNemar test is conducted for 2x2 tables. Note that \code{paired} only works for 2x2 tables.} \item{...}{Additional arguments passed down to \code{\link[=chisq.test]{chisq.test()}}.} } \value{ A data frame with test results. The returned effects sizes are Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against given probabilities. } \description{ This function performs a \eqn{\chi^2} test for contingency tables or tests for given probabilities. The returned effects sizes are Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against given probabilities (see \emph{Ben-Shachar et al. 2023}). } \details{ The function is a wrapper around \code{\link[=chisq.test]{chisq.test()}} and \code{\link[=fisher.test]{fisher.test()}} (for small expected values) for contingency tables, and \code{chisq.test()} for given probabilities. When \code{probabilities} are provided, these are rescaled to sum to 1 (i.e. \code{rescale.p = TRUE}). When \code{fisher.test()} is called, simulated p-values are returned (i.e. \code{simulate.p.value = TRUE}, see \code{?fisher.test}). If \code{paired = TRUE} and a 2x2 table is provided, a McNemar test (see \code{\link[=mcnemar.test]{mcnemar.test()}}) is conducted. The weighted version of the chi-squared test is based on the a weighted table, using \code{\link[=xtabs]{xtabs()}} as input for \code{chisq.test()}. Interpretation of effect sizes are based on rules described in \code{\link[effectsize:interpret_r]{effectsize::interpret_phi()}}, \code{\link[effectsize:interpret_r]{effectsize::interpret_cramers_v()}}, and \code{\link[effectsize:interpret_r]{effectsize::interpret_fei()}}. Use these function directly to get other interpretations, by providing the returned effect size as argument, e.g. \code{interpret_phi(0.35, rules = "gignac2016")}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("effectsize") && requireNamespace("MASS")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) # Chi-squared test chi_squared_test(efc, "c161sex", by = "e16sex") # weighted Chi-squared test chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight") # Chi-squared test for given probabilities chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7)) \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. \doi{10.3390/math11091982} \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/efc.Rd0000644000176200001440000000070214076243617013555 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nhanes_sample.R \docType{data} \name{efc} \alias{efc} \title{Sample dataset from the EUROFAMCARE project} \description{ German data set from the European study on family care of older people. } \references{ Lamura G, Döhner H, Kofahl C, editors. Family carers of older people in Europe: a six-country comparative study. Münster: LIT, 2008. } \keyword{data} sjstats/man/figures/0000755000176200001440000000000013563265750014200 5ustar liggesuserssjstats/man/figures/logo.png0000644000176200001440000004411313563265750015651 0ustar liggesusersPNG  IHDRxb]e iCCPICC Profile8U]hU>sg#$Sl4t? % V46nI6"dΘ83OEP|1Ŀ (>/ % (>P苦;3ie|{g蹪X-2s=+WQ+]L6O w[C{_F qb Uvz?Zb1@/zcs>~if,ӈUSjF 1_Mjbuݠpamhmçϙ>a\+5%QKFkm}ۖ?ޚD\!~6,-7SثŜvķ5Z;[rmS5{yDyH}r9|-ăFAJjI.[/]mK 7KRDrYQO-Q||6 (0 MXd(@h2_f<:”_δ*d>e\c?~,7?& ك^2Iq2"y@g|UP`o pHYs  YiTXtXML:com.adobe.xmp 1 L'Y@IDATxW[`)@m0h4$M˪ۜ|v?])]w-@HMY dYc轆a;;0\JfDܹ}S<9-V.Zksm<_LC3jj/,?k? zݷhQy[HyMӬ%% tl-@'Ą@zF56qko^(dYf3W#Xl5?|"~ AɩXK,WA~ǣ---@ (DTU'|o|D8],,/_7ocϽo ,??G~>#3gTɚ#~B`7kUhԢ\+IPo y)P8EMC{[e6$vCy7_wu58 6UNFbV ,Wmyc-QლhĢMwN|GN^Eh}ef9gѕ> Նj B~HvWۚն!bPk:"X| R< d礆-kgdeXwMؤg{;nO9 a'=3Mؚ#Ͱ`mhھ35xmn0<97wR"U+-I٩60dž˴Pb--)Oc̙oyrI'UG2'o=`}e˖<F fcc0JǪl}EՂ`DBp ,DK>YoS$]s>kƜVT{M)߼eee^.[,X6k&bgUޘREm/+K8{nM~)iZ!DZS-ٵ5v9f}4?v/.6H!Kn`S6-/ƃ~x<F3dCV73yVV,m9Uct,;@9䱟o>PoXT22 uMyV#g s3@j&Vr;:8/$!@M/D>ϱ ̪(\dPBм~o޴z s=QVpњX v@ZzfQ$l>1 9p`litTb/Ď;i!Yv m' K14&Nh%ä `WYC}w𓟾}LsnǁۛUT#7>:%%m  n)UqgñJ[{αlr!KXۦJjtmͰ ;M!ii@d=vwZ?n̙klO^{_7 *PnM+P8ʨIE`+lrv;Z$ YqN7@MhPtzFl+)ȱX8r`ZZ575swN^u)rz֬.))os=>_t^ Fbgϩ[ nk\+; |RĪ@Qce%ܞ1zG5HKϰfgX仟WzP%6v|;nNw|jYT6E{~YM'.=;Vl(ó;RY܋+,JΪtžl;a?Of`03NP49(mG>ҷ7V~>vlZ̳ި^*^fMҳa_JMϸg 1BF>7NZ5K>ED٬=H̗SNMmDıLMMu+~{;څ%`]VKτ'RGZyGE'|ͶVajZKBlm8! LcN ٵ2-,9PY9nXԾ`ĵ%/osUޅ ;nZG+'#+_C}͈ c܋km-= 5J,vQm)UZݞ=q}nb>1`Z(F8ܱmw5}ȘX/tݰ-ٳcEh;{Į=SoMs>Ec({}5!Ld8t8%q N1h%5[ߍ[F6i (rv2;vVl`0.Yq/Bb67򚆄{9+b(6 `躬cU(^Ct\l\np |JۆsD&777X,M}'JL8} gBu띆ر6gm_ѱgd6lFȊh}س;p/z}ٳ3MoPg!rUF?FF>qDpZZ3̫g[5ypIҹ$+P%%_ki隼TxojzVA$҄{,8fO wR\rVjU3{ᘨYl[eXia KERˆP("} X{1o (3]:r0MF,ʢ? oKS0crNڠ>6@gc]a4Z_o~j)H:o;nEW|?ixٹeFЌ-㓜݋ٳgCm3Z+Xy`op]ȐƭSSKNoUb?G@H|Uꖅ-)9@`mֹkOp̖UOZ%@[bWa lD*rEk _ c,bbcg"$Z̪&’n5|;XQ_S:aɶĒMwyn@S q~U^$ɍ|9֙"4|%ja\u(fLQF(g{8fPnF)},Q&C.А{?Qv&嶶EHS:@9QU܏Ln% aծ_LB{H|#n!V@}9Bvؒ_)h3weXFj,)Z+cu kze{ 7h'Io5^2ݢ>ւQx6F3?adD:+AK @p-kzIbB3ׄP+{-k7K22TIVe&>d Ăna nԆzD&f AJ&_$ 7SѢ[!:s|'6*SrwpE=?[6Aڌa֞ `u}s:4:c8TrBUOB_1 EO:Dics`ۚ0p6 w;uu^^w!DqŠar .Un㱛:ӡSya7;O -09bNbyWڋb˵8ĭxAtnX$K~tM`oo^cK\%#,/uB9vSzƦs`@*{0}bG[Opsm r o{TJ\?|#T?y?0a޼_}{.RZAUoM.~Cmoپx}L9h]}Ț`'zBdN9綎}dW'HDA8hnmcNUNVRԥq_%2Uz8$m_.GٺG_URX)M'VW4lLRM6֜_ bo}qwIeЏ GM-ԻO_mg`>׸ K~Y.D#~A>#]oO ̥.`E '4 om@v:C?⾂5n%kCm_h !0.C﹩ƲQpVl=bw/j/n9nOy֤kl9}(Zbr%"JXI<_AV<\b'o\+O,l5nb岕[u *7EXR6 )h?Љ?]` 9^ٷ™4j1UPu ^9DC{V.^=C҉3g~[_[lˬ[͑kP˫ &#^zꒄUjlR`Xm"UrE\ 2\uhiP9W$(G &X6H>W* RTЯ@џ8?t]/=>蚊$ܦEA?~{}v}!An_TGƛT&(z1\zD?g"X (i~C62+ʥ}٣2OU7[;gг`PWiEFʅ Wo=jWv4AWф*Hr:UOW.GpNiɃаHsd3<(+ʬM 9DE.{#yƻ{H=o􌮿=^]](RQBdkDJ+h U4$:`ˁrh+GQW(Q]FbۑgrEJ>ßJ' Nt i Ə>{ƪF&'^P/r뛂`E)ņ7Y J/out<נ>z=juO{Ͻ Nbr`|:.Mb1x˷ œ% WTrNk3ۻF9QV]OX2zIsrsz=DL{ XuO.IZXt2-*v<0g{ Y͘t٘s=bINw!b)njhJq%J ܣK/6=.[Bm GJ*nТO2#rxvY'$wq:yF$FHH$]ϲTϐxJn՞ZcׂQUV,nkQK4[|uJWl&,NI{zIBpor׀TUfRv mA$ 4A3D|+yϱ)^vЕ\1jQdCel&{ne~!w&X`VHfIFjv; `*Jl7@+`!BuL S.vvz[βAhÐO;vU鐭vq"^P z]t,WJjYRU[Ru9$#Nn\*,ՠQ~=nPXʲTc璿b hI}EȔb\|كâŖ2Ŷ҆BJ )dr[^$%Kÿ:89%憦\Dr\tt)1",1Φ)VڳV/$st\[J#%ZYԾ҅`LX_dpb5TpLmU'^7گr*FȦ%®:Bt$;_SP Y_g+&SkGAW}Ԗ `hڒ᪃]ӕ(Цgc鱛6*"%w-. QOjYD/!VLG'TxSf;! XEdJyтc&;L }6kRb#@\S䠾tET{I42sE,UR_}v}ݖ PeRҖ .]jRE(~ѵvZ޻n>{㳴2-xWQ2I$wmS R#HNl6RDYBfKdQa{)LR:*P^&#<^qH滎W,V!Eۼyu/w<}&d6Ye0ޖ oTOG=i`fǤAnŁ_֡u$[]ST^Т>v+ oB(SϷǝoV@v;X7אBJ*(A{<18Dj W$!X?uM |sN[ٌiT EWic4].\@*wͲ͎fJHL__e7*q0A̠ &Hvպx]#:{Po=Tv|/zD 5/A `W GA&j"2$#IlO&[̻dmjR1jۈ1g1[\oIԭwM3ZaF;:~6ֶhțHc֫ן''aڭNTEML~.C~~4I*hB:(u-ߥ4=Fr$eI uw6k .h[A5:,RbjK'dmS^3a' B~ q!K$֩OKd;iu=>\oæFy D6Nu<Ħe:٫*$!^ua5KKʄDg=c߭n=, ~CXr(M`$qȜt^&IZS *0$k{5ڣSgK)դ% Ӣ[,vTe3t?҆^ DfZ![_R#v)JרRD-p:N% TxtծybT{iEl<18rN}Mm&.nCG;ud[$w 8Dqȩy;rI{9FMwȿY~<OJg$/%tUOm⳶;,)lk[{RiۘaK+ȵj1sF? *ozmv'L= mmO,*ԩs'Z}éCY`'_aK8Lۊ>QH 54dKەӭ$YDVML&MP2z8 ;"$-Y@o5!Wٙ5x&k#qi9ՇDܢF^E hJN,$^dWZ^e827Vr3m[N_yT#9Fj@Z5&#DTciyphWډxNb \59;Q97aХɑ=K>[;u2bEA!Σ:cd ʵ!yvT=;HZ7=>TdQHlqc_[F$5 c m}O<)qTQXAIQ%ISUgUi١fJ_+-g~!LSc$ej_lWeꔋQ?A> &Zۤ4 (cع`2 p OZ܊@l!Z$EZGbC+)zL8pRЙyS8(s?6F}q?F#hssfs(X9[1᪨cj'Qgut,_v Y}h`B+`J%RlxWTSI_E9PX+Pr&*Zqlt:sn]DŽXJ+\':5CDqjL*Ĉ'K%̘`Ȃ uO̼pC]>7и>>h΄_u#tĜ*Zr pG[ͣ! \WTAnFrsY_N`QRJۜ%|W:xZ(q|셺Jr_5*@]MLȟ14"o"?jD"_0{#6,Ö4Ee_LgG90 rL>*xO)ꋀ/QLY:L/瀬1x]l ot]R{Bl۽|'g> Ӵsݫ~,{qb""dc wںe'|G/ǫgdD8Y!B$Ϲv+|-Kh=hơfysgpB8zE*\:*;%&r|'Id5ƹ| \ 4D?uwH[ 2g\LUvJv,H#PH 7kք-bEuWg7??=$i ;TYkka2ĶUz*>oo (%رSqur~>JG* P0HIMT?}wdi2m? "ػlٲَ͜mo |NKJmh83|~H>.msஆf4I-'D'QX9h@hlh#o,@Ơ`_ Tum|yxo-r8¹/`݊V{h@٬aqWCakl@a!}0k-+mvYF' =ooBY8*,obˎGMO`-Qp]A>&#/c7w«0D|e3gƨL^teeey~K6|ypj=gEfw\ncλErKEwvw\DRtX ;q%x%&&8 A6µ`ğz#](ض8SlAb=!EM d<OM,4EAmfB6 yMf*$U{p{woE-j2k"ׂZ(wN)Ѱ0*tٔ4ښFS5qXlY+Fpg֓KR0)W F>q4f̪uUp~NEsOuEIBرVbIabS L@ah'[|t 槯_|~'sJv%N4-[fϸ==\A)bmÒL\;Btezܞq\@3;>=Sp/Nq/*9\4vV=[ݒ>Ã̞Y^ZȘzksGAƱ^uTJ@^SNn/y+p?FmM%E̾նvܞ {JWoO|EBAXxӑZ݋XqdmvbrZ7_sP?dVX0и۰_~>[VYljB@:XqKT;͠k ͍^[}šDϵ4FUG۶o{J~{sn~4 uF Z{dum>]K-) [[B1F8td'^Tb"܋d&Lx]qAW^iaW]t5\1u]aɹ6 x&>q)/Em=ܞ\6$J%k޽gp/j-Nߘv]YũgM,}'ܹ:GEήxػN!D/_ο;X}q?z=hEs3YPuU KmoF>K~Vr;]ٖҁ%g^/mYj 嶄SR)JKKq!=M7d̜i,Īo*=%-'-5%k'9$=3'T6c?HɬE2~PdWX@쎈sIDATKWJ` rVaQx 鑒*v*@Xi5`g\ߙ?{!ľ2l~뉥#Zsއ}`8|3r#[DWJVaɭΫq{ÕX5+DEڸL'7ered"̈́"!kwY*g7p'^`Dvmyt=PZJr~6=#k,xXpnOC%\|% Xa|VcI^Wa?9lrēJu=&Uo=~,{vT/`.p㭫1̅%5sW XX'#9nڐ={rj]%ۛ{5hiO=jֵU%oCn5ىs]l듣D娀]Y/>X̙𢮫hmϽJjlj^^`oڶVK?2şwQ})7Ǎp0ض)s{[l |J40`El>t?IҒ`%}޷We~pgm'CIENDB`sjstats/man/cv_error.Rd0000644000176200001440000000304714616424264014645 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cv_error.R \name{cv_error} \alias{cv_error} \alias{cv_compare} \title{Test and training error from model cross-validation} \usage{ cv_error(data, formula, k = 5) cv_compare(data, formulas, k = 5) } \arguments{ \item{data}{A data frame.} \item{formula}{The formula to fit the linear model for the test and training data.} \item{k}{The number of folds for the kfold-crossvalidation.} \item{formulas}{A list of formulas, to fit linear models for the test and training data.} } \value{ A data frame with the root mean squared errors for the training and test data. } \description{ \code{cv_error()} computes the root mean squared error from a model fitted to kfold cross-validated test-training-data. \code{cv_compare()} does the same, for multiple formulas at once (by calling \code{cv_error()} for each formula). } \details{ \code{cv_error()} first generates cross-validated test-training pairs, using \code{\link[modelr]{crossv_kfold}} and then fits a linear model, which is described in \code{formula}, to the training data. Then, predictions for the test data are computed, based on the trained models. The \emph{training error} is the mean value of the \code{\link{rmse}} for all \emph{trained} models; the \emph{test error} is the rmse based on all residuals from the test data. } \examples{ data(efc) cv_error(efc, neg_c_7 ~ barthtot + c161sex) cv_compare(efc, formulas = list( neg_c_7 ~ barthtot + c161sex, neg_c_7 ~ barthtot + c161sex + e42dep, neg_c_7 ~ barthtot + c12hour )) } sjstats/man/find_beta.Rd0000644000176200001440000000760714617352174014746 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/find_beta.R \name{find_beta} \alias{find_beta} \alias{find_beta2} \alias{find_cauchy} \alias{find_normal} \title{Determining distribution parameters} \usage{ find_beta(x1, p1, x2, p2) find_beta2(x, se, ci, n) find_cauchy(x1, p1, x2, p2) find_normal(x1, p1, x2, p2) } \arguments{ \item{x1}{Value for the first percentile.} \item{p1}{Probability of the first percentile.} \item{x2}{Value for the second percentile.} \item{p2}{Probability of the second percentile.} \item{x}{Numeric, a probability value between 0 and 1. Typically indicates a prevalence rate of an outcome of interest; Or an integer value with the number of observed events. In this case, specify \code{n} to indicate the toral number of observations.} \item{se}{The standard error of \code{x}. Either \code{se} or \code{ci} must be specified.} \item{ci}{The upper limit of the confidence interval of \code{x}. Either \code{se} or \code{ci} must be specified.} \item{n}{Numeric, number of total observations. Needs to be specified, if \code{x} is an integer (number of observed events), and no probability. See 'Examples'.} } \value{ A list of length two, with the two distribution parameters than can be used to define the distribution, which (best) describes the shape for the given input parameters. } \description{ \code{find_beta()}, \code{find_normal()} and \code{find_cauchy()} find the shape, mean and standard deviation resp. the location and scale parameters to describe the beta, normal or cauchy distribution, based on two percentiles. \code{find_beta2()} finds the shape parameters for a Beta distribution, based on a probability value and its standard error or confidence intervals. } \details{ These functions can be used to find parameter for various distributions, to define prior probabilities for Bayesian analyses. \code{x1}, \code{p1}, \code{x2} and \code{p2} are parameters that describe two quantiles. Given this knowledge, the distribution parameters are returned. Use \code{find_beta2()}, if the known parameters are, e.g. a prevalence rate or similar probability, and its standard deviation or confidence interval. In this case. \code{x} should be a probability, for example a prevalence rate of a certain event. \code{se} then needs to be the standard error for this probability. Alternatively, \code{ci} can be specified, which should indicate the upper limit of the confidence interval od the probability (prevalence rate) \code{x}. If the number of events out of a total number of trials is known (e.g. 12 heads out of 30 coin tosses), \code{x} can also be the number of observed events, while \code{n} indicates the total amount of trials (in the above example, the function call would be: \code{find_beta2(x = 12, n = 30)}). } \examples{ # example from blogpost: # https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/ # 10\% of patients respond within 30 days of treatment # and 80\% respond within 90 days of treatment find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) curve( dnorm(x, mean = parms$mean, sd = parms$sd), from = 0, to = 200 ) parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) curve( dcauchy(x, location = parms$location, scale = parms$scale), from = 0, to = 200 ) find_beta2(x = .25, ci = .5) shapes <- find_beta2(x = .25, ci = .5) curve(dbeta(x, shapes[[1]], shapes[[2]])) # find Beta distribution for 3 events out of 20 observations find_beta2(x = 3, n = 20) shapes <- find_beta2(x = 3, n = 20) curve(dbeta(x, shapes[[1]], shapes[[2]])) } \references{ Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF}) } sjstats/man/se_ybar.Rd0000644000176200001440000000176514620351262014445 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/se_ybar.R \name{se_ybar} \alias{se_ybar} \title{Standard error of sample mean for mixed models} \usage{ se_ybar(fit) } \arguments{ \item{fit}{Fitted mixed effects model (\code{\link[lme4]{merMod}}-class).} } \value{ The standard error of the sample mean of \code{fit}. } \description{ Compute the standard error for the sample mean for mixed models, regarding the extent to which clustering affects the standard errors. May be used as part of the multilevel power calculation for cluster sampling (see \cite{Gelman and Hill 2007, 447ff}). } \examples{ \dontshow{if (require("lme4")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) se_ybar(fit) \dontshow{\}) # examplesIf} } \references{ Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press } sjstats/man/is_prime.Rd0000644000176200001440000000076114616424264014633 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_prime.R \name{is_prime} \alias{is_prime} \title{Find prime numbers} \usage{ is_prime(x) } \arguments{ \item{x}{An integer, or a vector of integers.} } \value{ \code{TRUE} for each prime number in \code{x}, \code{FALSE} otherwise. } \description{ This functions checks whether a number is, or numbers in a vector are prime numbers. } \examples{ is_prime(89) is_prime(15) is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) } sjstats/man/kruskal_wallis_test.Rd0000644000176200001440000001427214623373000017101 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/kruskal_wallis_test.R \name{kruskal_wallis_test} \alias{kruskal_wallis_test} \title{Kruskal-Wallis test} \usage{ kruskal_wallis_test(data, select = NULL, by = NULL, weights = NULL) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} } \value{ A data frame with test results. } \description{ This function performs a Kruskal-Wallis rank sum test, which is a non-parametric method to test the null hypothesis that the population median of all of the groups are equal. The alternative is that they differ in at least one. Unlike the underlying base R function \code{kruskal.test()}, this function allows for weighted tests. } \details{ The function simply is a wrapper around \code{\link[=kruskal.test]{kruskal.test()}}. The weighted version of the Kruskal-Wallis test is based on the \strong{survey} package, using \code{\link[survey:svyranktest]{survey::svyranktest()}}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ data(efc) # Kruskal-Wallis test for elder's age by education kruskal_wallis_test(efc, "e17age", by = "c172code") # when data is in wide-format, specify all relevant continuous # variables in `select` and omit `by` set.seed(123) wide_data <- data.frame( scale1 = runif(20), scale2 = runif(20), scale3 = runif(20) ) kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3")) # same as if we had data in long format, with grouping variable long_data <- data.frame( scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3), groups = rep(c("A", "B", "C"), each = 20) ) kruskal_wallis_test(long_data, select = "scales", by = "groups") # base R equivalent kruskal.test(scales ~ groups, data = long_data) } \references{ \itemize{ \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/chisq_gof.Rd0000644000176200001440000000414014616424264014761 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/gof.R \name{chisq_gof} \alias{chisq_gof} \title{Compute model quality} \usage{ chisq_gof(x, prob = NULL, weights = NULL) } \arguments{ \item{x}{A numeric vector or a \code{glm}-object.} \item{prob}{Vector of probabilities (indicating the population probabilities) of the same length as \code{x}'s amount of categories / factor levels. Use \code{nrow(table(x))} to determine the amount of necessary values for \code{prob}. Only used, when \code{x} is a vector, and not a \code{glm}-object.} \item{weights}{Vector with weights, used to weight \code{x}.} } \value{ For vectors, returns the object of the computed \code{\link[stats]{chisq.test}}. For \code{glm}-objects, an object of class \code{chisq_gof} with following values: \code{p.value}, the p-value for the goodness-of-fit test; \code{z.score}, the standardized z-score for the goodness-of-fit test; \code{rss}, the residual sums of squares term and \code{chisq}, the pearson chi-squared statistic. } \description{ For logistic regression models, performs a Chi-squared goodness-of-fit-test. } \details{ For vectors, this function is a convenient function for the \code{chisq.test()}, performing goodness-of-fit test. For \code{glm}-objects, this function performs a goodness-of-fit test. A well-fitting model shows \emph{no} significant difference between the model and the observed data, i.e. the reported p-values should be greater than 0.05. } \examples{ data(efc) efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) m <- glm( neg_c_7d ~ c161sex + barthtot + c172code, data = efc, family = binomial(link = "logit") ) # goodness-of-fit test for logistic regression chisq_gof(m) # goodness-of-fit test for vectors against probabilities # differing from population chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28)) # equal to population chisq_gof(efc$e42dep, prop.table(table(efc$e42dep))) } \references{ Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. } sjstats/man/crosstable_statistics.Rd0000644000176200001440000001406514620351262017431 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cramer.R, R/phi.R, R/xtab_statistics.R \name{cramers_v} \alias{cramers_v} \alias{cramer} \alias{cramers_v.formula} \alias{phi} \alias{crosstable_statistics} \alias{xtab_statistics} \title{Measures of association for contingency tables} \usage{ cramers_v(tab, ...) cramer(tab, ...) \method{cramers_v}{formula}( formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ... ) phi(tab, ...) crosstable_statistics( data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ... ) xtab_statistics( data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ... ) } \arguments{ \item{tab}{A \code{\link[=table]{table()}} or \code{\link[=ftable]{ftable()}}. Tables of class \code{\link[=xtabs]{xtabs()}} and other will be coerced to \code{ftable} objects.} \item{...}{Other arguments, passed down to the statistic functions \code{\link[=chisq.test]{chisq.test()}}, \code{\link[=fisher.test]{fisher.test()}} or \code{\link[=cor.test]{cor.test()}}.} \item{formula}{A formula of the form \code{lhs ~ rhs} where \code{lhs} is a numeric variable giving the data values and \code{rhs} a factor giving the corresponding groups.} \item{data}{A data frame or a table object. If a table object, \code{x1} and \code{x2} will be ignored. For Kendall's \emph{tau}, Spearman's \emph{rho} or Pearson's product moment correlation coefficient, \code{data} needs to be a data frame. If \code{x1} and \code{x2} are not specified, the first two columns of the data frames are used as variables to compute the crosstab.} \item{ci.lvl}{Scalar between 0 and 1. If not \code{NULL}, returns a data frame including lower and upper confidence intervals.} \item{n}{Number of bootstraps to be generated.} \item{method}{Character vector, indicating if confidence intervals should be based on bootstrap standard error, multiplied by the value of the quantile function of the t-distribution (default), or on sample quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{x1}{Name of first variable that should be used to compute the contingency table. If \code{data} is a table object, this argument will be irgnored.} \item{x2}{Name of second variable that should be used to compute the contingency table. If \code{data} is a table object, this argument will be irgnored.} \item{statistics}{Name of measure of association that should be computed. May be one of \code{"auto"}, \code{"cramer"}, \code{"phi"}, \code{"spearman"}, \code{"kendall"}, \code{"pearson"} or \code{"fisher"}. See 'Details'.} \item{weights}{Name of variable in \code{x} that indicated the vector of weights that will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} } \value{ For \code{\link[=phi]{phi()}}, the table's Phi value. For [\verb{cramers_v()]}, the table's Cramer's V. For \code{crosstable_statistics()}, a list with following components: \itemize{ \item \code{estimate}: the value of the estimated measure of association. \item \code{p.value}: the p-value for the test. \item \code{statistic}: the value of the test statistic. \item \code{stat.name}: the name of the test statistic. \item \code{stat.html}: if applicable, the name of the test statistic, in HTML-format. \item \code{df}: the degrees of freedom for the contingency table. \item \code{method}: character string indicating the name of the measure of association. \item \code{method.html}: if applicable, the name of the measure of association, in HTML-format. \item \code{method.short}: the short form of association measure, equals the \code{statistics}-argument. \item \code{fisher}: logical, if Fisher's exact test was used to calculate the p-value. } } \description{ This function calculates various measure of association for contingency tables and returns the statistic and p-value. Supported measures are Cramer's V, Phi, Spearman's rho, Kendall's tau and Pearson's r. } \details{ The p-value for Cramer's V and the Phi coefficient are based on \code{chisq.test()}. If any expected value of a table cell is smaller than 5, or smaller than 10 and the df is 1, then \code{fisher.test()} is used to compute the p-value, unless \code{statistics = "fisher"}; in this case, the use of \code{fisher.test()} is forced to compute the p-value. The test statistic is calculated with \code{cramers_v()} resp. \code{phi()}. Both test statistic and p-value for Spearman's rho, Kendall's tau and Pearson's r are calculated with \code{cor.test()}. When \code{statistics = "auto"}, only Cramer's V or Phi are calculated, based on the dimension of the table (i.e. if the table has more than two rows or columns, Cramer's V is calculated, else Phi). } \examples{ # Phi coefficient for 2x2 tables tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) phi(tab) # Cramer's V for nominal variables with more than 2 categories tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) cramer(tab) # formula notation data(efc) cramer(e16sex ~ c161sex, data = efc) # bootstrapped confidence intervals cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100) # 2x2 table, compute Phi automatically crosstable_statistics(efc, e16sex, c161sex) # more dimensions than 2x2, compute Cramer's V automatically crosstable_statistics(efc, c172code, c161sex) # ordinal data, use Kendall's tau crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall") # calcilate Spearman's rho, with continuity correction crosstable_statistics(efc, e42dep, quol_5, statistics = "spearman", exact = FALSE, continuity = TRUE ) } \references{ Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. \doi{10.3390/math11091982} } sjstats/man/table_values.Rd0000644000176200001440000000224714616424265015474 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/sjStatistics.R \name{table_values} \alias{table_values} \title{Expected and relative table values} \usage{ table_values(tab, digits = 2) } \arguments{ \item{tab}{Simple \code{\link{table}} or \code{\link[stats]{ftable}} of which cell, row and column percentages as well as expected values are calculated. Tables of class \code{\link[stats]{xtabs}} and other will be coerced to \code{ftable} objects.} \item{digits}{Amount of digits for the table percentage values.} } \value{ (Invisibly) returns a list with four tables: \enumerate{ \item \code{cell} a table with cell percentages of \code{tab} \item \code{row} a table with row percentages of \code{tab} \item \code{col} a table with column percentages of \code{tab} \item \code{expected} a table with expected values of \code{tab} } } \description{ This function calculates a table's cell, row and column percentages as well as expected values and returns all results as lists of tables. } \examples{ tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) # show expected values table_values(tab)$expected # show cell percentages table_values(tab)$cell } sjstats/man/r2.Rd0000644000176200001440000000131214617352174013341 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/Deprecated.R \name{r2} \alias{r2} \alias{cohens_f} \alias{eta_sq} \alias{epsilon_sq} \alias{omega_sq} \alias{scale_weights} \alias{robust} \alias{icc} \alias{p_value} \alias{se} \alias{means_by_group} \alias{mean_n} \title{Deprecated functions} \usage{ r2(x) cohens_f(x, ...) eta_sq(x, ...) epsilon_sq(x, ...) omega_sq(x, ...) scale_weights(x, ...) robust(x, ...) icc(x) p_value(x, ...) se(x, ...) means_by_group(x, ...) mean_n(x, ...) } \arguments{ \item{x}{An object.} \item{...}{Currently not used.} } \value{ Nothing. } \description{ A list of deprecated functions. } sjstats/man/cv.Rd0000644000176200001440000000145414616424264013434 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cv.R \name{cv} \alias{cv} \title{Compute model quality} \usage{ cv(x, ...) } \arguments{ \item{x}{Fitted linear model of class \code{lm}, \code{merMod} (\pkg{lme4}) or \code{lme} (\pkg{nlme}).} \item{...}{More fitted model objects, to compute multiple coefficients of variation at once.} } \value{ Numeric, the coefficient of variation. } \description{ Compute the coefficient of variation. } \details{ The advantage of the cv is that it is unitless. This allows coefficient of variation to be compared to each other in ways that other measures, like standard deviations or root mean squared residuals, cannot be. } \examples{ data(efc) fit <- lm(barthtot ~ c160age + c12hour, data = efc) cv(fit) } sjstats/man/design_effect.Rd0000644000176200001440000000347514616424264015616 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/design_effect.R \name{design_effect} \alias{design_effect} \title{Design effects for two-level mixed models} \usage{ design_effect(n, icc = 0.05) } \arguments{ \item{n}{Average number of observations per grouping cluster (i.e. level-2 unit).} \item{icc}{Assumed intraclass correlation coefficient for multilevel-model.} } \value{ The design effect (Variance Inflation Factor) for the two-level model. } \description{ Compute the design effect (also called \emph{Variance Inflation Factor}) for mixed models with two-level design. } \details{ The formula for the design effect is simply \code{(1 + (n - 1) * icc)}. } \examples{ # Design effect for two-level model with 30 observations per # cluster group (level-2 unit) and an assumed intraclass # correlation coefficient of 0.05. design_effect(n = 30) # Design effect for two-level model with 24 observation per cluster # group and an assumed intraclass correlation coefficient of 0.2. design_effect(n = 24, icc = 0.2) } \references{ Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20. \cr \cr Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} \cr \cr Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} \cr \cr Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. \doi{10.1370/afm.1347} } sjstats/man/prop.Rd0000644000176200001440000000633515022763073014003 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/prop.R \name{prop} \alias{prop} \alias{props} \title{Proportions of values in a vector} \usage{ prop(data, ..., weights = NULL, na.rm = TRUE, digits = 4) props(data, ..., na.rm = TRUE, digits = 4) } \arguments{ \item{data}{A data frame. May also be a grouped data frame (see 'Examples').} \item{...}{One or more value pairs of comparisons (logical predicates). Put variable names the left-hand-side and values to match on the right hand side. Expressions may be quoted or unquoted. See 'Examples'.} \item{weights}{Vector of weights that will be applied to weight all observations. Must be a vector of same length as the input vector. Default is \code{NULL}, so no weights are used.} \item{na.rm}{Logical, whether to remove NA values from the vector when the proportion is calculated. \code{na.rm = FALSE} gives you the raw percentage of a value in a vector, \code{na.rm = TRUE} the valid percentage.} \item{digits}{Amount of digits for returned values.} } \value{ For one condition, a numeric value with the proportion of the values inside a vector. For more than one condition, a data frame with one column of conditions and one column with proportions. For grouped data frames, returns a data frame with one column per group with grouping categories, followed by one column with proportions per condition. } \description{ \code{prop()} calculates the proportion of a value or category in a variable. \code{props()} does the same, but allows for multiple logical conditions in one statement. It is similar to \code{mean()} with logical predicates, however, both \code{prop()} and \code{props()} work with grouped data frames. } \details{ \code{prop()} only allows one logical statement per comparison, while \code{props()} allows multiple logical statements per comparison. However, \code{prop()} supports weighting of variables before calculating proportions, and comparisons may also be quoted. Hence, \code{prop()} also processes comparisons, which are passed as character vector (see 'Examples'). } \examples{ \dontshow{if (getRversion() >= "4.2.0" && requireNamespace("datawizard", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) # proportion of value 1 in e42dep prop(efc, e42dep == 1) # expression may also be completely quoted prop(efc, "e42dep == 1") # use "props()" for multiple logical statements props(efc, e17age > 70 & e17age < 80) # proportion of value 1 in e42dep, and all values greater # than 2 in e42dep, including missing values. will return a data frame prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE) # for factors or character vectors, use quoted or unquoted values library(datawizard) # convert numeric to factor, using labels as factor levels efc$e16sex <- to_factor(efc$e16sex) efc$n4pstu <- to_factor(efc$n4pstu) # get proportion of female older persons prop(efc, e16sex == female) # get proportion of male older persons prop(efc, e16sex == "male") # "props()" needs quotes around non-numeric factor levels props(efc, e17age > 70 & e17age < 80, n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' ) \dontshow{\}) # examplesIf} } sjstats/man/t_test.Rd0000644000176200001440000001600414623373000014310 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/t_test.R \name{t_test} \alias{t_test} \title{Student's t test} \usage{ t_test( data, select = NULL, by = NULL, weights = NULL, paired = FALSE, mu = 0, alternative = "two.sided" ) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} \item{paired}{Logical, whether to compute a paired t-test for dependent samples.} \item{mu}{The hypothesized difference in means (for \code{t_test()}) or location shift (for \code{wilcoxon_test()} and \code{mann_whitney_test()}). The default is 0.} \item{alternative}{A character string specifying the alternative hypothesis, must be one of \code{"two.sided"} (default), \code{"greater"} or \code{"less"}. See \code{?t.test} and \code{?wilcox.test}.} } \value{ A data frame with test results. Effectsize Cohen's \emph{d} is returned for larger samples (n > 20), while Hedges' \emph{g} is returned for smaller samples. } \description{ This function performs a Student's t test for two independent samples, for paired samples, or for one sample. It's a parametric test for the null hypothesis that the means of two independent samples are equal, or that the mean of one sample is equal to a specified value. The hypothesis can be one- or two-sided. Unlike the underlying base R function \code{t.test()}, this function allows for weighted tests and automatically calculates effect sizes. Cohen's \emph{d} is returned for larger samples (n > 20), while Hedges' \emph{g} is returned for smaller samples. } \details{ Interpretation of effect sizes are based on rules described in \code{\link[effectsize:interpret_cohens_d]{effectsize::interpret_cohens_d()}} and \code{\link[effectsize:interpret_cohens_d]{effectsize::interpret_hedges_g()}}. Use these function directly to get other interpretations, by providing the returned effect size (\emph{Cohen's d} or \emph{Hedges's g} in this case) as argument, e.g. \code{interpret_cohens_d(0.35, rules = "sawilowsky2009")}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("effectsize")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(sleep) # one-sample t-test t_test(sleep, "extra") # base R equivalent t.test(extra ~ 1, data = sleep) # two-sample t-test, by group t_test(mtcars, "mpg", by = "am") # base R equivalent t.test(mpg ~ am, data = mtcars) # paired t-test t_test(mtcars, c("mpg", "hp"), paired = TRUE) # base R equivalent t.test(mtcars$mpg, mtcars$hp, data = mtcars, paired = TRUE) \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/DESCRIPTION0000644000176200001440000000307715022765352013471 0ustar liggesusersPackage: sjstats Type: Package Encoding: UTF-8 Title: Collection of Convenient Functions for Common Statistical Computations Version: 0.19.1 Authors@R: person("Daniel", "Lüdecke", role = c("aut", "cre"), email = "d.luedecke@uke.de", comment = c(ORCID = "0000-0002-8895-3206")) Maintainer: Daniel Lüdecke Description: Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages. This package aims at providing, first, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like Cramer's V, Phi, or effect size statistics like Eta or Omega squared), or for which currently no functions available. Second, another focus lies on weighted variants of common statistical measures and tests like weighted standard error, mean, t-test, correlation, and more. License: GPL-3 Depends: R (>= 4.0), utils Imports: datawizard, effectsize (>= 0.8.8), insight, parameters, performance, stats Suggests: brms, car, coin, ggplot2, lme4, MASS, pscl, pwr, survey, testthat URL: https://strengejacke.github.io/sjstats/ BugReports: https://github.com/strengejacke/sjstats/issues RoxygenNote: 7.3.2 Config/testthat/edition: 3 Config/testthat/parallel: true NeedsCompilation: no Packaged: 2025-06-13 08:50:49 UTC; mail Author: Daniel Lüdecke [aut, cre] (ORCID: ) Repository: CRAN Date/Publication: 2025-06-13 09:10:02 UTC