sjstats/0000755000176200001440000000000013617050712011747 5ustar liggesuserssjstats/NAMESPACE0000644000176200001440000001531713616771032013201 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(AIC,svyglm.nb) S3method(as.data.frame,sj_resample) S3method(as.integer,sj_resample) S3method(cramer,formula) S3method(cramer,ftable) S3method(cramer,table) S3method(deviance,svyglm.nb) S3method(family,svyglm.nb) S3method(formula,svyglm.nb) S3method(formula,svyglm.zip) S3method(mediation,brmsfit) S3method(model.frame,svyglm.nb) S3method(model.frame,svyglm.zip) S3method(model.matrix,gls) S3method(phi,formula) S3method(phi,ftable) S3method(phi,table) S3method(plot,sj_inequ_trend) S3method(predict,svyglm.nb) S3method(print,sj_anova_stat) S3method(print,sj_check_assump) S3method(print,sj_chi2gof) S3method(print,sj_grpmean) S3method(print,sj_grpmeans) S3method(print,sj_mediation) S3method(print,sj_mwu) S3method(print,sj_outliers) S3method(print,sj_pval) S3method(print,sj_resample) S3method(print,sj_ttest) S3method(print,sj_wcor) S3method(print,sj_wmwu) S3method(print,sj_xtab_stat) S3method(print,svyglm.nb) S3method(print,svyglm.zip) S3method(print,tidy_stan) S3method(residuals,svyglm.nb) S3method(summary,sj_pval) S3method(terms,svyglm.nb) S3method(weighted_chisqtest,default) S3method(weighted_chisqtest,formula) S3method(weighted_correlation,default) S3method(weighted_correlation,formula) S3method(weighted_mannwhitney,default) S3method(weighted_mannwhitney,formula) S3method(weighted_mean,data.frame) S3method(weighted_mean,default) S3method(weighted_median,data.frame) S3method(weighted_median,default) S3method(weighted_sd,data.frame) S3method(weighted_sd,default) S3method(weighted_sd,matrix) S3method(weighted_se,data.frame) S3method(weighted_se,default) S3method(weighted_se,matrix) S3method(weighted_ttest,default) S3method(weighted_ttest,formula) export("%>%") export(anova_stats) export(auto_prior) export(boot_ci) export(boot_est) export(boot_p) export(boot_se) export(bootstrap) export(chisq_gof) export(ci) export(cohens_f) export(cramer) export(crosstable_statistics) export(cv) export(cv_compare) export(cv_error) export(design_effect) export(epsilon_sq) export(equivalence_test) export(eta_sq) export(find_beta) export(find_beta2) export(find_cauchy) export(find_normal) export(gmd) export(grpmean) export(icc) export(inequ_trend) export(is_prime) export(link_inverse) export(mannwhitney) export(mean_n) export(means_by_group) export(mediation) export(mse) export(mwu) export(odds_to_rr) export(omega_sq) export(or_to_rr) export(p_value) export(phi) export(prop) export(props) export(r2) export(rmse) export(robust) export(samplesize_mixed) export(scale_weights) export(sd_pop) export(se) export(se_ybar) export(smpsize_lmm) export(std_beta) export(survey_median) export(svyglm.nb) export(svyglm.zip) export(table_values) export(tidy_stan) export(typical_value) export(var_pop) export(weight) export(weight2) export(weighted_chisqtest) export(weighted_correlation) export(weighted_mannwhitney) export(weighted_mean) export(weighted_median) export(weighted_sd) export(weighted_se) export(weighted_ttest) export(wtd_sd) export(xtab_statistics) importFrom(MASS,glm.nb) importFrom(MASS,loglm) importFrom(bayestestR,ci) importFrom(bayestestR,effective_sample) importFrom(bayestestR,equivalence_test) importFrom(bayestestR,hdi) importFrom(bayestestR,mcse) importFrom(bayestestR,pd) importFrom(broom,augment) importFrom(broom,tidy) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) importFrom(dplyr,enquo) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_vars) importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,n_distinct) importFrom(dplyr,pull) importFrom(dplyr,quo_name) importFrom(dplyr,quos) importFrom(dplyr,right_join) importFrom(dplyr,select) importFrom(dplyr,select_if) importFrom(dplyr,slice) importFrom(dplyr,summarise) importFrom(effectsize,cohens_f) importFrom(effectsize,standardize_parameters) importFrom(emmeans,contrast) importFrom(emmeans,emmeans) importFrom(insight,find_formula) importFrom(insight,find_predictors) importFrom(insight,find_response) importFrom(insight,format_table) importFrom(insight,get_data) importFrom(insight,get_parameters) importFrom(insight,get_response) importFrom(insight,is_multivariate) importFrom(insight,link_inverse) importFrom(insight,model_info) importFrom(insight,print_color) importFrom(insight,print_parameters) importFrom(lme4,ngrps) importFrom(magrittr,"%>%") importFrom(modelr,crossv_kfold) importFrom(parameters,p_value) importFrom(parameters,standard_error) importFrom(parameters,standard_error_robust) importFrom(performance,icc) importFrom(performance,mse) importFrom(performance,r2) importFrom(performance,rmse) importFrom(purrr,flatten_df) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map_chr) importFrom(purrr,map_dbl) importFrom(purrr,map_df) importFrom(purrr,map_lgl) importFrom(purrr,walk) importFrom(rlang,.data) importFrom(rlang,enquo) importFrom(rlang,quo_name) importFrom(sjlabelled,as_numeric) importFrom(sjlabelled,drop_labels) importFrom(sjlabelled,get_label) importFrom(sjlabelled,get_labels) importFrom(sjmisc,add_columns) importFrom(sjmisc,add_variables) importFrom(sjmisc,is_empty) importFrom(sjmisc,is_float) importFrom(sjmisc,is_num_fac) importFrom(sjmisc,recode_to) importFrom(sjmisc,rotate_df) importFrom(sjmisc,round_num) importFrom(sjmisc,seq_col) importFrom(sjmisc,str_contains) importFrom(sjmisc,to_value) importFrom(sjmisc,trim) importFrom(sjmisc,typical_value) importFrom(stats,anova) importFrom(stats,aov) importFrom(stats,approx) importFrom(stats,as.formula) importFrom(stats,chisq.test) importFrom(stats,coef) importFrom(stats,complete.cases) importFrom(stats,confint) importFrom(stats,cor.test) importFrom(stats,dpois) importFrom(stats,family) importFrom(stats,fisher.test) importFrom(stats,fitted) importFrom(stats,formula) importFrom(stats,ftable) importFrom(stats,kruskal.test) importFrom(stats,lm) importFrom(stats,mad) importFrom(stats,model.frame) importFrom(stats,na.omit) importFrom(stats,na.pass) importFrom(stats,nobs) importFrom(stats,pbeta) importFrom(stats,pf) importFrom(stats,pnorm) importFrom(stats,predict.glm) importFrom(stats,pt) importFrom(stats,qcauchy) importFrom(stats,qf) importFrom(stats,qnorm) importFrom(stats,qt) importFrom(stats,quantile) importFrom(stats,resid) importFrom(stats,residuals) importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,terms) importFrom(stats,update) importFrom(stats,var) importFrom(stats,vcov) importFrom(stats,weighted.mean) importFrom(stats,weights) importFrom(stats,wilcox.test) importFrom(stats,xtabs) importFrom(tidyr,gather) importFrom(tidyr,nest) importFrom(tidyr,unnest) sjstats/README.md0000644000176200001440000000447713563265750013254 0ustar liggesusers# sjstats - Collection of Convenient Functions for Common Statistical Computations [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1284472.svg)](https://doi.org/10.5281/zenodo.1284472) Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages. This package aims at providing, **first**, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like Cramer's V, Phi, or effict size statistics like Eta or Omega squared), or for which currently no functions available. **Second**, another focus lies on weighted variants of common statistical measures and tests like weighted standard error, mean, t-test, correlation, and more. The comprised tools include: * Especially for mixed models: design effect, sample size calculation * Especially for Bayesian models: mediation analysis * For anova-tables: Eta-squared, Partial Eta-squared, Omega-squared, Partial Omega-squared and Epsilon-squared statistics * Weighted statistics and tests for: mean, median, standard error, standard deviation, correlation, Chi-squared test, t-test, Mann-Whitney-U-test ## Documentation Please visit [https://strengejacke.github.io/sjstats/](https://strengejacke.github.io/sjstats/) for documentation and vignettes. ## Installation ### Latest development build To install the latest development snapshot (see latest changes below), type following commands into the R console: ```r library(devtools) devtools::install_github("strengejacke/sjstats") ``` ### Officiale, stable release [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/sjstats)](https://cran.r-project.org/package=sjstats)    [![downloads](http://cranlogs.r-pkg.org/badges/sjstats)](http://cranlogs.r-pkg.org/)    [![total](http://cranlogs.r-pkg.org/badges/grand-total/sjstats)](http://cranlogs.r-pkg.org/) To install the latest stable release from CRAN, type following command into the R console: ```r install.packages("sjstats") ``` ## Citation In case you want / have to cite my package, please use `citation('sjstats')` for citation information. [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1284472.svg)](https://doi.org/10.5281/zenodo.1284472) sjstats/data/0000755000176200001440000000000013563265750012672 5ustar liggesuserssjstats/data/fish.RData0000644000176200001440000000640613563265750014546 0ustar liggesusersZ PW/11V~nTq7CV&l#Qb,c@R#FeQDOj  0,m0$[{=羞u܃98UǩZԜDFǿqڎ㱎+wޚ38,/grURaq35ɷ=(ڋQ,WvL~gx>gxw ~W~JׁNv?r8f;{Q`vWpοGTOW߻1:;vt3Ϟ8tӪ u%3zho?齳gb8gy:[a:hxx=؃~ժ;ei~Q[fy&r,::4G^TL>[O;#V::VPIW3xt~:߻$2LIF^ˡRn2_%ڠ헍a$OJw7Q.+K>.;|.<GGt ޅ;=u- %FQ'ޠۦ" (>}Tڍ}{uQtfQ?{\̛G5I$x'K?GigacK܍+GR݋{}<^ޗ7KB%~N3%8bkwQC!Aq?.n,K&}eE?N>$*KE_13Zaƚd]o!}pŲy)|38uܯn>t }HwE۸~y(_, E^8g~J y>Ӓ8/,m3m\˟ac-9ݦy/B}[/]RkI˺On,5 q{|0A80xm|yЫWM%~᥷gNMϕq*xD+(j4z/r^5Zmx|V r!/)3r'$gâiT7M \L*Z&uyZp_>[SLW-З~l?EɸkBI_}~WX=.{vG_n;_xHQĊCa B`_׼sM~rߣ΄>ON$}g 'zL:{k齏Ŝk뇸ƴkg|&G=f}r;æwm_ȯ;/A_ >ƍCdqz=oͦOr]G'uƶ8y9{n/HxmS3%q;ڒ°_Fuz <__f2p ,7ssPosMOഉl SɺX9k>&h\o.E`;,<*ӱn$~(4-pOl3U=K3fvΕgI{X3Z=bI~CY[ٞׯ|Nf@ǀ2z|?>po`~*:,;vBq@$tEse91:q=HKրyy}ɔ(?kЇqkR~ q+>e^[;/p F-m.}_<¨5^hBw?'kVed|QO7}A,>IGIߊԅGwmݎ>F!-N~?M]}^URtΐL~ϯ>6y)O`? /m "6jxxdؖg_>t Cwk/"l:]{cnܒ>\FC`M?_>a~;Q婏Я`_W=pz$<P)[=Wע~AlF5[^(aݚ_67 ~~ ПO;}=?zЯ^" W7~þk>I!蓛+x|-8nЩ_[;+oՠ?GC zneB]=OĪy3n'n $q4$b`RS>Gjs{.lVc='s4 }z6̼cz'ZcB?AS|n<_#}DXK?3UӍaEG||] |+w< A#:;wgc="˾9GUҏqƥ_zKBn#ZW?6<_Q/`ޛh ~艾_"e܏OyѮ71ۀE@# :_ A#K q?x9/{s{ u`a܈@z^MSqsK#O1ޫh˟T1_H/z^釶zM?qߒCr)owcݠ/OET_}ճʺ'fK鳀u81ƕW*QrI^': <1X2?'ᾥy)=~3?Yw=v'/bu\puR;_A .\>E)oH?W/E:G[+u~-d?׸W(uR?!}|:W?-swGWP/;U|kS?\RP柕2x.O|y;=q!wCyP5@/> Aj?ެKXKq8к>SYgJWЯ~+qd\K{ ^tq<ȼsYi\tY${J ^J: ΒCr3I?kS1#7|mٝ 9#zzu%_=S?1{s}y?I/8[xCGXǯοf!A~k|n>on>:v6[w;l]W\v`C7^̍L_Զ|kV@Jyj嶰7Vg?/l?fgW`oIwuH;.k!;;%kGx2N0?'puvz6`I^uwߟM7<0VA7rvj7K|&~ .6o&H?v=̷I?ۼ}7Od$/+`~>o#:\?3-7Q>g`nѿB?f2l5nƃf2щtun?ߏۡ?ߣ 4 q)Xv[</;y-{+}26%g7 ?|Y?ܻ:9\morT==; j|zx/o>]k7}w뻷?y֣ova7?ڽ}p2?:\HkwHÓWǷ6n|{\F:_S_ȢnVhvnWbr̕^R0SKuVuprΕ{vY:s6͛ۆu_t\nn0Wu+Wv}N.VV[+G_;\j#"_ ʿz{4ޚOxY~54>KieG߶?.g-VX[_,pf,]ϭ"zrDmG[~\VX{XJmוKvwy헭c-ut|սُl#ڿ9^}9(jt r?<1Py^7޹OūC~s@߱<8_\?7H. o[wc“<?%:])'r@Kø~Sn'9 >/<ѻ*yGE^]I_>nS阯"=eOqihz4^3›Uz8a\~#{79ܢϗ.)@%\c}/4̧h=Dڲ[mtJԺmK[_zNW Oo?1yWX.ҁOIGqvi#vYDK>`=WP?+qq?!bn/yOr֩Oz_'؍aON|$Ax%/Pv,~=oG%7a>^;ҫ D)zЖi=$&bn.P9[.%+~y+%N/'zN0_v+tcyJ>C'&@G1?3r]p?r2%ҁ?RzOc~ܷ5z~Yߍ^% mօ$!'g_X_^zI<گ/}%~yMyWx|`/r7o~'7_<ެ/qֳzL]%<9x~;OdYx{n7SRIu>wB(zЯ3gQX⇐@yq(~cXп̸諮[9o/<%+w uhړs`ǐ1^YeC /\WħY`?Ճ83wg[z +yӑ<ϕ~Wa4\][n,Oo}JI5uԖ+tYG{.vb r%~C?I}MУnL笲Ϟ^8/׼r4cXn=sm?(x#XG_ %L@KuqCx^;cy$nHo3%}q{ ޿yxߡ]M~re=%K?r,|ov?HO#`yd^^Kٻ^ےR'Tfpc vt0^B>NGϭ#؟{)z.dGD wh;?ҋ镍xrK:(m [0._NHHO'wmܸ=Y9H?қrDuRkE9\6Ku#9"ġ+5ym~ y}*gh◜M؁~4߬[m} m'_m5P7{Ooȅ{9c/7o#*g; n4-ڢ>_6xjkf7|\a$g>Y|E$~oժwEszfv~|%ή ڭvM-$lt,Z9:VIq۹v'ed_W?N~VxY/rY9"~%R޺N<],,.h8590]_3Wv{kdY*:yɟx?+_>Ѻf[+F"<G7?/͏=jj_H?U_`mUY+ja<‹dyuvuCϬ]+^\?\}yx{ucٿk6/6յݣͧ{o~o| thg0zxd?/v<ܟݖz9`2~i6r[zce$_em>\yj#W_Q^Ot1ʩJ }3"lg2ڼG+-C_DՋ= =G"G=Vێ֞W|y]|*[;_9uhk]泥9N/|Ζ1vi=':j9P.;wzWf?N8qէwALt}e;9֏sa^6un=lu~N_t|:9<\]|{pnk)>%744ߝ)"]%^oHVkqU+=f)̋Bx1~fE>B>OI{C70+md%lu^J k)[4Wi=HZ6E둎p3M;QOKw6txxpշ?yǵ'}[xR狭#EYgga>x#T{xw޵'{wG~z:ܻbG{Ǐ %x<sQv|zT񫽊Ke֫HW31JZzH/*%R/Ֆh|55m/[$m_F7ˑJq2K?|5o?+Wv^6+;/h'Yfam !˿8ڼ[$-S7yl})oWЬ<曵cz=gr#y'x= I?N?8GV~VN>'W׆o?[g;݈_ϭ/ οnK{mK'p_M~z: Oh8yulɵZ+O8 U>wnqvOFn?;?zʏQse */̛{^((ݓܸv:޺޵id- z}8^I/SR/d[ԥv #k_a Ul<>eyBIjdo'owT_)"vmK@'WVh>kߢZ׃rYX?ݣ^G%wm;M {8 k"ۊʶ||Nɷz|uw"g{_>~ORkw4e3*W{ίl]Z2}^3xAU脪̸.Ym󣂭q=t ?HV9kS+GMS?ǣ7"͒RO~Xק\9֣^󼾍Ow%ǶjG.|{>ϛ_zlIj=sl-[[A]O/OB'[ӳUpi:o=/={G}%WX_ok^j]Gn'|6Wm{\8΋߈g?I۳;=?z<-M[[xقrUjedx{k1{% L~o'6~ꓵSѺ?(/M8^|#wq'/ȷKtuCv_sr̿]5o18:a~ݝ mIʪ%+_z3'x٧=:kS@ڪ:?g#7'r8}Ο뇑fuH~#YldKϵu{jd_FO{3j[y#ɗpOo?FlkWϤu4ċ괰 }W։lgw"o@7ʱSr=~e-ț1Kl,:inuxۨsGlyYhK;f|v=CF=K-n/#[r'yqUS{ѭo|re[Ns`^<`__{tuc_޺ny9zmm }4vƈ o=+y\HE+od0?^tCZäAn }(g7Ͼo?2. qke>`?}:c$'ac=Pwv6,'Fj5k$K406HwN.rTog3ڳô$l>o͏s`:?ϯW]gwnd_tԝ]xZ us2Nxpqum -a/uJxټh7ylU{ya=A:A^ehlN=_$w+ >ub7 }ڀ({5'Gz٩3'njy"pl3սُK۷nl>ruhospGOo~rxh|a1~ana)7,H%)Oh^q7J#g ތGKo4zFG|Xtt+WV\1=>ɋ#[<8fGnz$ED 9l1c=D$(oZ?-ʳQQߏa8ʧ[NroY97Ѹrg}>+7z&%{ߛ~dvcKᅣʼn|K97y/H>=g=[uG:Oh䭮ϻ) pޢ'ߥOU~>@WyO\j-.<,|~O'~\oѹ2~>֥0~E|J<hk=^Ez0>-<Y?};g,ZG# tdwe? ⧼!. 7\e=ŕK֛y!AtOP~Z e7_w:SkZz x%Otxn.yFrgݸ_)ڥ%ɾokOW.|8W呝ؖ]ŷC~JP~% m/NmzJn[|7ɏ8)'d0/DF~ŋݹ>Gޜ|}u/>^Zi='9 PQPq-ܲV~?2.Z_~]۬/~7[8My4&׾U~zZo5Kayߢ/{eOkHѓZ?~7{].{hRgJ'[= /S?CAy03@?_tx ۏ15]G+3~yZoK/Cvơ_o99[>_o/џCT\Zq]ŏyMtyK_A<_wӱ|7Z֡K9d?7!4_<?+~> ޝ_rtc>̃?] 8_B-91/Z|uȯ?HٕH֏Zr^> &;JN_8:*.e_`(Uih(?)?~S^̓Z5T?U. zNPuH9QفuIr)ym~/g~,}.O0./qo}U's=&Gʝ1]y5=,Жh}ӱ>W<%<8w~,z7O7Y tE7 KQ|q/mn-ʹGa[m+/so{Z kV+ݬZ˥٥_f#p)o7KQ?n~}dqn>*y{㳟FǫO77Gr_|bԯw77}-+sZdjA}ӺY?TٺٽniŏZ~m}PI:Y9y#:q\ܖO3H~>W`Y'!n|v0K&+|yܼu'yynߚjoy9jWoe<[럵v\*oF~ZKGs~Mg);ͷ|ZtnܺfnTim[Rȍg8noךg?=OG{g\|b*ڙ_}pד_n{S ~XM oBw6v0߀~/!~C"i==TM6=#='ziOKoRz: }'~}f7_cbo =d#?zn_)!o+/ /2BWj=} q#?xxq["_yA8 _0O|4q$<͗Ԗj3j3S?}3.zCrreGӏh}v1%%y[m o?^C?\ Z_ E z7ySWu#go$旿n||A3%yDr0иGڕu1e7>yc~$%=92_Ň%-y/_rNqyN}InIx̷_iG~)YGބS^~OPzf9(9eWm"q? 3ga~)3?. _AB?a|g''<楫<_п>Ʃח~Tv\<0nW˼+> _֫Oc\s&4yJ|ȼI:AgisySOw'S?%GE#c޽yoc>9nߡYr`=C=9a\:!ƙ;rn1.3o\|P^ek<Żg;c*ƕאzmVo_~7N>70WA76_8P/{\ y(lOm{~#Wr<_\l2.+.oɲ= ~E|_|Ux0_~sZO-\y/a>?C~C;пwxỤm^/ڃs/<~#uz㤻}(w_a5θѸ:b>=2oH~o:r=W/;t5.zcvg\oŗ?EP{'ѡ=o2}N\O̗'\vD~.=޴ x\/;=7ig]M;0|/WL?Au)/ggޣo(%7W/n>Ÿd[8bGH}is˿k3YH^ {S </~:M|Wv"㹚Jwu7Rnݙ8(huO3@un_\dڢCeJ}俬cO9dc>sʸqu'ϛNb]]6S>wwx]ar.Et_dn_y sk>5Ps8yݹٕʸ&ڍqF?qu#v>t 3.p\=-|9{Gv#9tyu0L?$?w>]1E95׍\}ud?ӯ}c]H.|s\MʟÂyB'}gv;zmwl%O2]2ߺ{<^Sy#y9̘~-@!b_|ڑv7v/ :2;0`~K'sv>?}j^wv^.U} _O+9hmw} ϯ㮎wN;gcwp>;qĽgϼGݜ|FyCGyw>tŗ{n LsuoszoQrw~pu9K}Y;yב~9ǣM\1\tujGuGdwrg}ףs~Wq^;;[eqxwoo?xIn<̠T!02_ 3Y|9--9fZ+%rݞOwq7yC'?W=^󝬣i_'kԓ)|=Y^i~2џտc'@k7)]>>M>rM~+1EyK._c3-Ӟ.nWGoOC8aqXkxw_8?+/Vgsybޡ;9t}1{|"|ܿkW߽.y?Wz=)|^FF}ǟwpr3/Ǽf` ?EƺOv:_tײyG={oꗞy+RN=KvdRaŁzJ% hx>ղ3w0y=.aAs.9<oSx<*._~ui]WЇ_'Cɟyc/ o=vr&~7=ɗ3Ty|_c=~x 9<+~6%<%W:Dk_wp|N<ڼ'~O=X2_puwoځwncJxܟd:/Gre\ODGy/FgtG#zԏ<Wx=Eɟtytksu;܌ϣxB]\B'/W7V{76W&3}*݇џtxsJi]~H7u2McՌ'Пfe>ۢms^;?:O.o~h?rhϲٹVߵ_l܏Շ~j]dX K.?y˵zq&y>=w_繥?p80Zow>o͇/&N--\}wEGYz6>y}j]>Zw;>a[^~tO=ãE5ҟm~x~:9Nn3M:du9>9ڻvwx0d?bF'7yyxǿ<66| ?cUOcXCI?k>\r{#wzB??o=|}aO瑾>r"΢?LyîOwrkZƥw=`;ƹgOeW>?Z:wo=Sظi=!]kW6] 8^dZ=|ܼ}֭&~$G6~D6l>'v#|~_~%~Z~SM =G7+'.ټT~D5`m}\Ol{׋ڻZY:s[yqލvuz#:}rMo#?gUi[o9dv7,q~ާm6.w,ݹ뜥Ϛ08B\eWdS6nRugxɺjB|3[ɞgܼzn ߻9^8kHoğ]GH{sc'9O ΍DA<__wm]q|#;dOvϞkΝ=H} '}'_'KJK]Z;z}j7"'[OFi=Om|ϞsO30eT=ۓT#?o[D=':czig g=7+v ?Or|8x._c|kE%eO&{޻~=YxFKgӟv>x*rѶwE>QY {aq{qY?6gO=KC^Fބ[6|QvqvrLrx[zj^/FFGyr\U&v(<nݢuO>Dkտ>O"y9//ʇv6Nz/ҋ|s5RQYg˻|#{8ΞV`}ks-}g.Q=ЪeNX~VʇKom>qtjjK!~]!jo_{;̷3F٫v=<5x=[Ng,kn|q"!ͻN^]-0̇s풥ptn=E@N(.UG@WQ^7k0 #[ŮKwm>ΛOO/ǣ8~Þ>Yt~I菾,tD\{񥛇ӛ_9 oo^|x^tSZ_Uҵ[/>h|҃Ż͗4_ۼ+mkom^om@Wwu <}Jy}+nӟ<99| [}~;@\o.o}|}Y/~ljOkt'/8;_0sjstats/data/nhanes_sample.RData0000644000176200001440000006554013563265750016436 0ustar liggesusers| N1nDi6Wݦ?2 JBXdJ$p{C#CY;϶;W9gkֳ>oPqN~{\{~IK=Eu&\tn NtyJ9y_31TC7gXz[ǻWqdz<|䷤F.נ[,_~/R}/ z]qYKC?P ׁxO){rQ~K5Gn2^~UCazZ-z?`?|~Bo[w9Em6rubI0<ܘ 2^Zx5puNx܆|gK$.Ecѡrx<Kwy̯ a?Izq\r6eq5OB(6֟k~?C4F_i> =}"8cr2I\9|N]W@n#ɿ|TŻۉ%`W}=S_-)9$]d}jD~z 棆ϔ~:zf8.0`xy8*y=Fc0a?g/g~qO#gwx}y?z1\y Or|]}xʰ7!:gʻIW~yks'{8AzlϿ1ahqx2+ǃ}#\_;,q+P?@>JG"pCyw?%pu%8j?ڏqg9qǺӏMJk[=C;"j(Ojzh9TxCR'@1|4<^W4{@S{~9n0iZϼWޜvF\A_+F.;;/b~O_Xg=A?h{眷@\ß/qc<>7y=䲝a]1/w=0/a]αxn,yO#xxߙ! \f?~ 84-/dؿKs|rZ䦸ʺcӸ}5w{ W'iXpźps 猷_ 0NF=tnΆ6ggvґ_Fs^Sw{x>hc\%-=J(G᧛Ż,`a U:"N=g?x\l76=6=>v=ΟsX{ qW+sܒ-ox@<ҰJT%z,mS9!cbǓJy kG=b(8cr7ܯ5~"OeO%bݎu!w}>I<l=浲"Itzvks:w?S_yWa܅x{;x+WS=C~29}p=7<^ߌ~F[Ge; a|PPċr \byxߡz̻XXwF9hDWX+}?J=ǼyxfZXGKzy+~$= 09|W|*\=^'\ܬ~/l XK=lG,3d_ܰSڿg 䈥+/G;My\ Đ<K9Է>| :՟q7y?+-;Z!u{>r^{bqckKR9- ]Do3#GIvn'QA#g?Ĝ[|uOڍ!9ն`q"?]ܢ!7S,{ o'Y4b}/U#yH_DS=8@vxgg pǺ/܂ǣ9[.zQ&}Bw7<=I}ʧhsW>'X1~؍ux<Wy#|4p Y60~\R;,/OKЛ1 ^/ƾFvw?'^orq'eo3hr:w(?%G? 7y[qHwc})k\wpcoy ÷+px'<9'r!/Cs+pa=cq//g#=ɌuS;c_,ݯ'lNw3x~hϡ{˹2k>ܡ+Hگuz~?V\ms!ݑ ;ခ?ߐeCq܄vh?A!2pP~Q|vn3o떓qBxgyr#Nxq)V_XH[~gabf%YOC)la^w/q^7 04/|v/?GɳMu!\Ju9~VkJYK_spنd/Ƌ{?׋LJu9'YG˼]_,N7ցdD#}NDn>b]Un?GN!ngdWF2_XW_.ޣ_v^({^@'/c\hC8c9G=l'o$}|Wt7/xHy]u|_'&EGvyomC+z:zwNYNWkr܍_uw9}-ȅuzH(}^= zb~x.5sc~{/\)1n <1_^o!-N3: 0_K78wc/8 /_Ẉ~/? _ /7IO%\:rm䟘3'{0ǺH>=ޜ.B~{+<31#98<zq$rD/.ǺKK~_ǽyݥ]/S\,׽"ŗF>o>`!'!#(#ց}w`ǐq=D}/GGy{x.x7XW_`^0أiugv->oNǼ!ܴd CS)O@@Nkb.R9ށ^Wf#/"-|K~CnY7GHz=)3fyi]7x0N zg_Cq{NEy.%O|pva/#v$r]_ߊyoSD QBN M$n@\5s v(x%g a}L']s~y {ؗ'%vE5#']Cp3NHS3y;y౾ Or縗s L#~Ƽ4> !6W3 7>;~(zi<碏:A 1< '36q~;>Ob3@䏍 0>[Ox$zy#q$d^8Iq)&3~(|~lq8Oqbxv%IyPQ/qϒab%@Sx v~+$Sփ p\ۮ9o#wטk<۫؉;eP)x8_>g3 '#nd{";0h~% zo %N#?y~Xo"/8).¯Q>Qݝ_2g('8Hq2ޡ]ȇy ޝSgz7Yi\=J_璟pGop_G1c;8yB=+<8?s~?$ oROUۯx?s￰ߕ<{~y|wyNsN⒜sir=oY<2~p:h䛘ǐxoW`!m,yCbB9s6丂TI&1Mƥz7՜hqKg>,yU7M#'ԃSܗ ܿO~fؕA>%όýo#~1$Lqw9!~ڭw%#>ǂ~0gsSFSۈ$C94s?Ǔ܎/S E ^u8܆RɸWpy(qP^ф낗y0<;n0HwG$/b=rۓ'hGA~O Ei zIZ̳Z/ʂ7kxUC+rλisa^[ĵ/ ޟCX_\Q^1%wr|9a>QPc#!<¿Kނ c^0.[x~NDvpBxC'q( #އ^oOx ~??楘?G·ʇ8h̋8zAOf}|y})>7O_(y72w]j1OcQ=Ey!BHY!K?B-Xځܘ3q(w{ |-dW~C;^~پ0O~!/١'ijxƹe{|90XYwaK*qCSE=_${8~RF<qgkX>漟ÐuMv!F\oweK8ۧ:7| >iOSb|Mɱ-yo$ zqr 5W|73_AWK<迶3琞q~ e\S$b<`0~惔?>ļXWSOb$&'y2rQ>֍FUKq^?Y;b7dv>_ug*U/ ُ%}#l^^7hH5Տvm/9gP={֮m ;zH;R9'^ï\~˅oѲhCڏ6Da]E 1[;~C-ǯE?z%?h;_mװ~zaσ)ψo}.u j'^lm ~۾\h7Ho?Hs^ϽGA [Rbk?kX_dڿWyrGk> +O"'l}ut]Ou׫[;ђ#z~8rx-v%wî{-m{s~y{_zn#h[~yX9lmAۯWAj翚hDp47Z-Tx>5Z8b_ }o_YVծ^G˟y ڋDKo%W<]D{"m/R;'}]?h)װo+mG\OnFD\P_xm۟#ny緝e_G?~ED 碍A劶 ھMhWP+W4t}"9~ׯ^䈖z|V_ַokV5R+/G~ {U.,^-Wpuﰫ|~j'\x|ȫU,m:bԮGDWclm9wuٮU=տoq_ܫ/}3ëm=匱9w9k㵮Aݱ顇(<_ gSw7/fgk? ?1k1zя^9wq9]S..^rg~1VymZo%-+:/OCn|~1n" /_\ ̯Y΀ӫ?_=~w^:t>CgKOzȫjR/pO)KCu1ܾ ,g{}ss܁COb,#ƿVXWPzmz? =1^xQ/w|\?x<<_үsο^M߂ԫ=r[ȏn7$n{g/ڳq&^8e+8.}qu?1Ά^rx_m3{^}y籑M{>'ߡ,A]|s?~΃G;ηC^{s/۫_OA]\v<-zc,,Cr'SO{3<ۯ~Xe}W/ɟ,O6y#czXϜoϋXg>݊?aO]rN8+{+ۏz~u񝿋9w=O\~~|Peބ ;Ay("NO{{/m8XyP<? <=ŧy'bk?^xz+rE#p^(xz鉫1ܾKm_}|oӠ8%'y}Gy _h^}y_{}ذ~W7=[9N}Qq9~>z{_:ފ⭧?ݠvծ_KyOu꥟</П=3Ke}3v'p˧^z7 <߸ڳӎ- OqDKqr9ƽM~On/~Tw{Vz~=^qO{HտO9Emɞvj;r=߁yꥥg9^{{}"=6m߯{q~8dC w=q;wW[?\ۯ5)<~{+^㷵cvVޯ\~v_9^ *_¶㷞_yv~m;k"+lF~oa嶕\yP>,gA Z>*m} >h9za0ROFZW{6y䌴M>[hgvտ\ڳq95ZxWHvt-V~_G#m?:-֎+'kx%_P}Ͻۿ~v +_6yr?r~\#Z>l{~*w? ܯk}G[B *yXі3,/zخa'h}h!'qa~x>وʣ>3Sl?STb7$㚢^P;l,798ycA;=ڗyv/?=l7uNd>[1nNQ?lKٿz+ab'#yl~r2|qTy~x|3hfRte~Pxˇa$'0멱4^w)»IN+?x -7P=Y^c|D9ï+/>u9ϋ_IN[|~/1ur=m''C+xޱf9}?VO=Y#0n G/<j5< H"zteoT3ļqqO&}܌#qO`u2?Ty?ϛ՟[ysX^l> <ϠW?}88/n 8.9os^丁yW[q,<rz1Ogb>#+g~vf6^ʸɸo˳2Yy=x^v/'98̣ W~$?0{8<zzo?hGxw}KysާF>^-< 臾~|12~27䴭-zz<r_{qu4rDz9ӆ+F\Jm8q0)>23uWCH.n0/~liΧԷpqy|x_o? --?o?\-8e?f=o.|omy3ϣz8eKhmzwӽ3xO ' =nxnm[ly8Tֿ wm^zlqך~.scMr|<~~9`!o{\6 s#+yct}"ΟpyWOxןΰ?/8/vqNƁr}o'{Wo=WB`<u>G9>ڵW:ȿ6r0$(-nCTc_Lss?Gb?iaIy>ؾ8wطg\9/'=߳K=}ebі}{-~7c??(#޵Cmh|OqгxwO8Obx-O~8}B?6m;Wm3EXn 5޶k;gƼTzFލ㸄A/{s87CQ|~9q}gq<$#@X =yu?kSQ=, [|h;/9>c{B`1=mFE}ǧv9b2PyY7zmvc<`}5?ն/lqM/xFr3{1_5Yyq^填-.χ>9wy[2O|~n=Fr8lqCyVqy-<mzd|۳a<.G-?$;ڷb} ~ٿAQy\S_9O`y[~߶Gnlō_5!.grqT]84}x m889mTދ>9bl1T.Eپlq?7mж?f;OS3c\clAѽqe;1v8di8rR{4noovöc; ~S[l_~WXڡ g;Gl<od'd><7vlo'8q8X. O< kG^Eml1?xwܞro{9Q .oOP0^~OW#?۾Kak[G\ ^qﭿ7Bqo<7-ʖ'0ΙXby58e:+?A98Ɵטwh߆6ghc:}?ewjS$33W 8i|v'lQ[\~y`=.<=[sX_ gbsS =la|XlvΕmxy {;Al7S9+~B>K==ylǩ Ì$<'9?R[v>'} yُ0NN4n汆|T͈a;/f|W_=9z3~ m4Y۾2 #<5pq?g;ud^˸<4[}k36{GjxYm?iCy>l*G @0_L}|0'bxu^3S^L|m|Tv>ϖװOG{F^a\f8>Pto>&xvkݖOm#lW]s9O'?wXj߯xt2Ï +ݤQ\q.vԼТvߗx,O6V|ڨn_%?9 j=+_*8^gTq;ٯSK![T:F2i?zuy]-_暙{U2u_MRs~Uv麴lK%We C=┘ۋ?+)\^jߍnV]5«w6T){9Ub+&'_Gf6;WWMO zUͿZߥ nnkdlT1፼jՎ}Xw٨2[Vs2j Z~rJAZ+> O=kc7՚/Tso}{s}V*ek'lhix7ui~d^}pMȶ^UꏾN$[eM潤nWn޴սԂzn6=Xf\<`Λx8c՞O㵚ltyJkUj>lxTm(XԞ Q:k"uN?]w ۫uʹ:w3K*tz'OYLxCSRI[J/Kh<^wU|ʸUQ:s$u}-9ZZz'gp뵩/S[ 8j]KOdsuP.+VھqSN~~5V{7<%jώkZ7wE?;7 {~CfWg*rʾ^[eUپTZ+5ۙx[QᵢOi6ԑMW+{j[wyf)X'3]p 8o/,F=/w|aJH=02jn˶̨Һ)QF KZ+ S%{*˯[ƻʃû9ThUG'm7VLR7>_|Uj=MxpZЗ]r-岲۹ieV >ꤹEm,ym;7w-SUmXnD+Vo˫}{7ׇTQ#==?WK}-cny=+vvB9 z81"Iۨw4e*—J >/hsZ2&cJJt-SQny/[BMy=mC|Reߤ溦X '.~WM5kTfZIU74('4wZ_N^Ԧi_|N?ia'j^ezN jێ/xOO[H[ܱF 77v|ȳe65R> -{2ǿSw!uUCKVxe_ޠJʪe*k #.';楗#5sMR[S[ҕ)/Tor`ԗk/JR=Īy}(s~&n]%^|7 M~?:]{Ί:*YKmbwv_/'WvU?/}a?qgxȪ_s\~0MJ|.N/x_gȱc#&mܸIg!YpfݷkΏn~mINT}f狫]ûP)߹ƽ%R/-.v >u=O;'4X߱^TʥU:[j3ԑ.{&mϫ.}O{:&6ʌ6;0>ěyՒgxYqW,U{ȇ<9^7rB}OO۝Sh kGmjF5--i /ц눟ݸל y5ҟo{2wqEiǘ.iꏙuXLplw7???K^.r;lHK7!<~{7j9Z, 듾]c7<]|52Pc x'$?0$hZVPۦy:ӖÏ ?9Gi;u߈Q!/ |]eu.gCo/C/o߮hw_? E';ϿA*~_ _Z瞿@ை3Gkڌ_~q$ }~ߋ_Tjy'^A>䇀/+?]È{?~ʁic%w.0*{;!>Q)6ncrf;.7u%NhqK$ͳvjgfkzE3؛ H۠7T;ze~=+F܁}&S]G n=5}C%o)DՈ{`tRs1.pjZ}gӅ7"_}œc.U{^uKW3jkSȾΛ'^2J>pvM/wpwW|ĝ*;Mm/׾JO'i8{:CQ؏c{~|j^s^d ]+-wMQQJt}";7'>!x,#?Q/_t*y/*"+##]?oD<|/B??smǴL䡗{baV?SUW]U̜ԃW_1es/#~|YҌ +I~xxmOBfȕٮ"׶46c⟠߂z'%~|f5\:݆ f|s%Es'qx kvu? @v݊GY-}g/O ̗?*qB~OO>yJ^n=6=g@|s<{Wyw #=tvce˞}`f]'dx[M~zqa57&9Nr`]WY$_#]!_}cʩ~™)(rʘ&/ynx~875hpKg/q~y& /^#=maNF?ȏCOwayMv-;75Ѫ3n.<s ^>yd~?4޺w 5U~8o~t7Cw W!MXj]YQA z< 8 ~ak/Z,=^ FK7oϽYOK9j4OmYEOYԃy@-)v}wg!9<|2+?^I2^{bĒy#|Y'v!weS9 C_G=(@dz<$'Fӈ3n~D·$`m՚l1Ŀ?o^)TN˖<|r@!p> =zZ$V/<9wJӯ\K~3eߡN<ĭ?zG >)u~s teF;X޾mg;S/K+x p/B棫yP`Lj ¹@/yHcB`-3-/#>`r 0SIs4F{~.塝/ZډG]cs:g<᷻ʽqYr/'w:c '@.}<wෑ7Z?~˸a=%㥟ʾư ݲ7|9Apj#3E:55>~ 'A|srhnokvvV_y,в?ԑK3V'„OSe!iz[i9?y;v&g{ckq%s]1Pq˶<!a8\~qV)V9x͌wN98Ļ8zq۾XQ~>W|e?6~筲OF^ 7>om |XC[@ >Oyw_cQ_A.!r\:#KhPy/C =no>ΥO|,j䳱/?ZIگu'}ㆯo@hOJIVxyJ]7K>9q[O;WRBg%CX#ӏ_ib9ѩ~`IGZ1>wCz_C@˯RS?dJ<~ ? }cu8x%⒬E87j]uK!O_΢?o}o"P_8sވǐ<`59Y_ ?9y0Tگ vA]g<$6l&x¥*eg6y\1[gCr>|}(+|\C_-&شf#c^֢wZ%ͻ!< )w|ny sX_?VTCu9:TZ\QS%VyCY-:gt3\z~g? ^?J | sǁ'nPa\}cb59<=ʳGoz6t\Y-kF!O51K;۬9|ÔW=3_zY 7}ďr>Jw-ic?{NW:M3*,(sjJܗ2OC~-ݥͅG]r?%Gy8g<!灵Foui+^ȕ!8Fp|w/K9*ONeO ͻvҳ`g+| 9;~$SXs9|\yV !op`'T/\eJjmnzξS'gү:yi*śgSqW~GDYժN\LJ>"žGgx?;N][pCkWWA^3ثݲ.aev_yl %6?rNL+A |?y]Xʹ7'y=x"xDR/>ڮԃ_ﴑo3ċxׯ׋G$qϲ}kސQ93Aݓ><kuGO^VO.y-O}yQii7MXO~5M7b7{[#>K~M*qN3Nl]$NohE%ڌXH<".949~SuWZ[H\"(Z|? =Xe#xwIv6s-^,usϠ7{a1ƝJQgXg_Ca n8D[wo(~O7ȃ<TI\vܬӷ//-Yt%ռ1wI-ᰟ-yǶ^KԚQ8ďاiy_ 7omM'np=sz_a儧/ >Pg?O4xxO ƾp>ee}S#o>'K"_A<$lһnBijqrG5ǧ/Fy5qӸ w,Yޜw{wqNv@< :w;C{'C~q~=]r;3|w8wwC^k#A@Υ)#~_#O$/j\y 齜||EoEk>wo}5"@r7uU ꇏ_[_8w {A!o~7 Oe _okv?ĩI@r K{g⽏h#u/^};km@ QJ+]k?;|n뾧;3A\ _b]qSƥg>aoXK>jaC"9'!Zkye% ΀C؉官.4'(['sOf=n1ʹĭQWQRN0= Λ!@}K|ǀ+W츳NSGI ?c_k]I W(YؿնOurgMuw/A!Kϥ}r.Ց?N;0@| żh~sOԗz -3x~B</h8e7n_  ;@^y4MFK\%q<m=zwx!Nx{KGA.y[h'Aq1y^ȃsXߍ ,%qDx{||M<"+3?\<6qJ͉'5|%(wg>&/9qscO5CLj2+q_~;hzK Y7|kesʹ`'Ǐ5/qA7ӯٍIķ' ]+ݞ2C#~d1^~?Lc <kR{{|O}{%>˗*yc}0/@^ma{;^ٗ¹l?mFr$_h( }~x~oxʠ 49ϋh ߡ㜄Gȏ}Ǎ |)r~8 \% group_by(c172code) \%>\% means_by_group(c12hour, e42dep) } # weighting efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) means_by_group(efc, c12hour, e42dep, weights = weight) } sjstats/man/samplesize_mixed.Rd0000644000176200001440000000646513616514473016376 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/samplesize_mixed.R \name{samplesize_mixed} \alias{samplesize_mixed} \alias{smpsize_lmm} \title{Sample size for linear mixed models} \usage{ samplesize_mixed( eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05 ) smpsize_lmm( eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05 ) } \arguments{ \item{eff.size}{Effect size.} \item{df.n}{Optional argument for the degrees of freedom for numerator. See 'Details'.} \item{power}{Power of test (1 minus Type II error probability).} \item{sig.level}{Significance level (Type I error probability).} \item{k}{Number of cluster groups (level-2-unit) in multilevel-design.} \item{n}{Optional, number of observations per cluster groups (level-2-unit) in multilevel-design.} \item{icc}{Expected intraclass correlation coefficient for multilevel-model.} } \value{ A list with two values: The number of subjects per cluster, and the total sample size for the linear mixed model. } \description{ Compute an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs. } \details{ The sample size calculation is based on a power-calculation for the standard design. If \code{df.n} is not specified, a power-calculation for an unpaired two-sample t-test will be computed (using \code{\link[pwr]{pwr.t.test}} of the \CRANpkg{pwr}-package). If \code{df.n} is given, a power-calculation for general linear models will be computed (using \code{\link[pwr]{pwr.f2.test}} of the \pkg{pwr}-package). The sample size of the standard design is then adjusted for the design effect of two-level-designs (see \code{\link{design_effect}}). Thus, the sample size calculation is appropriate in particular for two-level-designs (see \cite{Snijders 2005}). Models that additionally include repeated measures (three-level-designs) may work as well, however, the computed sample size may be less accurate. } \examples{ # Sample size for multilevel model with 30 cluster groups and a small to # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and # hence a total sample size of about 802 observations is needed. samplesize_mixed(eff.size = .3, k = 30) # Sample size for multilevel model with 20 cluster groups and a medium # to large effect size for linear models of 0.2. Five subjects per cluster and # hence a total sample size of about 107 observations is needed. samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) } \references{ Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum. \cr \cr Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} \cr \cr Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} } sjstats/man/fish.Rd0000644000176200001440000000043313616771032013746 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nhanes_sample.R \docType{data} \name{fish} \alias{fish} \title{Sample dataset} \description{ Sample data from the UCLA idre website. } \references{ https://stats.idre.ucla.edu/r/dae/zip/ } \keyword{data} sjstats/man/sjstats-package.Rd0000644000176200001440000000636713563265750016123 0ustar liggesusers\encoding{UTF-8} \name{sjstats-package} \alias{sjstats-package} \alias{sjstats} \docType{package} \title{Collection of Convenient Functions for Common Statistical Computations} \description{Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages. This package aims at providing, first, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like standard errors or root mean squared errors). Second, these shortcut functions are generic (if appropriate), and can be applied not only to vectors, but also to other objects as well (e.g., the Coefficient of Variation can be computed for vectors, linear models, or linear mixed models; the \code{r2()}-function returns the r-squared value for \code{lm}, \code{glm}, \code{merMod}, \code{glmmTMB}, or \code{lme} and other objects). Most functions of this package are designed as \emph{summary functions}, i.e. they do not transform the input vector; rather, they return a summary, which is sometimes a vector and sometimes a \href{https://cran.r-project.org/package=broom/vignettes/broom.html}{tidy data frame}. he focus of most functions lies on summary statistics or fit measures for regression models, including generalized linear models, mixed effects models or Bayesian models. However, some of the functions deal with other statistical measures, like Cronbach's Alpha, Cramer's V, Phi etc. The comprised tools include: \itemize{ \item For regression and mixed models: Coefficient of Variation, Root Mean Squared Error, Residual Standard Error, Coefficient of Discrimination, R-squared and pseudo-R-squared values, standardized beta values \item Especially for mixed models: Design effect, ICC, sample size calculation and convergence tests \item Especially for Bayesian models: Highest Density Interval, region of practical equivalence (rope), Monte Carlo Standard Errors, ratio of number of effective samples, mediation analysis, Test for Practical Equivalence \item Fit and accuracy measures for regression models: Overdispersion tests, accuracy of predictions, test/training-error comparisons, error rate and binned residual plots for logistic regression models \item For anova-tables: Eta-squared, Partial Eta-squared, Omega-squared and Partial Omega-squared statistics } Furthermore, \strong{sjstats} has functions to access information from model objects, which either support more model objects than their \strong{stats} counterparts, or provide easy access to model attributes, like: \itemize{ \item \code{model_frame()} to get the model frame \item \code{model_family()} to get information about the model family, link functions etc. \item \code{link_inverse()} to get the link-inverse function \item \code{pred_vars()} and \code{resp_var()} to get the names of either the dependent or independent variables, or \item \code{var_names()} to get the "cleaned" variables names from a model object (cleaned means, things like \code{s()} or \code{log()} are removed from the returned character vector with variable names.) } Other statistics: \itemize{ \item Cramer's V, Cronbach's Alpha, Mean Inter-Item-Correlation, Mann-Whitney-U-Test, Item-scale reliability tests } } sjstats/man/svyglm.zip.Rd0000644000176200001440000000331013616514473015140 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svyglmzip.R \name{svyglm.zip} \alias{svyglm.zip} \title{Survey-weighted zero-inflated Poisson model} \usage{ svyglm.zip(formula, design, ...) } \arguments{ \item{formula}{An object of class \code{formula}, i.e. a symbolic description of the model to be fitted. See 'Details' in \code{\link[pscl]{zeroinfl}}.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{...}{Other arguments passed down to \code{\link[pscl]{zeroinfl}}.} } \value{ An object of class \code{\link[survey]{svymle}} and \code{svyglm.zip}, with some additional information about the model. } \description{ \code{svyglm.zip()} is an extension to the \CRANpkg{survey}-package to fit survey-weighted zero-inflated Poisson models. It uses \code{\link[survey]{svymle}} to fit sampling-weighted maximum likelihood estimates, based on starting values provided by \code{\link[pscl]{zeroinfl}}. } \details{ Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/. } \examples{ if (require("survey")) { data(nhanes_sample) set.seed(123) nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2) nhanes_sample$malepartners[sample(1:2992, 400)] <- 0 # create survey design des <- svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) # fit negative binomial regression fit <- svyglm.zip( malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1), des ) # print coefficients and standard errors fit } } sjstats/man/bootstrap.Rd0000644000176200001440000000624113616514531015034 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bootstrap.R \name{bootstrap} \alias{bootstrap} \title{Generate nonparametric bootstrap replications} \usage{ bootstrap(data, n, size) } \arguments{ \item{data}{A data frame.} \item{n}{Number of bootstraps to be generated.} \item{size}{Optional, size of the bootstrap samples. May either be a number between 1 and \code{nrow(data)} or a value between 0 and 1 to sample a proportion of observations from \code{data} (see 'Examples').} } \value{ A data frame with one column: a list-variable \code{strap}, which contains resample-objects of class \code{sj_resample}. These resample-objects are lists with three elements: \enumerate{ \item the original data frame, \code{data} \item the rownmumbers \code{id}, i.e. rownumbers of \code{data}, indicating the resampled rows with replacement \item the \code{resample.id}, indicating the index of the resample (i.e. the position of the \code{sj_resample}-object in the list \code{strap}) } } \description{ Generates \code{n} bootstrap samples of \code{data} and returns the bootstrapped data frames as list-variable. } \details{ By default, each bootstrap sample has the same number of observations as \code{data}. To generate bootstrap samples without resampling same observations (i.e. sampling without replacement), use \code{size} to get bootstrapped data with a specific number of observations. However, specifying the \code{size}-argument is much less memory-efficient than the bootstrap with replacement. Hence, it is recommended to ignore the \code{size}-argument, if it is not really needed. } \note{ This function applies nonparametric bootstrapping, i.e. the function draws samples with replacement. \cr \cr There is an \code{as.data.frame}- and a \code{print}-method to get or print the resampled data frames. See 'Examples'. The \code{as.data.frame}- method automatically applies whenever coercion is done because a data frame is required as input. See 'Examples' in \code{\link{boot_ci}}. } \examples{ data(efc) bs <- bootstrap(efc, 5) # now run models for each bootstrapped sample lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x)) # generate bootstrap samples with 600 observations for each sample bs <- bootstrap(efc, 5, 600) # generate bootstrap samples with 70\% observations of the original sample size bs <- bootstrap(efc, 5, .7) # compute standard error for a simple vector from bootstraps # use the `as.data.frame()`-method to get the resampled # data frame bs <- bootstrap(efc, 100) bs$c12hour <- unlist(lapply(bs$strap, function(x) { mean(as.data.frame(x)$c12hour, na.rm = TRUE) })) # or as tidyverse-approach if (require("dplyr") && require("purrr")) { bs <- efc \%>\% bootstrap(100) \%>\% mutate( c12hour = map_dbl(strap, ~mean(as.data.frame(.x)$c12hour, na.rm = TRUE)) ) # bootstrapped standard error boot_se(bs, c12hour) } } \seealso{ \code{\link{boot_ci}} to calculate confidence intervals from bootstrap samples. } sjstats/man/mediation.Rd0000644000176200001440000000513713565517445015005 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mediation.R \name{mediation} \alias{mediation} \alias{mediation.brmsfit} \title{Summary of Bayesian multivariate-response mediation-models} \usage{ mediation(x, ...) \method{mediation}{brmsfit}(x, treatment, mediator, prob = 0.9, typical = "median", ...) } \arguments{ \item{x}{A \code{stanreg}, \code{stanfit}, or \code{brmsfit} object.} \item{...}{Not used.} \item{treatment}{Character, name of the treatment variable (or direct effect) in a (multivariate response) mediator-model. If missing, \code{mediation()} tries to find the treatment variable automatically, however, this may fail.} \item{mediator}{Character, name of the mediator variable in a (multivariate response) mediator-model. If missing, \code{mediation()} tries to find the treatment variable automatically, however, this may fail.} \item{prob}{Vector of scalars between 0 and 1, indicating the mass within the credible interval that is to be estimated.} \item{typical}{The typical value that will represent the Bayesian point estimate. By default, the posterior median is returned. See \code{\link[sjmisc]{typical_value}} for possible values for this argument.} } \value{ A data frame with direct, indirect, mediator and total effect of a multivariate-response mediation-model, as well as the proportion mediated. The effect sizes are mean values of the posterior samples. } \description{ \code{mediation()} is a short summary for multivariate-response mediation-models. } \details{ \code{mediation()} returns a data frame with information on the \emph{direct effect} (mean value of posterior samples from \code{treatment} of the outcome model), \emph{mediator effect} (mean value of posterior samples from \code{mediator} of the outcome model), \emph{indirect effect} (mean value of the multiplication of the posterior samples from \code{mediator} of the outcome model and the posterior samples from \code{treatment} of the mediation model) and the total effect (mean value of sums of posterior samples used for the direct and indirect effect). The \emph{proportion mediated} is the indirect effect divided by the total effect. \cr \cr For all values, the 90\% HDIs are calculated by default. Use \code{prob} to calculate a different interval. \cr \cr The arguments \code{treatment} and \code{mediator} do not necessarily need to be specified. If missing, \code{mediation()} tries to find the treatment and mediator variable automatically. If this does not work, specify these variables. } sjstats/man/is_prime.Rd0000644000176200001440000000076413565517445014644 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_prime.R \name{is_prime} \alias{is_prime} \title{Find prime numbers} \usage{ is_prime(x) } \arguments{ \item{x}{An integer, or a vector of integers.} } \value{ \code{TRUE} for each prime number in \code{x}, \code{FALSE} otherwise. } \description{ This functions checks whether a number is, or numbers in a vector are prime numbers. } \examples{ is_prime(89) is_prime(15) is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) } sjstats/man/design_effect.Rd0000644000176200001440000000362213565517445015616 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/design_effect.R \name{design_effect} \alias{design_effect} \title{Design effects for two-level mixed models} \usage{ design_effect(n, icc = 0.05) } \arguments{ \item{n}{Average number of observations per grouping cluster (i.e. level-2 unit).} \item{icc}{Assumed intraclass correlation coefficient for multilevel-model.} } \value{ The design effect (Variance Inflation Factor) for the two-level model. } \description{ Compute the design effect (also called \emph{Variance Inflation Factor}) for mixed models with two-level design. } \details{ The formula for the design effect is simply \code{(1 + (n - 1) * icc)}. } \examples{ # Design effect for two-level model with 30 observations per # cluster group (level-2 unit) and an assumed intraclass # correlation coefficient of 0.05. design_effect(n = 30) # Design effect for two-level model with 24 observation per cluster # group and an assumed intraclass correlation coefficient of 0.2. design_effect(n = 24, icc = 0.2) } \references{ Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20. \cr \cr Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} \cr \cr Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} \cr \cr Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. \doi{10.1370/afm.1347} } sjstats/man/mean_n.Rd0000644000176200001440000000516213565517445014267 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mean_n.R \name{mean_n} \alias{mean_n} \title{Row means with min amount of valid values} \usage{ mean_n(dat, n, digits = 2) } \arguments{ \item{dat}{A data frame with at least two columns, where row means are applied.} \item{n}{May either be \itemize{ \item a numeric value that indicates the amount of valid values per row to calculate the row mean; \item or a value between 0 and 1, indicating a proportion of valid values per row to calculate the row mean (see 'Details'). } If a row's sum of valid values is less than \code{n}, \code{NA} will be returned as row mean value.} \item{digits}{Numeric value indicating the number of decimal places to be used for rounding mean value. Negative values are allowed (see 'Details').} } \value{ A vector with row mean values of \code{df} for those rows with at least \code{n} valid values. Else, \code{NA} is returned. } \description{ This function is similar to the SPSS \code{MEAN.n} function and computes row means from a \code{data.frame} or \code{matrix} if at least \code{n} values of a row are valid (and not \code{NA}). } \details{ Rounding to a negative number of \code{digits} means rounding to a power of ten, so for example mean_n(df, 3, digits = -2) rounds to the nearest hundred. \cr \cr For \code{n}, must be a numeric value from \code{0} to \code{ncol(dat)}. If a \emph{row} in \code{dat} has at least \code{n} non-missing values, the row mean is returned. If \code{n} is a non-integer value from 0 to 1, \code{n} is considered to indicate the proportion of necessary non-missing values per row. E.g., if \code{n = .75}, a row must have at least \code{ncol(dat) * n} non-missing values for the row mean to be calculated. See 'Examples'. } \examples{ dat <- data.frame(c1 = c(1,2,NA,4), c2 = c(NA,2,NA,5), c3 = c(NA,4,NA,NA), c4 = c(2,3,7,8)) # needs at least 4 non-missing values per row mean_n(dat, 4) # 1 valid return value # needs at least 3 non-missing values per row mean_n(dat, 3) # 2 valid return values # needs at least 2 non-missing values per row mean_n(dat, 2) # needs at least 1 non-missing value per row mean_n(dat, 1) # all means are shown # needs at least 50\% of non-missing values per row mean_n(dat, .5) # 3 valid return values # needs at least 75\% of non-missing values per row mean_n(dat, .75) # 2 valid return values } \references{ \href{http://r4stats.com/2014/09/03/adding-the-spss-mean-n-function-to-r/}{r4stats.com} } sjstats/man/nhanes_sample.Rd0000644000176200001440000000105213565517445015641 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nhanes_sample.R \docType{data} \name{nhanes_sample} \alias{nhanes_sample} \title{Sample dataset from the National Health and Nutrition Examination Survey} \description{ Selected variables from the National Health and Nutrition Examination Survey that are used in the example from Lumley (2010), Appendix E. See \code{\link{svyglm.nb}} for examples. } \references{ Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley } \keyword{data} sjstats/man/se_ybar.Rd0000644000176200001440000000163713616514473014454 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/se_ybar.R \name{se_ybar} \alias{se_ybar} \title{Standard error of sample mean for mixed models} \usage{ se_ybar(fit) } \arguments{ \item{fit}{Fitted mixed effects model (\code{\link[lme4]{merMod}}-class).} } \value{ The standard error of the sample mean of \code{fit}. } \description{ Compute the standard error for the sample mean for mixed models, regarding the extent to which clustering affects the standard errors. May be used as part of the multilevel power calculation for cluster sampling (see \cite{Gelman and Hill 2007, 447ff}). } \examples{ if (require("lme4")) { fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) se_ybar(fit) } } \references{ Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press } sjstats/man/auto_prior.Rd0000644000176200001440000000567513565517445015226 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/auto_prior.R \name{auto_prior} \alias{auto_prior} \title{Create default priors for brms-models} \usage{ auto_prior(formula, data, gaussian, locations = NULL) } \arguments{ \item{formula}{A formula describing the model, which just needs to contain the model terms, but no notation of interaction, splines etc. Usually, you want only those predictors in the formula, for which automatic priors should be generated. Add informative priors afterwards to the returned \code{brmsprior}-object.} \item{data}{The data that will be used to fit the model.} \item{gaussian}{Logical, if the outcome is gaussian or not.} \item{locations}{A numeric vector with location values for the priors. If \code{locations = NULL}, \code{0} is used as location parameter.} } \value{ A \code{brmsprior}-object. } \description{ This function creates default priors for brms-regression models, based on the same automatic prior-scale adjustment as in \pkg{rstanarm}. } \details{ \code{auto_prior()} is a small, convenient function to create some default priors for brms-models with automatically adjusted prior scales, in a similar way like \pkg{rstanarm} does. The default scale for the intercept is 10, for coefficients 2.5. If the outcome is gaussian, both scales are multiplied with \code{sd(y)}. Then, for categorical variables, nothing more is changed. For numeric variables, the scales are divided by the standard deviation of the related variable. \cr \cr All prior distributions are \emph{normal} distributions. \code{auto_prior()} is intended to quickly create default priors with feasible scales. If more precise definitions of priors is necessary, this needs to be done directly with brms-functions like \code{set_prior()}. } \note{ As \code{auto_prior()} also sets priors on the intercept, the model formula used in \code{brms::brm()} must be rewritten to something like \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. } \examples{ library(sjmisc) data(efc) efc$c172code <- as.factor(efc$c172code) efc$c161sex <- to_label(efc$c161sex) mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) if (requireNamespace("brms", quietly = TRUE)) auto_prior(mf, efc, TRUE) ## compare to # library(rstanarm) # m <- stan_glm(mf, data = efc, chains = 2, iter = 200) # ps <- prior_summary(m) # ps$prior_intercept$adjusted_scale # ps$prior$adjusted_scale ## usage # ap <- auto_prior(mf, efc, TRUE) # brm(mf, data = efc, priors = ap) # add informative priors mf <- formula(neg_c_7 ~ c161sex + c172code) if (requireNamespace("brms", quietly = TRUE)) { auto_prior(mf, efc, TRUE) + brms::prior(normal(.1554, 40), class = "b", coef = "c160age") } # example with binary response efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) if (requireNamespace("brms", quietly = TRUE)) auto_prior(mf, efc, FALSE) } sjstats/man/chisq_gof.Rd0000644000176200001440000000417113615753133014763 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/gof.R \name{chisq_gof} \alias{chisq_gof} \title{Compute model quality} \usage{ chisq_gof(x, prob = NULL, weights = NULL) } \arguments{ \item{x}{A numeric vector or a \code{glm}-object.} \item{prob}{Vector of probabilities (indicating the population probabilities) of the same length as \code{x}'s amount of categories / factor levels. Use \code{nrow(table(x))} to determine the amount of necessary values for \code{prob}. Only used, when \code{x} is a vector, and not a \code{glm}-object.} \item{weights}{Vector with weights, used to weight \code{x}.} } \value{ For vectors, returns the object of the computed \code{\link[stats]{chisq.test}}. For \code{glm}-objects, an object of class \code{chisq_gof} with following values: \code{p.value}, the p-value for the goodness-of-fit test; \code{z.score}, the standardized z-score for the goodness-of-fit test; \code{rss}, the residual sums of squares term and \code{chisq}, the pearson chi-squared statistic. } \description{ For logistic regression models, performs a Chi-squared goodness-of-fit-test. } \details{ For vectors, this function is a convenient function for the \code{chisq.test()}, performing goodness-of-fit test. For \code{glm}-objects, this function performs a goodness-of-fit test. A well-fitting model shows \emph{no} significant difference between the model and the observed data, i.e. the reported p-values should be greater than 0.05. } \examples{ data(efc) efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) m <- glm( neg_c_7d ~ c161sex + barthtot + c172code, data = efc, family = binomial(link = "logit") ) # goodness-of-fit test for logistic regression chisq_gof(m) # goodness-of-fit test for vectors against probabilities # differing from population chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28)) # equal to population chisq_gof(efc$e42dep, prop.table(table(efc$e42dep))) } \references{ Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. \doi{10.1002/0471722146} } sjstats/man/tidy_stan.Rd0000644000176200001440000001067113565517445015031 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tidy_stan.R \name{tidy_stan} \alias{tidy_stan} \title{Tidy summary output for stan models} \usage{ tidy_stan( x, prob = 0.89, typical = "median", trans = NULL, effects = c("all", "fixed", "random"), component = c("all", "conditional", "zero_inflated", "zi"), digits = 2 ) } \arguments{ \item{x}{A \code{stanreg}, \code{stanfit} or \code{brmsfit} object.} \item{prob}{Vector of scalars between 0 and 1, indicating the mass within the credible interval that is to be estimated.} \item{typical}{The typical value that will represent the Bayesian point estimate. By default, the posterior median is returned. See \code{\link[sjmisc]{typical_value}} for possible values for this argument.} \item{trans}{Name of a function or character vector naming a function, used to apply transformations on the estimates and uncertainty intervals. The values for standard errors are \emph{not} transformed! If \code{trans} is not \code{NULL}, \emph{credible intervals} instead of \emph{HDI} are computed, due to the possible asymmetry of the HDI.} \item{effects}{Should results for fixed effects, random effects or both be returned? Only applies to mixed models. May be abbreviated.} \item{component}{Should results for all parameters, parameters for the conditional model or the zero-inflated part of the model be returned? May be abbreviated. Only applies to \pkg{brms}-models.} \item{digits}{Amount of digits to round numerical values in the output.} } \value{ A data frame, summarizing \code{x}, with consistent column names. To distinguish multiple HDI values, column names for the HDI get a suffix when \code{prob} has more than one element. } \description{ Returns a tidy summary output for stan models. } \details{ The returned data frame has an additonal class-attribute, \code{tidy_stan}, to pass the result to its own \code{print()}-method. The \code{print()}-method creates a cleaner output, especially for multilevel, zero-inflated or multivariate response models, where - for instance - the conditional part of a model is printed separately from the zero-inflated part, or random and fixed effects are printed separately. \cr \cr The returned data frame gives information on: \itemize{ \item{The Bayesian point estimate (column \emph{estimate}, which is by default the posterior median; other statistics are also possible, see argument \code{typical}).} \item{ The standard error (which is actually the \emph{median absolute deviation}). } \item{ The HDI. Computation for HDI is based on the code from Kruschke 2015, pp. 727f. } \item{ The Probability of Direction (pd), which is an index for "effect significance" (see \cite{Makowski et al. 2019}). A value of 95\% or higher indicates a "significant" (i.e. statistically clear) effect. } \item{ The effective numbers of samples, \emph{ESS}. } \item{ The Rhat statistics. When Rhat is above 1, it usually indicates that the chain has not yet converged, indicating that the drawn samples might not be trustworthy. Drawing more iteration may solve this issue. } \item{ The Monte Carlo standard error (see \code{\link{mcse}}). It is defined as standard deviation of the chains divided by their effective sample size and \dQuote{provides a quantitative suggestion of how big the estimation noise is} (\emph{Kruschke 2015, p.187}). } } } \examples{ \dontrun{ if (require("rstanarm")) { fit <- stan_glm(mpg ~ wt + am, data = mtcars, chains = 1) tidy_stan(fit) tidy_stan(fit, prob = c(.89, .5)) }} } \references{ Kruschke JK. \emph{Doing Bayesian Data Analysis: A Tutorial with R, JAGS, and Stan} 2nd edition. Academic Press, 2015 \cr \cr Gelman A, Carlin JB, Stern HS, Dunson DB, Vehtari A, Rubin DB. \emph{Bayesian data analysis} 3rd ed. Boca Raton: Chapman and Hall/CRC, 2013 \cr \cr Gelman A, Rubin DB. \emph{Inference from iterative simulation using multiple sequences} Statistical Science 1992;7: 457-511 \cr \cr Makowski D, Ben-Shachar MS, Lüdecke D. bayestestR: Describing Effects and their Uncertainty, Existence and Significance within the Bayesian Framework. Journal of Open Source Software 2019;4:1541. \doi{10.21105/joss.01541} \cr \cr McElreath R. \emph{Statistical Rethinking. A Bayesian Course with Examples in R and Stan} Chapman and Hall, 2015 } sjstats/man/table_values.Rd0000644000176200001440000000235213565517445015476 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/sjStatistics.R \name{table_values} \alias{table_values} \title{Expected and relative table values} \usage{ table_values(tab, digits = 2) } \arguments{ \item{tab}{Simple \code{\link{table}} or \code{\link[stats]{ftable}} of which cell, row and column percentages as well as expected values are calculated. Tables of class \code{\link[stats]{xtabs}} and other will be coerced to \code{ftable} objects.} \item{digits}{Amount of digits for the table percentage values.} } \value{ (Invisibly) returns a list with four tables: \enumerate{ \item \code{cell} a table with cell percentages of \code{tab} \item \code{row} a table with row percentages of \code{tab} \item \code{col} a table with column percentages of \code{tab} \item \code{expected} a table with expected values of \code{tab} } } \description{ This function calculates a table's cell, row and column percentages as well as expected values and returns all results as lists of tables. } \examples{ tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) # show expected values table_values(tab)$expected # show cell percentages table_values(tab)$cell } sjstats/man/cv_error.Rd0000644000176200001440000000317013565517445014650 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cv_error.R \name{cv_error} \alias{cv_error} \alias{cv_compare} \title{Test and training error from model cross-validation} \usage{ cv_error(data, formula, k = 5) cv_compare(data, formulas, k = 5) } \arguments{ \item{data}{A data frame.} \item{formula}{The formula to fit the linear model for the test and training data.} \item{k}{The number of folds for the kfold-crossvalidation.} \item{formulas}{A list of formulas, to fit linear models for the test and training data.} } \value{ A data frame with the root mean squared errors for the training and test data. } \description{ \code{cv_error()} computes the root mean squared error from a model fitted to kfold cross-validated test-training-data. \code{cv_compare()} does the same, for multiple formulas at once (by calling \code{cv_error()} for each formula). } \details{ \code{cv_error()} first generates cross-validated test-training pairs, using \code{\link[modelr]{crossv_kfold}} and then fits a linear model, which is described in \code{formula}, to the training data. Then, predictions for the test data are computed, based on the trained models. The \emph{training error} is the mean value of the \code{\link{rmse}} for all \emph{trained} models; the \emph{test error} is the rmse based on all residuals from the test data. } \examples{ data(efc) cv_error(efc, neg_c_7 ~ barthtot + c161sex) cv_compare(efc, formulas = list( neg_c_7 ~ barthtot + c161sex, neg_c_7 ~ barthtot + c161sex + e42dep, neg_c_7 ~ barthtot + c12hour )) } sjstats/man/scale_weights.Rd0000644000176200001440000000627413616514473015653 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svy_scale_weights.R \name{scale_weights} \alias{scale_weights} \title{Rescale design weights for multilevel analysis} \usage{ scale_weights(x, cluster.id, pweight) } \arguments{ \item{x}{A data frame.} \item{cluster.id}{Variable indicating the grouping structure (strata) of the survey data (level-2-cluster variable).} \item{pweight}{Variable indicating the probability (design or sampling) weights of the survey data (level-1-weight).} } \value{ \code{x}, with two new variables: \code{svywght_a} and \code{svywght_b}, which represent the rescaled design weights to use in multilevel models (use these variables for the \code{weights} argument). } \description{ Most functions to fit multilevel and mixed effects models only allow to specify frequency weights, but not design (i.e. sampling or probability) weights, which should be used when analyzing complex samples and survey data. \code{scale_weights()} implements an algorithm proposed by Aaparouhov (2006) and Carle (2009) to rescale design weights in survey data to account for the grouping structure of multilevel models, which then can be used for multilevel modelling. } \details{ Rescaling is based on two methods: For \code{svywght_a}, the sample weights \code{pweight} are adjusted by a factor that represents the proportion of cluster size divided by the sum of sampling weights within each cluster. The adjustment factor for \code{svywght_b} is the sum of sample weights within each cluster devided by the sum of squared sample weights within each cluster (see Carle (2009), Appendix B). \cr \cr Regarding the choice between scaling methods A and B, Carle suggests that "analysts who wish to discuss point estimates should report results based on weighting method A. For analysts more interested in residual between-cluster variance, method B may generally provide the least biased estimates". In general, it is recommended to fit a non-weighted model and weighted models with both scaling methods and when comparing the models, see whether the "inferential decisions converge", to gain confidence in the results. \cr \cr Though the bias of scaled weights decreases with increasing cluster size, method A is preferred when insufficient or low cluster size is a concern. \cr \cr The cluster ID and probably PSU may be used as random effects (e.g. nested design, or cluster and PSU as varying intercepts), depending on the survey design that should be mimicked. } \examples{ data(nhanes_sample) scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) if (require("lme4")) { nhanes_sample <- scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) glmer( total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | SDMVPSU), family = poisson(), data = nhanes_sample, weights = svywght_a ) } } \references{ Carle AC. \emph{Fitting multilevel models in complex survey data with design weights: Recommendations} BMC Medical Research Methodology 2009, 9(49): 1-13 \cr \cr Asparouhov T. \emph{General Multi-Level Modeling with Sampling Weights} Communications in Statistics - Theory and Methods 2006, 35: 439-460 } sjstats/man/figures/0000755000176200001440000000000013563265750014200 5ustar liggesuserssjstats/man/figures/logo.png0000644000176200001440000004411313563265750015651 0ustar liggesusersPNG  IHDRxb]e iCCPICC Profile8U]hU>sg#$Sl4t? % V46nI6"dΘ83OEP|1Ŀ (>/ % (>P苦;3ie|{g蹪X-2s=+WQ+]L6O w[C{_F qb Uvz?Zb1@/zcs>~if,ӈUSjF 1_Mjbuݠpamhmçϙ>a\+5%QKFkm}ۖ?ޚD\!~6,-7SثŜvķ5Z;[rmS5{yDyH}r9|-ăFAJjI.[/]mK 7KRDrYQO-Q||6 (0 MXd(@h2_f<:”_δ*d>e\c?~,7?& ك^2Iq2"y@g|UP`o pHYs  YiTXtXML:com.adobe.xmp 1 L'Y@IDATxW[`)@m0h4$M˪ۜ|v?])]w-@HMY dYc轆a;;0\JfDܹ}S<9-V.Zksm<_LC3jj/,?k? zݷhQy[HyMӬ%% tl-@'Ą@zF56qko^(dYf3W#Xl5?|"~ AɩXK,WA~ǣ---@ (DTU'|o|D8],,/_7ocϽo ,??G~>#3gTɚ#~B`7kUhԢ\+IPo y)P8EMC{[e6$vCy7_wu58 6UNFbV ,Wmyc-QლhĢMwN|GN^Eh}ef9gѕ> Նj B~HvWۚն!bPk:"X| R< d礆-kgdeXwMؤg{;nO9 a'=3Mؚ#Ͱ`mhھ35xmn0<97wR"U+-I٩60dž˴Pb--)Oc̙oyrI'UG2'o=`}e˖<F fcc0JǪl}EՂ`DBp ,DK>YoS$]s>kƜVT{M)߼eee^.[,X6k&bgUޘREm/+K8{nM~)iZ!DZS-ٵ5v9f}4?v/.6H!Kn`S6-/ƃ~x<F3dCV73yVV,m9Uct,;@9䱟o>PoXT22 uMyV#g s3@j&Vr;:8/$!@M/D>ϱ ̪(\dPBм~o޴z s=QVpњX v@ZzfQ$l>1 9p`litTb/Ď;i!Yv m' K14&Nh%ä `WYC}w𓟾}LsnǁۛUT#7>:%%m  n)UqgñJ[{αlr!KXۦJjtmͰ ;M!ii@d=vwZ?n̙klO^{_7 *PnM+P8ʨIE`+lrv;Z$ YqN7@MhPtzFl+)ȱX8r`ZZ575swN^u)rz֬.))os=>_t^ Fbgϩ[ nk\+; |RĪ@Qce%ܞ1zG5HKϰfgX仟WzP%6v|;nNw|jYT6E{~YM'.=;Vl(ó;RY܋+,JΪtžl;a?Of`03NP49(mG>ҷ7V~>vlZ̳ި^*^fMҳa_JMϸg 1BF>7NZ5K>ED٬=H̗SNMmDıLMMu+~{;څ%`]VKτ'RGZyGE'|ͶVajZKBlm8! LcN ٵ2-,9PY9nXԾ`ĵ%/osUޅ ;nZG+'#+_C}͈ c܋km-= 5J,vQm)UZݞ=q}nb>1`Z(F8ܱmw5}ȘX/tݰ-ٳcEh;{Į=SoMs>Ec({}5!Ld8t8%q N1h%5[ߍ[F6i (rv2;vVl`0.Yq/Bb67򚆄{9+b(6 `躬cU(^Ct\l\np |JۆsD&777X,M}'JL8} gBu띆ر6gm_ѱgd6lFȊh}س;p/z}ٳ3MoPg!rUF?FF>qDpZZ3̫g[5ypIҹ$+P%%_ki隼TxojzVA$҄{,8fO wR\rVjU3{ᘨYl[eXia KERˆP("} X{1o (3]:r0MF,ʢ? oKS0crNڠ>6@gc]a4Z_o~j)H:o;nEW|?ixٹeFЌ-㓜݋ٳgCm3Z+Xy`op]ȐƭSSKNoUb?G@H|Uꖅ-)9@`mֹkOp̖UOZ%@[bWa lD*rEk _ c,bbcg"$Z̪&’n5|;XQ_S:aɶĒMwyn@S q~U^$ɍ|9֙"4|%ja\u(fLQF(g{8fPnF)},Q&C.А{?Qv&嶶EHS:@9QU܏Ln% aծ_LB{H|#n!V@}9Bvؒ_)h3weXFj,)Z+cu kze{ 7h'Io5^2ݢ>ւQx6F3?adD:+AK @p-kzIbB3ׄP+{-k7K22TIVe&>d Ăna nԆzD&f AJ&_$ 7SѢ[!:s|'6*SrwpE=?[6Aڌa֞ `u}s:4:c8TrBUOB_1 EO:Dics`ۚ0p6 w;uu^^w!DqŠar .Un㱛:ӡSya7;O -09bNbyWڋb˵8ĭxAtnX$K~tM`oo^cK\%#,/uB9vSzƦs`@*{0}bG[Opsm r o{TJ\?|#T?y?0a޼_}{.RZAUoM.~Cmoپx}L9h]}Ț`'zBdN9綎}dW'HDA8hnmcNUNVRԥq_%2Uz8$m_.GٺG_URX)M'VW4lLRM6֜_ bo}qwIeЏ GM-ԻO_mg`>׸ K~Y.D#~A>#]oO ̥.`E '4 om@v:C?⾂5n%kCm_h !0.C﹩ƲQpVl=bw/j/n9nOy֤kl9}(Zbr%"JXI<_AV<\b'o\+O,l5nb岕[u *7EXR6 )h?Љ?]` 9^ٷ™4j1UPu ^9DC{V.^=C҉3g~[_[lˬ[͑kP˫ &#^zꒄUjlR`Xm"UrE\ 2\uhiP9W$(G &X6H>W* RTЯ@џ8?t]/=>蚊$ܦEA?~{}v}!An_TGƛT&(z1\zD?g"X (i~C62+ʥ}٣2OU7[;gг`PWiEFʅ Wo=jWv4AWф*Hr:UOW.GpNiɃаHsd3<(+ʬM 9DE.{#yƻ{H=o􌮿=^]](RQBdkDJ+h U4$:`ˁrh+GQW(Q]FbۑgrEJ>ßJ' Nt i Ə>{ƪF&'^P/r뛂`E)ņ7Y J/out<נ>z=juO{Ͻ Nbr`|:.Mb1x˷ œ% WTrNk3ۻF9QV]OX2zIsrsz=DL{ XuO.IZXt2-*v<0g{ Y͘t٘s=bINw!b)njhJq%J ܣK/6=.[Bm GJ*nТO2#rxvY'$wq:yF$FHH$]ϲTϐxJn՞ZcׂQUV,nkQK4[|uJWl&,NI{zIBpor׀TUfRv mA$ 4A3D|+yϱ)^vЕ\1jQdCel&{ne~!w&X`VHfIFjv; `*Jl7@+`!BuL S.vvz[βAhÐO;vU鐭vq"^P z]t,WJjYRU[Ru9$#Nn\*,ՠQ~=nPXʲTc璿b hI}EȔb\|كâŖ2Ŷ҆BJ )dr[^$%Kÿ:89%憦\Dr\tt)1",1Φ)VڳV/$st\[J#%ZYԾ҅`LX_dpb5TpLmU'^7گr*FȦ%®:Bt$;_SP Y_g+&SkGAW}Ԗ `hڒ᪃]ӕ(Цgc鱛6*"%w-. QOjYD/!VLG'TxSf;! XEdJyтc&;L }6kRb#@\S䠾tET{I42sE,UR_}v}ݖ PeRҖ .]jRE(~ѵvZ޻n>{㳴2-xWQ2I$wmS R#HNl6RDYBfKdQa{)LR:*P^&#<^qH滎W,V!Eۼyu/w<}&d6Ye0ޖ oTOG=i`fǤAnŁ_֡u$[]ST^Т>v+ oB(SϷǝoV@v;X7אBJ*(A{<18Dj W$!X?uM |sN[ٌiT EWic4].\@*wͲ͎fJHL__e7*q0A̠ &Hvպx]#:{Po=Tv|/zD 5/A `W GA&j"2$#IlO&[̻dmjR1jۈ1g1[\oIԭwM3ZaF;:~6ֶhțHc֫ן''aڭNTEML~.C~~4I*hB:(u-ߥ4=Fr$eI uw6k .h[A5:,RbjK'dmS^3a' B~ q!K$֩OKd;iu=>\oæFy D6Nu<Ħe:٫*$!^ua5KKʄDg=c߭n=, ~CXr(M`$qȜt^&IZS *0$k{5ڣSgK)դ% Ӣ[,vTe3t?҆^ DfZ![_R#v)JרRD-p:N% TxtծybT{iEl<18rN}Mm&.nCG;ud[$w 8Dqȩy;rI{9FMwȿY~<OJg$/%tUOm⳶;,)lk[{RiۘaK+ȵj1sF? *ozmv'L= mmO,*ԩs'Z}éCY`'_aK8Lۊ>QH 54dKەӭ$YDVML&MP2z8 ;"$-Y@o5!Wٙ5x&k#qi9ՇDܢF^E hJN,$^dWZ^e827Vr3m[N_yT#9Fj@Z5&#DTciyphWډxNb \59;Q97aХɑ=K>[;u2bEA!Σ:cd ʵ!yvT=;HZ7=>TdQHlqc_[F$5 c m}O<)qTQXAIQ%ISUgUi١fJ_+-g~!LSc$ej_lWeꔋQ?A> &Zۤ4 (cع`2 p OZ܊@l!Z$EZGbC+)zL8pRЙyS8(s?6F}q?F#hssfs(X9[1᪨cj'Qgut,_v Y}h`B+`J%RlxWTSI_E9PX+Pr&*Zqlt:sn]DŽXJ+\':5CDqjL*Ĉ'K%̘`Ȃ uO̼pC]>7и>>h΄_u#tĜ*Zr pG[ͣ! \WTAnFrsY_N`QRJۜ%|W:xZ(q|셺Jr_5*@]MLȟ14"o"?jD"_0{#6,Ö4Ee_LgG90 rL>*xO)ꋀ/QLY:L/瀬1x]l ot]R{Bl۽|'g> Ӵsݫ~,{qb""dc wںe'|G/ǫgdD8Y!B$Ϲv+|-Kh=hơfysgpB8zE*\:*;%&r|'Id5ƹ| \ 4D?uwH[ 2g\LUvJv,H#PH 7kք-bEuWg7??=$i ;TYkka2ĶUz*>oo (%رSqur~>JG* P0HIMT?}wdi2m? "ػlٲَ͜mo |NKJmh83|~H>.msஆf4I-'D'QX9h@hlh#o,@Ơ`_ Tum|yxo-r8¹/`݊V{h@٬aqWCakl@a!}0k-+mvYF' =ooBY8*,obˎGMO`-Qp]A>&#/c7w«0D|e3gƨL^teeey~K6|ypj=gEfw\ncλErKEwvw\DRtX ;q%x%&&8 A6µ`ğz#](ض8SlAb=!EM d<OM,4EAmfB6 yMf*$U{p{woE-j2k"ׂZ(wN)Ѱ0*tٔ4ښFS5qXlY+Fpg֓KR0)W F>q4f̪uUp~NEsOuEIBرVbIabS L@ah'[|t 槯_|~'sJv%N4-[fϸ==\A)bmÒL\;Btezܞq\@3;>=Sp/Nq/*9\4vV=[ݒ>Ã̞Y^ZȘzksGAƱ^uTJ@^SNn/y+p?FmM%E̾նvܞ {JWoO|EBAXxӑZ݋XqdmvbrZ7_sP?dVX0и۰_~>[VYljB@:XqKT;͠k ͍^[}šDϵ4FUG۶o{J~{sn~4 uF Z{dum>]K-) [[B1F8td'^Tb"܋d&Lx]qAW^iaW]t5\1u]aɹ6 x&>q)/Em=ܞ\6$J%k޽gp/j-Nߘv]YũgM,}'ܹ:GEήxػN!D/_ο;X}q?z=hEs3YPuU KmoF>K~Vr;]ٖҁ%g^/mYj 嶄SR)JKKq!=M7d̜i,Īo*=%-'-5%k'9$=3'T6c?HɬE2~PdWX@쎈sIDATKWJ` rVaQx 鑒*v*@Xi5`g\ߙ?{!ľ2l~뉥#Zsއ}`8|3r#[DWJVaɭΫq{ÕX5+DEڸL'7ered"̈́"!kwY*g7p'^`Dvmyt=PZJr~6=#k,xXpnOC%\|% Xa|VcI^Wa?9lrēJu=&Uo=~,{vT/`.p㭫1̅%5sW XX'#9nڐ={rj]%ۛ{5hiO=jֵU%oCn5ىs]l듣D娀]Y/>X̙𢮫hmϽJjlj^^`oڶVK?2şwQ})7Ǎp0ض)s{[l |J40`El>t?IҒ`%}޷We~pgm'CIENDB`sjstats/man/reexports.Rd0000644000176200001440000000143213565517445015061 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/re-exports.R \docType{import} \name{reexports} \alias{reexports} \alias{\%>\%} \alias{typical_value} \alias{mse} \alias{rmse} \alias{ci} \alias{equivalence_test} \alias{link_inverse} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{bayestestR}{\code{\link[bayestestR]{ci}}, \code{\link[bayestestR]{equivalence_test}}} \item{insight}{\code{\link[insight]{link_inverse}}} \item{magrittr}{\code{\link[magrittr]{\%>\%}}} \item{performance}{\code{\link[performance]{mse}}, \code{\link[performance]{rmse}}} \item{sjmisc}{\code{\link[sjmisc]{typical_value}}} }} sjstats/man/efc.Rd0000644000176200001440000000066613616771032013562 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nhanes_sample.R \docType{data} \name{efc} \alias{efc} \title{Sample dataset from the EUROFAMCARE project} \description{ German data set from the European study on family care of older people. } \references{ Lamura G, Döhner H, Kofahl C, editors. Family carers of older people in Europe: a six-country comparative study. Münster: LIT, 2008. } \keyword{data} sjstats/man/find_beta.Rd0000644000176200001440000000777313565517445014757 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/find_beta.R \name{find_beta} \alias{find_beta} \alias{find_beta2} \alias{find_cauchy} \alias{find_normal} \title{Determining distribution parameters} \usage{ find_beta(x1, p1, x2, p2) find_beta2(x, se, ci, n) find_cauchy(x1, p1, x2, p2) find_normal(x1, p1, x2, p2) } \arguments{ \item{x1}{Value for the first percentile.} \item{p1}{Probability of the first percentile.} \item{x2}{Value for the second percentile.} \item{p2}{Probability of the second percentile.} \item{x}{Numeric, a probability value between 0 and 1. Typically indicates a prevalence rate of an outcome of interest; Or an integer value with the number of observed events. In this case, specify \code{n} to indicate the toral number of observations.} \item{se}{The standard error of \code{x}. Either \code{se} or \code{ci} must be specified.} \item{ci}{The upper limit of the confidence interval of \code{x}. Either \code{se} or \code{ci} must be specified.} \item{n}{Numeric, number of total observations. Needs to be specified, if \code{x} is an integer (number of observed events), and no probability. See 'Examples'.} } \value{ A list of length two, with the two distribution parameters than can be used to define the distribution, which (best) describes the shape for the given input parameters. } \description{ \code{find_beta()}, \code{find_normal()} and \code{find_cauchy()} find the shape, mean and standard deviation resp. the location and scale parameters to describe the beta, normal or cauchy distribution, based on two percentiles. \code{find_beta2()} finds the shape parameters for a Beta distribution, based on a probability value and its standard error or confidence intervals. } \details{ These functions can be used to find parameter for various distributions, to define prior probabilities for Bayesian analyses. \code{x1}, \code{p1}, \code{x2} and \code{p2} are parameters that describe two quantiles. Given this knowledge, the distribution parameters are returned. \cr \cr Use \code{find_beta2()}, if the known parameters are, e.g. a prevalence rate or similar probability, and its standard deviation or confidence interval. In this case. \code{x} should be a probability, for example a prevalence rate of a certain event. \code{se} then needs to be the standard error for this probability. Alternatively, \code{ci} can be specified, which should indicate the upper limit of the confidence interval od the probability (prevalence rate) \code{x}. If the number of events out of a total number of trials is known (e.g. 12 heads out of 30 coin tosses), \code{x} can also be the number of observed events, while \code{n} indicates the total amount of trials (in the above example, the function call would be: \code{find_beta2(x = 12, n = 30)}). } \examples{ # example from blogpost: # https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/ # 10\% of patients respond within 30 days of treatment # and 80\% respond within 90 days of treatment find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) curve( dnorm(x, mean = parms$mean, sd = parms$sd), from = 0, to = 200 ) parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) curve( dcauchy(x, location = parms$location, scale = parms$scale), from = 0, to = 200 ) find_beta2(x = .25, ci = .5) shapes <- find_beta2(x = .25, ci = .5) curve(dbeta(x, shapes[[1]], shapes[[2]])) # find Beta distribution for 3 events out of 20 observations find_beta2(x = 3, n = 20) shapes <- find_beta2(x = 3, n = 20) curve(dbeta(x, shapes[[1]], shapes[[2]])) } \references{ Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF}) } sjstats/man/boot_ci.Rd0000644000176200001440000001177413574162025014444 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/boot_ci.R \name{boot_ci} \alias{boot_ci} \alias{boot_se} \alias{boot_p} \alias{boot_est} \title{Standard error and confidence intervals for bootstrapped estimates} \usage{ boot_ci(data, ..., method = c("dist", "quantile"), ci.lvl = 0.95) boot_se(data, ...) boot_p(data, ...) boot_est(data, ...) } \arguments{ \item{data}{A data frame that containts the vector with bootstrapped estimates, or directly the vector (see 'Examples').} \item{...}{Optional, unquoted names of variables with bootstrapped estimates. Required, if either \code{data} is a data frame (and no vector), and only selected variables from \code{data} should be processed. You may also use functions like \code{:} or tidyselect's \code{\link[tidyselect]{select_helpers}}.} \item{method}{Character vector, indicating if confidence intervals should be based on bootstrap standard error, multiplied by the value of the quantile function of the t-distribution (default), or on sample quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{ci.lvl}{Numeric, the level of the confidence intervals.} } \value{ A \code{\link[tibble]{tibble}} with either bootstrap estimate, standard error, the lower and upper confidence intervals or the p-value for all bootstrapped estimates. } \description{ Compute nonparametric bootstrap estimate, standard error, confidence intervals and p-value for a vector of bootstrap replicate estimates. } \details{ The methods require one or more vectors of bootstrap replicate estimates as input. \itemize{ \item{ \code{boot_est()} returns the bootstrapped estimate, simply by computing the mean value of all bootstrap estimates. } \item{ \code{boot_se()} computes the nonparametric bootstrap standard error by calculating the standard deviation of the input vector. } \item{ The mean value of the input vector and its standard error is used by \code{boot_ci()} to calculate the lower and upper confidence interval, assuming a t-distribution of bootstrap estimate replicates (for \code{method = "dist"}, the default, which is \code{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)}); for \code{method = "quantile"}, 95\% sample quantiles are used to compute the confidence intervals (\code{quantile(x, probs = c(.025, .975))}). Use \code{ci.lvl} to change the level for the confidence interval. } \item{ P-values from \code{boot_p()} are also based on t-statistics, assuming normal distribution. } } } \examples{ library(dplyr) library(purrr) data(efc) bs <- bootstrap(efc, 100) # now run models for each bootstrapped sample bs$models <- map(bs$strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)) # extract coefficient "dependency" and "gender" from each model bs$dependency <- map_dbl(bs$models, ~coef(.x)[2]) bs$gender <- map_dbl(bs$models, ~coef(.x)[3]) # get bootstrapped confidence intervals boot_ci(bs$dependency) # compare with model fit fit <- lm(neg_c_7 ~ e42dep + c161sex, data = efc) confint(fit)[2, ] # alternative function calls. boot_ci(bs$dependency) boot_ci(bs, dependency) boot_ci(bs, dependency, gender) boot_ci(bs, dependency, gender, method = "q") # compare coefficients mean(bs$dependency) boot_est(bs$dependency) coef(fit)[2] # bootstrap() and boot_ci() work fine within pipe-chains efc \%>\% bootstrap(100) \%>\% mutate( models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)), dependency = map_dbl(models, ~coef(.x)[2]) ) \%>\% boot_ci(dependency) # check p-value boot_p(bs$gender) summary(fit)$coefficients[3, ] \dontrun{ # 'spread_coef()' from the 'sjmisc'-package makes it easy to generate # bootstrapped statistics like confidence intervals or p-values library(dplyr) library(sjmisc) efc \%>\% # generate bootstrap replicates bootstrap(100) \%>\% # apply lm to all bootstrapped data sets mutate( models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) ) \%>\% # spread model coefficient for all 100 models spread_coef(models) \%>\% # compute the CI for all bootstrapped model coefficients boot_ci(e42dep, c161sex, c172code) # or... efc \%>\% # generate bootstrap replicates bootstrap(100) \%>\% # apply lm to all bootstrapped data sets mutate( models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) ) \%>\% # spread model coefficient for all 100 models spread_coef(models, append = FALSE) \%>\% # compute the CI for all bootstrapped model coefficients boot_ci()} } \references{ Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 } \seealso{ \code{\link{bootstrap}} to generate nonparametric bootstrap samples. } sjstats/man/odds_to_rr.Rd0000644000176200001440000000626213565517445015172 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/odds_to_rr.R \name{odds_to_rr} \alias{odds_to_rr} \alias{or_to_rr} \title{Get relative risks estimates from logistic regressions or odds ratio values} \usage{ odds_to_rr(fit) or_to_rr(or, p0) } \arguments{ \item{fit}{A fitted binomial generalized linear (mixed) model with logit-link function (logistic (multilevel) regression model).} \item{or}{Numeric, an odds ratio estimate.} \item{p0}{Numeric, the risk of having a positive outcome in the control or unexposed group (reference group), i.e. the number of outcome or "successes" in the control divided by the total number of observations in the control group.} } \value{ A data frame with relative risks and lower/upper confidence interval for the relative risks estimates; for \code{or_to_rr()}, the risk ratio estimate. } \description{ \code{odds_to_rr()} converts odds ratios from a logistic regression model (including mixed models) into relative risks; \code{or_to_rr()} converts a single odds ratio estimate into a relative risk estimate. } \details{ This function extracts the odds ratios (exponentiated model coefficients) from logistic regressions (fitted with \code{glm} or \code{glmer}) and their related confidence intervals, and transforms these values into relative risks (and their related confidence intervals). \cr \cr The formula for transformation is based on Zhang and Yu (1998), Wang (2013) and Grant (2014): \code{RR <- OR / (1 - P0 + (P0 * OR))}, where \code{OR} is the odds ratio and \code{P0} indicates the proportion of the incidence in the outcome variable for the control group (reference group). } \examples{ library(sjmisc) library(lme4) # create binary response sleepstudy$Reaction.dicho <- dicho(sleepstudy$Reaction, dich.by = "median") # fit model fit <- glmer(Reaction.dicho ~ Days + (Days | Subject), data = sleepstudy, family = binomial("logit")) # convert to relative risks odds_to_rr(fit) data(efc) # create binary response y <- ifelse(efc$neg_c_7 < median(na.omit(efc$neg_c_7)), 0, 1) # create data frame for fitted model mydf <- data.frame( y = as.factor(y), sex = to_factor(efc$c161sex), dep = to_factor(efc$e42dep), barthel = efc$barthtot, education = to_factor(efc$c172code) ) # fit model fit <- glm(y ~., data = mydf, family = binomial(link = "logit")) # convert to relative risks odds_to_rr(fit) # replicate OR/RR for coefficient "sex" from above regression # p0 ~ .44, or ~ 1.914 prop.table(table(mydf$y, mydf$sex)) or_to_rr(1.914, 0.1055 / (.1324 + .1055)) } \references{ Grant RL. 2014. Converting an odds ratio to a range of plausible relative risks for better communication of research findings. BMJ 348:f7450. \doi{10.1136/bmj.f7450} \cr \cr Wang Z. 2013. Converting Odds Ratio to Relative Risk in Cohort Studies with Partial Data Information. J Stat Soft 2013;55. \doi{10.18637/jss.v055.i05} \cr \cr Zhang J, Yu KF. 1998. What's the Relative Risk? A Method of Correcting the Odds Ratio in Cohort Studies of Common Outcomes. JAMA; 280(19): 1690-1. \doi{10.1001/jama.280.19.1690} } sjstats/man/crosstable_statistics.Rd0000644000176200001440000001357513565714127017450 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cramer.R, R/phi.R, R/xtab_statistics.R \name{cramer} \alias{cramer} \alias{cramer.formula} \alias{phi} \alias{crosstable_statistics} \alias{xtab_statistics} \title{Measures of association for contingency tables} \usage{ cramer(tab, ...) \method{cramer}{formula}( formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ... ) phi(tab, ...) crosstable_statistics( data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ... ) xtab_statistics( data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ... ) } \arguments{ \item{tab}{A \code{\link{table}} or \code{\link[stats]{ftable}}. Tables of class \code{\link[stats]{xtabs}} and other will be coerced to \code{ftable} objects.} \item{...}{Other arguments, passed down to the statistic functions \code{\link[stats]{chisq.test}}, \code{\link[stats]{fisher.test}} or \code{\link[stats]{cor.test}}.} \item{formula}{A formula of the form \code{lhs ~ rhs} where \code{lhs} is a numeric variable giving the data values and \code{rhs} a factor giving the corresponding groups.} \item{data}{A data frame or a table object. If a table object, \code{x1} and \code{x2} will be ignored. For Kendall's \emph{tau}, Spearman's \emph{rho} or Pearson's product moment correlation coefficient, \code{data} needs to be a data frame. If \code{x1} and \code{x2} are not specified, the first two columns of the data frames are used as variables to compute the crosstab.} \item{ci.lvl}{Scalar between 0 and 1. If not \code{NULL}, returns a data frame including lower and upper confidence intervals.} \item{n}{Number of bootstraps to be generated.} \item{method}{Character vector, indicating if confidence intervals should be based on bootstrap standard error, multiplied by the value of the quantile function of the t-distribution (default), or on sample quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{x1}{Name of first variable that should be used to compute the contingency table. If \code{data} is a table object, this argument will be irgnored.} \item{x2}{Name of second variable that should be used to compute the contingency table. If \code{data} is a table object, this argument will be irgnored.} \item{statistics}{Name of measure of association that should be computed. May be one of \code{"auto"}, \code{"cramer"}, \code{"phi"}, \code{"spearman"}, \code{"kendall"}, \code{"pearson"} or \code{"fisher"}. See 'Details'.} \item{weights}{Name of variable in \code{x} that indicated the vector of weights that will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} } \value{ For \code{phi()}, the table's Phi value. For \code{cramer()}, the table's Cramer's V. \cr \cr For \code{crosstable_statistics()}, a list with following components: \describe{ \item{\code{estimate}}{the value of the estimated measure of association.} \item{\code{p.value}}{the p-value for the test.} \item{\code{statistic}}{the value of the test statistic.} \item{\code{stat.name}}{the name of the test statistic.} \item{\code{stat.html}}{if applicable, the name of the test statistic, in HTML-format.} \item{\code{df}}{the degrees of freedom for the contingency table.} \item{\code{method}}{character string indicating the name of the measure of association.} \item{\code{method.html}}{if applicable, the name of the measure of association, in HTML-format.} \item{\code{method.short}}{the short form of association measure, equals the \code{statistics}-argument.} \item{\code{fisher}}{logical, if Fisher's exact test was used to calculate the p-value.} } } \description{ This function calculates various measure of association for contingency tables and returns the statistic and p-value. Supported measures are Cramer's V, Phi, Spearman's rho, Kendall's tau and Pearson's r. } \details{ The p-value for Cramer's V and the Phi coefficient are based on \code{chisq.test()}. If any expected value of a table cell is smaller than 5, or smaller than 10 and the df is 1, then \code{fisher.test()} is used to compute the p-value, unless \code{statistics = "fisher"}; in this case, the use of \code{fisher.test()} is forced to compute the p-value. The test statistic is calculated with \code{cramer()} resp. \code{phi()}. \cr \cr Both test statistic and p-value for Spearman's rho, Kendall's tau and Pearson's r are calculated with \code{cor.test()}. \cr \cr When \code{statistics = "auto"}, only Cramer's V or Phi are calculated, based on the dimension of the table (i.e. if the table has more than two rows or columns, Cramer's V is calculated, else Phi). } \examples{ # Phi coefficient for 2x2 tables tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) phi(tab) # Cramer's V for nominal variables with more than 2 categories tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) cramer(tab) # formula notation data(efc) cramer(e16sex ~ c161sex, data = efc) # bootstrapped confidence intervals cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100) # 2x2 table, compute Phi automatically crosstable_statistics(efc, e16sex, c161sex) # more dimensions than 2x2, compute Cramer's V automatically crosstable_statistics(efc, c172code, c161sex) # ordinal data, use Kendall's tau crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall") # calcilate Spearman's rho, with continuity correction crosstable_statistics(efc, e42dep, quol_5, statistics = "spearman", exact = FALSE, continuity = TRUE ) } sjstats/man/svyglm.nb.Rd0000644000176200001440000000536113616514473014745 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svyglmnb.R \name{svyglm.nb} \alias{svyglm.nb} \title{Survey-weighted negative binomial generalised linear model} \usage{ svyglm.nb(formula, design, ...) } \arguments{ \item{formula}{An object of class \code{formula}, i.e. a symbolic description of the model to be fitted. See 'Details' in \code{\link[stats]{glm}}.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{...}{Other arguments passed down to \code{\link[MASS]{glm.nb}}.} } \value{ An object of class \code{\link[survey]{svymle}} and \code{svyglm.nb}, with some additional information about the model. } \description{ \code{svyglm.nb()} is an extension to the \CRANpkg{survey}-package to fit survey-weighted negative binomial models. It uses \code{\link[survey]{svymle}} to fit sampling-weighted maximum likelihood estimates, based on starting values provided by \code{\link[MASS]{glm.nb}}, as proposed by \emph{Lumley (2010, pp249)}. } \details{ For details on the computation method, see Lumley (2010), Appendix E (especially 254ff.) \cr \cr \pkg{sjstats} implements following S3-methods for \code{svyglm.nb}-objects: \code{family()}, \code{model.frame()}, \code{formula()}, \code{print()}, \code{predict()} and \code{residuals()}. However, these functions have some limitations: \itemize{ \item{\code{family()} simply returns the family-object from the underlying \code{\link[MASS]{glm.nb}}-model.} \item{The \code{predict()}-method just re-fits the \code{svyglm.nb}-model with \code{\link[MASS]{glm.nb}}, overwrites the \code{$coefficients} from this model-object with the coefficients from the returned \code{\link[survey]{svymle}}-object and finally calls \code{\link[stats]{predict.glm}} to compute the predicted values.} \item{\code{residuals()} re-fits the \code{svyglm.nb}-model with \code{\link[MASS]{glm.nb}} and then computes the Pearson-residuals from the \code{glm.nb}-object.} } } \examples{ # ------------------------------------------ # This example reproduces the results from # Lumley 2010, figure E.7 (Appendix E, p256) # ------------------------------------------ if (require("survey")) { data(nhanes_sample) # create survey design des <- svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) # fit negative binomial regression fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des) # print coefficients and standard errors fit } } \references{ Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley } sjstats/man/cv.Rd0000644000176200001440000000253213565517445013440 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cv.R \name{cv} \alias{cv} \title{Compute model quality} \usage{ cv(x, ...) } \arguments{ \item{x}{Fitted linear model of class \code{lm}, \code{merMod} (\pkg{lme4}) or \code{lme} (\pkg{nlme}).} \item{...}{More fitted model objects, to compute multiple coefficients of variation at once.} } \value{ Numeric, the coefficient of variation. } \description{ Compute the coefficient of variation. } \details{ The advantage of the cv is that it is unitless. This allows coefficient of variation to be compared to each other in ways that other measures, like standard deviations or root mean squared residuals, cannot be. \cr \cr \dQuote{It is interesting to note the differences between a model's CV and R-squared values. Both are unitless measures that are indicative of model fit, but they define model fit in two different ways: CV evaluates the relative closeness of the predictions to the actual values while R-squared evaluates how much of the variability in the actual values is explained by the model.} \cite{(\href{http://www.ats.ucla.edu/stat/mult_pkg/faq/general/coefficient_of_variation.htm}{source: UCLA-FAQ})} } \examples{ data(efc) fit <- lm(barthtot ~ c160age + c12hour, data = efc) cv(fit) } sjstats/man/r2.Rd0000644000176200001440000000072513610331646013341 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/Deprecated.R \name{r2} \alias{r2} \alias{icc} \alias{p_value} \alias{se} \alias{cohens_f} \alias{std_beta} \alias{robust} \title{Deprecated functions} \usage{ r2(x) icc(x) p_value(x, ...) se(x, ...) cohens_f(x, ...) std_beta(x, ...) robust(x, ...) } \arguments{ \item{x}{An object.} \item{...}{Currently not used.} } \value{ Nothing. } \description{ A list of deprecated functions. } sjstats/man/mwu.Rd0000644000176200001440000000543613565521701013634 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mwu.R \name{mwu} \alias{mwu} \alias{mannwhitney} \title{Mann-Whitney-U-Test} \usage{ mwu( data, x, grp, distribution = "asymptotic", out = c("txt", "viewer", "browser"), encoding = "UTF-8", file = NULL ) mannwhitney( data, x, grp, distribution = "asymptotic", out = c("txt", "viewer", "browser"), encoding = "UTF-8", file = NULL ) } \arguments{ \item{data}{A data frame.} \item{x}{Bare (unquoted) variable name, or a character vector with the variable name.} \item{grp}{Bare (unquoted) name of the cross-classifying variable, where \code{x} is grouped into the categories represented by \code{grp}, or a character vector with the variable name.} \item{distribution}{Indicates how the null distribution of the test statistic should be computed. May be one of \code{"exact"}, \code{"approximate"} or \code{"asymptotic"} (default). See \code{\link[coin]{wilcox_test}} for details.} \item{out}{Character vector, indicating whether the results should be printed to console (\code{out = "txt"}) or as HTML-table in the viewer-pane (\code{out = "viewer"}) or browser (\code{out = "browser"}), of if the results should be plotted (\code{out = "plot"}, only applies to certain functions). May be abbreviated.} \item{encoding}{Character vector, indicating the charset encoding used for variable and value labels. Default is \code{"UTF-8"}. Only used when \code{out} is not \code{"txt"}.} \item{file}{Destination file, if the output should be saved as file. Only used when \code{out} is not \code{"txt"}.} } \value{ (Invisibly) returns a data frame with U, p and Z-values for each group-comparison as well as effect-size r; additionally, group-labels and groups' n's are also included. } \description{ This function performs a Mann-Whitney-U-Test (or Wilcoxon rank sum test, see \code{\link[stats]{wilcox.test}} and \code{\link[coin]{wilcox_test}}) for \code{x}, for each group indicated by \code{grp}. If \code{grp} has more than two categories, a comparison between each combination of two groups is performed. \cr \cr The function reports U, p and Z-values as well as effect size r and group-rank-means. } \note{ This function calls the \code{\link[coin]{wilcox_test}} with formula. If \code{grp} has more than two groups, additionally a Kruskal-Wallis-Test (see \code{\link{kruskal.test}}) is performed. \cr \cr Interpretation of effect sizes, as a rule-of-thumb: \itemize{ \item small effect >= 0.1 \item medium effect >= 0.3 \item large effect >= 0.5 } } \examples{ data(efc) # Mann-Whitney-U-Tests for elder's age by elder's dependency. mwu(efc, e17age, e42dep) } sjstats/man/weight.Rd0000644000176200001440000000320613565517445014316 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/weight.R \name{weight} \alias{weight} \alias{weight2} \title{Weight a variable} \usage{ weight(x, weights, digits = 0) weight2(x, weights) } \arguments{ \item{x}{(Unweighted) variable.} \item{weights}{Vector with same length as \code{x}, which contains weight factors. Each value of \code{x} has a specific assigned weight in \code{weights}.} \item{digits}{Numeric value indicating the number of decimal places to be used for rounding the weighted values. By default, this value is \code{0}, i.e. the returned values are integer values.} } \value{ The weighted \code{x}. } \description{ These functions weight the variable \code{x} by a specific vector of \code{weights}. } \details{ \code{weight2()} sums up all \code{weights} values of the associated categories of \code{x}, whereas \code{weight()} uses a \code{\link[stats]{xtabs}} formula to weight cases. Thus, \code{weight()} may return a vector of different length than \code{x}. } \note{ The values of the returned vector are in sorted order, whereas the values' order of the original \code{x} may be spread randomly. Hence, \code{x} can't be used, for instance, for further cross tabulation. In case you want to have weighted contingency tables or (grouped) box plots etc., use the \code{weightBy} argument of most functions. } \examples{ v <- sample(1:4, 20, TRUE) table(v) w <- abs(rnorm(20)) table(weight(v, w)) table(weight2(v, w)) set.seed(1) x <- sample(letters[1:5], size = 20, replace = TRUE) w <- runif(n = 20) table(x) table(weight(x, w)) } sjstats/man/prop.Rd0000644000176200001440000000716313565517445014015 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/prop.R \name{prop} \alias{prop} \alias{props} \title{Proportions of values in a vector} \usage{ prop(data, ..., weights = NULL, na.rm = TRUE, digits = 4) props(data, ..., na.rm = TRUE, digits = 4) } \arguments{ \item{data}{A data frame. May also be a grouped data frame (see 'Examples').} \item{...}{One or more value pairs of comparisons (logical predicates). Put variable names the left-hand-side and values to match on the right hand side. Expressions may be quoted or unquoted. See 'Examples'.} \item{weights}{Vector of weights that will be applied to weight all observations. Must be a vector of same length as the input vector. Default is \code{NULL}, so no weights are used.} \item{na.rm}{Logical, whether to remove NA values from the vector when the proportion is calculated. \code{na.rm = FALSE} gives you the raw percentage of a value in a vector, \code{na.rm = TRUE} the valid percentage.} \item{digits}{Amount of digits for returned values.} } \value{ For one condition, a numeric value with the proportion of the values inside a vector. For more than one condition, a tibble with one column of conditions and one column with proportions. For grouped data frames, returns a tibble with one column per group with grouping categories, followed by one column with proportions per condition. } \description{ \code{prop()} calculates the proportion of a value or category in a variable. \code{props()} does the same, but allows for multiple logical conditions in one statement. It is similar to \code{mean()} with logical predicates, however, both \code{prop()} and \code{props()} work with grouped data frames. } \details{ \code{prop()} only allows one logical statement per comparison, while \code{props()} allows multiple logical statements per comparison. However, \code{prop()} supports weighting of variables before calculating proportions, and comparisons may also be quoted. Hence, \code{prop()} also processes comparisons, which are passed as character vector (see 'Examples'). } \examples{ data(efc) # proportion of value 1 in e42dep prop(efc, e42dep == 1) # expression may also be completely quoted prop(efc, "e42dep == 1") # use "props()" for multiple logical statements props(efc, e17age > 70 & e17age < 80) # proportion of value 1 in e42dep, and all values greater # than 2 in e42dep, including missing values. will return a tibble prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE) # for factors or character vectors, use quoted or unquoted values library(sjmisc) # convert numeric to factor, using labels as factor levels efc$e16sex <- to_label(efc$e16sex) efc$n4pstu <- to_label(efc$n4pstu) # get proportion of female older persons prop(efc, e16sex == female) # get proportion of male older persons prop(efc, e16sex == "male") # "props()" needs quotes around non-numeric factor levels props(efc, e17age > 70 & e17age < 80, n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' ) # also works with pipe-chains library(dplyr) efc \%>\% prop(e17age > 70) efc \%>\% prop(e17age > 70, e16sex == 1) # and with group_by efc \%>\% group_by(e16sex) \%>\% prop(e42dep > 2) efc \%>\% select(e42dep, c161sex, c172code, e16sex) \%>\% group_by(c161sex, c172code) \%>\% prop(e42dep > 2, e16sex == 1) # same for "props()" efc \%>\% select(e42dep, c161sex, c172code, c12hour, n4pstu) \%>\% group_by(c161sex, c172code) \%>\% props( e42dep > 2, c12hour > 20 & c12hour < 40, n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' ) } sjstats/man/weighted_sd.Rd0000644000176200001440000001315613616514473015315 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svy_median.R, R/wtd_chisqtest.R, R/wtd_cor.R, % R/wtd_mean.R, R/wtd_median.R, R/wtd_mwu.R, R/wtd_sd.R, R/wtd_se.R, % R/wtd_ttest.R \name{survey_median} \alias{survey_median} \alias{weighted_chisqtest} \alias{weighted_chisqtest.default} \alias{weighted_chisqtest.formula} \alias{weighted_correlation} \alias{weighted_correlation.default} \alias{weighted_correlation.formula} \alias{weighted_mean} \alias{weighted_median} \alias{weighted_mannwhitney} \alias{weighted_mannwhitney.default} \alias{weighted_mannwhitney.formula} \alias{weighted_sd} \alias{wtd_sd} \alias{weighted_se} \alias{weighted_ttest} \alias{weighted_ttest.default} \alias{weighted_ttest.formula} \title{Weighted statistics for tests and variables} \usage{ survey_median(x, design) weighted_chisqtest(data, ...) \method{weighted_chisqtest}{default}(data, x, y, weights, ...) \method{weighted_chisqtest}{formula}(formula, data, ...) weighted_correlation(data, ...) \method{weighted_correlation}{default}(data, x, y, weights, ci.lvl = 0.95, ...) \method{weighted_correlation}{formula}(formula, data, ci.lvl = 0.95, ...) weighted_mean(x, weights = NULL) weighted_median(x, weights = NULL) weighted_mannwhitney(data, ...) \method{weighted_mannwhitney}{default}(data, x, grp, weights, ...) \method{weighted_mannwhitney}{formula}(formula, data, ...) weighted_sd(x, weights = NULL) wtd_sd(x, weights = NULL) weighted_se(x, weights = NULL) weighted_ttest(data, ...) \method{weighted_ttest}{default}( data, x, y = NULL, weights, mu = 0, paired = FALSE, ci.lvl = 0.95, alternative = c("two.sided", "less", "greater"), ... ) \method{weighted_ttest}{formula}( formula, data, mu = 0, paired = FALSE, ci.lvl = 0.95, alternative = c("two.sided", "less", "greater"), ... ) } \arguments{ \item{x}{(Numeric) vector or a data frame. For \code{survey_median()}, \code{weighted_ttest()}, \code{weighted_mannwhitney()} and \code{weighted_chisqtest()} the bare (unquoted) variable name, or a character vector with the variable name.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{data}{A data frame.} \item{...}{For \code{weighted_ttest()} and \code{weighted_mannwhitney()}, currently not used. For \code{weighted_chisqtest()}, further arguments passed down to \code{\link[stats]{chisq.test}}.} \item{y}{Optional, bare (unquoted) variable name, or a character vector with the variable name.} \item{weights}{Bare (unquoted) variable name, or a character vector with the variable name of the numeric vector of weights. If \code{weights = NULL}, unweighted statistic is reported.} \item{formula}{A formula of the form \code{lhs ~ rhs1 + rhs2} where \code{lhs} is a numeric variable giving the data values and \code{rhs1} a factor with two levels giving the corresponding groups and \code{rhs2} a variable with weights.} \item{ci.lvl}{Confidence level of the interval.} \item{grp}{Bare (unquoted) name of the cross-classifying variable, where \code{x} is grouped into the categories represented by \code{grp}, or a character vector with the variable name.} \item{mu}{A number indicating the true value of the mean (or difference in means if you are performing a two sample test).} \item{paired}{Logical, whether to compute a paired t-test.} \item{alternative}{A character string specifying the alternative hypothesis, must be one of \code{"two.sided"} (default), \code{"greater"} or \code{"less"}. You can specify just the initial letter.} } \value{ The weighted (test) statistic. } \description{ \strong{Weighted statistics for variables} \cr \cr \code{weighted_sd()}, \code{weighted_se()}, \code{weighted_mean()} and \code{weighted_median()} compute weighted standard deviation, standard error, mean or median for a variable or for all variables of a data frame. \code{survey_median()} computes the median for a variable in a survey-design (see \code{\link[survey]{svydesign}}). \code{weighted_correlation()} computes a weighted correlation for a two-sided alternative hypothesis. \cr \cr \strong{Weighted tests} \cr \cr \code{weighted_ttest()} computes a weighted t-test, while \code{weighted_mannwhitney()} computes a weighted Mann-Whitney-U test or a Kruskal-Wallis test (for more than two groups). \code{weighted_chisqtest()} computes a weighted Chi-squared test for contigency tables. } \note{ \code{weighted_chisq()} is a convenient wrapper for \code{\link{crosstable_statistics}}. For a weighted one-way Anova, use \code{means_by_group()} with \code{weights}-argument. \cr \cr \code{weighted_ttest()} assumes unequal variance between the two groups. } \examples{ # weighted sd and se ---- weighted_sd(rnorm(n = 100, mean = 3), runif(n = 100)) data(efc) weighted_sd(efc[, 1:3], runif(n = nrow(efc))) weighted_se(efc[, 1:3], runif(n = nrow(efc))) # survey_median ---- # median for variables from weighted survey designs if (require("survey")) { data(nhanes_sample) des <- svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) survey_median(total, des) survey_median("total", des) } # weighted t-test ---- efc$weight <- abs(rnorm(nrow(efc), 1, .3)) weighted_ttest(efc, e17age, weights = weight) weighted_ttest(efc, e17age, c160age, weights = weight) weighted_ttest(e17age ~ e16sex + weight, efc) # weighted Mann-Whitney-U-test ---- weighted_mannwhitney(c12hour ~ c161sex + weight, efc) # weighted Chi-squared-test ---- weighted_chisqtest(efc, c161sex, e16sex, weights = weight, correct = FALSE) weighted_chisqtest(c172code ~ c161sex + weight, efc) } sjstats/man/eta_sq.Rd0000644000176200001440000000642113565517445014305 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/anova_stats.R, R/epsilon_sq.R, R/eta_sq.R, % R/omega_sq.R \name{anova_stats} \alias{anova_stats} \alias{epsilon_sq} \alias{eta_sq} \alias{omega_sq} \title{Effect size statistics for anova} \usage{ anova_stats(model, digits = 3) epsilon_sq(model, ci.lvl = NULL, n = 1000, method = c("dist", "quantile")) eta_sq( model, partial = FALSE, ci.lvl = NULL, n = 1000, method = c("dist", "quantile") ) omega_sq( model, partial = FALSE, ci.lvl = NULL, n = 1000, method = c("dist", "quantile") ) } \arguments{ \item{model}{A fitted anova-model of class \code{aov} or \code{anova}. Other models are coerced to \code{\link[stats]{anova}}.} \item{digits}{Number of decimal points in the returned data frame.} \item{ci.lvl}{Scalar between 0 and 1. If not \code{NULL}, returns a data frame with effect sizes including lower and upper confidence intervals.} \item{n}{Number of bootstraps to be generated.} \item{method}{Character vector, indicating if confidence intervals should be based on bootstrap standard error, multiplied by the value of the quantile function of the t-distribution (default), or on sample quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{partial}{Logical, if \code{TRUE}, the partial eta-squared is returned.} } \value{ A data frame with the term name(s) and effect size statistics; if \code{ci.lvl} is not \code{NULL}, a data frame including lower and upper confidence intervals is returned. For \code{anova_stats()}, a tidy data frame with all statistics is returned (excluding confidence intervals). } \description{ Returns the (partial) eta-squared, (partial) omega-squared, epsilon-squared statistic or Cohen's F for all terms in an anovas. \code{anova_stats()} returns a tidy summary, including all these statistics and power for each term. } \details{ For \code{eta_sq()} (with \code{partial = FALSE}), due to non-symmetry, confidence intervals are based on bootstrap-methods. In this case, \code{n} indicates the number of bootstrap samples to be drawn to compute the confidence intervals. Confidence intervals for partial omega-squared and epsilon-squared is also based on bootstrapping. \cr \cr Since bootstrapped confidence intervals are based on the bootstrap standard error (i.e. \code{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x))}, bounds of the confidence interval may be negative. Use \code{method = "quantile"} to make sure that the confidence intervals are always positive. } \examples{ # load sample data data(efc) # fit linear model fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) eta_sq(fit) omega_sq(fit) eta_sq(fit, partial = TRUE) eta_sq(fit, partial = TRUE, ci.lvl = .8) anova_stats(car::Anova(fit, type = 2)) } \references{ Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research (\href{https://www.msu.edu/~levinet/eta\%20squared\%20hcr.pdf}{pdf}) \cr \cr Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. (\href{http://www.scsug.org/wp-content/uploads/2016/11/Ad-Hoc-Method-for-Computing-Effect-Size-for-Mixed-Models_PROCEEDINGS-UPDATE-1.pdf}{pdf}) } sjstats/DESCRIPTION0000644000176200001440000000353713617050712013465 0ustar liggesusersPackage: sjstats Type: Package Encoding: UTF-8 Title: Collection of Convenient Functions for Common Statistical Computations Version: 0.17.9 Authors@R: person("Daniel", "Lüdecke", role = c("aut", "cre"), email = "d.luedecke@uke.de", comment = c(ORCID = "0000-0002-8895-3206")) Maintainer: Daniel Lüdecke Description: Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages. This package aims at providing, first, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like Cramer's V, Phi, or effect size statistics like Eta or Omega squared), or for which currently no functions available. Second, another focus lies on weighted variants of common statistical measures and tests like weighted standard error, mean, t-test, correlation, and more. License: GPL-3 Depends: R (>= 3.2), utils Imports: bayestestR (>= 0.4.0), broom, dplyr (>= 0.8.1), effectsize, emmeans, insight (>= 0.8.0), lme4, magrittr, MASS, modelr, parameters (>= 0.4.0), performance (>= 0.4.0), purrr, rlang, sjlabelled (>= 1.1.1), sjmisc (>= 2.8.2), stats, tidyr Suggests: brms, car, coin, ggplot2, graphics, httr, knitr, mediation, nlme, pbkrtest (>= 0.4-7), pscl, pwr, sandwich, sjPlot, survey, rstan, rstanarm, VGAM, Zelig, testthat URL: https://github.com/strengejacke/sjstats, https://strengejacke.github.io/sjstats BugReports: https://github.com/strengejacke/sjstats/issues RoxygenNote: 7.0.2 VignetteBuilder: knitr NeedsCompilation: no Packaged: 2020-02-06 11:02:12 UTC; mail Author: Daniel Lüdecke [aut, cre] () Repository: CRAN Date/Publication: 2020-02-06 17:50:02 UTC sjstats/build/0000755000176200001440000000000013616771064013057 5ustar liggesuserssjstats/build/vignette.rds0000644000176200001440000000050213616771064015413 0ustar liggesusersRAK0:@(l] ^ 4˜ǵKk2l!_=^:$x{7jH@f`R};Lspv7(񟚄sA|གྷC? qY+隣êheݤ `X&1L=ƑnYeiv[ >l+ΑdFb~[a+Ծ @{')PS@ @ZuBK= ͉(b!+P(( VRuuRzp8Y^ҿRCM 9u/Ge/`1#,Wk7.E*Ing/>M:In"}UTm;L~{ ְ ^C|,;qX9( ϖ&v.l@vkلO#<:*Keޡ\FL PvNȂb6薫Y prDsH纯^)6ǰW!89׏hA2Qlyi;$aE@M:s;:Xjɷ^OgϤ; Pts!C8yĞ<e2rRԏħ/[2&8D+[Uio[6$ijH Z]6Lu28GvRUwx X q %yڢ"0 pGt)F(:Í"i1Fn%d4MV:zZ;)C'/ }S`ntbi%Ġs<)Eo3GUyu/#AHGCEzTa#DX>ǙFbn8;hQH0V[Z `ʛ `u@8y i1{V||1O 7xͫa[ i 36ҷ?; ⣨G74{ ۟:)VI"NxZ ,Q%/ujg2AHM+NOcfH0vI qd/k_Yp5բpW6;C;7Ǒ|<(Q!55*eMq&-%+Z%5פ9>ɧ(<=ߧes ob]/>QY_,jvETxj='=(Ig;"Um'j)QgQIXm'ݡ쒄/QŪV"N yS %ﭸGPNe!HʭKg0U5 $eꦡn \%-vZHk1Px H aϐk(H Mo -q2f +L#-qKr.qј*l%٨8Tx >jK%`;_rv~Ջ(dOMK_"Bq˕b'UGHc4FۺYɦuSd&א^q Y(MW:Kmj)b4ZˆK9 |RV>P#X>nk$tPL7ՌR؞=Үz~e^˯_O Pcf9 "з 77N=\4WW3Erk[n)EC3Sv&/)I,N{!GP Ip3%2VNRkߑBzߑ/ɦf;R*%29sxv0u~&! FiiVJ 8A{  =@f"jI,0 dW},;@53x3CdNa6=NmkI?k>4 842X7[Uf;rx  \B''Oz%E Lz*IF_:#&~d< 8!vkJGI%eRg;xPC=ʕf$Jeۺ Y7}?{A|$[rʺ"e:E.zYQI"di2_ j? &|mOG֦Z7Sy>?G[;&yi=ȲYH*3'ceUn*o(Ւܶ)ն^ZVə_{ώ%fL&7@.;}=J_/W3v`.kjn Z僸+_T{8ϑ&sdu+$}+mۓ/&{^v Gg77|ZwAP?imںکhR7&G@Z+TJ6,ۣNMtc{ro޴S)X|3_)\%, c+a5z8_o,Խ;@[?}ߵ 3bGwbFlC{+Z(,o򰊦. ۺRf.V$&B?56Gz67bu/oB?xcUk/(kC!н;'g}"^:q] -fUFs2pک8ҊY -/Mz_xž?u>uS`^;RAϮCV{rpa͡}LP湈>Q iE+xPH˓0u2a8?z]/d 6Z>n!UX+ >kq07?a<0+_5py6Y\˯>.;OL]]6tv4['}qs%:;nZKu-/$dWTZ>1 `F]-}\oncPT#䲶KbNAVsƳPS!dFh5&3OuoCEN'h+zAQ6zT GE2;kd&Qe:Ոga};ړ$Vg&32b` ?}F*zy'50`HZg:[E o!/?#)mB~y35m!bxn`BhCA;̣GEo]11J!_RQo_5Z>Et&5L]Q/yBt+Hk&#`]J;P ᥜ SLY֭vftn/2=r_dDfmOMmLMic|omB#4#;޷hM/qd0kY 2.8 >llh($'EX -u_U1Pi:pT\Q/iŚDg3 ⣨g!~8~քwtyq@ QkɁEZLHc^K߂:kYY5넙"F@n{Q(CBL{-yOcN9,|&1=69zOΣ]jmlxSbAȶmU+pkP:x%4DTMMj<_QcyH/P:tdU$dM{$֒MR;7pmO]Pl[w5˃r{+^Yg8 Ӳ` qsuKȠQTeHp&DQܫ Y=|"&5 8ڲkCEg;y!Wt!,DoESr%}}[&@|$[ʂU&פJ9FxlWg#_4K P\!BkU2|\#?ogL?]e]52ǭأ!A${}G# PdQBe J &?8_-iQSqrG89f[lH,$L2AIr(U.aT-j*֐-JZCI5b̀?D|#eoM4)z4ġ8rCMMTYnq؋MS' wbwx͉yuH LPf|+rUps}  z Lr2ZlxA1Ç{ Mdm; >~ 4xiu ]}AՊaB8j4dS7Mٙddg&rEY9Z`"JC(>8Ћ ^ DZ萍HO1$g%c;dyZA^ȂYҫciqSi2o]:[#e! O(^J 1&r\<ҵ]4֖!s$75;62#{SSєdOfndr׮2Q3TPwHoV`K :rp2 {=ʫҽ)iE!-qԉʘ /5"&GObàWmؒrAIh`ˏ]oOWl:CfTAo"-;*mŶ8q o bpK?=0%Bc=\ħ{m0C/A5^?nuj?ɚV==u[$O SlHesm.",HJ,#N'd,JV{aA ~:x zRvtSHOraxy0En{HA]213Tv6t\D:r Up'?pXm΃O> A}唻Բ…N1em("qЫFSb&Cvdr.3Ԋk\乗Q?Wy?{F`+C7Bvݍ"zFH{ӽ6A?Š7?|*W$Vhmj"y(EV,J<~ lw 黅;<߫>HD>V[-3G(S !KMvK*z`hͮ_-"@J'MI9v%?^l&1M]p;GQ9N 95Զʔok_:OA"$/\uvͿn7A}U{Բ›Nx1em(c߫v"P>(u"|bN -VJ*| Vn JqkO[cGQ'fpKD/FsHv #.Rf/S csu^ "_.2i G]VaʥZKe|yqV4#ryIiB.gG18הlGDX#(ң >6(BZ-3/P)rS+h@VF5uYfC-N)Zft{ O3`5C- 6%fD'tbOK[v he#r[˜sDf)2$ۀprv4V:Ҟlt&撂eq] Y1HV $+nZ:xj3 HKbNH;z8$:1pljfC3Ǩ'M"pEM 6l$a9nꝓACH)3]+kجg&ƈ O&A|C>mcp g:?bwt_Oħl c ؙ1؏ +NN;8?!xy(ѧCn`d<+5J >jN FaQCu,;#5Tc-6ǶFpO ˥*V% tUUVŲ˪HaG3ڦc1dM/EyQH!N!=P9EDRHOKw ?RtWw!>@ޱEz SvwW^P3UW8[४1BGFU掊^#GwkZ{Oh"]ZW!eݱZ ^ ,L I |4K{vW6a !H6jC;Է澵jut)XvxE:!LNW괹GVk ;DwVJD.VOlin"As*)(>Πݓ#*K5LFB((XU26Z1[ثKH˛Ojc!pBW2U^ѭJK(B/;:xTV O+\c K/XU6 bv)kLъfj0x}2 u{&ƩLRn鎀z_1 B<]UϑuZgAEV\%<kH)XZ`hm˦ä\ad"2w1nAB::'UĻHUh4Eܺ<(k4 1G}eo `kGS>gPn?_v"106rox -bԒA',"5jq!-/R'e9$ {먊ePeWS4rj$К-16?l*h` ՘o^z ɤ#tTVLㅍ!=fΖ+y*I^ mFU@HR-w: 8S瑏{yzotn sIJT&5A7;d08&A)G&M3$?D Xed{tLPM~sTN)   ܜ7oe oUfw Y:%9Ռv_ʤ}`د@u ~ {{̴^sdi Ao%];7WZ=$y\xCSDo"g>߃>eo>,Ynpj'ourǰj q6r}U%kp?m9;A'Sj(ik>C*A4vdf*l,tYSu ]gETn~9N~KSAI-Xu\Tb?!rYo/'k ~^?@#>-m"X܂lr:(> >:pgA|Z!x-EV45^kTMU:y:1#Hx8/ۥ y=)Eqԉxʘ /5"&GOx릎u%e>vj( ?8Kt?Wp2*ŸZo^ďeR^?Wi -MXǏ`x$JExaP;.,䀂7Ǔ #.Rf!V$9೏&dCh5?hW*M&M]j+zeۿ']E{Im/q8 &ȵP(4z;Zo#U%H'b|EIr ? >=6BZI"-Rı E2`BNJFz ҇W !*+rQ 'u~x6ky OVn׶ bFqVpt2t^aF Tu׻߯"XT4Tʒg-pY+mπ.yΉfDxlRp[{p'k7hE.G|KiAL&j^\-KIfu `…>B1āpdn1-j7LVRwQXp6CQ6 )/C}xE9ĸ(VaM%CNA|3wR+NFQ|Grv&wjm;M*cxM@n*Т(߀O~"+USa&j;'.fPϠVsDpDB@@\AzEJhEߍ\8`utzI&cK4:`ovZ522yNPؔ -} XO(>Π]rXOTc/{mr%p/"dR#-[ #$9+tuR!VNWw-%`Ȟ;p#$TFB-ec52(7x=8¬H;ϐ~&֩`2CHyE=ҧP|A9}|48 >qi=Ywz@CdVrcW4u% А @x% P y 4$0$J&`1:B-|v(sŶWob}`I67Wmh0/fZOHQcmRY6kKb=8rYYv."ˊ‘\[+~lV+'aRJ`[[Fs;,:>njhxf '+`2e|pT,Mo*ZB./~ 8x{ʺvntC'H 9;e)O;? iyck~ZʰIO/9 ?UjC-=1swUJ-3 Y-j? r'C @&8E~p`G~qi1H 3O(BR?ɠYE'`za>4?f C65ع!\EDvvF2[!6nHys L^GbOV>NFM"b`P'Krr(ضI;&gJG03'!j̿ >xc:MyʻZpq?N+WHp(ܳLv;i%4T1zm|ۿAHU3[:rUR3Z:jʍ},0q c_s\&ZfMoz%e.^'+6!>SLU ws6^.׆?[?ZF-*JAUxqX3Xw^K"1U?:Sy*}k&I]"ԃtȶmfn-bZ~( ֜<Awע_<_y /͂W1y 3ǿgDo n }:\؟)Qǿb|XsKjJm֖Oޖ#_Q!`@hމq.9/E%E&ɇ<Ǒo@&ڊ$MO3a -ŹH?ψp3rr `ȥn )| }yIo9kW΁29+^ϔe~þEz ҇}Ubw_S ⣨_׽ACo;a#H_U_”\0{Vb)+g/;1j 2 6h3}٣uB40RǥY&h*^<}MH-U.R2\1ruPU{et]X$AyK`23Lpz Smo;c[(Ze+ekeɺ+Wx35_w^cDe A|3bZ9Yx c>-!ClU<߫dyMeb[ڦQ2ܽ$|Cwyx{,(~A|@#5qkaxz6-A">K@QĻHwADZ{+zJEl;#^su e9@|g򦯼 a2t\IxB>3 j5O:&C".!y1gQ-_P#6쐘.ͯh('&vy (֦]/|6Go#}[/v`cqx) 6Il;3'cWq\ipU5L %qHnsl{iN ;Txx![.ﶔAGQn>A2H\_| M(kjΎ<'%^4EvRUwD)5Lw)/+&H? &j%4Z*iIަhd: ގI ʍCdbW'@|$[ւUT]Rhj3݄f 9[,%`H*I<%tlOk?'"C٦s(hBgSE! SkF~5<Tu3>5V{3'ٔ1F W6Eh'ӓ@^AL#&z OCb!?, eXZgazJ~)i 0x雼u7ZU l@(NZ7N[,ԭ'6>0^].52b/M\$m%O{?/VVC<+z(nobpE%j!)k_VWn2kjk+'Ʀo#-וpF Q܂_L ANG!7xʖ[ i3¦ PTdH028HWPQQG{%WIM^i~O">@Z7T^,^с;fPf1I8:踍"(IU3: wQ2+1X<8XG&bO #=y]~&E<xYuRvz[=\n/!~Y\})]#wE!S6A wEvH.ÝYXE\DzQ n BqSKH[\TWe:B/`! _H3Z/X4n=*kR+Y5jT\j7M푷RPW^U8iyJ)C]8f  *D 6-ai{GGP/;~+8 >ͲƛvcȘtFUl8.:_ͦ8p UgtPQ*9E=#O^ph6j{(HW!YC.7bGZÞFzX~l ^ D9S d=+C!Kߦ8ڵlL>z"KzqL:F!y.MJM]C ךmx7-;-iײ/^&aYdN{HJܶ͑s$75;6Ý+9G­6Yvff2>1]DMdΏ@#!wu7 #En8¾UOwmҝy5P3,6IwN;S\x6Ivb=_tQGJkYG;eG &]0tgl%Xz%cî{ AXϼy|/@|#=@T%+C{~kibAIVO}L M9jz<`7k:gv 62A>]ɱJRCr$<\sW'e_*W($M'V&Mg_p<\ʖ>xۅԚsa Mp%?يX#M OtSu @ZuBK_~ŚCWQQ+iZ.k|bf_`Snv8VOKYYF3V !fXc?% <$ߥ /8ն6r:CW&cpG3҂A4s'(sf:)pټ'b&$HX!1Jc)26 UK% O".Z&`9.Xfp{ݶ^l""L[;Hߑ !Ec<1 M+ȓ@Xf䆥I0 9ݚO`V_Mk6d*:a ˹~r-f'` ONa+[J׋݆02rx$%)!~j;eˈ:jѶSH')BL"-?q"薫HboeG[琞N{2@,wZkh H/(lF~?z[S/ >f vS!089Gc-I9gx"k9zɄݷnw0"]4 2p w< cKwvڋdS#q7]OP:y,!;,wxFiL#<;dansdA1İ !^*_8^u h=14HDGAM乃 |Kwz)[Z!*x6 jn6(%Q>E!ZI v+&%ב^WMD%>|W6w -LYH?B| ihP #eB?i F$@ݑw 3S`d ~cn8gZAEw Žfk/aM K[)Rut4yP!ħH?e *Ҟu@d#hdL~3Zv5:Κ:AVesQ-`ںmC枸!c>/5qo4-޻9eD&2+;$ EsH iBʉt/H VPM%?@G~w~zl+C+Bb@EzT[PISHR#-9b i'|;81!a"iYgM|, 0t>3s H *A|$٘Q9gyL-`P2(э`/Xp+)-RMđ >cl@fkE͓jd H)f|/J)#yG:O lF8Oǐ&S uc8.)XjلB>܍#G4 Hs*gU gOq =XFݜj࿶.uU[/5S^㐌 dPv YȡBO:?4!OS";C";Qfj{'Dx8;89jx"3gPnƋ˲32 >2,gR*6e21(N6*່h2tR(< Yvr1yHi 0*G駽c寒|ZP QAnO!)(Pi坮QP࢏LNiGNE/^~11Lq"rϹƪP;7+G&:L$uչ" K:k:;B-C<ᅼ:&:m[wkxkŜq`M"p"a';Z1n%ƜFv/'X8$QzD (a/EڠAm0xi6z? >ldqV [o~CD<9^i qt d}!۝ HKH[rѫڍ͡[ EZl|.<%IV#N"=lM9'uH Xq> A (2er)VEx[6Mbя=[[%ZɱjJ׽DAO*9^;" GU_UVVߑVpsҗ:oeAqg/jw3%*L tnJz5aI_.AWħ]=_2(?6U`8o&-ǍI6h \~?=8@ulh m:tEBW&Ecm?kHu~B+U~lϑ\ ?QJcÞK°:FZP|?t?Ƕ/b>lT~ ȂwR^sa|ٳ*rk9Į::eRRI*z<g!zYe*;PA P-a/*ZbPc QE_3hͶ6Md'"J1x YyjI m[4 ob~u y'\̹tޒ S+k pбIDen "^AZ>Q"&Kkclg6V)`15פ5Vzw$< vv$qω(xn$ X<`DuǰΉP|?=`?9 w8Π91̉5vuZ G!X;S0$bξ  esH!(*ʟFތkf1,g3~ӯ+qA5)_&/~M#^kռ%UFW=Fا:cwlG3L8D ᚖ2!om)Y:2k4<걩׮:[vwej]lΫGqZ ǩGm Ux:cyYuǰe(ulu~S ʫclgPZQ+dAKV79v\v Az.n DZ(J@ØC-FR?;Leo.[ع iy@3qMkA0oFoN?5l'f*Y@:>2׽ܜ\1U|Cqo.W-hF𦪙:h6}zK6mM-,cW;aIB`Ӵ kx㞼6 ZjFꕅ?GZ,Ym$W>'ٷN5o67f88.( H˻r|7O&G`װh:pq 3fq-p[ly@xKj:] wK>TߢاT 6AdKܿFU}(> >J{gn""&qb~;1 PMύ-7Uc6P5]c @=qdC#7W-ZnV+$(7]ed ,O҄%;Gu",l.>bJ{̑yZvyKe:SަaStIAj:ïW x5_4Cn͵2x4יV,o:{HoHoLP1M[52ŽV^H+& vJݭCb CvTNյ1P0O!}JʹlnA[5xNroJTEIT/xsį ;]{ ֊k^06n̑k3ײY^)As[#;.j#(]FPhb?6K.;tX}tk Gɲf&+8O1Ԣb9dy\O=Wei\6'kA`GVVKŽymm( :}>d+ {U,:\m?<7`W^W9%NMsjstats/tests/0000755000176200001440000000000013563265750013123 5ustar liggesuserssjstats/tests/testthat/0000755000176200001440000000000013617050712014751 5ustar liggesuserssjstats/tests/testthat/test-wtd.R0000644000176200001440000000340213565713271016656 0ustar liggesusersif (require("testthat") && require("sjstats")) { context("sjstats, var_names") data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, .3)) test_that("wtd", { expect_equal(weighted_sd(efc$c12hour, weights = efc$weight), 51.18224, tolerance = 1e-5) expect_equal(weighted_sd(efc$c12hour, weights = NULL), 50.80504, tolerance = 1e-5) expect_equal(weighted_mean(efc$c12hour, weights = efc$weight), 42.80723, tolerance = 1e-5) expect_equal(weighted_mean(efc$c12hour, weights = NULL), 42.39911, tolerance = 1e-5) expect_equal(weighted_se(efc$c12hour, weights = efc$weight), 1.704182, tolerance = 1e-5) expect_equal(weighted_se(efc$c12hour, weights = NULL), 1.691623, tolerance = 1e-5) expect_equal(weighted_median(efc$c12hour, weights = efc$weight), 20, tolerance = 1e-5) expect_equal(weighted_median(efc$c12hour, weights = NULL), 20, tolerance = 1e-5) }) test_that("weighted_chisqtest", { w <- weighted_chisqtest(efc, c161sex, c172code, weights = weight) expect_equal(w$estimate, 0.06668895, tolerance = 1e-5) expect_equal(w$p.value, 0.1517221, tolerance = 1e-5) w <- weighted_chisqtest(c161sex ~ c172code + weight, efc) expect_equal(w$estimate, 0.06668895, tolerance = 1e-5) expect_equal(w$p.value, 0.1517221, tolerance = 1e-5) }) test_that("weighted_mannwhitney", { weighted_mannwhitney(efc, c12hour, c161sex, weights = weight) weighted_mannwhitney(c12hour ~ c161sex + weight, efc) }) test_that("weighted_ttest", { weighted_ttest(efc, e17age, weights = weight) weighted_ttest(efc, e17age, c160age, weights = weight) weighted_ttest(e17age ~ e16sex + weight, efc) weighted_ttest(efc, e17age, c160age, weights = weight, ci.lvl = .8) }) } sjstats/tests/testthat/test-anova_stats.R0000644000176200001440000000332713563265750020412 0ustar liggesusers.runThisTest <- Sys.getenv("RunAllsjstatsTests") == "yes" if (.runThisTest) { if (require("testthat") && require("sjstats")) { context("sjstats, anova_stats") # fit linear model data(efc) m <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) test_that("eta_sq", { eta_sq(m, partial = FALSE) eta_sq(m, partial = TRUE) eta_sq(m, partial = FALSE, ci.lvl = .5, n = 50) eta_sq(m, partial = TRUE, ci.lvl = .6, n = 50) }) test_that("omega_sq", { omega_sq(m, partial = FALSE) omega_sq(m, partial = TRUE) omega_sq(m, partial = FALSE, ci.lvl = .5, n = 50) omega_sq(m, partial = TRUE, ci.lvl = .6, n = 50) }) test_that("cohens_f", { cohens_f(m) }) test_that("anova_stats", { anova_stats(m, digits = 3) anova_stats(m, digits = 5) anova_stats(car::Anova(m, type = 2)) anova_stats(car::Anova(m, type = 3)) }) set.seed(123) fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) test_that("omega_sq", { omega_sq(fit, partial = TRUE, ci.lvl = 0.95, n = 50) }) set.seed(123) data(mtcars) m <- stats::aov( formula = mpg ~ wt + qsec + Error(disp / am), data = mtcars ) test_that("anova_stats", { anova_stats(m, digits = 3) anova_stats(m, digits = 5) eta_sq(m, partial = TRUE, ci.lvl = 0.95, n = 10) eta_sq(m, partial = FALSE, ci.lvl = 0.95, n = 10) omega_sq(m, partial = TRUE, ci.lvl = 0.95, n = 10) omega_sq(m, partial = FALSE, ci.lvl = 0.95, n = 10) }) } } sjstats/tests/testthat/test-autoprior.R0000644000176200001440000000177313563265750020117 0ustar liggesusers.runThisTest <- Sys.getenv("RunAllsjstatsTests") == "yes" if (.runThisTest) { if (suppressWarnings( require("testthat") && require("sjstats") && require("sjmisc") && require("brms") )) { context("sjstats, autoprior") data(efc) efc$c172code <- as.factor(efc$c172code) efc$c161sex <- to_label(efc$c161sex) efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) test_that("auto_prior", { mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) expect_is(auto_prior(mf, efc, TRUE), "brmsprior") mf <- formula(neg_c_7 ~ c161sex + c160age + c172code + c12hour + e17age) expect_is(auto_prior(mf, efc, TRUE), "brmsprior") expect_error(auto_prior(mf, efc)) mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) expect_is(auto_prior(mf, efc, FALSE), "brmsprior") expect_is(auto_prior(mf, efc), "brmsprior") expect_warning(auto_prior(mf, efc, TRUE)) }) } } sjstats/tests/testthat/test-grpmean.R0000644000176200001440000000161013610332406017474 0ustar liggesusersif (require("testthat") && require("sjstats") && require("dplyr")) { data(efc) efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) efc_grouped <- group_by(efc, c172code) test_that("means_by_group", { means_by_group(efc, c12hour, e42dep) }) test_that("means_by_group, weighting", { w <- "weight" means_by_group(efc, c12hour, e42dep, weights = weight) means_by_group(efc, c12hour, e42dep, weights = "weight") means_by_group(efc, c12hour, e42dep, weights = w) }) test_that("means_by_group, grouping", { means_by_group(efc_grouped, c12hour, e42dep) }) test_that("means_by_group, grouped weighting", { w <- "weight" means_by_group(efc_grouped, c12hour, e42dep, weights = weight) means_by_group(efc_grouped, c12hour, e42dep, weights = "weight") means_by_group(efc_grouped, c12hour, e42dep, weights = w) }) } sjstats/tests/testthat.R0000644000176200001440000000036413563265750015111 0ustar liggesuserslibrary(testthat) library(sjstats) if (length(strsplit(packageDescription("sjstats")$Version, "\\.")[[1]]) > 3) { Sys.setenv("RunAllsjstatsTests" = "yes") } else { Sys.setenv("RunAllsjstatsTests" = "no") } test_check("sjstats") sjstats/vignettes/0000755000176200001440000000000013616771064013770 5ustar liggesuserssjstats/vignettes/mixedmodels-statistics.Rmd0000644000176200001440000001340713616476363021147 0ustar liggesusers--- title: "Statistics for Mixed Effects Models" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Statistics for Mixed Effects Models} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r set-options, echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) options(width = 800) if (!requireNamespace("lme4", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } ``` # Statistics and Measures for Mixed Effects Models This vignettes demontrates those functions of the *sjstats*-package that deal especially with mixed effects models. *sjstats* provides following functions: * `design_effect()` and `samplesize_mixed()` * `scale_weights()` Befor we start, we fit a simple linear mixed model: ```{r} library(sjstats) library(lme4) # load sample data data(sleepstudy) # fit linear mixed model m <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy) set.seed(2018) sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE) m2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy) ``` ## Sample Size Calculation for Mixed Models The first two functions, `design_effect()` and `samplesize_mixed()`, can be used to approximately calculate the sample size in the context of power calculation. Calculating the sample size for simple linear models is pretty straightforward, however, for (linear) mixed models, statistical power is affected through the change of the variance of test statistics. This is what _Hsieh et al. (2003)_ call a _design effect_ (or variance inflation factor, VIF). Once this design effect is calculated, the sample size calculated for a standard design can be adjusted accordingly. ### Design Effect for Two-Level Mixed Models `design_effect()` computes this design effect for linear mixed models with two-level design. It requires the approximated average number of observations per grouping cluster (i.e. level-2 unit) and the assumed intraclass correlation coefficient (ICC) for the multilevel-model. Typically, the minimum assumed value for the ICC is _0.05_. ```{r} # Design effect for two-level model with 30 observations per # cluster group (level-2 unit) and an assumed intraclass # correlation coefficient of 0.05. design_effect(n = 30) # Design effect for two-level model with 24 observation per cluster # group and an assumed intraclass correlation coefficient of 0.2. design_effect(n = 24, icc = 0.2) ``` ### Calculating the Sample Size for Linear Mixed Models `samplesize_mixed()` combines the functions for power calculation from the **pwr**-package and design effect `design_effect()`. It computes an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs. ```{r} # Sample size for multilevel model with 30 cluster groups and a small to # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and # hence a total sample size of about 802 observations is needed. samplesize_mixed(eff.size = .3, k = 30) # Sample size for multilevel model with 20 cluster groups and a medium # to large effect size for linear models of 0.2. Five subjects per cluster and # hence a total sample size of about 107 observations is needed. samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) ``` There are more ways to perform power calculations for multilevel models, however, most of these require very detailed knowledge about the sample characteristics and performing simulation studys. `samplesize_mixed()` is a more pragmatic alternative to these approaches. ## Rescale model weights for complex samples Most functions to fit multilevel and mixed effects models only allow to specify frequency weights, but not design (i.e. _sampling_ or _probability_) weights, which should be used when analyzing complex samples and survey data. `scale_weights()` implements an algorithm proposed by _Aaparouhov (2006)_ and _Carle (2009)_ to rescale design weights in survey data to account for the grouping structure of multilevel models, which then can be used for multilevel modelling. To calculate a weight-vector that can be used in multilevel models, `scale_weights()` needs the data frame with survey data as `x`-argument. This data frame should contain 1) a _cluster ID_ (argument `cluster.id`), which represents the _strata_ of the survey data (the level-2-cluster variable) and 2) the probability weights (argument `pweight`), which represents the design or sampling weights of the survey data (level-1-weight). `scale_weights()` then returns the original data frame, including two new variables: `svywght_a`, where the sample weights `pweight` are adjusted by a factor that represents the proportion of cluster size divided by the sum of sampling weights within each cluster. The adjustment factor for `svywght_b` is the sum of sample weights within each cluster devided by the sum of squared sample weights within each cluster (see _Carle (2009)_, Appendix B, for details). ```{r} data(nhanes_sample) scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) ``` # References Aaparouhov T. 2006. _General Multi-Level Modeling with Sampling Weights._ Communications in Statistics—Theory and Methods (35): 439–460 Carle AC. 2009. _Fitting multilevel models in complex survey data with design weights: Recommendations._ BMC Medical Research Methodology 9(49): 1-13 Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. _An Overview of Variance Inflation Factors for Sample-Size Calculation._ Evaluation & the Health Professions 26: 239–257. doi: [10.1177/0163278703255230](http://doi.org/10.1177/0163278703255230) sjstats/vignettes/bayesian-statistics.Rmd0000644000176200001440000000746613616476363020440 0ustar liggesusers--- title: "Summary of Mediation Analysis using Bayesian Regression Models" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette --- ```{r, SETTINGS-knitr, echo = FALSE, warning = FALSE, message = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE ) options(width = 800) if (!requireNamespace("mediation", quietly = TRUE) || !requireNamespace("httr", quietly = TRUE) || !requireNamespace("brms", quietly = TRUE) || !requireNamespace("insight", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } ``` This vignettes demontrates the `mediation()`-function in **sjstats**. Before we start, we fit some models, including a mediation-object from the _mediation_-package, which we use for comparison with _brms_. ```{r} library(sjstats) library(mediation) library(brms) # load sample data data(jobs) set.seed(123) # linear models, for mediation analysis b1 <- lm(job_seek ~ treat + econ_hard + sex + age, data = jobs) b2 <- lm(depress2 ~ treat + job_seek + econ_hard + sex + age, data = jobs) # mediation analysis, for comparison with brms m1 <- mediate(b1, b2, sims = 1000, treat = "treat", mediator = "job_seek") ``` ```{r eval=FALSE} # Fit Bayesian mediation model f1 <- bf(job_seek ~ treat + econ_hard + sex + age) f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age) m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4) ``` ```{r echo=FALSE} m2 <- insight::download_model("brms_mv_6") ``` `mediation()` is a summary function, especially for mediation analysis, i.e. for multivariate response models with casual mediation effects. In the model _m2_, _treat_ is the treatment effect, *job_seek* is the mediator effect, _f1_ describes the mediator model and _f2_ describes the outcome model. `mediation()` returns a data frame with information on the _direct effect_ (median value of posterior samples from treatment of the outcome model), _mediator effect_ (median value of posterior samples from mediator of the outcome model), _indirect effect_ (median value of the multiplication of the posterior samples from mediator of the outcome model and the posterior samples from treatment of the mediation model) and the _total effect_ (median value of sums of posterior samples used for the direct and indirect effect). The _proportion mediated_ is the indirect effect divided by the total effect. The simplest call just needs the model-object. ```{r, message=TRUE} mediation(m2) ``` Typically, `mediation()` finds the treatment and mediator variables automatically. If this does not work, use the `treatment` and `mediator` arguments to specify the related variable names. For all values, the 90% HDIs are calculated by default. Use `prob` to calculate a different interval. Here is a comparison with the _mediation_ package. Note that the `summary()`-output of the _mediation_ package shows the indirect effect first, followed by the direct effect. ```{r} summary(m1) mediation(m2, prob = .95) ``` If you want to calculate mean instead of median values from the posterior samples, use the `typical`-argument. Furthermore, there is a `print()`-method, which allows to print more digits. ```{r, message=TRUE} mediation(m2, typical = "mean", prob = .95) %>% print(digits = 4) ``` As you can see, the results are similar to what the _mediation_ package produces for non-Bayesian models. # References Bürkner, P. C. (2017). brms: An R package for Bayesian multilevel models using Stan. Journal of Statistical Software, 80(1), 1-28 sjstats/vignettes/anova-statistics.Rmd0000644000176200001440000001221313565522642017726 0ustar liggesusers--- title: "Statistics for Anova Tables" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Statistics for Anova Tables} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r set-options, echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) options(width = 800) ``` # Effect Size Statistics for Anova Tables This vignettes demontrates those functions of the *sjstats*-package that deal with Anova tables. These functions report different effect size measures, which are useful beyond significance tests (p-values), because they estimate the magnitude of effects, independent from sample size. *sjstats* provides following functions: * `eta_sq()` * `omega_sq()` * `epsilon_sq()` * `anova_stats()` Befor we start, we fit a simple model: ```{r} library(sjstats) # load sample data data(efc) # fit linear model fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) ``` All functions accept objects of class `aov` or `anova`, so you can also use model fits from the *car* package, which allows fitting Anova's with different types of sum of squares. Other objects, like `lm`, will be coerced to `anova` internally. The following functions return the effect size statistic as named numeric vector, using the model's term names. ## Eta-Squared The eta-squared is the proportion of the total variability in the dependent variable that is accounted for by the variation in the independent variable. It is the ratio of the sum of squares for each group level to the total sum of squares. It can be interpreted as percentage of variance accounted for by a variable. For variables with 1 degree of freedeom (in the numerator), the square root of eta-squared is equal to the correlation coefficient _r_. For variables with more than 1 degree of freedom, eta-squared equals _R2_. This makes eta-squared easily interpretable. Furthermore, these effect sizes can easily be converted into effect size measures that can be, for instance, further processed in meta-analyses. Eta-squared can be computed simply with: ```{r} eta_sq(fit) ``` ## Partial Eta-Squared The partial eta-squared value is the ratio of the sum of squares for each group level to the sum of squares for each group level plus the residual sum of squares. It is more difficult to interpret, because its value strongly depends on the variability of the residuals. Partial eta-squared values should be reported with caution, and Levine and Hullett (2002) recommend reporting eta- or omega-squared rather than partial eta-squared. Use the `partial`-argument to compute partial eta-squared values: ```{r} eta_sq(fit, partial = TRUE) ``` ## Omega-Squared While eta-squared estimates tend to be biased in certain situations, e.g. when the sample size is small or the independent variables have many group levels, omega-squared estimates are corrected for this bias. Omega-squared can be simply computed with: ```{r} omega_sq(fit) ``` ## Partial Omega-Squared `omega_sq()` also has a `partial`-argument to compute partial omega-squared values. Computing the partial omega-squared statistics is based on bootstrapping. In this case, use `n` to define the number of samples (1000 by default.) ```{r} omega_sq(fit, partial = TRUE, n = 100) ``` # Epsilon Squared Espilon-squared is a less common measure of effect size. It is sometimes considered as an "adjusted r-squared" value. You can compute this effect size using `epsilon_sq()`. ```{r} epsilon_sq(fit) ``` When the `ci.lvl`-argument is defined, bootstrapping is used to compute the confidence intervals. ```{r} epsilon_sq(fit, ci.lvl = .95, n = 100) ``` # Complete Statistical Table Output The `anova_stats()` function takes a model input and computes a comprehensive summary, including the above effect size measures, returned as tidy data frame: ```{r} anova_stats(fit) ``` Like the other functions, the input may also be an object of class `anova`, so you can also use model fits from the *car* package, which allows fitting Anova's with different types of sum of squares: ```{r} anova_stats(car::Anova(fit, type = 3)) ``` # Confidence Intervals `eta_sq()` and `omega_sq()` have a `ci.lvl`-argument, which - if not `NULL` - also computes a confidence interval. For eta-squared, i.e. `eta_sq()` with `partial = FALSE`, due to non-symmetry, confidence intervals are based on bootstrap-methods. Confidence intervals for partial omega-squared, i.e. `omega_sq()` with `partial = TRUE` - is also based on bootstrapping. In these cases, `n` indicates the number of bootstrap samples to be drawn to compute the confidence intervals. ```{r} eta_sq(fit, partial = TRUE, ci.lvl = .8) # uses bootstrapping - here, for speed, just 100 samples omega_sq(fit, partial = TRUE, ci.lvl = .9, n = 100) ``` # References Levine TR, Hullet CR. Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. Human Communication Research 28(4); 2002: 612-625 sjstats/R/0000755000176200001440000000000013616476363012165 5ustar liggesuserssjstats/R/epsilon_sq.R0000644000176200001440000000125613563265750014465 0ustar liggesusers#' @rdname eta_sq #' @importFrom dplyr bind_cols mutate #' @export epsilon_sq <- function(model, ci.lvl = NULL, n = 1000, method = c("dist", "quantile")) { method <- match.arg(method) es <- aov_stat(model, type = "epsilon") x <- data_frame( term = names(es), es = es ) if (!is.null(ci.lvl) && !is.na(ci.lvl)) { x <- es_boot_fun( model = model, type = "epsilon", ci.lvl = ci.lvl, n = n, boot.method = method ) } colnames(x)[2] <- "epsilonsq" if (!is.null(attr(es, "stratum"))) x$stratum <- attr(es, "stratum")[1:nrow(x)] class(x) <- c("sj_anova_stat", class(x)) x } sjstats/R/nhanes_sample.R0000644000176200001440000000202513563265750015121 0ustar liggesusers#' @docType data #' @title Sample dataset from the National Health and Nutrition Examination Survey #' @name nhanes_sample #' @keywords data #' #' @description Selected variables from the National Health and Nutrition Examination #' Survey that are used in the example from Lumley (2010), Appendix E. #' See \code{\link{svyglm.nb}} for examples. #' #' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley NULL #' @docType data #' @title Sample dataset from the EUROFAMCARE project #' @name efc #' @keywords data #' #' @description German data set from the European study on family care of older people. #' #' @references Lamura G, Döhner H, Kofahl C, editors. Family carers of older people in Europe: a six-country comparative study. Münster: LIT, 2008. NULL #' @docType data #' @title Sample dataset #' @name fish #' @keywords data #' #' @description Sample data from the UCLA idre website. #' #' @references https://stats.idre.ucla.edu/r/dae/zip/ NULL sjstats/R/odds_to_rr.R0000644000176200001440000001300113563265750014436 0ustar liggesusers#' @title Get relative risks estimates from logistic regressions or odds ratio values #' @name odds_to_rr #' #' @description \code{odds_to_rr()} converts odds ratios from a logistic regression #' model (including mixed models) into relative risks; \code{or_to_rr()} #' converts a single odds ratio estimate into a relative risk estimate. #' #' @param fit A fitted binomial generalized linear (mixed) model with logit-link function #' (logistic (multilevel) regression model). #' @param or Numeric, an odds ratio estimate. #' @param p0 Numeric, the risk of having a positive outcome in the control or #' unexposed group (reference group), i.e. the number of outcome or "successes" #' in the control divided by the total number of observations in the control #' group. #' #' @return A data frame with relative risks and lower/upper confidence interval for #' the relative risks estimates; for \code{or_to_rr()}, the risk ratio #' estimate. #' #' @references #' Grant RL. 2014. Converting an odds ratio to a range of plausible relative risks for better communication of research findings. BMJ 348:f7450. \doi{10.1136/bmj.f7450} #' \cr \cr #' Wang Z. 2013. Converting Odds Ratio to Relative Risk in Cohort Studies with Partial Data Information. J Stat Soft 2013;55. \doi{10.18637/jss.v055.i05} #' \cr \cr #' Zhang J, Yu KF. 1998. What's the Relative Risk? A Method of Correcting the Odds Ratio in Cohort Studies of Common Outcomes. JAMA; 280(19): 1690-1. \doi{10.1001/jama.280.19.1690} #' #' #' @details This function extracts the odds ratios (exponentiated model coefficients) #' from logistic regressions (fitted with \code{glm} or \code{glmer}) #' and their related confidence intervals, and transforms these values #' into relative risks (and their related confidence intervals). #' \cr \cr #' The formula for transformation is based on Zhang and Yu (1998), #' Wang (2013) and Grant (2014): #' \code{RR <- OR / (1 - P0 + (P0 * OR))}, where \code{OR} is the odds #' ratio and \code{P0} indicates the proportion of the incidence in #' the outcome variable for the control group (reference group). #' #' @examples #' library(sjmisc) #' library(lme4) #' # create binary response #' sleepstudy$Reaction.dicho <- dicho(sleepstudy$Reaction, dich.by = "median") #' # fit model #' fit <- glmer(Reaction.dicho ~ Days + (Days | Subject), #' data = sleepstudy, family = binomial("logit")) #' # convert to relative risks #' odds_to_rr(fit) #' #' #' data(efc) #' # create binary response #' y <- ifelse(efc$neg_c_7 < median(na.omit(efc$neg_c_7)), 0, 1) #' # create data frame for fitted model #' mydf <- data.frame( #' y = as.factor(y), #' sex = to_factor(efc$c161sex), #' dep = to_factor(efc$e42dep), #' barthel = efc$barthtot, #' education = to_factor(efc$c172code) #' ) #' # fit model #' fit <- glm(y ~., data = mydf, family = binomial(link = "logit")) #' # convert to relative risks #' odds_to_rr(fit) #' #' # replicate OR/RR for coefficient "sex" from above regression #' # p0 ~ .44, or ~ 1.914 #' prop.table(table(mydf$y, mydf$sex)) #' or_to_rr(1.914, 0.1055 / (.1324 + .1055)) #' #' @importFrom stats coef confint model.frame #' @importFrom sjlabelled as_numeric #' @export odds_to_rr <- function(fit) { # check model family fitinfo <- get_glm_family(fit) # no binomial model with logit-link? if (!fitinfo$is_bin && !fitinfo$is_logit) stop("`fit` must be a binomial model with logit-link (logistic regression).", call. = F) # get model estimates est <- insight::get_parameters(fit) est[[2]] <- exp(est[[2]]) # get confidence intervals if (is_merMod(fit)) ci <- stats::confint(fit, method = "Wald", parm = "beta_") else ci <- stats::confint(fit) # bind to data frame or.dat <- data.frame(est, exp(ci)) colnames(or.dat) <- c("Parameter", "OR", "CI_low", "CI_high") # get P0, i.e. the incidence ratio of the outcome for the # non-exposed group modfram <- insight::get_data(fit) # make sure that outcome is 0/1-numeric, so we can simply # compute the mean to get the ratio outcome <- sjmisc::recode_to(sjlabelled::as_numeric(insight::get_response(fit))) P0 <- c() for (i in 1:nrow(est)) { P0 <- c(P0, .baseline_risk_for_predictor(modfram, outcome, est[[1]][i])) } # compute relative risks for estimate and confidence intervals rr.dat <- or.dat[, 2:4] / ((1 - P0) + (P0 * or.dat[, 2:4])) rr.dat <- cbind(or.dat$Parameter, or.dat$OR, rr.dat) colnames(rr.dat) <- c("Parameter", "Odds Ratio", "Risk Ratio", "CI_low", "CI_high") rownames(rr.dat) <- NULL rr.dat } .baseline_risk_for_predictor <- function(data, outcome, parameter) { if (parameter == "(Intercept)") return(mean(outcome)) if (!(parameter %in% colnames(data))) { find.factors <- lapply(colnames(data), function(.i) { v <- data[[.i]] if (is.factor(v)) { return(paste0(.i, levels(v))) } return(.i) }) names(find.factors) <- colnames(data) parameter <- names(find.factors)[which(sapply(find.factors, function(.i) { parameter %in% .i }))] } if (is.numeric(data[[parameter]])) { mean(outcome) } else { p <- prop.table(table(data[[parameter]], outcome)) p[1, 2] / sum(p[1, ]) } } #' @rdname odds_to_rr #' @export or_to_rr <- function(or, p0) { or / (1 - p0 + (p0 * or)) } sjstats/R/wtd_median.R0000644000176200001440000000161013565520623014411 0ustar liggesusers#' @rdname weighted_sd #' @export weighted_median <- function(x, weights = NULL) { UseMethod("weighted_median") } #' @export weighted_median.default <- function(x, weights = NULL) { weighted_md_helper(x, w = weights, p = 0.5) } #' @importFrom purrr map_dbl #' @importFrom dplyr select_if #' @export weighted_median.data.frame <- function(x, weights = NULL) { dplyr::select_if(x, is.numeric) %>% purrr::map_dbl(~ weighted_md_helper(.x, w = weights, p = 0.5)) } weighted_md_helper <- function(x, w, p = .5) { if (is.null(w)) w <- rep(1, length(x)) x[is.na(w)] <- NA w[is.na(x)] <- NA w <- na.omit(w) x <- na.omit(x) order <- order(x) x <- x[order] w <- w[order] rw <- cumsum(w) / sum(w) md.values <- min(which(rw >= p)) if (rw[md.values] == p) q <- mean(x[md.values:(md.values + 1)]) else q <- x[md.values] q } sjstats/R/find_beta.R0000644000176200001440000001500513563265750014221 0ustar liggesusers#' @title Determining distribution parameters #' @name find_beta #' #' @description \code{find_beta()}, \code{find_normal()} and \code{find_cauchy()} find the #' shape, mean and standard deviation resp. the location and scale parameters #' to describe the beta, normal or cauchy distribution, based on two #' percentiles. \code{find_beta2()} finds the shape parameters for a Beta #' distribution, based on a probability value and its standard error #' or confidence intervals. #' #' @param x1 Value for the first percentile. #' @param p1 Probability of the first percentile. #' @param x2 Value for the second percentile. #' @param p2 Probability of the second percentile. #' @param x Numeric, a probability value between 0 and 1. Typically indicates #' a prevalence rate of an outcome of interest; Or an integer value #' with the number of observed events. In this case, specify \code{n} #' to indicate the toral number of observations. #' @param se The standard error of \code{x}. Either \code{se} or \code{ci} must #' be specified. #' @param ci The upper limit of the confidence interval of \code{x}. Either #' \code{se} or \code{ci} must be specified. #' @param n Numeric, number of total observations. Needs to be specified, if #' \code{x} is an integer (number of observed events), and no #' probability. See 'Examples'. #' #' @return A list of length two, with the two distribution parameters than can #' be used to define the distribution, which (best) describes #' the shape for the given input parameters. #' #' @details These functions can be used to find parameter for various distributions, #' to define prior probabilities for Bayesian analyses. \code{x1}, #' \code{p1}, \code{x2} and \code{p2} are parameters that describe two #' quantiles. Given this knowledge, the distribution parameters are #' returned. \cr \cr #' Use \code{find_beta2()}, if the known parameters are, e.g. a prevalence #' rate or similar probability, and its standard deviation or confidence #' interval. In this case. \code{x} should be a probability, #' for example a prevalence rate of a certain event. \code{se} then #' needs to be the standard error for this probability. Alternatively, #' \code{ci} can be specified, which should indicate the upper limit #' of the confidence interval od the probability (prevalence rate) \code{x}. #' If the number of events out of a total number of trials is known #' (e.g. 12 heads out of 30 coin tosses), \code{x} can also be the number #' of observed events, while \code{n} indicates the total amount of trials #' (in the above example, the function call would be: \code{find_beta2(x = 12, n = 30)}). #' #' @references Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF}) #' #' @examples #' # example from blogpost: #' # https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/ #' # 10% of patients respond within 30 days of treatment #' # and 80% respond within 90 days of treatment #' find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' #' parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' curve( #' dnorm(x, mean = parms$mean, sd = parms$sd), #' from = 0, to = 200 #' ) #' #' parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' curve( #' dcauchy(x, location = parms$location, scale = parms$scale), #' from = 0, to = 200 #' ) #' #' #' find_beta2(x = .25, ci = .5) #' #' shapes <- find_beta2(x = .25, ci = .5) #' curve(dbeta(x, shapes[[1]], shapes[[2]])) #' #' # find Beta distribution for 3 events out of 20 observations #' find_beta2(x = 3, n = 20) #' #' shapes <- find_beta2(x = 3, n = 20) #' curve(dbeta(x, shapes[[1]], shapes[[2]])) #' #' @importFrom stats pbeta approx #' @importFrom purrr map_dbl #' @export find_beta <- function(x1, p1, x2, p2) { logK <- seq(-5, 10, length = 200) K <- exp(logK) m <- purrr::map_dbl(K, ~ betaprior(.x, x1, p1)) prob2 <- stats::pbeta(x2, K * m, K * (1 - m)) ind <- ((prob2 > 0) & (prob2 < 1)) app <- stats::approx(prob2[ind], logK[ind], p2) K0 <- exp(app$y) m0 <- betaprior(K0, x1, p1) s1 <- K0 * m0 s2 <- K0 * (1 - m0) list(shape1 = s1, shape2 = s2) } betaprior <- function(K, x, p) { m.lo <- 0 m.hi <- 1 flag <- TRUE while (flag) { m0 <- (m.lo + m.hi) / 2 p0 <- stats::pbeta(x, K * m0, K * (1 - m0)) if (p0 < p) m.hi <- m0 else m.lo <- m0 if (abs(p0 - p) < 1e-04) flag <- FALSE } m0 } #' @rdname find_beta #' @export find_beta2 <- function(x, se, ci, n) { # check if all required arguments are given if (missing(se) && missing(ci) && missing(n)) { stop("Either `se` or `ci`, or `n` must be specified.", call. = F) } # for number of observations, compute variance of beta distribution if (!missing(n)) { if (!is.integer(x) && x < 1) stop("If `n` is given, x` must be an integer value greater than 0.", call. = F) # compute 2 SD from beta variance bvar <- 2 * sqrt((x * n) / ((x + n)^2 * (x + n + 1))) # need to compute proportion x <- x / n p2 <- .95 x2 <- x + bvar } # for standard errors, we assume a 68% quantile if (!missing(se)) { p2 <- .68 x2 <- x + se } # for CI, we assume a 68% quantile if (!missing(ci)) { p2 <- .95 x2 <- ci } # the probability is assumed to be the median p1 <- .5 x1 <- x find_beta(x1, p1, x2, p2) } #' @importFrom stats qcauchy #' @rdname find_beta #' @export find_cauchy <- function(x1, p1, x2, p2) { # find location paramater l <- (x1 * stats::qcauchy(p2) ^ -1 - x2 * stats::qcauchy(p1) ^ -1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1) s <- (x2 - x1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1) list(location = l, scale = s) } #' @importFrom stats qnorm #' @rdname find_beta #' @export find_normal <- function(x1, p1, x2, p2) { # find location paramater mw <- (x1 * stats::qnorm(p2) ^ -1 - x2 * stats::qnorm(p1) ^ -1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1) stddev <- (x2 - x1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1) list(mean = mw, sd = stddev) } sjstats/R/partial_eta_sq_ci.R0000644000176200001440000000410413563265750015747 0ustar liggesusers# The function partial_eta_sq_ci() is licensed unter MIT # Original Author: David Stanley # Code copied from https://github.com/dstanley4/apaTables # (https://github.com/dstanley4/apaTables/blob/master/R/etaSquaredCI.R) # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. partial_eta_sq_ci <- function(F.value, df1, df2, conf.level=.90) { F_value <- F.value conf_level <- conf.level F_limits <- confint_ncg( F.value = F_value, df.1 = df1, df.2 = df2, conf.level = conf_level ) LL_lambda <- F_limits$Lower.Limit UL_lambda <- F_limits$Upper.Limit LL_partial_eta2 <- get_partial_eta2_from_lambda(lambda = LL_lambda, df1 = df1, df2 = df2) UL_partial_eta2 <- get_partial_eta2_from_lambda(lambda = UL_lambda, df1 = df1, df2 = df2) if (is.na(LL_partial_eta2)) { LL_partial_eta2 <- 0 } if (is.na(UL_partial_eta2)) { UL_partial_eta2 <- 1 } list(LL = LL_partial_eta2, UL = UL_partial_eta2) } get_partial_eta2_from_lambda <- function(lambda, df1, df2) { lambda / (lambda + df1 + df2 + 1) } sjstats/R/gof.R0000644000176200001440000000744013615753121013055 0ustar liggesusers#' @title Compute model quality #' @name chisq_gof #' #' @description For logistic regression models, performs a Chi-squared #' goodness-of-fit-test. #' #' @param x A numeric vector or a \code{glm}-object. #' @param prob Vector of probabilities (indicating the population probabilities) #' of the same length as \code{x}'s amount of categories / factor levels. #' Use \code{nrow(table(x))} to determine the amount of necessary values #' for \code{prob}. Only used, when \code{x} is a vector, and not a #' \code{glm}-object. #' @param weights Vector with weights, used to weight \code{x}. #' #' @references #' Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. \doi{10.1002/0471722146} #' #' @details For vectors, this function is a convenient function for the #' \code{chisq.test()}, performing goodness-of-fit test. For #' \code{glm}-objects, this function performs a goodness-of-fit test. #' A well-fitting model shows \emph{no} significant difference between the #' model and the observed data, i.e. the reported p-values should be #' greater than 0.05. #' #' @return For vectors, returns the object of the computed \code{\link[stats]{chisq.test}}. #' For \code{glm}-objects, an object of class \code{chisq_gof} with #' following values: \code{p.value}, the p-value for the goodness-of-fit test; #' \code{z.score}, the standardized z-score for the goodness-of-fit test; #' \code{rss}, the residual sums of squares term and \code{chisq}, the pearson #' chi-squared statistic. #' #' @examples #' data(efc) #' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) #' m <- glm( #' neg_c_7d ~ c161sex + barthtot + c172code, #' data = efc, #' family = binomial(link = "logit") #' ) #' #' # goodness-of-fit test for logistic regression #' chisq_gof(m) #' #' # goodness-of-fit test for vectors against probabilities #' # differing from population #' chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28)) #' #' # equal to population #' chisq_gof(efc$e42dep, prop.table(table(efc$e42dep))) #' #' @importFrom stats na.omit fitted resid formula as.formula lm pnorm chisq.test #' @export chisq_gof <- function(x, prob = NULL, weights = NULL) { if (inherits(x, "glm")) { # This is an adapted version from the # "binomTools" package. The "X2GOFtest()" # function did not work when model data frame # had missing values. y_hat <- stats::fitted(x) wt <- x$prior.weight vJ <- wt * y_hat * (1 - y_hat) cJ <- (1 - 2 * y_hat) / vJ X2 <- sum(stats::resid(x, type = "pearson")^2) form <- stats::as.formula(x$formula) form[[2]] <- as.name("cJ") # use model matrix instead of data values, # because data may contain more variables # than needed, and due to missing may have # different row length dat <- stats::na.omit(x$model) dat$cJ <- cJ dat$vJ <- vJ RSS <- sum(stats::resid(stats::lm(form, data = dat, weights = vJ))^2) A <- 2 * (length(y_hat) - sum(1 / wt)) z <- (X2 - x$df.residual) / sqrt(A + RSS) p.value <- 2 * stats::pnorm(abs(z), lower.tail = FALSE) chi2gof <- list( p.value = p.value, z.score = z, rss = RSS, chisq = X2 ) class(chi2gof) <- c("sj_chi2gof", "list") } else { # check if we have probs if (is.null(prob)) { warning("`prob` needs to be specified.", call. = F) return(invisible(NULL)) } # performs a Chi-square goodnes-of-fit-test if (!is.null(weights)) x <- weight(x, weights) dummy <- as.vector(table(x)) # goodness of fit-test. x is one-dimensional and # y not given chi2gof <- stats::chisq.test(dummy, p = prob) } chi2gof } sjstats/R/boot_ci.R0000644000176200001440000001763313563265750013735 0ustar liggesusers#' @title Standard error and confidence intervals for bootstrapped estimates #' @name boot_ci #' #' @description Compute nonparametric bootstrap estimate, standard error, #' confidence intervals and p-value for a vector of bootstrap #' replicate estimates. #' #' @param data A data frame that containts the vector with bootstrapped #' estimates, or directly the vector (see 'Examples'). #' @param ci.lvl Numeric, the level of the confidence intervals. #' @param ... Optional, unquoted names of variables with bootstrapped estimates. #' Required, if either \code{data} is a data frame (and no vector), #' and only selected variables from \code{data} should be processed. #' You may also use functions like \code{:} or tidyselect's #' \code{\link[tidyselect]{select_helpers}}. #' @param method Character vector, indicating if confidence intervals should be #' based on bootstrap standard error, multiplied by the value of the #' quantile function of the t-distribution (default), or on sample #' quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. #' May be abbreviated. #' #' @return A \code{\link[tibble]{tibble}} with either bootstrap estimate, #' standard error, the lower and upper confidence intervals or the #' p-value for all bootstrapped estimates. #' #' @details The methods require one or more vectors of bootstrap replicate estimates #' as input. #' \itemize{ #' \item{ #' \code{boot_est()} returns the bootstrapped estimate, simply by #' computing the mean value of all bootstrap estimates. #' } #' \item{ #' \code{boot_se()} computes the nonparametric bootstrap standard #' error by calculating the standard deviation of the input vector. #' } #' \item{ #' The mean value of the input vector and its standard error is used #' by \code{boot_ci()} to calculate the lower and upper confidence #' interval, assuming a t-distribution of bootstrap estimate replicates #' (for \code{method = "dist"}, the default, which is #' \code{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)}); for #' \code{method = "quantile"}, 95\% sample quantiles are used to compute #' the confidence intervals (\code{quantile(x, probs = c(.025, .975))}). #' Use \code{ci.lvl} to change the level for the confidence interval. #' } #' \item{ #' P-values from \code{boot_p()} are also based on t-statistics, #' assuming normal distribution. #' } #' } #' #' @references Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 #' #' @seealso \code{\link{bootstrap}} to generate nonparametric bootstrap samples. #' #' @examples #' library(dplyr) #' library(purrr) #' data(efc) #' bs <- bootstrap(efc, 100) #' #' # now run models for each bootstrapped sample #' bs$models <- map(bs$strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)) #' #' # extract coefficient "dependency" and "gender" from each model #' bs$dependency <- map_dbl(bs$models, ~coef(.x)[2]) #' bs$gender <- map_dbl(bs$models, ~coef(.x)[3]) #' #' # get bootstrapped confidence intervals #' boot_ci(bs$dependency) #' #' # compare with model fit #' fit <- lm(neg_c_7 ~ e42dep + c161sex, data = efc) #' confint(fit)[2, ] #' #' # alternative function calls. #' boot_ci(bs$dependency) #' boot_ci(bs, dependency) #' boot_ci(bs, dependency, gender) #' boot_ci(bs, dependency, gender, method = "q") #' #' #' # compare coefficients #' mean(bs$dependency) #' boot_est(bs$dependency) #' coef(fit)[2] #' #' #' # bootstrap() and boot_ci() work fine within pipe-chains #' efc %>% #' bootstrap(100) %>% #' mutate( #' models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)), #' dependency = map_dbl(models, ~coef(.x)[2]) #' ) %>% #' boot_ci(dependency) #' #' # check p-value #' boot_p(bs$gender) #' summary(fit)$coefficients[3, ] #' #' \dontrun{ #' # 'spread_coef()' from the 'sjmisc'-package makes it easy to generate #' # bootstrapped statistics like confidence intervals or p-values #' library(dplyr) #' library(sjmisc) #' efc %>% #' # generate bootstrap replicates #' bootstrap(100) %>% #' # apply lm to all bootstrapped data sets #' mutate( #' models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) #' ) %>% #' # spread model coefficient for all 100 models #' spread_coef(models) %>% #' # compute the CI for all bootstrapped model coefficients #' boot_ci(e42dep, c161sex, c172code) #' #' # or... #' efc %>% #' # generate bootstrap replicates #' bootstrap(100) %>% #' # apply lm to all bootstrapped data sets #' mutate( #' models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) #' ) %>% #' # spread model coefficient for all 100 models #' spread_coef(models, append = FALSE) %>% #' # compute the CI for all bootstrapped model coefficients #' boot_ci()} #' #' @importFrom stats qt quantile #' @importFrom dplyr quos #' @importFrom rlang .data #' @export boot_ci <- function(data, ..., method = c("dist", "quantile"), ci.lvl = .95) { # match arguments method <- match.arg(method) # evaluate arguments, generate data .dat <- get_dot_data(data, dplyr::quos(...)) # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # check if method should be based on t-distribution of # bootstrap values or quantiles if (method == "dist") { # get bootstrap standard error bootse <- stats::qt((1 + ci.lvl) / 2, df = length(x) - 1) * stats::sd(x, na.rm = T) # lower and upper confidence interval ci <- mean(x, na.rm = T) + c(-bootse, bootse) } else { # CI based on quantiles of bootstrapped values ci <- stats::quantile(x, probs = c((1 - ci.lvl) / 2, (1 + ci.lvl) / 2)) } # give proper names names(ci) <- c("conf.low", "conf.high") ci })) } #' @rdname boot_ci #' @importFrom stats sd #' @export boot_se <- function(data, ...) { # evaluate arguments, generate data .dat <- get_dot_data(data, dplyr::quos(...)) # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # get bootstrap standard error se <- stats::sd(x, na.rm = T) names(se) <- "std.err" se })) } #' @rdname boot_ci #' @importFrom stats sd pt #' @export boot_p <- function(data, ...) { # evaluate arguments, generate data .dat <- get_dot_data(data, dplyr::quos(...)) # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # compute t-statistic t.stat <- mean(x, na.rm = T) / stats::sd(x, na.rm = T) # compute p-value p <- 2 * stats::pt(abs(t.stat), df = length(x) - 1, lower.tail = FALSE) names(p) <- "p.value" p })) } #' @rdname boot_ci #' @export boot_est <- function(data, ...) { # evaluate arguments, generate data .dat <- get_dot_data(data, dplyr::quos(...)) # compute mean for all values (= bootstrapped estimate) transform_boot_result(lapply(.dat, function(x) { estimate <- mean(x, na.rm = T) names(estimate) <- "estimate" estimate })) } transform_boot_result <- function(res) { # transform a bit, so we have each estimate in a row, and ci's as columns... res %>% as.data.frame() %>% t() %>% as.data.frame() %>% rownames_as_column(var = "term") } #' @importFrom dplyr select get_dot_data <- function(x, qs) { if (sjmisc::is_empty(qs)) as.data.frame(x) else suppressWarnings(dplyr::select(x, !!!qs)) } sjstats/R/svyglmnb.R0000644000176200001440000001220513616514147014142 0ustar liggesusersutils::globalVariables("scaled.weights") #' @title Survey-weighted negative binomial generalised linear model #' @name svyglm.nb #' @description \code{svyglm.nb()} is an extension to the \CRANpkg{survey}-package #' to fit survey-weighted negative binomial models. It uses #' \code{\link[survey]{svymle}} to fit sampling-weighted #' maximum likelihood estimates, based on starting values provided #' by \code{\link[MASS]{glm.nb}}, as proposed by \emph{Lumley #' (2010, pp249)}. #' #' #' @param formula An object of class \code{formula}, i.e. a symbolic description #' of the model to be fitted. See 'Details' in \code{\link[stats]{glm}}. #' @param design An object of class \code{\link[survey]{svydesign}}, providing #' a specification of the survey design. #' @param ... Other arguments passed down to \code{\link[MASS]{glm.nb}}. #' #' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.nb}, #' with some additional information about the model. #' #' @details For details on the computation method, see Lumley (2010), Appendix E #' (especially 254ff.) #' \cr \cr #' \pkg{sjstats} implements following S3-methods for \code{svyglm.nb}-objects: #' \code{family()}, \code{model.frame()}, \code{formula()}, \code{print()}, #' \code{predict()} and \code{residuals()}. However, these functions have some #' limitations: #' \itemize{ #' \item{\code{family()} simply returns the family-object from the #' underlying \code{\link[MASS]{glm.nb}}-model.} #' \item{The \code{predict()}-method just re-fits the \code{svyglm.nb}-model #' with \code{\link[MASS]{glm.nb}}, overwrites the \code{$coefficients} #' from this model-object with the coefficients from the returned #' \code{\link[survey]{svymle}}-object and finally calls #' \code{\link[stats]{predict.glm}} to compute the predicted values.} #' \item{\code{residuals()} re-fits the \code{svyglm.nb}-model with #' \code{\link[MASS]{glm.nb}} and then computes the Pearson-residuals #' from the \code{glm.nb}-object.} #' } #' #' #' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley #' #' @examples #' # ------------------------------------------ #' # This example reproduces the results from #' # Lumley 2010, figure E.7 (Appendix E, p256) #' # ------------------------------------------ #' if (require("survey")) { #' data(nhanes_sample) #' #' # create survey design #' des <- svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' #' # fit negative binomial regression #' fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des) #' #' # print coefficients and standard errors #' fit #' } #' @importFrom MASS glm.nb #' @importFrom stats weights update model.frame coef as.formula family #' @export svyglm.nb <- function(formula, design, ...) { # check if pkg survey is available if (!requireNamespace("survey", quietly = TRUE)) { stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) } # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) # update design with scaled weights design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE)) # fit negative binomial model, with scaled design weights mod <- MASS::glm.nb(formula, data = stats::model.frame(design), weights = scaled.weights, ...) fam <- stats::family(mod) # fit survey model, using maximum likelihood estimation svyfit <- survey::svymle( loglike = sjstats_loglik, grad = sjstats_score, design = design, formulas = list(theta = ~1, eta = formula), start = c(mod$theta, stats::coef(mod)), na.action = "na.omit" ) # add additoinal information class(svyfit) <- c("svyglm.nb", class(svyfit)) attr(svyfit, "nb.terms") <- all.vars(formula) attr(svyfit, "nb.formula") <- formula attr(svyfit, "family") <- fam attr(svyfit, "nb.theta") <- mod[["theta"]] attr(svyfit, "nb.theta.se") <- mod[["SE.theta"]] svyfit$deviance <- mod$deviance svyfit$df.residuals <- mod$df.residuals svyfit$df <- length(stats::coef(mod)) + 1 svyfit$aic <- mod$aic svyfit } # log-likelihood function used in "svymle()" sjstats_loglik <- function(y, theta, eta) { mu <- exp(eta) return( lgamma(theta + y) - lgamma(theta) - lgamma(y + 1) + theta * log(theta) + y * log(mu + (y == 0)) - (theta + y) * log(theta + mu) ) } # derivative sjstats_deta <- function(y, theta, eta) { mu <- exp(eta) dmu <- y / mu - (theta + y) / (theta + mu) dmu * mu } # derivative sjstats_dtheta <- function(y, theta, eta) { mu <- exp(eta) digamma(theta + y) - digamma(theta) + log(theta) + 1 - log(theta + mu) - (y + theta) / (mu + theta) } # score function, combines derivatives sjstats_score <- function(y, theta, eta) { cbind(sjstats_dtheta(y, theta,eta), sjstats_deta(y, theta, eta)) } sjstats/R/eta_sq.R0000644000176200001440000001626413563265750013572 0ustar liggesusers#' @title Effect size statistics for anova #' @name eta_sq #' @description Returns the (partial) eta-squared, (partial) omega-squared, #' epsilon-squared statistic or Cohen's F for all terms in an anovas. #' \code{anova_stats()} returns a tidy summary, including all these statistics #' and power for each term. #' #' @param model A fitted anova-model of class \code{aov} or \code{anova}. Other #' models are coerced to \code{\link[stats]{anova}}. #' @param partial Logical, if \code{TRUE}, the partial eta-squared is returned. #' @param digits Number of decimal points in the returned data frame. #' @param ci.lvl Scalar between 0 and 1. If not \code{NULL}, returns a data #' frame with effect sizes including lower and upper confidence intervals. #' #' @inheritParams bootstrap #' @inheritParams boot_ci #' #' @return A data frame with the term name(s) and effect size statistics; if #' \code{ci.lvl} is not \code{NULL}, a data frame including lower and #' upper confidence intervals is returned. For \code{anova_stats()}, a tidy #' data frame with all statistics is returned (excluding confidence intervals). #' #' @details For \code{eta_sq()} (with \code{partial = FALSE}), due to #' non-symmetry, confidence intervals are based on bootstrap-methods. In this #' case, \code{n} indicates the number of bootstrap samples to be drawn to #' compute the confidence intervals. Confidence intervals for partial #' omega-squared and epsilon-squared is also based on bootstrapping. #' \cr \cr #' Since bootstrapped confidence intervals are based on the bootstrap standard error #' (i.e. \code{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x))}, bounds of #' the confidence interval may be negative. Use \code{method = "quantile"} to #' make sure that the confidence intervals are always positive. #' #' @references Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research (\href{https://www.msu.edu/~levinet/eta\%20squared\%20hcr.pdf}{pdf}) #' \cr \cr #' Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. (\href{http://www.scsug.org/wp-content/uploads/2016/11/Ad-Hoc-Method-for-Computing-Effect-Size-for-Mixed-Models_PROCEEDINGS-UPDATE-1.pdf}{pdf}) #' #' @examples #' # load sample data #' data(efc) #' #' # fit linear model #' fit <- aov( #' c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, #' data = efc #' ) #' #' eta_sq(fit) #' omega_sq(fit) #' eta_sq(fit, partial = TRUE) #' eta_sq(fit, partial = TRUE, ci.lvl = .8) #' #' anova_stats(car::Anova(fit, type = 2)) #' #' @export eta_sq <- function(model, partial = FALSE, ci.lvl = NULL, n = 1000, method = c("dist", "quantile")) { method <- match.arg(method) if (partial) type <- "peta" else type <- "eta" es <- aov_stat(model, type = type) x <- data_frame( term = names(es), es = es ) if (partial) { if (!is.null(ci.lvl) && !is.na(ci.lvl)) { x <- dplyr::bind_cols(x, peta_sq_ci(aov.sum = aov_stat_summary(model), ci.lvl = ci.lvl)) } } else { if (!is.null(ci.lvl) && !is.na(ci.lvl)) { x <- es_boot_fun( model = model, type = "eta", ci.lvl = ci.lvl, n = n, boot.method = method ) } } colnames(x)[2] <- dplyr::case_when( type == "eta" ~ "etasq", type == "peta" ~ "partial.etasq", TRUE ~ "effect.size" ) if (!is.null(attr(es, "stratum"))) x$stratum <- attr(es, "stratum")[1:nrow(x)] class(x) <- c("sj_anova_stat", class(x)) x } #' @importFrom purrr map_df peta_sq_ci <- function(aov.sum, ci.lvl = .95) { rows <- nrow(aov.sum) - 1 df.den <- aov.sum[["df"]][rows + 1] purrr::map_df( 1:rows, function(.x) { df.num = aov.sum[.x, "df"] test.stat <- aov.sum[.x, "statistic"] if (!is.na(test.stat)) { ci <- partial_eta_sq_ci( F.value = test.stat, df1 = df.num, df2 = df.den, conf.level = ci.lvl ) data.frame( conf.low = ci$LL, conf.high = ci$UL ) } else { data.frame( conf.low = NA, conf.high = NA ) } } ) } #' @importFrom purrr map map_df #' @importFrom dplyr bind_cols mutate case_when pull #' @importFrom stats anova formula aov #' @importFrom sjmisc rotate_df #' @importFrom insight get_data es_boot_fun <- function(model, type, ci.lvl, n, boot.method = "dist") { if (inherits(model, "anova") || is.data.frame(model)) { if (type == "pomega") stop("Objects of class `anova` or `data.frame` not supported for partial Omega squared statistics.", call. = FALSE) else if (type == "eta") stop("Objects of class `anova` or `data.frame` not supported for Eta squared statistics.", call. = FALSE) else stop("Objects of class `anova` or `data.frame` not supported.", call. = FALSE) } es <- aov_stat(model = model, type = type) x <- data_frame( term = names(es), es = es ) # need special handling for repeated measure anova here if (inherits(model, "aovlist")) { mdata <- insight::get_data(model) mformula <- stats::formula(attr(model, "terms")) # this is a bit sloppy, but I need to catch all exceptions here # if we have a 1-way-anova, map() could return a column with # one value per row (a vector). However, if the model has more # covariates/factors, map() returns a list-colum with 3 values # per row, which need to be spread into a 3 columns data frame. es <- mdata %>% bootstrap(n = n) %>% dplyr::mutate(es = purrr::map( .data$strap, function(i) { m <- stats::aov(mformula, data = i) dat <- aov_stat(m, type = type) sjmisc::rotate_df(as.data.frame(dat)) } )) %>% dplyr::pull(2) %>% purrr::map_df(~ .x) %>% boot_ci(ci.lvl = ci.lvl, method = boot.method) } else { mdata <- insight::get_data(model) mformula <- stats::formula(model) # this is a bit sloppy, but I need to catch all exceptions here # if we have a 1-way-anova, map() could return a column with # one value per row (a vector). However, if the model has more # covariates/factors, map() returns a list-colum with 3 values # per row, which need to be spread into a 3 columns data frame. es <- mdata %>% bootstrap(n = n) %>% dplyr::mutate(es = purrr::map( .data$strap, function(i) { m <- lm(mformula, data = i) dat <- aov_stat(m, type = type) sjmisc::rotate_df(as.data.frame(dat)) } )) %>% dplyr::pull(2) %>% purrr::map_df(~ .x) %>% boot_ci(ci.lvl = ci.lvl, method = boot.method) } x <- dplyr::bind_cols(x, es[1:nrow(x), -1, drop = FALSE]) colnames(x)[2] <- dplyr::case_when( type == "eta" ~ "etasq", type == "epsilon" ~ "epsilonsq", type == "peta" ~ "partial.etasq", type == "omega" ~ "omegasq", type == "pomega" ~ "partial.omegasq", TRUE ~ "effect.size" ) x } sjstats/R/grpmean.R0000644000176200001440000002520713616514432013735 0ustar liggesusers#' @title Summary of mean values by group #' @name means_by_group #' #' @description Computes mean, sd and se for each sub-group (indicated by \code{grp}) #' of \code{dv}. #' #' @param x A (grouped) data frame. #' @param dv Name of the dependent variable, for which the mean value, grouped #' by \code{grp}, is computed. #' @param grp Factor with the cross-classifying variable, where \code{dv} is #' grouped into the categories represented by \code{grp}. Numeric vectors #' are coerced to factors. #' @param weights Name of variable in \code{x} that indicated the vector of #' weights that will be applied to weight all observations. Default is #' \code{NULL}, so no weights are used. #' @param digits Numeric, amount of digits after decimal point when rounding #' estimates and values. #' @param file Destination file, if the output should be saved as file. #' Only used when \code{out} is not \code{"txt"}. #' @param encoding Character vector, indicating the charset encoding used #' for variable and value labels. Default is \code{"UTF-8"}. Only used #' when \code{out} is not \code{"txt"}. #' @param out Character vector, indicating whether the results should be printed #' to console (\code{out = "txt"}) or as HTML-table in the viewer-pane #' (\code{out = "viewer"}) or browser (\code{out = "browser"}), of if the #' results should be plotted (\code{out = "plot"}, only applies to certain #' functions). May be abbreviated. #' #' @return For non-grouped data frames, \code{means_by_group()} returns a data frame with #' following columns: \code{term}, \code{mean}, \code{N}, \code{std.dev}, #' \code{std.error} and \code{p.value}. For grouped data frames, returns #' a list of such data frames. #' #' @details This function performs a One-Way-Anova with \code{dv} as dependent #' and \code{grp} as independent variable, by calling #' \code{lm(count ~ as.factor(grp))}. Then \code{\link[emmeans]{contrast}} #' is called to get p-values for each sub-group. P-values indicate whether #' each group-mean is significantly different from the total mean. #' #' @examples #' data(efc) #' means_by_group(efc, c12hour, e42dep) #' #' data(iris) #' means_by_group(iris, Sepal.Width, Species) #' #' # also works for grouped data frames #' if (require("dplyr")) { #' efc %>% #' group_by(c172code) %>% #' means_by_group(c12hour, e42dep) #' } #' #' # weighting #' efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) #' means_by_group(efc, c12hour, e42dep, weights = weight) #' @importFrom sjlabelled get_label drop_labels get_labels #' @importFrom stats lm na.omit sd weighted.mean #' @importFrom purrr map_chr map_df #' @importFrom sjmisc to_value is_empty #' @importFrom rlang enquo .data quo_name #' @export means_by_group <- function(x, dv, grp, weights = NULL, digits = 2, out = c("txt", "viewer", "browser"), encoding = "UTF-8", file = NULL) { out <- match.arg(out) if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) { message("Package `sjPlot` needs to be loaded to print HTML tables.") out <- "txt" } # create quosures grp.name <- rlang::quo_name(rlang::enquo(grp)) dv.name <- rlang::quo_name(rlang::enquo(dv)) # weights need extra checking, might be NULL if (!missing(weights)) { .weights <- try(rlang::quo_name(rlang::enquo(weights)), silent = TRUE) if (inherits(.weights, "try-error")) .weights <- NULL w.string <- try(eval(weights), silent = TRUE) if (!inherits(w.string, "try-error") && !is.null(w.string) && is.character(w.string)) .weights <- w.string if (sjmisc::is_empty(.weights) || .weights == "NULL") .weights <- NULL } else .weights <- NULL # create string with variable names vars <- c(grp.name, dv.name, .weights) # get data x <- suppressMessages(dplyr::select(x, !! vars)) # set value and row labels varGrpLabel <- sjlabelled::get_label(x[[grp.name]], def.value = grp.name) varCountLabel <- sjlabelled::get_label(x[[dv.name]], def.value = dv.name) # first, drop unused labels x[[grp.name]] <- sjlabelled::drop_labels(x[[grp.name]], drop.na = TRUE) # now get valid value labels value.labels <- sjlabelled::get_labels( x[[grp.name]], attr.only = F, values = "n", non.labelled = TRUE ) # return values dataframes <- list() # do we have a grouped data frame? if (inherits(x, "grouped_df")) { # get grouped data grps <- get_grouped_data(x) # now plot everything for (i in seq_len(nrow(grps))) { # copy back labels to grouped data frame tmp <- sjlabelled::copy_labels(grps$data[[i]], x) # get grouped means table dummy <- means_by_group_helper( x = tmp, dv = dv.name, grp = grp.name, weight.by = .weights, digits = digits, value.labels = value.labels, varCountLabel = varCountLabel, varGrpLabel = varGrpLabel ) attr(dummy, "group") <- get_grouped_title(x, grps, i, sep = "\n") # save data frame for return value dataframes[[length(dataframes) + 1]] <- dummy } # add class-attr for print-method() if (out == "txt") class(dataframes) <- c("sj_grpmeans", "list") else class(dataframes) <- c("sjt_grpmeans", "list") } else { dataframes <- means_by_group_helper( x = x, dv = dv.name, grp = grp.name, weight.by = .weights, digits = digits, value.labels = value.labels, varCountLabel = varCountLabel, varGrpLabel = varGrpLabel ) # add class-attr for print-method() if (out == "txt") class(dataframes) <- c("sj_grpmean", class(dataframes)) else class(dataframes) <- c("sjt_grpmean", class(dataframes)) } # save how to print output attr(dataframes, "print") <- out attr(dataframes, "encoding") <- encoding attr(dataframes, "file") <- file dataframes } #' @importFrom stats pf lm weighted.mean na.omit sd #' @importFrom sjmisc to_value add_variables #' @importFrom emmeans emmeans contrast #' @importFrom dplyr pull select n_distinct #' @importFrom purrr map_chr #' @importFrom rlang .data means_by_group_helper <- function(x, dv, grp, weight.by, digits, value.labels, varCountLabel, varGrpLabel) { # copy vectors from data frame dv <- x[[dv]] grp <- x[[grp]] if (!is.null(weight.by)) weight.by <- x[[weight.by]] else weight.by <- 1 # convert values to numeric dv <- sjmisc::to_value(dv) # create data frame, for emmeans mydf <- stats::na.omit(data.frame( dv = dv, grp = as.factor(grp), weight.by = weight.by )) # compute anova statistics for mean table fit <- stats::lm(dv ~ grp, weights = weight.by, data = mydf) # p-values of contrast-means means.p <- fit %>% emmeans::emmeans(specs = "grp") %>% emmeans::contrast(method = "eff") %>% summary() %>% dplyr::pull("p.value") # create string with p-values pval <- purrr::map_chr(seq_len(length(means.p)), function(i) { if (means.p[i] < 0.001) { "<0.001" } else { sprintf("%.*f", digits, means.p[i]) } }) ## TODO # efc %>% # group_by(c172code, c161sex) %>% # means_by_group(c12hour, e42dep) # check if value labels length matches group count if (dplyr::n_distinct(mydf$grp) != length(value.labels)) { # get unique factor levels and check if these are numeric. # if so, we match the values from value labels and the remaining # factor levels, so we get the correct value labels for printing nl <- unique(mydf$grp) if (sjmisc::is_num_fac(nl)) value.labels <- value.labels[names(value.labels) %in% levels(nl)] else value.labels <- nl } # create summary dat <- mydf %>% dplyr::group_by(.data$grp) %>% summarise( mean = sprintf("%.*f", digits, stats::weighted.mean(.data$dv, w = .data$weight.by, na.rm = TRUE)), N = round(sum(.data$weight.by)), std.dev = sprintf("%.*f", digits, weighted_sd(.data$dv, .data$weight.by)), std.error = sprintf("%.*f", digits, weighted_se(.data$dv, .data$weight.by)) ) %>% mutate(p.value = pval) %>% dplyr::select(-.data$grp) # finally, add total-row dat <- dplyr::bind_rows( dat, data_frame( mean = sprintf("%.*f", digits, stats::weighted.mean(mydf$dv, w = mydf$weight.by, na.rm = TRUE)), N = nrow(mydf), std.dev = sprintf("%.*f", digits, weighted_sd(mydf$dv, mydf$weight.by)), std.error = sprintf("%.*f", digits, weighted_se(mydf$dv, mydf$weight.by)), p.value = "" ) ) # add row labels dat <- sjmisc::add_variables( dat, term = c(unname(value.labels), "Total"), .after = -1 ) # get anova statistics for mean table sum.fit <- summary(fit) # r-squared values r2 <- sum.fit$r.squared r2.adj <- sum.fit$adj.r.squared # F-statistics fstat <- sum.fit$fstatistic pval <- stats::pf(fstat[1], fstat[2], fstat[3], lower.tail = F) # copy as attributes attr(dat, "r2") <- r2 attr(dat, "adj.r2") <- r2.adj attr(dat, "fstat") <- fstat[1] attr(dat, "p.value") <- pval attr(dat, "dv.label") <- varCountLabel attr(dat, "grp.label") <- varGrpLabel dat } get_grouped_title <- function(x, grps, i, sep = "\n") { # create title for first grouping level tp <- get_title_part(x, grps, 1, i) title <- sprintf("%s: %s", tp[1], tp[2]) # do we have another groupng variable? if (length(dplyr::group_vars(x)) > 1) { tp <- get_title_part(x, grps, 2, i) title <- sprintf("%s%s%s: %s", title, sep, tp[1], tp[2]) } # return title title } get_title_part <- function(x, grps, level, i) { # prepare title for group var.name <- colnames(grps)[level] # get values from value labels vals <- sjlabelled::get_values(x[[var.name]]) # if we have no value labels, get values directly if (is.null(vals)) { vals <- unique(x[[var.name]]) lab.pos <- i } else { # find position of value labels for current group lab.pos <- which(vals == grps[[var.name]][i]) } # get variable and value labels t1 <- sjlabelled::get_label(x[[var.name]], def.value = var.name) t2 <- sjlabelled::get_labels(x[[var.name]])[lab.pos] # if we have no value label, use value instead if (is.null(t2)) t2 <- vals[lab.pos] # generate title c(t1, t2) } #' @rdname means_by_group #' @export grpmean <- means_by_group sjstats/R/S3-methods.R0000644000176200001440000005024113610333346014223 0ustar liggesusers#' @importFrom stats formula #' @export model.matrix.gls <- function(object, ...) { if (!requireNamespace("nlme")) stop("Package `nlme` is required, please install it first.", call. = FALSE) cbind( `(Intercept)` = 1, nlme::getData(object)[, all.vars(stats::formula(object))] ) } #' @importFrom stats coef vcov pnorm #' @importFrom dplyr case_when #' @export print.svyglm.nb <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.nb(x, digits, v_se = se)[-1, -2] pan <- dplyr::case_when( sm$p.value < 0.001 ~ "<0.001 ***", sm$p.value < 0.01 ~ sprintf("%.*f ** ", digits, sm$p.value), sm$p.value < 0.05 ~ sprintf("%.*f * ", digits, sm$p.value), sm$p.value < 0.1 ~ sprintf("%.*f . ", digits, sm$p.value), TRUE ~ sprintf("%.*f ", digits, sm$p.value) ) sm$p.value <- pan print(sm, ...) # add dispersion parameter cat(sprintf("\nDispersion parameter Theta: %.*f", digits, attr(x, "nb.theta", exact = TRUE))) cat(sprintf("\n Standard Error of Theta: %.*f", digits, attr(x, "nb.theta.se", exact = TRUE))) message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se)) } #' @importFrom stats coef vcov pnorm #' @importFrom dplyr case_when #' @export print.svyglm.zip <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.zip(x, digits, v_se = se)[-1, ] pan <- dplyr::case_when( sm$p.value < 0.001 ~ "<0.001 ***", sm$p.value < 0.01 ~ sprintf("%.*f ** ", digits, sm$p.value), sm$p.value < 0.05 ~ sprintf("%.*f * ", digits, sm$p.value), sm$p.value < 0.1 ~ sprintf("%.*f . ", digits, sm$p.value), TRUE ~ sprintf("%.*f ", digits, sm$p.value) ) sm$p.value <- pan print(sm, ...) message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se)) } #' @importFrom stats qnorm coef pnorm vcov tidy_svyglm.nb <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) # keep original value, not rounded est <- stats::coef(x) se <- sqrt(diag(stats::vcov(x, stderr = v_se))) data_frame( term = substring(names(stats::coef(x)), 5), estimate = round(est, digits), irr = round(exp(est), digits), std.error = round(se, digits), conf.low = round(exp(est - stats::qnorm(.975) * se), digits), conf.high = round(exp(est + stats::qnorm(.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } #' @importFrom stats qnorm coef pnorm vcov tidy_svyglm.zip <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) # keep original value, not rounded est <- stats::coef(x) se <- sqrt(diag(stats::vcov(x, stderr = v_se))) data_frame( term = substring(names(stats::coef(x)), 5), estimate = round(est, digits), std.error = round(se, digits), conf.low = round(exp(est - stats::qnorm(.975) * se), digits), conf.high = round(exp(est + stats::qnorm(.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } #' @importFrom dplyr select #' @export model.frame.svyglm.nb <- function(formula, ...) { pred <- attr(formula, "nb.terms", exact = T) dplyr::select(formula$design$variables, string_one_of(pattern = pred, x = colnames(formula$design$variables))) } #' @importFrom dplyr select #' @export model.frame.svyglm.zip <- function(formula, ...) { pred <- attr(formula, "zip.terms", exact = T) dplyr::select(formula$design$variables, string_one_of(pattern = pred, x = colnames(formula$design$variables))) } #' @export family.svyglm.nb <- function(object, ...) { attr(object, "family", exact = TRUE) } #' @export formula.svyglm.nb <- function(x, ...) { attr(x, "nb.formula", exact = TRUE) } #' @export formula.svyglm.zip <- function(x, ...) { attr(x, "zip.formula", exact = TRUE) } #' @importFrom MASS glm.nb #' @importFrom stats coef setNames predict.glm #' @export predict.svyglm.nb <- function(object, newdata = NULL, type = c("link", "response", "terms"), se.fit = FALSE, dispersion = NULL, terms = NULL, na.action = na.pass, ...) { if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) fnb <- MASS::glm.nb( attr(object, "nb.formula", exact = TRUE), data = object$design$variables, weights = scaled.weights ) cf <- stats::coef(fnb) names.cf <- names(cf) cf <- stats::coef(object)[-1] cf <- stats::setNames(cf, names.cf) fnb$coefficients <- cf stats::predict.glm( object = fnb, newdata = newdata, type = type, se.fit = se.fit, dispersion = dispersion, terms = terms, na.action = na.action, ... ) } #' @importFrom MASS glm.nb #' @importFrom stats coef setNames predict.glm #' @importFrom insight get_response #' @export residuals.svyglm.nb <- function(object, ...) { if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) fnb <- MASS::glm.nb( attr(object, "nb.formula", exact = TRUE), data = object$design$variables, weights = scaled.weights ) y <- insight::get_response(fnb) mu <- stats::predict.glm(fnb, type = "response") wts <- fnb$prior.weights (y - mu) * sqrt(wts) / sqrt(fnb$family$variance(mu)) } #' @importFrom stats terms formula #' @export terms.svyglm.nb <- function(x, ...) { if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) stats::terms(stats::formula(x), ...) } #' @importFrom purrr map flatten_df #' @export AIC.svyglm.nb <- function(object, ...) { ## FIXME this one just returns the AIC of the underlying glm.nb() model list(object, ...) %>% purrr::map(~ getaic(.x)) %>% purrr::flatten_df() %>% as.data.frame() } getaic <- function(x) { c(df = x$df, AIC = x$aic) } #' @export deviance.svyglm.nb <- function(object, ...) { ## FIXME this one just returns the deviance of the underlying glm.nb() model object$deviance } #' @importFrom insight print_color #' @export print.tidy_stan <- function(x, ...) { insight::print_color("\nSummary Statistics of Stan-Model\n\n", "blue") digits <- attr(x, "digits") for (i in x) { insight::print_color(paste0("# ", attr(i, "main_title")), "blue") cat(" ") insight::print_color(attr(i, "sub_title"), "red") cat("\n\n") rem <- which(colnames(i) %in% c("Parameter", "Group", "Response", "Function")) i <- i[, -rem] colnames(i)[1] <- "Parameter" i$ESS <- as.character(i$ESS) i$pd <- sprintf("%.1f%%", 100 * i$pd) i[] <- lapply(i, function(.j) { if (is.numeric(.j)) .j <- sprintf("%.*f", digits, .j) .j }) print.data.frame(i, quote = FALSE, row.names = FALSE) cat("\n\n") } } #' @importFrom sjmisc trim clean_term_name <- function(x) { x <- sjmisc::trim(x) format(x, width = max(nchar(x))) } #' @export as.integer.sj_resample <- function(x, ...) { x$id } #' @export as.data.frame.sj_resample <- function(x, ...) { x$data[x$id, , drop = FALSE] } #' @export print.sj_resample <- function(x, ...) { n <- length(x$id) if (n > 12) id10 <- c(x$id[1:12], "...") else id10 <- x$id cat("<", paste0("id's of resample [", prettyNum(nrow(x$data), big.mark = ","), " x ", prettyNum(ncol(x$data), big.mark = ","), "]"), "> ", paste(id10, collapse = ", "), "\n", sep = "") } #' @importFrom tidyr gather #' @importFrom rlang .data #' @export plot.sj_inequ_trend <- function(x, ...) { if (!requireNamespace("ggplot2", quietly = TRUE)) { stop("Package `ggplot2` required for plotting inequalities trends.", call. = F) } # add time indicator x$data$zeit <- seq_len(nrow(x$data)) # get gather column names gather.cols1 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "lo", "hi")] gather.cols2 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "rr", "rd")] key_col <- "grp" value_col <- "y" # gather data to plot rr and rd dat1 <- tidyr::gather(x$data, !! key_col, !! value_col, !! gather.cols1) # gather data for raw prevalences dat2 <- tidyr::gather(x$data, !! key_col, !! value_col, !! gather.cols2) # Proper value names, for facet labels dat1$grp[dat1$grp == "rr"] <- "Rate Ratios" dat1$grp[dat1$grp == "rd"] <- "Rate Differences" # plot prevalences gp1 <- ggplot2::ggplot(dat2, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + ggplot2::geom_smooth(method = "loess", se = F) + ggplot2::labs(title = "Prevalance Rates for Lower and Higher SES Groups", y = "Prevalances", x = "Time", colour = "") + ggplot2::scale_color_manual(values = c("darkblue", "darkred"), labels = c("High SES", "Low SES")) # plot rr and rd gp2 <- ggplot2::ggplot(dat1, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + ggplot2::geom_smooth(method = "loess", se = F) + ggplot2::facet_wrap(~grp, ncol = 1, scales = "free") + ggplot2::labs(title = "Proportional Change in Rate Ratios and Rate Differences", colour = NULL, y = NULL, x = "Time") + ggplot2::guides(colour = FALSE) suppressMessages(graphics::plot(gp1)) suppressMessages(graphics::plot(gp2)) } #' @importFrom stats kruskal.test na.omit #' @export print.sj_mwu <- function(x, ...) { insight::print_color("\n# Mann-Whitney-U-Test\n\n", "blue") # get data .dat <- x$df # print to console for (i in seq_len(nrow(.dat))) { # get value labels l1 <- .dat[i, "grp1.label"] l2 <- .dat[i, "grp2.label"] # do we have value labels? if (!is.null(l1) && !is.na(l1) %% !is.null(l2) && !is.na(l2)) { insight::print_color( sprintf( "Groups %i = %s (n = %i) | %i = %s (n = %i):\n", .dat[i, "grp1"], l1, .dat[i, "grp1.n"], .dat[i, "grp2"], l2, .dat[i, "grp2.n"] ), "cyan" ) } else { insight::print_color( sprintf("Groups (%i|%i), n = %i/%i:\n", .dat[i, "grp1"], .dat[i, "grp2"], .dat[i, "grp1.n"], .dat[i, "grp2.n"]), "cyan" ) } pval <- .dat[i, "p"] if (pval < 0.001) { pval <- 0.001 p.string <- "<" } else { p.string <- "=" } cat(sprintf( " U = %.3f, W = %.3f, p %s %.3f, Z = %.3f\n", .dat[i, "u"], .dat[i, "w"], p.string, pval, .dat[i, "z"] )) string_es <- "effect-size r" string_r <- sprintf("%.3f", .dat[i, "r"]) string_group1 <- sprintf("rank-mean(%i)", .dat[i, "grp1"]) string_group2 <- sprintf("rank-mean(%i)", .dat[i, "grp2"]) string_rm1 <- sprintf("%.2f", .dat[i, "rank.mean.grp1"]) string_rm2 <- sprintf("%.2f", .dat[i, "rank.mean.grp2"]) space1 <- max(nchar(c(string_es, string_group1, string_group2))) space2 <- max(nchar(c(string_r, string_rm1, string_rm2))) cat( sprintf(" %*s = %*s\n", space1, string_es, space2 + 1, string_r), sprintf(" %*s = %*s\n", space1, string_group1, space2, string_rm1), sprintf(" %*s = %*s\n\n", space1, string_group2, space2, string_rm2) ) } # if we have more than 2 groups, also perfom kruskal-wallis-test if (length(unique(stats::na.omit(x$data$grp))) > 2) { insight::print_color("# Kruskal-Wallis-Test\n\n", "blue") kw <- stats::kruskal.test(x$data$dv, x$data$grp) cat(sprintf("chi-squared = %.3f\n", kw$statistic)) cat(sprintf("df = %i\n", kw$parameter)) if (kw$p.value < 0.001) { p <- 0.001 p.string <- "<" } else { p <- kw$p.value p.string <- "=" } cat(sprintf("p %s %.3f\n", p.string, p)) } } #' @export print.sj_outliers <- function(x, ...) { print(x$result, ...) } #' @export print.sj_xtab_stat <- function(x, ...) { # get length of method name, to align output l <- max(nchar(c(x$method, x$stat.name, "p-value"))) # headline insight::print_color("\n# Measure of Association for Contingency Tables\n", "blue") # used fisher? if (x$fisher) insight::print_color(" (using Fisher's Exact Test)\n", "blue") cat("\n") # print test statistic cat(sprintf(" %*s: %.4f\n", l, x$stat.name, x$statistic)) cat(sprintf(" %*s: %.4f\n", l, x$method, x$estimate)) # check if p <.001 if (x$p.value < 0.001) cat(sprintf(" %*s: <0.001\n", l, "p-value", x$p.value)) else cat(sprintf(" %*s: %.4f\n", l, "p-value", x$p.value)) } #' @export print.sj_grpmean <- function(x, ...) { cat("\n") print_grpmean(x, ...) } #' @importFrom insight format_table print_color print_grpmean <- function(x, ...) { # headline insight::print_color(sprintf( "# Grouped Means for %s by %s\n\n", attr(x, "dv.label", exact = TRUE), attr(x, "grp.label", exact = TRUE) ), "blue") colnames(x) <- c("Category", "Mean", "N", "SD", "SE", "p") cat(insight::format_table(x)) # statistics cat(sprintf( "\nAnova: R2=%.3f; adj.R2=%.3f; F=%.3f; p=%.3f\n", attr(x, "r2", exact = TRUE), attr(x, "adj.r2", exact = TRUE), attr(x, "fstat", exact = TRUE), attr(x, "p.value", exact = TRUE) )) } #' @importFrom purrr walk #' @export print.sj_grpmeans <- function(x, ...) { cat("\n") purrr::walk(x, function(dat) { # get grouping title label grp <- attr(dat, "group", exact = T) # print title for grouping insight::print_color(sprintf("Grouped by:\n%s\n\n", grp), "cyan") # print grpmean-table print_grpmean(dat, ...) cat("\n\n") }) } #' @export print.sj_mediation <- function(x, digits = 2, ...) { insight::print_color("\n# Causal Mediation Analysis for Stan Model\n\n", "blue") insight::print_color(sprintf( " Treatment: %s\n Mediator: %s\n Response: %s\n", attr(x, "treatment", exact = TRUE), attr(x, "mediator", exact = TRUE), attr(x, "response", exact = TRUE) ), "cyan") cat("\n") prop.med <- 100 * x[5, 2:4] x <- x[c(1, 2, 4), ] x$value <- format(round(x$value, digits = digits)) x$hdi.low <- format(round(x$hdi.low, digits = digits)) x$hdi.high <- format(round(x$hdi.high, digits = digits)) prop.med <- format(round(prop.med, digits = digits)) # ensure minimum width for column header if (max(nchar(x$value)) < 8) x$value <- format(x$value, width = 8, justify = "right") indent.width1 <- max(nchar(x$value)) + 17 indent.width2 <- max(nchar(x$hdi.low)) + max(nchar(x$hdi.high)) + 4 cat(sprintf( "%s%s\n", format("Estimate", width = indent.width1, justify = "right"), format(sprintf("HDI (%i%%)", as.integer(100 * attr(x, "prob", exact = TRUE))), width = indent.width2, justify = "right") )) cat(sprintf(" Direct effect: %s [%s %s]\n", x$value[1], x$hdi.low[1], x$hdi.high[1])) cat(sprintf("Indirect effect: %s [%s %s]\n", x$value[2], x$hdi.low[2], x$hdi.high[2])) cat(sprintf(" Total effect: %s [%s %s]\n", x$value[3], x$hdi.low[3], x$hdi.high[3])) insight::print_color( sprintf( "\nProportion mediated: %s%% [%s%% %s%%]\n", prop.med[1], prop.med[2], prop.med[3]) , "red") if (prop.med[1] < 0) message("\nDirect and indirect effects have opposite directions. The proportion mediated is not meaningful.") } #' @export print.sj_pval <- function(x, digits = 3, summary = FALSE, ...) { if (summary) { df.kr <- attr(x, "df.kr", exact = TRUE) t.kr <- attr(x, "t.kr", exact = TRUE) if (!is.null(df.kr)) x$df <- df.kr if (!is.null(t.kr)) x$statistic <- t.kr } x <- purrr::map_if(x, is.numeric, round, digits = digits) print.data.frame(as.data.frame(x), ..., row.names = TRUE) } #' @export summary.sj_pval <- function(object, digits = 3, summary = FALSE, ...) { print(object, digits, summary = TRUE) } #' @export print.sj_chi2gof <- function(x, ...) { insight::print_color("\n# Chi-squared Goodness-of-Fit Test\n\n", "blue") v1 <- sprintf("%.3f", x$chisq) v2 <- sprintf("%.3f", x$z.score) v3 <- sprintf("%.3f", x$p.value) space <- max(nchar(c(v1, v2, v3))) cat(sprintf(" Chi-squared: %*s\n", space, v1)) cat(sprintf(" z-score: %*s\n", space, v2)) cat(sprintf(" p-value: %*s\n\n", space, v3)) if (x$p.value >= 0.05) message("Summary: model seems to fit well.") else message("Summary: model does not fit well.") } #' @export print.sj_check_assump <- function(x, ...) { insight::print_color("\n# Checking Model-Assumptions\n\n", "blue") cat(sprintf(" Model: %s", attr(x, "formula", exact = TRUE))) insight::print_color("\n\n violated statistic\n", "red") v1 <- ifelse(x$heteroskedasticity < 0.05, "yes", "no") v2 <- ifelse(x$multicollinearity > 4, "yes", "no") v3 <- ifelse(x$non.normal.resid < 0.05, "yes", "no") v4 <- ifelse(x$autocorrelation < 0.05, "yes", "no") s1 <- sprintf("p = %.3f", x$heteroskedasticity) s2 <- sprintf("vif = %.3f", x$multicollinearity) s3 <- sprintf("p = %.3f", x$non.normal.resid) s4 <- sprintf("p = %.3f", x$autocorrelation) cat(sprintf(" Heteroskedasticity %8s %11s\n", v1, s1)) cat(sprintf(" Non-normal residuals %8s %11s\n", v3, s3)) cat(sprintf(" Autocorrelated residuals%8s %11s\n", v4, s4)) cat(sprintf(" Multicollinearity %8s %11s\n", v2, s2)) } #' @export print.sj_ttest <- function(x, ...) { insight::print_color(sprintf("\n%s (%s)\n", x$method, x$alternative), "blue") group <- attr(x, "group.name", exact = TRUE) xn <- attr(x, "x.name", exact = TRUE) yn <- attr(x, "y.name", exact = TRUE) if (!is.null(group)) verbs <- c("of", "by") else verbs <- c("between", "and") st <- sprintf("# t=%.2f df=%i p-value=%.3f\n\n", x$statistic, as.integer(x$df), x$p.value) if (!is.null(yn)) { insight::print_color(sprintf("\n# comparison %s %s %s %s\n", verbs[1], xn, verbs[2], yn), "cyan") } insight::print_color(st, "cyan") if (!is.null(yn)) { if (!is.null(group)) { l1 <- sprintf("mean in group %s", group[1]) l2 <- sprintf("mean in group %s", group[2]) } else { l1 <- sprintf("mean of %s", xn) l2 <- sprintf("mean of %s", yn) } l3 <- "difference of mean" slen <- max(nchar(c(l1, l2, l3))) cat(sprintf(" %s: %.3f\n", format(l1, width = slen), x$estimate[1])) cat(sprintf(" %s: %.3f\n", format(l2, width = slen), x$estimate[2])) cat(sprintf(" %s: %.3f [%.3f %.3f]\n", format(l3, width = slen), x$estimate[1] - x$estimate[2], x$ci[1], x$ci[2])) } else { cat(sprintf(" mean of %s: %.3f [%.3f, %.3f]\n", xn, x$estimate[1], x$ci[1], x$ci[2])) } cat("\n") } #' @export print.sj_wmwu <- function(x, ...) { insight::print_color(sprintf("\n%s (%s)\n", x$method, x$alternative), "blue") group <- attr(x, "group.name", exact = TRUE) xn <- attr(x, "x.name", exact = TRUE) insight::print_color(sprintf("\n# comparison of %s by %s\n", xn, group), "cyan") insight::print_color(sprintf("# Chisq=%.2f df=%i p-value=%.3f\n\n", x$statistic, as.integer(x$parameter), x$p.value), "cyan") cat(sprintf(" difference in mean rank score: %.3f\n\n", x$estimate)) } #' @export print.sj_wcor <- function(x, ...) { insight::print_color(sprintf("\nWeighted %s\n\n", x$method), "blue") if (!is.null(x$ci)) { cilvl <- sprintf("%.2i%%", as.integer(100 * x$ci.lvl)) cat(sprintf(" estimate [%s CI]: %.3f [%.3f %.3f]\n", cilvl, x$estimate, x$ci[1], x$ci[2])) cat(sprintf(" p-value: %.3f\n\n", x$p.value)) } else { cat(sprintf(" estimate: %.3f\n", x$estimate)) cat(sprintf(" p-value: %.3f\n\n", x$p.value)) } } #' @importFrom sjmisc round_num #' @export print.sj_anova_stat <- function(x, digits = 3, ...) { print.data.frame(sjmisc::round_num(x, digits), ..., row.names = TRUE) } sjstats/R/design_effect.R0000644000176200001440000000364213563265750015077 0ustar liggesusers#' @title Design effects for two-level mixed models #' @name design_effect #' #' @description Compute the design effect (also called \emph{Variance Inflation Factor}) #' for mixed models with two-level design. #' #' @param n Average number of observations per grouping cluster (i.e. level-2 unit). #' @param icc Assumed intraclass correlation coefficient for multilevel-model. #' #' @return The design effect (Variance Inflation Factor) for the two-level model. #' #' @references Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20. #' \cr \cr #' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} #' \cr \cr #' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} #' \cr \cr #' Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. \doi{10.1370/afm.1347} #' #' @details The formula for the design effect is simply \code{(1 + (n - 1) * icc)}. #' #' @examples #' # Design effect for two-level model with 30 observations per #' # cluster group (level-2 unit) and an assumed intraclass #' # correlation coefficient of 0.05. #' design_effect(n = 30) #' #' # Design effect for two-level model with 24 observation per cluster #' # group and an assumed intraclass correlation coefficient of 0.2. #' design_effect(n = 24, icc = 0.2) #' #' @export design_effect <- function(n, icc = 0.05) { 1 + (n - 1) * icc } sjstats/R/cv.R0000644000176200001440000000432313563265750012717 0ustar liggesusers#' @title Compute model quality #' @name cv #' #' @description Compute the coefficient of variation. #' #' @param x Fitted linear model of class \code{lm}, \code{merMod} (\pkg{lme4}) #' or \code{lme} (\pkg{nlme}). #' @param ... More fitted model objects, to compute multiple coefficients of #' variation at once. #' #' @details The advantage of the cv is that it is unitless. This allows #' coefficient of variation to be compared to each other in ways #' that other measures, like standard deviations or root mean #' squared residuals, cannot be. #' \cr \cr #' \dQuote{It is interesting to note the differences between a model's CV #' and R-squared values. Both are unitless measures that are indicative #' of model fit, but they define model fit in two different ways: CV #' evaluates the relative closeness of the predictions to the actual #' values while R-squared evaluates how much of the variability in the #' actual values is explained by the model.} #' \cite{(\href{http://www.ats.ucla.edu/stat/mult_pkg/faq/general/coefficient_of_variation.htm}{source: UCLA-FAQ})} #' #' @return Numeric, the coefficient of variation. #' #' @examples #' data(efc) #' fit <- lm(barthtot ~ c160age + c12hour, data = efc) #' cv(fit) #' #' @importFrom stats sd #' @export cv <- function(x, ...) { # return value cv_ <- cv_helper(x) # check if we have multiple parameters if (nargs() > 1) { # get input list params_ <- list(...) cv_ <- c(cv_, sapply(params_, cv_helper)) } cv_ } #' @importFrom performance rmse #' @importFrom insight get_response cv_helper <- function(x) { # check if we have a fitted linear model if (inherits(x, c("lm", "lmerMod", "lme", "merModLmerTest")) && !inherits(x, "glm")) { # get response dv <- insight::get_response(x) mw <- mean(dv, na.rm = TRUE) stddev <- performance::rmse(x) } else { mw <- mean(x, na.rm = TRUE) stddev <- stats::sd(x, na.rm = TRUE) } # check if mean is zero? if (mw == 0) stop("Mean of dependent variable is zero. Cannot compute model's coefficient of variation.", call. = F) stddev / mw } sjstats/R/omega_sq.R0000644000176200001440000000352113563265750014101 0ustar liggesusers#' @rdname eta_sq #' @importFrom dplyr bind_cols mutate #' @export omega_sq <- function(model, partial = FALSE, ci.lvl = NULL, n = 1000, method = c("dist", "quantile")) { method <- match.arg(method) if (partial) type <- "pomega" else type <- "omega" es <- aov_stat(model, type = type) x <- data_frame( term = names(es), es = es ) if (partial) { if (!is.null(ci.lvl) && !is.na(ci.lvl)) { x <- es_boot_fun( model = model, type = "pomega", ci.lvl = ci.lvl, n = n, boot.method = method ) } } else { if (!is.null(ci.lvl) && !is.na(ci.lvl)) { x <- dplyr::bind_cols(x, omega_sq_ci(aov.sum = aov_stat_summary(model), ci.lvl = ci.lvl)) } } colnames(x)[2] <- dplyr::case_when( type == "omega" ~ "omegasq", type == "pomega" ~ "partial.omegasq", TRUE ~ "effect.size" ) if (!is.null(attr(es, "stratum"))) x$stratum <- attr(es, "stratum")[1:nrow(x)] class(x) <- c("sj_anova_stat", class(x)) x } #' @importFrom purrr map_df omega_sq_ci <- function(aov.sum, ci.lvl = .95) { rows <- nrow(aov.sum) - 1 df.den <- aov.sum[["df"]][rows + 1] N <- sum(aov.sum[["df"]]) + 1 purrr::map_df( 1:rows, function(.x) { df.num = aov.sum[.x, "df"] test.stat <- aov.sum[.x, "statistic"] if (!is.na(test.stat)) { ci <- confint_ncg( F.value = test.stat, conf.level = ci.lvl, df.1 = df.num, df.2 = df.den ) ci.low <- ci$Lower.Limit / (ci$Lower.Limit + N) ci.high <- ci$Upper.Limit / (ci$Upper.Limit + N) } else { ci.low <- ci.high <- NA } data.frame( conf.low = ci.low, conf.high = ci.high ) } ) } sjstats/R/mwu.R0000644000176200001440000001636113565714104013116 0ustar liggesusers#' @title Mann-Whitney-U-Test #' @name mwu #' @description This function performs a Mann-Whitney-U-Test (or Wilcoxon rank sum test, #' see \code{\link[stats]{wilcox.test}} and \code{\link[coin]{wilcox_test}}) #' for \code{x}, for each group indicated by \code{grp}. If \code{grp} #' has more than two categories, a comparison between each combination of #' two groups is performed. \cr \cr #' The function reports U, p and Z-values as well as effect size r #' and group-rank-means. #' #' @param x Bare (unquoted) variable name, or a character vector with the variable name. #' @param distribution Indicates how the null distribution of the test statistic should be computed. #' May be one of \code{"exact"}, \code{"approximate"} or \code{"asymptotic"} #' (default). See \code{\link[coin]{wilcox_test}} for details. #' #' @inheritParams weighted_sd #' @inheritParams means_by_group #' #' @return (Invisibly) returns a data frame with U, p and Z-values for each group-comparison #' as well as effect-size r; additionally, group-labels and groups' n's are #' also included. #' #' @note This function calls the \code{\link[coin]{wilcox_test}} with formula. If \code{grp} #' has more than two groups, additionally a Kruskal-Wallis-Test (see \code{\link{kruskal.test}}) #' is performed. \cr \cr #' Interpretation of effect sizes, as a rule-of-thumb: #' \itemize{ #' \item small effect >= 0.1 #' \item medium effect >= 0.3 #' \item large effect >= 0.5 #' } #' #' @examples #' data(efc) #' # Mann-Whitney-U-Tests for elder's age by elder's dependency. #' mwu(efc, e17age, e42dep) #' #' @importFrom stats na.omit wilcox.test kruskal.test #' @importFrom sjmisc recode_to is_empty #' @importFrom sjlabelled get_labels as_numeric #' @importFrom rlang quo_name enquo #' @export mwu <- function(data, x, grp, distribution = "asymptotic", out = c("txt", "viewer", "browser"), encoding = "UTF-8", file = NULL) { out <- match.arg(out) if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) { message("Package `sjPlot` needs to be loaded to print HTML tables.") out <- "txt" } if (!requireNamespace("coin", quietly = TRUE)) { stop("Package `coin` needs to be installed to compute the Mann-Whitney-U test.", call. = FALSE) } # create quosures grp.name <- rlang::quo_name(rlang::enquo(grp)) dv.name <- rlang::quo_name(rlang::enquo(x)) # create string with variable names vars <- c(grp.name, dv.name) # get data data <- suppressMessages(dplyr::select(data, !! vars)) grp <- data[[grp.name]] dv <- data[[dv.name]] # coerce factor and character to numeric if (is.factor(grp) || is.character(grp)) grp <- sjlabelled::as_numeric(grp) # group "counter" (index) should start with 1, not 0 if (min(grp, na.rm = TRUE) < 1) grp <- sjmisc::recode_to(grp, lowest = 1, append = FALSE) # retrieve unique group values. need to iterate all values grp_values <- sort(unique(stats::na.omit(grp))) # length of value range cnt <- length(grp_values) labels <- sjlabelled::get_labels( grp, attr.only = F, values = NULL, non.labelled = T ) df <- data.frame() for (i in seq_len(cnt)) { for (j in i:cnt) { if (i != j) { # retrieve cases (rows) of subgroups xsub <- dv[which(grp == grp_values[i] | grp == grp_values[j])] ysub <- grp[which(grp == grp_values[i] | grp == grp_values[j])] # this is for unpaired wilcox.test() xsub_2 <- stats::na.omit(dv[which(grp == grp_values[i])]) ysub_2 <- stats::na.omit(dv[which(grp == grp_values[j])]) # only use rows with non-missings ysub <- ysub[which(!is.na(xsub))] # remove missings xsub <- as.numeric(stats::na.omit(xsub)) ysub.n <- stats::na.omit(ysub) # grouping variable is a factor ysub <- as.factor(ysub.n) wcdat <- data.frame( x = xsub, y = ysub ) # perfom wilcox test wt <- coin::wilcox_test(x ~ y, data = wcdat, distribution = distribution) # compute statistics u <- as.numeric(coin::statistic(wt, type = "linear")) z <- as.numeric(coin::statistic(wt, type = "standardized")) p <- coin::pvalue(wt) r <- abs(z / sqrt(length(ysub))) w <- stats::wilcox.test(xsub_2, ysub_2, paired = FALSE)$statistic rkm.i <- mean(rank(xsub)[which(ysub.n == grp_values[i])], na.rm = TRUE) rkm.j <- mean(rank(xsub)[which(ysub.n == grp_values[j])], na.rm = TRUE) # compute n for each group n_grp1 <- length(xsub[which(ysub.n == grp_values[i])]) n_grp2 <- length(xsub[which(ysub.n == grp_values[j])]) # generate result data frame df <- rbind( df, cbind( grp1 = grp_values[i], grp1.label = labels[i], grp1.n = n_grp1, grp2 = grp_values[j], grp2.label = labels[j], grp2.n = n_grp2, u = u, w = w, p = p, z = z, r = r, rank.mean.grp1 = rkm.i, rank.mean.grp2 = rkm.j ) ) } } } # convert variables df[["grp1"]] <- as.numeric(as.character(df[["grp1"]])) df[["grp2"]] <- as.numeric(as.character(df[["grp2"]])) df[["grp1.n"]] <- as.numeric(as.character(df[["grp1.n"]])) df[["grp2.n"]] <- as.numeric(as.character(df[["grp2.n"]])) df[["grp1.label"]] <- as.character(df[["grp1.label"]]) df[["grp2.label"]] <- as.character(df[["grp2.label"]]) df[["u"]] <- as.numeric(as.character(df[["u"]])) df[["w"]] <- as.numeric(as.character(df[["w"]])) df[["p"]] <- as.numeric(as.character(df[["p"]])) df[["z"]] <- as.numeric(as.character(df[["z"]])) df[["r"]] <- as.numeric(as.character(df[["r"]])) df[["rank.mean.grp1"]] <- as.numeric(as.character(df[["rank.mean.grp1"]])) df[["rank.mean.grp2"]] <- as.numeric(as.character(df[["rank.mean.grp2"]])) # prepare a data frame that can be used for 'sjt.df'. tab.df <- data_frame( Groups = sprintf("%s
%s", df$grp1.label, df$grp2.label), N = sprintf("%s
%s", df$grp1.n, df$grp2.n), 'Mean Rank' = sprintf("%.2f
%.2f", df$rank.mean.grp1, df$rank.mean.grp2), 'Mann-Whitney-U' = as.character(df$u), 'Wilcoxon-W' = as.character(df$w), Z = sprintf("%.3f", df$z), 'Effect Size' = sprintf("%.3f", df$r), p = sprintf("%.3f", df$p) ) # replace 0.001 with <0.001 tab.df$p[which(tab.df$p == "0.001")] <- "<0.001" ret.df <- list(df = df, tab.df = tab.df, data = data.frame(dv, grp)) # save how to print output attr(ret.df, "print") <- out attr(ret.df, "encoding") <- encoding attr(ret.df, "file") <- file if (out %in% c("viewer", "browser")) class(ret.df) <- c("mwu", "sjt_mwu") else class(ret.df) <- c("mwu", "sj_mwu") ret.df } #' @rdname mwu #' @export mannwhitney <- mwu sjstats/R/is_prime.R0000644000176200001440000000124513563265750014116 0ustar liggesusers#' @title Find prime numbers #' @name is_prime #' #' @description This functions checks whether a number is, or numbers in a #' vector are prime numbers. #' #' @param x An integer, or a vector of integers. #' #' @return \code{TRUE} for each prime number in \code{x}, \code{FALSE} otherwise. #' #' @examples #' is_prime(89) #' is_prime(15) #' is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) #' #' @importFrom purrr map_lgl #' @importFrom sjmisc is_float #' @export is_prime <- function(x) { if (sjmisc::is_float(x)) stop("`x` needs to be an integer value.", call. = F) purrr::map_lgl(x, ~ .x == 2L || all(.x %% 2L:max(2, floor(sqrt(.x))) != 0)) } sjstats/R/gmd.R0000644000176200001440000000321213565522357013053 0ustar liggesusers#' @title Gini's Mean Difference #' @name gmd #' @description \code{gmd()} computes Gini's mean difference for a numeric vector #' or for all numeric vectors in a data frame. #' #' @param x A vector or data frame. #' @param ... Optional, unquoted names of variables that should be selected for #' further processing. Required, if \code{x} is a data frame (and no vector) #' and only selected variables from \code{x} should be processed. You may also #' use functions like \code{:} or tidyselect's \code{\link[tidyselect]{select_helpers}}. #' #' @return For numeric vectors, Gini's mean difference. For non-numeric vectors #' or vectors of length < 2, returns \code{NA}. #' #' @note Gini's mean difference is defined as the mean absolute difference between #' any two distinct elements of a vector. Missing values from \code{x} are #' silently removed. #' #' @references David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575 #' #' @examples #' data(efc) #' gmd(efc$e17age) #' gmd(efc, e17age, c160age, c12hour) #' #' @importFrom dplyr quos select #' @importFrom purrr map_df #' @importFrom sjmisc is_empty #' @export gmd <- function(x, ...) { # evaluate dots qs <- dplyr::quos(...) if (!sjmisc::is_empty(qs)) x <- suppressMessages(dplyr::select(x, !!!qs)) if (is.data.frame(x)) purrr::map_df(x, gmd_helper) else gmd_helper(x) } #' @importFrom stats na.omit gmd_helper <- function(x) { if (!is.numeric(x)) return(NA) x <- stats::na.omit(x) n <- length(x) if (n < 2) return(NA) w <- 4 * ((1:n) - (n - 1) / 2) / n / (n - 1) sum(w * sort(x - mean(x))) } sjstats/R/cramer.R0000644000176200001440000000235713565713731013564 0ustar liggesusers#' @rdname crosstable_statistics #' @export cramer <- function(tab, ...) { UseMethod("cramer") } #' @export cramer.table <- function(tab, ...) { .cramer(tab) } #' @export cramer.ftable <- function(tab, ...) { .cramer(tab) } #' @rdname crosstable_statistics #' @export cramer.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { terms <- all.vars(formula) tab <- table(data[[terms[1]]], data[[terms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { .cramer(tab) } else { ci <- data[, terms] %>% sjstats::bootstrap(n) %>% dplyr::mutate( tables = lapply(.data$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }), cramers = sapply(.data$tables, function(x) .cramer(x)) ) %>% dplyr::pull("cramers") %>% boot_ci(ci.lvl = ci.lvl, method = method) data_frame( cramer = .cramer(tab), conf.low = ci$conf.low, conf.high = ci$conf.high ) } } .cramer <- function(tab) { # convert to flat table if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) sqrt(phi(tab)^2 / min(dim(tab) - 1)) } sjstats/R/wtd_se.R0000644000176200001440000000143213565520672013571 0ustar liggesusers#' @rdname weighted_sd #' @export weighted_se <- function(x, weights = NULL) { UseMethod("weighted_se") } #' @export weighted_se.data.frame <- function(x, weights = NULL) { se_result <- purrr::map_dbl(x, ~ weighted_se_helper(.x, weights = weights)) names(se_result) <- colnames(x) se_result } #' @export weighted_se.matrix <- function(x, weights = NULL) { se_result <- purrr::map_dbl(x, ~ weighted_se_helper(.x, weights = weights)) names(se_result) <- colnames(x) se_result } #' @export weighted_se.default <- function(x, weights = NULL) { weighted_se_helper(x, weights) } weighted_se_helper <- function(x, weights) { if (is.null(weights)) weights <- rep(1, length(x)) sqrt(weighted_variance(x, weights) / length(stats::na.omit(x))) } sjstats/R/wtd_chisqtest.R0000644000176200001440000000226013565713744015175 0ustar liggesusers#' @rdname weighted_sd #' @export weighted_chisqtest <- function(data, ...) { UseMethod("weighted_chisqtest") } #' @importFrom dplyr select #' @rdname weighted_sd #' @export weighted_chisqtest.default <- function(data, x, y, weights, ...) { x.name <- deparse(substitute(x)) y.name <- deparse(substitute(y)) w.name <- deparse(substitute(weights)) if (w.name == "NULL") { w.name <- "weights" data$weights <- 1 } # create string with variable names vars <- c(x.name, y.name, w.name) # get data dat <- suppressMessages(dplyr::select(data, !! vars)) dat <- na.omit(dat) colnames(dat)[3] <- ".weights" crosstable_statistics(data = dat, statistics = "auto", weights = ".weights", ...) } #' @importFrom stats xtabs #' @rdname weighted_sd #' @export weighted_chisqtest.formula <- function(formula, data, ...) { vars <- all.vars(formula) if (length(vars) < 3) { vars <- c(vars, ".weights") data$.weights <- 1 } tab <- as.table(round(stats::xtabs(data[[vars[3]]] ~ data[[vars[1]]] + data[[vars[2]]]))) class(tab) <- "table" crosstable_statistics(data = tab, statistics = "auto", weights = NULL, ...) } sjstats/R/var_pop.R0000644000176200001440000000311313563265750013751 0ustar liggesusers#' @title Calculate population variance and standard deviation #' @name var_pop #' @description Calculate the population variance or standard deviation of a vector. #' #' @param x (Numeric) vector. #' #' @return The population variance or standard deviation of \code{x}. #' #' @details Unlike \code{\link[stats]{var}}, which returns the sample variance, #' \code{var_pop()} returns the population variance. \code{sd_pop()} #' returns the standard deviation based on the population variance. #' #' @examples #' data(efc) #' #' # sampling variance #' var(efc$c12hour, na.rm = TRUE) #' # population variance #' var_pop(efc$c12hour) #' #' # sampling sd #' sd(efc$c12hour, na.rm = TRUE) #' # population sd #' sd_pop(efc$c12hour) #' #' @importFrom stats na.omit var #' @importFrom sjmisc is_num_fac #' @importFrom sjlabelled as_numeric #' @export var_pop <- function(x) { # check for categorical if (is.factor(x)) { # only allow numeric factors if (!sjmisc::is_num_fac(x)) { warning("`x` must be numeric vector or a factor with numeric levels.", call. = F) return(NA) } # convert factor to numeric x <- sjlabelled::as_numeric(x) } # remove NA x <- stats::na.omit(x) n <- length(x) # population variance stats::var(x) * ((n - 1) / n) } #' @rdname var_pop #' @importFrom stats na.omit var #' @export sd_pop <- function(x) { # get population variance pv <- var_pop(x) # factors with non-numeric level return NULL if (!is.null(pv) && !is.na(pv)) sqrt(pv) else NA } sjstats/R/re-exports.R0000644000176200001440000000074613563265750014424 0ustar liggesusers#' @importFrom magrittr %>% #' @export magrittr::`%>%` #' @importFrom sjmisc typical_value #' @export sjmisc::typical_value #' @importFrom performance mse #' @export performance::mse #' @importFrom performance rmse #' @export performance::rmse #' @importFrom bayestestR ci #' @export bayestestR::ci #' @importFrom bayestestR equivalence_test #' @export bayestestR::equivalence_test #' @importFrom insight link_inverse #' @export insight::link_inverse sjstats/R/Deprecated.R0000644000176200001440000000257613610331557014347 0ustar liggesusers#' @title Deprecated functions #' @name r2 #' @description A list of deprecated functions. #' #' @param x An object. #' @param ... Currently not used. #' #' @return Nothing. #' #' @importFrom performance r2 #' @export r2 <- function(x) { .Deprecated("performance::r2()") performance::r2(x) } #' @importFrom performance icc #' @rdname r2 #' @export icc <- function(x) { .Deprecated("performance::icc()") performance::icc(x) } #' @importFrom parameters p_value #' @rdname r2 #' @export p_value <- function(x, ...) { .Deprecated("parameters::p_value()") parameters::p_value(x) } #' @importFrom parameters standard_error #' @rdname r2 #' @export se <- function(x, ...) { .Deprecated("parameters::standard_error()") parameters::standard_error(x) } #' @importFrom effectsize cohens_f #' @rdname r2 #' @export cohens_f <- function(x, ...) { .Deprecated("effectsize::cohens_f()") effectsize::cohens_f(x) } #' @importFrom effectsize standardize_parameters #' @rdname r2 #' @export std_beta <- function(x, ...) { .Deprecated("effectsize::standardize_parameters()") effectsize::standardize_parameters(x, ...) } #' @importFrom parameters standard_error_robust #' @rdname r2 #' @export robust <- function(x, ...) { .Deprecated("parameters::standard_error_robust()") parameters::standard_error_robust(x, ...) } sjstats/R/select_helpers.R0000644000176200001440000000157513563265750015316 0ustar liggesusersstring_starts_with <- function(pattern, x) { pattern <- paste0("^\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_contains <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_ends_with <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E$") grep(pattern, x, perl = TRUE) } #' @importFrom purrr map string_one_of <- function(pattern, x) { m <- unlist(purrr::map(pattern, ~ grep(., x, fixed = TRUE, useBytes = TRUE))) x[m] } rownames_as_column <- function(x, var = "rowname") { rn <- data.frame(rn = rownames(x), stringsAsFactors = FALSE) x <- cbind(rn, x) colnames(x)[1] <- var rownames(x) <- NULL x } obj_has_name <- function(x, name) { name %in% names(x) } obj_has_rownames <- function(x) { !identical(as.character(1:nrow(x)), rownames(x)) } sjstats/R/svyglmzip.R0000644000176200001440000001054613616514107014347 0ustar liggesusersutils::globalVariables("scaled.weights") #' @title Survey-weighted zero-inflated Poisson model #' @name svyglm.zip #' @description \code{svyglm.zip()} is an extension to the \CRANpkg{survey}-package #' to fit survey-weighted zero-inflated Poisson models. It uses #' \code{\link[survey]{svymle}} to fit sampling-weighted #' maximum likelihood estimates, based on starting values provided #' by \code{\link[pscl]{zeroinfl}}. #' #' #' @param formula An object of class \code{formula}, i.e. a symbolic description #' of the model to be fitted. See 'Details' in \code{\link[pscl]{zeroinfl}}. #' @param design An object of class \code{\link[survey]{svydesign}}, providing #' a specification of the survey design. #' @param ... Other arguments passed down to \code{\link[pscl]{zeroinfl}}. #' #' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.zip}, #' with some additional information about the model. #' #' @details Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/. #' #' @examples #' if (require("survey")) { #' data(nhanes_sample) #' set.seed(123) #' nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2) #' nhanes_sample$malepartners[sample(1:2992, 400)] <- 0 #' #' # create survey design #' des <- svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' #' # fit negative binomial regression #' fit <- svyglm.zip( #' malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1), #' des #' ) #' #' # print coefficients and standard errors #' fit #' } #' @importFrom insight find_formula #' @importFrom stats weights update model.frame coef as.formula family #' @export svyglm.zip <- function(formula, design, ...) { # check if pkg survey is available if (!requireNamespace("survey", quietly = TRUE)) { stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) } if (!requireNamespace("pscl", quietly = TRUE)) { stop("Package `pscl` needed to for this function to work. Please install it.", call. = FALSE) } # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) # update design with scaled weights design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE)) # fit ZIP model, with scaled design weights mod <- pscl::zeroinfl(formula, data = stats::model.frame(design), weights = scaled.weights, ...) ff <- insight::find_formula(mod) # fit survey model, using maximum likelihood estimation svyfit <- survey::svymle( loglike = sjstats_loglik_zip, grad = sjstats_score_zip, design = design, formulas = list(eta = ff$conditional, logitp = ff$zero_inflated), start = stats::coef(mod), na.action = "na.omit" ) # add additoinal information class(svyfit) <- c("svyglm.zip", class(svyfit)) attr(svyfit, "zip.terms") <- all.vars(formula) attr(svyfit, "zip.formula") <- formula svyfit$deviance <- mod$deviance svyfit$df.residuals <- mod$df.residuals svyfit$df <- length(stats::coef(mod)) + 1 svyfit$aic <- mod$aic svyfit } #' @importFrom stats dpois # log-likelihood function used in "svymle()" sjstats_loglik_zip <- function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) log(p * (y == 0) + (1 - p) * stats::dpois(y, mu)) } sjstats_dlogitp = function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) dexpit <- p / (1 + p) ^ 2 num <- dexpit * (y == 0) - dexpit * stats::dpois(y, mu) denom <- p * (y == 0) + (1 - p) * stats::dpois(y, mu) num / denom } # derivative sjstats_deta_zip <- function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) dmutoy <- 0 * y dmutoy[y > 0] = exp(-mu[y > 0]) * mu[y > 0] ^ (y[y > 0] - 1) / factorial(y[y > 0] - 1) num = (1 - p) * (-stats::dpois(y, mu) + dmutoy) denom = p * (y == 0) + (1 - p) * stats::dpois(y, mu) num / denom } # score function, combines derivatives sjstats_score_zip <- function(y, eta, logitp) { cbind(sjstats_deta_zip(y, eta, logitp), sjstats_dlogitp(y, eta, logitp)) } sjstats/R/wtd_ttest.R0000644000176200001440000001076513565520335014332 0ustar liggesusers#' @rdname weighted_sd #' @importFrom stats pt qt weighted.mean setNames #' @importFrom sjmisc is_empty #' @export weighted_ttest <- function(data, ...) { UseMethod("weighted_ttest") } #' @rdname weighted_sd #' @export weighted_ttest.default <- function(data, x, y = NULL, weights, mu = 0, paired = FALSE, ci.lvl = 0.95, alternative = c("two.sided", "less", "greater"), ...) { if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) stop("'ci.lvl' must be a single number between 0 and 1") alternative <- match.arg(alternative) x.name <- deparse(substitute(x)) y.name <- deparse(substitute(y)) w.name <- deparse(substitute(weights)) if (y.name == "NULL") y.name <- NULL if (w.name == "NULL") { w.name <- "weights" data$weights <- 1 } # create string with variable names vars <- c(x.name, y.name, w.name) # get data dat <- suppressMessages(dplyr::select(data, !! vars)) dat <- na.omit(dat) if (sjmisc::is_empty(dat) || nrow(dat) == 1) { warning("Too less data to compute t-test.") return(NULL) } xv <- dat[[x.name]] wx <- wy <- dat[[w.name]] if (!is.null(y.name)) yv <- dat[[y.name]] else yv <- NULL nx <- ny <- nrow(dat) weighted_ttest_helper(xv, yv, wx, wy, nx, ny, mu, paired, alternative, ci.lvl, x.name, y.name, NULL) } #' @rdname weighted_sd #' @export weighted_ttest.formula <- function(formula, data, mu = 0, paired = FALSE, ci.lvl = 0.95, alternative = c("two.sided", "less", "greater"), ...) { if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) stop("'ci.lvl' must be a single number between 0 and 1") alternative <- match.arg(alternative) vars <- all.vars(formula) g <- data[[vars[2]]] if (is.factor(g)) grps <- levels(g) else grps <- na.omit(sort(unique(g))) if (length(grps) > 2) stop("Grouping factor has more than two levels.") if (length(vars) < 3) { vars <- c(vars, "weights") data$weights <- 1 } x <- data[[vars[1]]] y <- data[[vars[2]]] w <- data[[vars[3]]] xv <- x[y == grps[1]] yv <- x[y == grps[2]] wx <- w[y == grps[1]] wy <- w[y == grps[2]] mxv <- is.na(xv) xv <- xv[!mxv] wx <- wx[!mxv] myv <- is.na(yv) yv <- yv[!myv] wy <- wy[!myv] nx <- length(xv) ny <- length(yv) labs <- sjlabelled::get_labels( data[[vars[2]]], attr.only = FALSE, values = "p", drop.na = TRUE, drop.unused = TRUE ) weighted_ttest_helper(xv, yv, wx, wy, nx, ny, mu, paired, alternative, ci.lvl, vars[1], vars[2], labs) } weighted_ttest_helper <- function(xv, yv, wx, wy, nx, ny, mu, paired, alternative, ci.lvl, x.name, y.name, group.name) { if (paired) { xv <- xv - yv yv <- NULL } mu.x.w <- stats::weighted.mean(xv, wx) var.x.w <- weighted_sd(xv, wx)^2 se.x <- sqrt(var.x.w / nx) if (!is.null(yv)) { mu.y.w <- stats::weighted.mean(yv, wy) var.y.w <- weighted_sd(yv, wy)^2 se.y <- sqrt(var.y.w / ny) se <- sqrt(se.x^2 + se.y^2) df <- se^4 / (se.x^4 / (nx - 1) + se.y^4 / (ny - 1)) tstat <- (mu.x.w - mu.y.w - mu) / se estimate <- c(mu.x.w, mu.y.w) names(estimate) <- c("mean of x", "mean of y") method <- "Two-Sample t-test" } else { se <- se.x df <- nx - 1 tstat <- (mu.x.w - mu) / se estimate <- stats::setNames(mu.x.w, if (paired) "mean of the differences" else "mean of x") method <- if (paired) "Paired t-test" else "One Sample t-test" } if (alternative == "less") { pval <- stats::pt(tstat, df) cint <- c(-Inf, tstat + stats::qt(ci.lvl, df)) } else if (alternative == "greater") { pval <- stats::pt(tstat, df, lower.tail = FALSE) cint <- c(tstat - stats::qt(ci.lvl, df), Inf) } else { pval <- 2 * stats::pt(-abs(tstat), df) alpha <- 1 - ci.lvl cint <- stats::qt(1 - alpha / 2, df) cint <- tstat + c(-cint, cint) } cint <- mu + cint * se names(tstat) <- "t" names(df) <- "df" names(mu) <- if (paired || !is.null(yv)) "difference in means" else "mean" tt <- structure( class = "sj_ttest", list( estimate = estimate, statistic = tstat, df = df, p.value = pval, ci = cint, alternative = alternative, method = method ) ) attr(tt, "x.name") <- x.name attr(tt, "y.name") <- y.name attr(tt, "group.name") <- group.name tt } sjstats/R/cv_error.R0000644000176200001440000000565513563265750014141 0ustar liggesusers#' @title Test and training error from model cross-validation #' @name cv_error #' #' @description \code{cv_error()} computes the root mean squared error from a model fitted #' to kfold cross-validated test-training-data. \code{cv_compare()} #' does the same, for multiple formulas at once (by calling \code{cv_error()} #' for each formula). #' #' @param data A data frame. #' @param formula The formula to fit the linear model for the test and training data. #' @param formulas A list of formulas, to fit linear models for the test and training data. #' @param k The number of folds for the kfold-crossvalidation. #' #' @return A data frame with the root mean squared errors for the training and test data. #' #' @details \code{cv_error()} first generates cross-validated test-training pairs, using #' \code{\link[modelr]{crossv_kfold}} and then fits a linear model, which #' is described in \code{formula}, to the training data. Then, predictions #' for the test data are computed, based on the trained models. #' The \emph{training error} is the mean value of the \code{\link{rmse}} for #' all \emph{trained} models; the \emph{test error} is the rmse based on all #' residuals from the test data. #' #' @examples #' data(efc) #' cv_error(efc, neg_c_7 ~ barthtot + c161sex) #' #' cv_compare(efc, formulas = list( #' neg_c_7 ~ barthtot + c161sex, #' neg_c_7 ~ barthtot + c161sex + e42dep, #' neg_c_7 ~ barthtot + c12hour #' )) #' #' @importFrom modelr crossv_kfold #' @importFrom dplyr mutate summarise #' @importFrom purrr map map2 map_dbl map_df #' @importFrom broom augment #' @importFrom tidyr unnest #' @importFrom rlang .data #' @importFrom insight find_response #' @importFrom performance rmse #' @export cv_error <- function(data, formula, k = 5) { # compute cross validation data cv_data <- data %>% modelr::crossv_kfold(k = k) %>% dplyr::mutate( trained.models = purrr::map(.data$train, ~ stats::lm(formula, data = .x)), predicted = purrr::map2(.data$trained.models, .data$test, ~ broom::augment(.x, newdata = .y)), residuals = purrr::map(.data$predicted, ~.x[[insight::find_response(formula)]] - .x$.fitted), rmse.train = purrr::map_dbl(.data$trained.models, ~ performance::rmse(.x)) ) # Training error train.error <- dplyr::summarise(cv_data, train.error = mean(.data$rmse.train, na.rm = TRUE)) # Test error test.error <- cv_data %>% tidyr::unnest(.data$predicted, .data$residuals) %>% dplyr::summarise(test.error = sqrt(mean(.data$residuals^2, na.rm = TRUE))) data_frame( model = deparse(formula), train.error = round(train.error[[1]], 4), test.error = round(test.error[[1]], 4) ) } #' @rdname cv_error #' @export cv_compare <- function(data, formulas, k = 5) { purrr::map_df(formulas, ~ cv_error(data, formula = .x, k = k)) } sjstats/R/prop.R0000644000176200001440000002201413563265750013264 0ustar liggesusers#' @title Proportions of values in a vector #' @name prop #' #' @description \code{prop()} calculates the proportion of a value or category #' in a variable. \code{props()} does the same, but allows for #' multiple logical conditions in one statement. It is similar #' to \code{mean()} with logical predicates, however, both #' \code{prop()} and \code{props()} work with grouped data frames. #' #' @param data A data frame. May also be a grouped data frame (see 'Examples'). #' @param ... One or more value pairs of comparisons (logical predicates). Put #' variable names the left-hand-side and values to match on the #' right hand side. Expressions may be quoted or unquoted. See #' 'Examples'. #' @param weights Vector of weights that will be applied to weight all observations. #' Must be a vector of same length as the input vector. Default is #' \code{NULL}, so no weights are used. #' @param na.rm Logical, whether to remove NA values from the vector when the #' proportion is calculated. \code{na.rm = FALSE} gives you the raw #' percentage of a value in a vector, \code{na.rm = TRUE} the valid #' percentage. #' @param digits Amount of digits for returned values. #' #' @details \code{prop()} only allows one logical statement per comparison, #' while \code{props()} allows multiple logical statements per comparison. #' However, \code{prop()} supports weighting of variables before calculating #' proportions, and comparisons may also be quoted. Hence, \code{prop()} #' also processes comparisons, which are passed as character vector #' (see 'Examples'). #' #' #' @return For one condition, a numeric value with the proportion of the values #' inside a vector. For more than one condition, a tibble with one column #' of conditions and one column with proportions. For grouped data frames, #' returns a tibble with one column per group with grouping categories, #' followed by one column with proportions per condition. #' #' @examples #' data(efc) #' #' # proportion of value 1 in e42dep #' prop(efc, e42dep == 1) #' #' # expression may also be completely quoted #' prop(efc, "e42dep == 1") #' #' # use "props()" for multiple logical statements #' props(efc, e17age > 70 & e17age < 80) #' #' # proportion of value 1 in e42dep, and all values greater #' # than 2 in e42dep, including missing values. will return a tibble #' prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE) #' #' # for factors or character vectors, use quoted or unquoted values #' library(sjmisc) #' # convert numeric to factor, using labels as factor levels #' efc$e16sex <- to_label(efc$e16sex) #' efc$n4pstu <- to_label(efc$n4pstu) #' #' # get proportion of female older persons #' prop(efc, e16sex == female) #' #' # get proportion of male older persons #' prop(efc, e16sex == "male") #' #' # "props()" needs quotes around non-numeric factor levels #' props(efc, #' e17age > 70 & e17age < 80, #' n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' #' ) #' #' # also works with pipe-chains #' library(dplyr) #' efc %>% prop(e17age > 70) #' efc %>% prop(e17age > 70, e16sex == 1) #' #' # and with group_by #' efc %>% #' group_by(e16sex) %>% #' prop(e42dep > 2) #' #' efc %>% #' select(e42dep, c161sex, c172code, e16sex) %>% #' group_by(c161sex, c172code) %>% #' prop(e42dep > 2, e16sex == 1) #' #' # same for "props()" #' efc %>% #' select(e42dep, c161sex, c172code, c12hour, n4pstu) %>% #' group_by(c161sex, c172code) %>% #' props( #' e42dep > 2, #' c12hour > 20 & c12hour < 40, #' n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' #' ) #' #' @importFrom dplyr bind_cols bind_rows #' @importFrom sjlabelled get_label get_labels as_numeric #' @export prop <- function(data, ..., weights = NULL, na.rm = TRUE, digits = 4) { # check argument if (!is.data.frame(data)) stop("`data` needs to be a data frame.", call. = F) # get dots dots <- match.call(expand.dots = FALSE)$`...` proportions(data, dots, weight.by = weights, na.rm, digits, multi_logical = FALSE) } #' @rdname prop #' @export props <- function(data, ..., na.rm = TRUE, digits = 4) { # check argument if (!is.data.frame(data)) stop("`data` needs to be a data frame.", call. = F) # get dots dots <- match.call(expand.dots = FALSE)$`...` proportions(data, dots, NULL, na.rm, digits, multi_logical = TRUE) } #' @importFrom purrr map_df proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) { # remember comparisons comparisons <- lapply(dots, function(x) { # to character, and remove spaces and quotes x <- gsub(" ", "", deparse(x), fixed = T) x <- gsub("\"", "", x, fixed = TRUE) x }) # do we have a grouped data frame? if (inherits(data, "grouped_df")) { # remember order of values reihenfolge <- NULL # get grouped data grps <- get_grouped_data(data) # now get proportions for each subset fr <- purrr::map_df( seq_len(nrow(grps)), function(i) { # get data from grouped data frame .d <- grps$data[[i]] # iterate dots (comparing conditions) if (multi_logical) result <- lapply(dots, get_multiple_proportion, .d, na.rm, digits) else result <- lapply(dots, get_proportion, .d, weight.by, na.rm, digits) as.data.frame(t(unlist(result))) } ) # now we need the values from the groups of the grouped data frame for (i in (ncol(grps) - 1):1) { # get value label var.name <- colnames(grps)[i] val.labels <- suppressWarnings( rep(sjlabelled::get_labels(data[[var.name]]), length.out = nrow(fr)) ) # if we have no value labels, use values instead if (is.null(val.labels)) { val.labels <- rep(unique(sort(data[[var.name]])), length.out = nrow(fr)) } # add row order, based on values of grouping variables reihenfolge <- rep(sort(unique(sjlabelled::as_numeric(data[[var.name]]))), length.out = nrow(fr)) %>% as.data.frame() %>% dplyr::bind_cols(reihenfolge) # bind values as column fr <- dplyr::bind_cols(data.frame(val.labels, stringsAsFactors = FALSE), fr) } # get column names. we need variable labels as column names var.names <- colnames(grps)[seq_len(ncol(grps) - 1)] var.labels <- sjlabelled::get_label(data[, var.names], def.value = var.names) # set variable labels and comparisons as colum names colnames(fr) <- c(var.labels, comparisons) # order rows by values of grouping variables fr <- fr[do.call(order, reihenfolge), ] return(fr) } else { # iterate dots (comparing conditions) if (multi_logical) result <- lapply(dots, get_multiple_proportion, data, na.rm, digits) else result <- lapply(dots, get_proportion, data, weight.by, na.rm, digits) # if we have more than one proportion, return a tibble. this allows us # to save more information, the condition and the proportion value if (length(comparisons) > 1) { return(data_frame( condition = as.character(unlist(comparisons)), prop = unlist(result) )) } return(unlist(result)) } } get_proportion <- function(x, data, weight.by, na.rm, digits) { # to character, and remove spaces and quotes x <- gsub(" ", "", deparse(x), fixed = T) x <- gsub("\"", "", x, fixed = TRUE) # split expression at ==, < or > x.parts <- unlist(regmatches(x, gregexpr("[!=]=|[<>]|(?:(?![=!]=)[^<>])+", x, perl = TRUE))) # shorter version, however, does not split variable names with dots # x.parts <- unlist(regmatches(x, regexec("(\\w+)(\\W+)(\\w+)", x)))[-1] # correct == assignment? if (length(x.parts) < 3) { message("?Syntax error in argument. You possibly used `=` instead of `==`.") return(NULL) } # get variable from data and value from equation f <- data[[x.parts[1]]] v <- suppressWarnings(as.numeric(x.parts[3])) # if we have factor, values maybe non-numeric if (is.na(v)) v <- x.parts[3] # weight vector? if (!is.null(weight.by)) f <- weight(f, weights = weight.by) # get proportions if (x.parts[2] == "==") dummy <- f == v else if (x.parts[2] == "!=") dummy <- f != v else if (x.parts[2] == "<") dummy <- f < v else if (x.parts[2] == ">") dummy <- f > v else dummy <- f == v # remove missings? if (na.rm) dummy <- na.omit(dummy) # get proportion round(sum(dummy, na.rm = T) / length(dummy), digits = digits) } get_multiple_proportion <- function(x, data, na.rm, digits) { # evaluate argument dummy <- with(data, eval(parse(text = deparse(x)))) # remove missings? if (na.rm) dummy <- na.omit(dummy) # get proportion round(sum(dummy, na.rm = T) / length(dummy), digits = digits) } sjstats/R/anova_stats.R0000644000176200001440000001273313616476363014640 0ustar liggesusers#' @importFrom sjmisc add_columns round_num #' @importFrom stats anova #' @rdname eta_sq #' @export anova_stats <- function(model, digits = 3) { if (!requireNamespace("pwr", quietly = TRUE)) { stop("Package `pwr` needed for this function to work. Please install it.", call. = FALSE) } # get tidy summary table aov.sum <- aov_stat_summary(model) # compute all model statistics etasq <- aov_stat_core(aov.sum, type = "eta") partial.etasq <- aov_stat_core(aov.sum, type = "peta") omegasq <- aov_stat_core(aov.sum, type = "omega") partial.omegasq <- aov_stat_core(aov.sum, type = "pomega") epsilonsq <- aov_stat_core(aov.sum, type = "epsilon") # compute power for each estimate cohens.f <- sqrt(partial.etasq / (1 - partial.etasq)) # bind as data frame as <- dplyr::bind_rows( data.frame(etasq, partial.etasq, omegasq, partial.omegasq, epsilonsq, cohens.f), data.frame(etasq = NA, partial.etasq = NA, omegasq = NA, partial.omegasq = NA, epsilonsq = NA, cohens.f = NA) ) %>% sjmisc::add_columns(aov.sum) # get nr of terms nt <- nrow(as) - 1 # finally, compute power power <- tryCatch( { c( pwr::pwr.f2.test(u = as$df[1:nt], v = as$df[nrow(as)], f2 = as$cohens.f[1:nt]^2)[["power"]], NA ) }, error = function(x) { NA } ) sjmisc::add_variables(as, power = power) %>% sjmisc::round_num(digits = digits) %>% as.data.frame() } #' @importFrom dplyr mutate #' @importFrom rlang .data aov_stat <- function(model, type) { aov.sum <- aov_stat_summary(model) aov.res <- aov_stat_core(aov.sum, type) if (obj_has_name(aov.sum, "stratum")) attr(aov.res, "stratum") <- aov.sum[["stratum"]] aov.res } #' @importFrom stats anova residuals #' @importFrom broom tidy aov_stat_summary <- function(model) { # check if we have a mixed model mm <- is_merMod(model) ori.model <- model # check that model inherits from correct class # else, try to coerce to anova table if (!inherits(model, c("Gam", "aov", "anova", "anova.rms", "aovlist"))) model <- stats::anova(model) # get summary table aov.sum <- as.data.frame(broom::tidy(model)) # for mixed models, add information on residuals if (mm) { res <- stats::residuals(ori.model) aov.sum <- dplyr::bind_rows( aov.sum, data_frame( term = "Residuals", df = length(res) - sum(aov.sum[["df"]]), sumsq = sum(res^2, na.rm = TRUE), meansq = mse(ori.model), statistic = NA ) ) } # check if object has sums of square if (!obj_has_name(aov.sum, "sumsq")) { stop("Model object has no sums of squares. Cannot compute effect size statistic.", call. = FALSE) } # need special handling for rms-anova if (inherits(model, "anova.rms")) colnames(aov.sum) <- c("term", "df", "sumsq", "meansq", "statistic", "p.value") # for car::Anova, the meansq-column might be missing, so add it manually if (!obj_has_name(aov.sum, "meansq")) aov.sum <- sjmisc::add_variables(aov.sum, meansq = aov.sum$sumsq / aov.sum$df, .after = "sumsq") intercept <- .which_intercept(aov.sum$term) if (length(intercept) > 0) { aov.sum <- aov.sum[-intercept, ] } aov.sum } aov_stat_core <- function(aov.sum, type) { intercept <- .which_intercept(aov.sum$term) if (length(intercept) > 0) { aov.sum <- aov.sum[-intercept, ] } # get mean squared of residuals meansq.resid <- aov.sum[["meansq"]][nrow(aov.sum)] # get total sum of squares ss.total <- sum(aov.sum[["sumsq"]]) # get sum of squares of residuals ss.resid <- aov.sum[["sumsq"]][nrow(aov.sum)] # number of terms in model n_terms <- nrow(aov.sum) - 1 # number of observations N <- sum(aov.sum[["df"]]) + 1 if (type == "omega") { # compute omega squared for each model term aovstat <- purrr::map_dbl(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / (ss.total + meansq.resid) }) } else if (type == "pomega") { # compute partial omega squared for each model term aovstat <- purrr::map_dbl(1:n_terms, function(x) { df.term <- aov.sum[["df"]][x] meansq.term <- aov.sum[["meansq"]][x] (df.term * (meansq.term - meansq.resid)) / (df.term * meansq.term + (N - df.term) * meansq.resid) }) } else if (type == "epsilon") { # compute epsilon squared for each model term aovstat <- purrr::map_dbl(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / ss.total }) } else if (type == "eta") { # compute eta squared for each model term aovstat <- purrr::map_dbl(1:n_terms, ~ aov.sum[["sumsq"]][.x] / sum(aov.sum[["sumsq"]])) } else if (type %in% c("cohens.f", "peta")) { # compute partial eta squared for each model term aovstat <- purrr::map_dbl(1:n_terms, ~ aov.sum[["sumsq"]][.x] / (aov.sum[["sumsq"]][.x] + ss.resid)) } # compute Cohen's F if (type == "cohens.f") aovstat <- sqrt(aovstat / (1 - aovstat)) # give values names of terms names(aovstat) <- aov.sum[["term"]][1:n_terms] aovstat } .which_intercept <- function(x) { which(tolower(x) %in% c("(intercept)_zi", "intercept (zero-inflated)", "intercept", "zi_intercept", "(intercept)", "b_intercept", "b_zi_intercept")) } sjstats/R/phi.R0000644000176200001440000000232313565713734013067 0ustar liggesusers#' @rdname crosstable_statistics #' @export phi <- function(tab, ...) { UseMethod("phi") } #' @export phi.table <- function(tab, ...) { .phi(tab) } #' @export phi.ftable <- function(tab, ...) { .phi(tab) } #' @export phi.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { terms <- all.vars(formula) tab <- table(data[[terms[1]]], data[[terms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { .cramer(tab) } else { ci <- data[, terms] %>% sjstats::bootstrap(n) %>% dplyr::mutate( tables = lapply(.data$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }), phis = sapply(.data$tables, function(x) .cramer(x)) ) %>% dplyr::pull("phis") %>% boot_ci(ci.lvl = ci.lvl, method = method) data_frame( phi = .phi(tab), conf.low = ci$conf.low, conf.high = ci$conf.high ) } } .phi <- function(tab) { # convert to flat table if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) tb <- summary(MASS::loglm(~1 + 2, tab))$tests sqrt(tb[2, 1] / sum(tab)) } sjstats/R/helpfunctions.R0000644000176200001440000000327513563265750015175 0ustar liggesusers# Help-functions data_frame <- function(...) { x <- data.frame(..., stringsAsFactors = FALSE) rownames(x) <- NULL x } is_merMod <- function(fit) { inherits(fit, c("lmerMod", "glmerMod", "nlmerMod", "merModLmerTest")) } is_stan_model <- function(fit) { inherits(fit, c("stanreg", "stanfit", "brmsfit")) } #' @importFrom sjmisc str_contains get_glm_family <- function(fit) { c.f <- class(fit) # do we have glm? if so, get link family. make exceptions # for specific models that don't have family function if (any(c.f %in% c("lme", "plm"))) { fitfam <- "" logit_link <- FALSE } else { fitfam <- stats::family(fit)$family logit_link <- stats::family(fit)$link == "logit" } # create logical for family binom_fam <- fitfam %in% c("binomial", "quasibinomial") poisson_fam <- fitfam %in% c("poisson", "quasipoisson") || sjmisc::str_contains(fitfam, "negative binomial", ignore.case = T) list(is_bin = binom_fam, is_pois = poisson_fam, is_logit = logit_link) } # return names of objects passed as ellipses argument dot_names <- function(dots) unname(unlist(lapply(dots, as.character))) #' @importFrom tidyr nest #' @importFrom dplyr select filter group_vars #' @importFrom stats complete.cases #' @importFrom rlang .data get_grouped_data <- function(x) { # retain observations that are complete wrt grouping vars, then nest grps <- x %>% dplyr::group_modify(~ dplyr::filter(.x, stats::complete.cases(.y))) %>% tidyr::nest() # arrange data if (length(dplyr::group_vars(x)) == 1) reihe <- order(grps[[1]]) else reihe <- order(grps[[1]], grps[[2]]) grps <- grps[reihe, ] grps } sjstats/R/mean_n.R0000644000176200001440000000667713563265750013562 0ustar liggesusers#' @title Row means with min amount of valid values #' @name mean_n #' @description This function is similar to the SPSS \code{MEAN.n} function and computes #' row means from a \code{data.frame} or \code{matrix} if at least \code{n} #' values of a row are valid (and not \code{NA}). #' #' @param dat A data frame with at least two columns, where row means are applied. #' @param n May either be #' \itemize{ #' \item a numeric value that indicates the amount of valid values per row to calculate the row mean; #' \item or a value between 0 and 1, indicating a proportion of valid values per row to calculate the row mean (see 'Details'). #' } #' If a row's sum of valid values is less than \code{n}, \code{NA} will be returned as row mean value. #' @param digits Numeric value indicating the number of decimal places to be used for rounding mean #' value. Negative values are allowed (see 'Details'). #' #' @return A vector with row mean values of \code{df} for those rows with at least \code{n} #' valid values. Else, \code{NA} is returned. #' #' @details Rounding to a negative number of \code{digits} means rounding to a power of #' ten, so for example mean_n(df, 3, digits = -2) rounds to the #' nearest hundred. \cr \cr #' For \code{n}, must be a numeric value from \code{0} to \code{ncol(dat)}. If #' a \emph{row} in \code{dat} has at least \code{n} non-missing values, the #' row mean is returned. If \code{n} is a non-integer value from 0 to 1, #' \code{n} is considered to indicate the proportion of necessary non-missing #' values per row. E.g., if \code{n = .75}, a row must have at least \code{ncol(dat) * n} #' non-missing values for the row mean to be calculated. See 'Examples'. #' #' @references \href{http://r4stats.com/2014/09/03/adding-the-spss-mean-n-function-to-r/}{r4stats.com} #' #' @examples #' dat <- data.frame(c1 = c(1,2,NA,4), #' c2 = c(NA,2,NA,5), #' c3 = c(NA,4,NA,NA), #' c4 = c(2,3,7,8)) #' #' # needs at least 4 non-missing values per row #' mean_n(dat, 4) # 1 valid return value #' #' # needs at least 3 non-missing values per row #' mean_n(dat, 3) # 2 valid return values #' #' # needs at least 2 non-missing values per row #' mean_n(dat, 2) #' #' # needs at least 1 non-missing value per row #' mean_n(dat, 1) # all means are shown #' #' # needs at least 50% of non-missing values per row #' mean_n(dat, .5) # 3 valid return values #' #' # needs at least 75% of non-missing values per row #' mean_n(dat, .75) # 2 valid return values #' #' @export mean_n <- function(dat, n, digits = 2) { # is 'n' indicating a proportion? digs <- n %% 1 if (digs != 0) n <- round(ncol(dat) * digs) # coerce matrix to data frame if (is.matrix(dat)) dat <- as.data.frame(dat) # check if we have a data framme with at least two columns if (!is.data.frame(dat) || ncol(dat) < 2) { warning("`dat` must be a data frame with at least two columns.", call. = TRUE) return(NA) } # n may not be larger as df's amount of columns if (ncol(dat) < n) { warning("`n` must be smaller or equal to number of columns in data frame.", call. = TRUE) return(NA) } round(apply(dat, 1, function(x) ifelse(sum(!is.na(x)) >= n, mean(x, na.rm = TRUE), NA)), digits) } sjstats/R/weight.R0000644000176200001440000000630413563265750013577 0ustar liggesusers#' @title Weight a variable #' @name weight #' @description These functions weight the variable \code{x} by #' a specific vector of \code{weights}. #' #' @param x (Unweighted) variable. #' @param weights Vector with same length as \code{x}, which #' contains weight factors. Each value of \code{x} has a #' specific assigned weight in \code{weights}. #' @param digits Numeric value indicating the number of decimal places to be #' used for rounding the weighted values. By default, this value is #' \code{0}, i.e. the returned values are integer values. #' #' @return The weighted \code{x}. #' #' @details \code{weight2()} sums up all \code{weights} values of the associated #' categories of \code{x}, whereas \code{weight()} uses a #' \code{\link[stats]{xtabs}} formula to weight cases. Thus, \code{weight()} #' may return a vector of different length than \code{x}. #' #' @note The values of the returned vector are in sorted order, whereas the values' #' order of the original \code{x} may be spread randomly. Hence, \code{x} can't be #' used, for instance, for further cross tabulation. In case you want to have #' weighted contingency tables or (grouped) box plots etc., use the \code{weightBy} #' argument of most functions. #' #' @examples #' v <- sample(1:4, 20, TRUE) #' table(v) #' w <- abs(rnorm(20)) #' table(weight(v, w)) #' table(weight2(v, w)) #' #' set.seed(1) #' x <- sample(letters[1:5], size = 20, replace = TRUE) #' w <- runif(n = 20) #' #' table(x) #' table(weight(x, w)) #' #' @importFrom stats na.pass xtabs #' @importFrom sjlabelled as_numeric #' @export weight <- function(x, weights, digits = 0) { # remember if x is numeric x.is.num <- is.numeric(x) # init values weightedvar <- c() wtab <- round(stats::xtabs(weights ~ x, data = data.frame(weights = weights, x = x), na.action = stats::na.pass, exclude = NULL), digits = digits) # iterate all table values for (w in seq_len(length(wtab))) { # retrieve count of each table cell w_count <- wtab[[w]] # retrieve "cell name" which is identical to the variable value # first check whether values are numeric or not nval_ <- suppressWarnings(as.numeric(names(wtab[w]))) # if value is not numeric, use as is if (is.na(nval_)) w_value <- names(wtab[w]) else # else, use numeric value w_value <- nval_ # append variable value, repeating it "w_count" times. weightedvar <- c(weightedvar, rep(w_value, w_count)) } # if we have NA values, weighted var is coerced to character. # coerce back to numeric then here if (!is.numeric(weightedvar) && x.is.num) weightedvar <- sjlabelled::as_numeric(weightedvar) # return result weightedvar } #' @rdname weight #' @export weight2 <- function(x, weights) { items <- unique(x) newvar <- c() for (i in seq_len(length(items))) { newcount <- round(sum(weights[which(x == items[i])])) newvar <- c(newvar, rep(items[i], newcount)) } newvar } sjstats/R/confint_ncg.R0000644000176200001440000000753213563265750014603 0ustar liggesusers# This function is a modified version from package MBESS # copied from https://github.com/cran/MBESS/blob/master/R/conf.limits.ncf.R # Author: Ken Kelley # License: GPL-3 #' @importFrom stats pf qf confint_ncg <- function(F.value = NULL, conf.level = 0.95, df.1 = NULL, df.2 = NULL) { alpha.lower <- alpha.upper <- (1 - conf.level) / 2 tol <- 1e-09 Jumping.Prop <- 0.1 FAILED <- NULL LL.0 <- stats::qf(p = alpha.lower * 5e-04, df1 = df.1, df2 = df.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) - (1 - alpha.lower) if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < (1 - alpha.lower)) { FAILED <- if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = 0) < 1 - alpha.lower) LL.0 <- 1e-08 if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < 1 - alpha.lower) FAILED <- TRUE } if (is.null(FAILED)) { LL.1 <- LL.2 <- LL.0 while (Diff > tol) { LL.2 <- LL.1 * (1 + Jumping.Prop) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.2) - (1 - alpha.lower) LL.1 <- LL.2 } LL.1 <- LL.2 / (1 + Jumping.Prop) LL.Bounds <- c(LL.1, (LL.1 + LL.2) / 2, LL.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) while (abs(Diff) > tol) { Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[1]) - (1 - alpha.lower) > tol Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[3]) - (1 - alpha.lower) > tol if (isTRUE(Diff.1) & isTRUE(Diff.2) & !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[2], (LL.Bounds[2] + LL.Bounds[3]) / 2, LL.Bounds[3]) } if (isTRUE(Diff.1) & !isTRUE(Diff.2) & !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[1], (LL.Bounds[1] + LL.Bounds[2]) / 2, LL.Bounds[2]) } Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) } LL <- LL.Bounds[2] } if (!is.null(FAILED)) LL <- NA FAILED.Up <- NULL UL.0 <- stats::qf(p = 1 - alpha.upper * 5e-04, df1 = df.1, df2 = df.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper if (Diff < 0) UL.0 <- 1e-08 Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper if (Diff < 0) FAILED.Up <- TRUE if (is.null(FAILED.Up)) { UL.1 <- UL.2 <- UL.0 while (Diff > tol) { UL.2 <- UL.1 * (1 + Jumping.Prop) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.2) - alpha.upper UL.1 <- UL.2 } UL.1 <- UL.2 / (1 + Jumping.Prop) UL.Bounds <- c(UL.1, (UL.1 + UL.2) / 2, UL.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper while (abs(Diff) > tol) { Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[1]) - alpha.upper > tol Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[3]) - alpha.upper > tol if (isTRUE(Diff.1) & isTRUE(Diff.2) & !isTRUE(Diff.3)) { UL.Bounds <- c(UL.Bounds[2], (UL.Bounds[2] + UL.Bounds[3]) / 2, UL.Bounds[3]) } if (isTRUE(Diff.1) & !isTRUE(Diff.2) & !isTRUE(Diff.3)) { UL.Bounds <- c(UL.Bounds[1], (UL.Bounds[1] + UL.Bounds[2])/2, UL.Bounds[2]) } Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper } UL <- UL.Bounds[2] } if (!is.null(FAILED.Up)) UL <- NA list(Lower.Limit = LL, Upper.Limit = UL) } sjstats/R/mediation.R0000644000176200001440000001616413563265750014266 0ustar liggesusers#' @title Summary of Bayesian multivariate-response mediation-models #' @name mediation #' #' @description \code{mediation()} is a short summary for multivariate-response #' mediation-models. #' #' @param x A \code{stanreg}, \code{stanfit}, or \code{brmsfit} object. #' @param prob Vector of scalars between 0 and 1, indicating the mass within #' the credible interval that is to be estimated. #' @param treatment Character, name of the treatment variable (or direct effect) #' in a (multivariate response) mediator-model. If missing, \code{mediation()} #' tries to find the treatment variable automatically, however, this may fail. #' @param mediator Character, name of the mediator variable in a (multivariate #' response) mediator-model. If missing, \code{mediation()} tries to find the #' treatment variable automatically, however, this may fail. #' @param typical The typical value that will represent the Bayesian point estimate. #' By default, the posterior median is returned. See \code{\link[sjmisc]{typical_value}} #' for possible values for this argument. #' @param ... Not used. #' #' @return A data frame with direct, indirect, mediator and #' total effect of a multivariate-response mediation-model, as well as the #' proportion mediated. The effect sizes are mean values of the posterior #' samples. #' #' @details \code{mediation()} returns a data frame with information on the #' \emph{direct effect} (mean value of posterior samples from \code{treatment} #' of the outcome model), \emph{mediator effect} (mean value of posterior #' samples from \code{mediator} of the outcome model), \emph{indirect effect} #' (mean value of the multiplication of the posterior samples from #' \code{mediator} of the outcome model and the posterior samples from #' \code{treatment} of the mediation model) and the total effect (mean #' value of sums of posterior samples used for the direct and indirect #' effect). The \emph{proportion mediated} is the indirect effect divided #' by the total effect. #' \cr \cr #' For all values, the 90\% HDIs are calculated by default. Use \code{prob} #' to calculate a different interval. #' \cr \cr #' The arguments \code{treatment} and \code{mediator} do not necessarily #' need to be specified. If missing, \code{mediation()} tries to find the #' treatment and mediator variable automatically. If this does not work, #' specify these variables. #' #' #' @export mediation <- function(x, ...) { UseMethod("mediation") } #' @rdname mediation #' @importFrom purrr map #' @importFrom stats formula #' @importFrom dplyr pull bind_cols #' @importFrom sjmisc typical_value #' @importFrom insight model_info #' @importFrom bayestestR hdi #' @export mediation.brmsfit <- function(x, treatment, mediator, prob = .9, typical = "median", ...) { # check for pkg availability, else function might fail if (!requireNamespace("brms", quietly = TRUE)) stop("Please install and load package `brms` first.") # only one HDI interval if (length(prob) > 1) prob <- prob[1] # check for binary response. In this case, user should rescale variables fitinfo <- insight::model_info(x) if (any(purrr::map_lgl(fitinfo, ~ .x$is_bin))) { message("One of moderator or outcome is binary, so direct and indirect effects may be on different scales. Consider rescaling model predictors, e.g. with `sjmisc::std()`.") } dv <- insight::find_response(x, combine = TRUE) fixm <- FALSE if (missing(mediator)) { pv <- insight::find_predictors(x, flatten = TRUE) mediator <- pv[pv %in% dv] fixm <- TRUE } if (missing(treatment)) { pvs <- purrr::map( x$formula$forms, ~ stats::formula(.x)[[3L]] %>% all.vars() ) treatment <- pvs[[1]][pvs[[1]] %in% pvs[[2]]][1] treatment <- fix_factor_name(x, treatment) } mediator.model <- which(dv == mediator) treatment.model <- which(dv != mediator) if (fixm) mediator <- fix_factor_name(x, mediator) # brms removes underscores from variable names when naming estimates # so we need to fix variable names here dv <- names(dv) # Direct effect: coef(treatment) from model_y_treatment coef_treatment <- sprintf("b_%s_%s", dv[treatment.model], treatment) eff.direct <- x %>% brms::posterior_samples(pars = coef_treatment, exact_match = TRUE) %>% dplyr::pull(1) # Mediator effect: coef(mediator) from model_y_treatment coef_mediator <- sprintf("b_%s_%s", dv[treatment.model], mediator) eff.mediator <- x %>% brms::posterior_samples(pars = coef_mediator, exact_match = TRUE) %>% dplyr::pull(1) # Indirect effect: coef(treament) from model_m_mediator * coef(mediator) from model_y_treatment coef_indirect <- sprintf("b_%s_%s", dv[mediator.model], treatment) tmp.indirect <- brms::posterior_samples(x, pars = c(coef_indirect, coef_mediator), exact_match = TRUE) eff.indirect <- tmp.indirect[[coef_indirect]] * tmp.indirect[[coef_mediator]] # Total effect eff.total <- eff.indirect + eff.direct # proportion mediated: indirect effect / total effect prop.mediated <- sjmisc::typical_value(eff.indirect, fun = typical) / sjmisc::typical_value(eff.total, fun = typical) hdi_eff <- bayestestR::hdi(eff.indirect / eff.total, ci = prob) prop.se <- (hdi_eff$CI_high - hdi_eff$CI_low) / 2 prop.hdi <- prop.mediated + c(-1, 1) * prop.se res <- data_frame( effect = c("direct", "indirect", "mediator", "total", "proportion mediated"), value = c( sjmisc::typical_value(eff.direct, fun = typical), sjmisc::typical_value(eff.indirect, fun = typical), sjmisc::typical_value(eff.mediator, fun = typical), sjmisc::typical_value(eff.total, fun = typical), prop.mediated ) ) %>% dplyr::bind_cols( as.data.frame(rbind( bayestestR::hdi(eff.direct, ci = prob)[, -1], bayestestR::hdi(eff.indirect, ci = prob)[, -1], bayestestR::hdi(eff.mediator, ci = prob)[, -1], bayestestR::hdi(eff.total, ci = prob)[, -1], prop.hdi )) ) colnames(res) <- c("effect", "value", "hdi.low", "hdi.high") attr(res, "prob") <- prob attr(res, "treatment") <- treatment attr(res, "mediator") <- mediator attr(res, "response") <- dv[treatment.model] attr(res, "formulas") <- lapply(x$formula$forms, function(x) as.character(x[1])) class(res) <- c("sj_mediation", class(res)) res } #' @importFrom insight get_data fix_factor_name <- function(model, variable) { # check for categorical. if user has not specified a treatment variable # and this variable is categorical, the posterior samples contain the # samples from each category of the treatment variable - so we need to # fix the variable name mf <- insight::get_data(model) if (obj_has_name(mf, variable)) { check_fac <- mf[[variable]] if (is.factor(check_fac)) { variable <- sprintf("%s%s", variable, levels(check_fac)[nlevels(check_fac)]) } else if (is.logical(check_fac)) { variable <- sprintf("%sTRUE", variable) } } variable } sjstats/R/wtd_mwu.R0000644000176200001440000000317513565521504013773 0ustar liggesusers#' @rdname weighted_sd #' @export weighted_mannwhitney <- function(data, ...) { UseMethod("weighted_mannwhitney") } #' @importFrom dplyr select #' @rdname weighted_sd #' @export weighted_mannwhitney.default <- function(data, x, grp, weights, ...) { x.name <- deparse(substitute(x)) g.name <- deparse(substitute(grp)) w.name <- deparse(substitute(weights)) # create string with variable names vars <- c(x.name, g.name, w.name) # get data dat <- suppressMessages(dplyr::select(data, !! vars)) dat <- na.omit(dat) weighted_mannwhitney_helper(dat) } #' @importFrom dplyr select #' @rdname weighted_sd #' @export weighted_mannwhitney.formula <- function(formula, data, ...) { vars <- all.vars(formula) # get data dat <- suppressMessages(dplyr::select(data, !! vars)) dat <- na.omit(dat) weighted_mannwhitney_helper(dat) } weighted_mannwhitney_helper <- function(dat, vars) { # check if pkg survey is available if (!requireNamespace("survey", quietly = TRUE)) { stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) } x.name <- colnames(dat)[1] group.name <- colnames(dat)[2] colnames(dat) <- c("x", "g", "w") design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) mw <- survey::svyranktest(formula = x ~ g, design) attr(mw, "x.name") <- x.name attr(mw, "group.name") <- group.name class(mw) <- c("sj_wmwu", "list") if (dplyr::n_distinct(dat$g, na.rm = TRUE) > 2) m <- "Weighted Kruskal-Wallis test" else m <- "Weighted Mann-Whitney-U test" mw$method <- m mw } sjstats/R/inequ_trends.R0000644000176200001440000001033113563265750015003 0ustar liggesusers#' @title Compute trends in status inequalities #' @name inequ_trend #' #' @description This method computes the proportional change of absolute #' (rate differences) and relative (rate ratios) inequalities #' of prevalence rates for two different status groups, as proposed #' by Mackenbach et al. (2015). #' #' @param data A data frame that contains the variables with prevalence rates for both low #' and high status groups (see 'Examples'). #' @param prev.low The name of the variable with the prevalence rates for #' the low status groups. #' @param prev.hi The name of the variable with the prevalence rates for #' the hi status groups. #' #' @return A data frame with the prevalence rates as well as the values for the #' proportional change in absolute (\code{rd}) and relative (\code{rr}) #' ineqqualities. #' #' @references Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. \doi{10.1136/jech-2015-207018} #' #' @details Given the time trend of prevalence rates of an outcome for two status #' groups (e.g. the mortality rates for people with lower and higher #' socioeconomic status over 40 years), this function computes the #' proportional change of absolute and relative inequalities, expressed #' in changes in rate differences and rate ratios. The function implements #' the algorithm proposed by \emph{Mackenbach et al. 2015}. #' #' @examples #' # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 #' #' # 40 simulated time points, with an initial rate ratio of 2 and #' # a rate difference of 100 (i.e. low status group starts with a #' # prevalence rate of 200, the high status group with 100) #' #' # annual decline of prevalence is 1% for the low, and 3% for the #' # high status group #' #' n <- 40 #' time <- seq(1, n, by = 1) #' lo <- rep(200, times = n) #' for (i in 2:n) lo[i] <- lo[i - 1] * .99 #' #' hi <- rep(100, times = n) #' for (i in 2:n) hi[i] <- hi[i - 1] * .97 #' #' prev.data <- data.frame(lo, hi) #' #' # print values #' inequ_trend(prev.data, lo, hi) #' #' # plot trends - here we see that the relative inequalities #' # are increasing over time, while the absolute inequalities #' # are first increasing as well, but later are decreasing #' # (while rel. inequ. are still increasing) #' plot(inequ_trend(prev.data, lo, hi)) #' #' @importFrom dplyr select #' @importFrom rlang quo_name enquo .data #' @export inequ_trend <- function(data, prev.low, prev.hi) { # prepare data for prevalence rates for low and hi status groups if (is.null(data) || missing(data)) { dat <- data.frame(prev.low, prev.hi) } else { # get variable names # create quosures low <- rlang::quo_name(rlang::enquo(prev.low)) high <- rlang::quo_name(rlang::enquo(prev.hi)) dat <- dplyr::select(data, !! low, !! high) } # ensure common column names colnames(dat) <- c("lo", "hi") # trends in rate ratios # compute relative inequality for first time point, needed # as reference to compute proportional change over time dat$rr <- dat$lo[1] / dat$hi[1] # compute proportional change of relative inequalities over time for (t in 2:nrow(dat)) { delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1] delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1] dat$rr[t] <- dat$rr[t - 1] * ((1 + delta.low) / (1 + delta.hi)) } # trends in rate difference # compute absolute inequality for first time point, needed # as reference to compute proportional change over time dat$rd <- dat$lo[1] - dat$hi[1] # compute proportional change of absolute inequalities over time for (t in 2:nrow(dat)) { delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1] delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1] dat$rd[t] <- dat$rd[t - 1] + (dat$lo[t - 1 ] * delta.low - dat$hi[t - 1] * delta.hi) } # return structure(class = "sj_inequ_trend", list(data = dat)) } sjstats/R/se_ybar.R0000644000176200001440000000322413616514317013725 0ustar liggesusers#' @title Standard error of sample mean for mixed models #' @name se_ybar #' #' @description Compute the standard error for the sample mean for mixed models, #' regarding the extent to which clustering affects the standard errors. #' May be used as part of the multilevel power calculation for cluster sampling #' (see \cite{Gelman and Hill 2007, 447ff}). #' #' @param fit Fitted mixed effects model (\code{\link[lme4]{merMod}}-class). #' #' @return The standard error of the sample mean of \code{fit}. #' #' @references Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press #' #' @examples #' if (require("lme4")) { #' fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) #' se_ybar(fit) #' } #' @importFrom lme4 ngrps #' @importFrom stats nobs #' @importFrom purrr map_dbl #' @export se_ybar <- function(fit) { # get model icc vars <- insight::get_variance(fit, verbose = FALSE) # get group variances tau.00 <- unname(vars$var.intercept) # total variance tot_var <- sum(tau.00, vars$var.residual) # get number of groups m.cnt <- lme4::ngrps(fit) # compute number of observations per group (level-2-unit) obs <- round(stats::nobs(fit) / m.cnt) # compute simple icc icc <- tau.00 / tot_var # compute standard error of sample mean se <- purrr::map_dbl(seq_len(length(m.cnt)), ~ sqrt((tot_var / stats::nobs(fit)) * design_effect(n = obs[.x], icc = icc[.x]))) # give names for se, so user sees, which random effect has what impact names(se) <- names(m.cnt) se } sjstats/R/sjStatistics.R0000644000176200001440000000351613563265750015001 0ustar liggesusers#' @title Expected and relative table values #' @name table_values #' @description This function calculates a table's cell, row and column percentages as #' well as expected values and returns all results as lists of tables. #' #' @param tab Simple \code{\link{table}} or \code{\link[stats]{ftable}} of which #' cell, row and column percentages as well as expected values are calculated. #' Tables of class \code{\link[stats]{xtabs}} and other will be coerced to #' \code{ftable} objects. #' @param digits Amount of digits for the table percentage values. #' #' @return (Invisibly) returns a list with four tables: #' \enumerate{ #' \item \code{cell} a table with cell percentages of \code{tab} #' \item \code{row} a table with row percentages of \code{tab} #' \item \code{col} a table with column percentages of \code{tab} #' \item \code{expected} a table with expected values of \code{tab} #' } #' #' @examples #' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) #' # show expected values #' table_values(tab)$expected #' # show cell percentages #' table_values(tab)$cell #' #' @export table_values <- function(tab, digits = 2) { # convert to ftable object if (!inherits(tab, "ftable")) tab <- ftable(tab) tab.cell <- round(100 * prop.table(tab), digits) tab.row <- round(100 * prop.table(tab, 1), digits) tab.col <- round(100 * prop.table(tab, 2), digits) tab.expected <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab))) # return results invisible(structure(class = "sjutablevalues", list(cell = tab.cell, row = tab.row, col = tab.col, expected = tab.expected))) } sjstats/R/samplesize_mixed.R0000644000176200001440000001111113616514330015630 0ustar liggesusers#' @title Sample size for linear mixed models #' @name samplesize_mixed #' #' @description Compute an approximated sample size for linear mixed models #' (two-level-designs), based on power-calculation for standard #' design and adjusted for design effect for 2-level-designs. #' #' @param eff.size Effect size. #' @param df.n Optional argument for the degrees of freedom for numerator. See 'Details'. #' @param power Power of test (1 minus Type II error probability). #' @param sig.level Significance level (Type I error probability). #' @param k Number of cluster groups (level-2-unit) in multilevel-design. #' @param n Optional, number of observations per cluster groups #' (level-2-unit) in multilevel-design. #' @param icc Expected intraclass correlation coefficient for multilevel-model. #' #' @return A list with two values: The number of subjects per cluster, and the #' total sample size for the linear mixed model. #' #' @references Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum. #' \cr \cr #' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} #' \cr \cr #' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} #' #' @details The sample size calculation is based on a power-calculation for the #' standard design. If \code{df.n} is not specified, a power-calculation #' for an unpaired two-sample t-test will be computed (using #' \code{\link[pwr]{pwr.t.test}} of the \CRANpkg{pwr}-package). #' If \code{df.n} is given, a power-calculation for general linear models #' will be computed (using \code{\link[pwr]{pwr.f2.test}} of the #' \pkg{pwr}-package). The sample size of the standard design #' is then adjusted for the design effect of two-level-designs (see #' \code{\link{design_effect}}). Thus, the sample size calculation is appropriate #' in particular for two-level-designs (see \cite{Snijders 2005}). Models that #' additionally include repeated measures (three-level-designs) may work #' as well, however, the computed sample size may be less accurate. #' #' @examples #' # Sample size for multilevel model with 30 cluster groups and a small to #' # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and #' # hence a total sample size of about 802 observations is needed. #' samplesize_mixed(eff.size = .3, k = 30) #' #' # Sample size for multilevel model with 20 cluster groups and a medium #' # to large effect size for linear models of 0.2. Five subjects per cluster and #' # hence a total sample size of about 107 observations is needed. #' samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) #' @export samplesize_mixed <- function(eff.size, df.n = NULL, power = .8, sig.level = .05, k, n, icc = 0.05) { if (!requireNamespace("pwr", quietly = TRUE)) { stop("Package `pwr` needed for this function to work. Please install it.", call. = FALSE) } # compute sample size for standard design if (is.null(df.n)) # if we have no degrees of freedom specified, use t-test obs <- 2 * pwr::pwr.t.test(d = eff.size, sig.level = sig.level, power = power)$n else # we have df, so power-calc for linear models obs <- pwr::pwr.f2.test(u = df.n, f2 = eff.size, sig.level = sig.level, power = power)$v + df.n + 1 # if we have no information on the number of observations per cluster, # compute this number now if (missing(n) || is.null(n)) { n <- (obs * (1 - icc)) / (k - (obs * icc)) if (n < 1) { warning("Minimum required number of subjects per cluster is negative and was adjusted to be positive. You may reduce the requirements for the multi-level structure (i.e. reduce `k` or `icc`), or you can increase the effect-size.", call. = FALSE) n <- 1 } } # adjust standard design by design effect total.n <- obs * design_effect(n = n, icc = icc) # sample size for each group and total n smpsz <- list(round(total.n / k), round(total.n)) # name list names(smpsz) <- c("Subjects per Cluster", "Total Sample Size") smpsz } #' @rdname samplesize_mixed #' @export smpsize_lmm <- samplesize_mixed sjstats/R/tidy_stan.R0000644000176200001440000002146413610330432014271 0ustar liggesusers#' @title Tidy summary output for stan models #' @name tidy_stan #' #' @description Returns a tidy summary output for stan models. #' #' @param x A \code{stanreg}, \code{stanfit} or \code{brmsfit} object. #' @param trans Name of a function or character vector naming a function, used #' to apply transformations on the estimates and uncertainty intervals. The #' values for standard errors are \emph{not} transformed! If \code{trans} #' is not \code{NULL}, \emph{credible intervals} instead of \emph{HDI} #' are computed, due to the possible asymmetry of the HDI. #' @param digits Amount of digits to round numerical values in the output. #' @param prob Vector of scalars between 0 and 1, indicating the mass within #' the credible interval that is to be estimated. #' @param typical The typical value that will represent the Bayesian point estimate. #' By default, the posterior median is returned. See \code{\link[sjmisc]{typical_value}} #' for possible values for this argument. #' @inheritParams bayestestR::hdi #' #' @return A data frame, summarizing \code{x}, with consistent column names. #' To distinguish multiple HDI values, column names for the HDI get a suffix #' when \code{prob} has more than one element. #' #' @details The returned data frame has an additonal class-attribute, #' \code{tidy_stan}, to pass the result to its own \code{print()}-method. #' The \code{print()}-method creates a cleaner output, especially for multilevel, #' zero-inflated or multivariate response models, where - for instance - #' the conditional part of a model is printed separately from the zero-inflated #' part, or random and fixed effects are printed separately. #' \cr \cr #' The returned data frame gives information on: #' \itemize{ #' \item{The Bayesian point estimate (column \emph{estimate}, which is by #' default the posterior median; other statistics are also possible, #' see argument \code{typical}).} #' \item{ #' The standard error (which is actually the \emph{median absolute deviation}). #' } #' \item{ #' The HDI. Computation for HDI is based on the #' code from Kruschke 2015, pp. 727f. #' } #' \item{ #' The Probability of Direction (pd), which is an index for "effect significance" #' (see \cite{Makowski et al. 2019}). A value of 95\% or higher indicates a #' "significant" (i.e. statistically clear) effect. #' } #' \item{ #' The effective numbers of samples, \emph{ESS}. #' } #' \item{ #' The Rhat statistics. When Rhat is above 1, it usually indicates that #' the chain has not yet converged, indicating that the drawn samples #' might not be trustworthy. Drawing more iteration may solve this issue. #' } #' \item{ #' The Monte Carlo standard error (see \code{\link{mcse}}). It is defined #' as standard deviation of the chains divided by their effective sample #' size and \dQuote{provides a quantitative suggestion of how big the #' estimation noise is} (\emph{Kruschke 2015, p.187}). #' } #' } #' #' @references Kruschke JK. \emph{Doing Bayesian Data Analysis: A Tutorial with R, JAGS, and Stan} 2nd edition. Academic Press, 2015 #' \cr \cr #' Gelman A, Carlin JB, Stern HS, Dunson DB, Vehtari A, Rubin DB. \emph{Bayesian data analysis} 3rd ed. Boca Raton: Chapman and Hall/CRC, 2013 #' \cr \cr #' Gelman A, Rubin DB. \emph{Inference from iterative simulation using multiple sequences} Statistical Science 1992;7: 457-511 #' \cr \cr #' Makowski D, Ben-Shachar MS, Lüdecke D. bayestestR: Describing Effects and their Uncertainty, Existence and Significance within the Bayesian Framework. Journal of Open Source Software 2019;4:1541. \doi{10.21105/joss.01541} #' \cr \cr #' McElreath R. \emph{Statistical Rethinking. A Bayesian Course with Examples in R and Stan} Chapman and Hall, 2015 #' #' @examples #' \dontrun{ #' if (require("rstanarm")) { #' fit <- stan_glm(mpg ~ wt + am, data = mtcars, chains = 1) #' tidy_stan(fit) #' tidy_stan(fit, prob = c(.89, .5)) #' }} #' #' @importFrom purrr map_dbl #' @importFrom stats mad formula sd #' @importFrom sjmisc is_empty trim seq_col typical_value #' @importFrom insight model_info get_parameters is_multivariate print_parameters #' @importFrom bayestestR hdi ci effective_sample mcse pd #' @export tidy_stan <- function(x, prob = .89, typical = "median", trans = NULL, effects = c("all", "fixed", "random"), component = c("all", "conditional", "zero_inflated", "zi"), digits = 2) { # only works for rstanarm- or brms-models if (!inherits(x, c("stanreg", "stanfit", "brmsfit"))) stop("`x` needs to be a stanreg- or brmsfit-object.", call. = F) .Deprecated("parameters::model_parameters()") # check arguments effects <- match.arg(effects) component <- match.arg(component) # get transformaton function if (!is.null(trans)) trans <- match.fun(trans) # family info faminfo <- insight::model_info(x) if (insight::is_multivariate(x)) { faminfo <- faminfo[[1]] } # get parameters ---- out.pars <- insight::get_parameters(x, effects = effects, component = component, parameters = "^(?!prior)") # compute HDI / ci ---- if (!is.null(trans)) { out.hdi <- bayestestR::ci(x, ci = prob, effects = effects, component = component) out.hdi$CI_low <- trans(out.hdi$CI_low) out.hdi$CI_high <- trans(out.hdi$CI_high) is_hdi <- FALSE } else { out.hdi <- bayestestR::hdi(x, ci = prob, effects = effects, component = component) is_hdi <- TRUE } out.hdi$CI_low <- sprintf("%.*f", digits, out.hdi$CI_low) out.hdi$CI_high <- sprintf("%.*f", digits, out.hdi$CI_high) # transform data frame for multiple ci-levels if (length(unique(out.hdi$CI)) > 1) { hdi_list <- lapply( split(out.hdi, out.hdi$CI, drop = FALSE), function(i) { .rename_ci_column(i, ifelse(is_hdi, "HDI", "CI")) }) hdi_frame <- Reduce(function(x, y) merge(x, y, all.y = TRUE, by = "Parameter"), hdi_list) to_remove <- string_starts_with("CI.", colnames(hdi_frame)) to_remove <- c(to_remove, string_starts_with("Component", colnames(hdi_frame))) to_remove <- c(to_remove, string_starts_with("Group", colnames(hdi_frame))) to_remove <- c(to_remove, string_starts_with("Effects", colnames(hdi_frame))) to_remove <- c(to_remove, string_starts_with("Response", colnames(hdi_frame))) to_remove <- c(to_remove, which(colnames(hdi_frame) == "CI")) out.hdi <- hdi_frame[, -to_remove, drop = FALSE] } else { out.hdi <- .rename_ci_column(out.hdi, ifelse(is_hdi, "HDI", "CI")) } remove <- which(colnames(out.hdi) == "CI" | grepl("^CI_", colnames(out.hdi))) out.hdi <- out.hdi[, -remove] # compute effective sample size ESS ---- out.ess <- bayestestR::effective_sample(x, effects = effects, component = component) # compute MCSE ---- out.mcse <- bayestestR::mcse(x, effects = effects, component = component) # compute Probability of Direction ---- out.pd <- bayestestR::pd(x, effects = effects, component = component, method = "direct") # compute RHat ---- out.rhat <- .rhat(x) # transform estimate, if requested if (!is.null(trans)) { all.cols <- sjmisc::seq_col(out.pars) simp.pars <- string_starts_with("simo_mo", colnames(out.pars)) if (!sjmisc::is_empty(simp.pars)) all.cols <- all.cols[-simp.pars] for (i in all.cols) out.pars[[i]] <- trans(out.pars[[i]]) } se.fun <- switch( typical, "median" = stats::mad, stats::sd ) out.parameters <- data.frame( Parameter = colnames(out.pars), Estimate = purrr::map_dbl(out.pars, ~ sjmisc::typical_value(.x, fun = typical)), Std.Error = purrr::map_dbl(out.pars, se.fun), stringsAsFactors = FALSE ) out <- insight::print_parameters(x, out.parameters, out.hdi, out.pd, out.ess, out.rhat, out.mcse) class(out) <- c("tidy_stan", class(out)) attr(out, "digits") <- digits out } .rename_ci_column <- function(x, col_name) { x$CI_low <- format(x$CI_low, justify = "right") x$CI_high <- format(x$CI_high, justify = "right") x$.ci <- sprintf("[%s, %s]", x$CI_low, x$CI_high) colnames(x)[ncol(x)] <- sprintf("%s(%g%%)", col_name, x$CI[1]) x } .rhat <- function(x) { if (!requireNamespace("rstan", quietly = TRUE)) stop("Package `rstan` is required. Please install it first.", call. = FALSE) if (inherits(x, "brmsfit")) x <- x$fit if (inherits(x, "stanfit")) { s <- rstan::summary(x)$summary } else if (inherits(x, "stanreg")) { s <- summary(x, pars = NULL, regex_pars = NULL) } data.frame( Parameter = make.names(rownames(s)), Rhat = s[, "Rhat"], stringsAsFactors = FALSE ) } sjstats/R/wtd_cor.R0000644000176200001440000000423313565520476013751 0ustar liggesusers#' @rdname weighted_sd #' @export weighted_correlation <- function(data, ...) { UseMethod("weighted_correlation") } #' @rdname weighted_sd #' @export weighted_correlation.default <- function(data, x, y, weights, ci.lvl = .95, ...) { if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) stop("'ci.lvl' must be a single number between 0 and 1") x.name <- deparse(substitute(x)) y.name <- deparse(substitute(y)) w.name <- deparse(substitute(weights)) if (w.name == "NULL") { w.name <- "weights" data$weights <- 1 } # create string with variable names vars <- c(x.name, y.name, w.name) # get data dat <- suppressMessages(dplyr::select(data, !! vars)) dat <- na.omit(dat) xv <- dat[[x.name]] yv <- dat[[y.name]] wv <- dat[[w.name]] weighted_correlation_helper(xv, yv, wv, ci.lvl) } #' @rdname weighted_sd #' @export weighted_correlation.formula <- function(formula, data, ci.lvl = .95, ...) { if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) stop("'ci.lvl' must be a single number between 0 and 1") vars <- all.vars(formula) if (length(vars) < 3) { vars <- c(vars, "weights") data$weights <- 1 } # get data dat <- suppressMessages(dplyr::select(data, !! vars)) dat <- na.omit(dat) xv <- dat[[vars[1]]] yv <- dat[[vars[2]]] wv <- dat[[vars[3]]] weighted_correlation_helper(xv, yv, wv, ci.lvl) } #' @importFrom stats cor.test weighted_correlation_helper <- function(xv, yv, wv, ci.lvl) { x <- xv - weighted_mean(xv, weights = wv) y <- yv - weighted_mean(yv, weights = wv) x <- x / weighted_sd(x, weights = wv) y <- y / weighted_sd(y, weights = wv) results <- stats::coef(summary(stats::lm(y ~ x, weights = wv)))[2, ] ci <- ci.lvl - ((1 - ci.lvl) / 2) ci <- results[1] + (stats::qnorm(ci) * c(-1, 1) * results[2]) structure( class = "sj_wcor", list( estimate = results[1], method = "Pearson's Correlation Coefficient", p.value = results[4], ci = ci, ci.lvl = ci.lvl ) ) } sjstats/R/wtd_mean.R0000644000176200001440000000121713565520574014104 0ustar liggesusers#' @rdname weighted_sd #' @export weighted_mean <- function(x, weights = NULL) { UseMethod("weighted_mean") } #' @importFrom stats weighted.mean #' @export weighted_mean.default <- function(x, weights = NULL) { if (is.null(weights)) weights <- rep(1, length(x)) stats::weighted.mean(x, w = weights, na.rm = TRUE) } #' @importFrom stats weighted.mean #' @importFrom purrr map_dbl #' @importFrom dplyr select_if #' @export weighted_mean.data.frame <- function(x, weights = NULL) { if (is.null(weights)) weights <- rep(1, length(x)) dplyr::select_if(x, is.numeric) %>% purrr::map_dbl(~ weighted.mean(.x, w = weights)) } sjstats/R/svy_scale_weights.R0000644000176200001440000001166313616514267016035 0ustar liggesusers#' @title Rescale design weights for multilevel analysis #' @name scale_weights #' #' @description Most functions to fit multilevel and mixed effects models only #' allow to specify frequency weights, but not design (i.e. sampling or probability) #' weights, which should be used when analyzing complex samples and survey data. #' \code{scale_weights()} implements an algorithm proposed by Aaparouhov (2006) #' and Carle (2009) to rescale design weights in survey data to account for #' the grouping structure of multilevel models, which then can be used for #' multilevel modelling. #' #' @param x A data frame. #' @param cluster.id Variable indicating the grouping structure (strata) of #' the survey data (level-2-cluster variable). #' @param pweight Variable indicating the probability (design or sampling) #' weights of the survey data (level-1-weight). #' #' @return \code{x}, with two new variables: \code{svywght_a} and \code{svywght_b}, #' which represent the rescaled design weights to use in multilevel models #' (use these variables for the \code{weights} argument). #' #' @details Rescaling is based on two methods: For \code{svywght_a}, the sample #' weights \code{pweight} are adjusted by a factor that represents the proportion #' of cluster size divided by the sum of sampling weights within each cluster. #' The adjustment factor for \code{svywght_b} is the sum of sample weights #' within each cluster devided by the sum of squared sample weights within #' each cluster (see Carle (2009), Appendix B). #' \cr \cr #' Regarding the choice between scaling methods A and B, Carle suggests #' that "analysts who wish to discuss point estimates should report results #' based on weighting method A. For analysts more interested in residual #' between-cluster variance, method B may generally provide the least biased #' estimates". In general, it is recommended to fit a non-weighted model #' and weighted models with both scaling methods and when comparing the #' models, see whether the "inferential decisions converge", to gain #' confidence in the results. #' \cr \cr #' Though the bias of scaled weights decreases with increasing cluster size, #' method A is preferred when insufficient or low cluster size is a concern. #' \cr \cr #' The cluster ID and probably PSU may be used as random effects (e.g. #' nested design, or cluster and PSU as varying intercepts), depending #' on the survey design that should be mimicked. #' #' @references Carle AC. \emph{Fitting multilevel models in complex survey data with design weights: Recommendations} BMC Medical Research Methodology 2009, 9(49): 1-13 #' \cr \cr #' Asparouhov T. \emph{General Multi-Level Modeling with Sampling Weights} Communications in Statistics - Theory and Methods 2006, 35: 439-460 #' #' @examples #' data(nhanes_sample) #' scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) #' #' if (require("lme4")) { #' nhanes_sample <- scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) #' glmer( #' total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | SDMVPSU), #' family = poisson(), #' data = nhanes_sample, #' weights = svywght_a #' ) #' } #' @importFrom dplyr group_by summarise n right_join enquo filter quo_name slice n #' @importFrom rlang .data #' @importFrom sjmisc is_empty #' @export scale_weights <- function(x, cluster.id, pweight) { message("This function will be removed in a future update. There's an enhanced implementation in 'parameters::rescale_weights()' that should be used instead.") # quote cluster.id and get name as string quo.id <- dplyr::enquo(cluster.id) id.name <- dplyr::quo_name(quo.id) # quote sample weights and get name as string quo.weights <- dplyr::enquo(pweight) pw.name <- dplyr::quo_name(quo.weights) # check if weight has missings. we need to remove them first, # and add back weights to correct cases later weight_missings <- which(is.na(x[[pw.name]])) weight_non_na <- which(!is.na(x[[pw.name]])) if (!sjmisc::is_empty(weight_missings)) { dummy_x <- dplyr::slice(x, !! weight_non_na) } else { dummy_x <- x } # copy data set, so we only append the two new weights tmp <- dummy_x tmp$s_q_w <- dummy_x[[pw.name]]^2 # compute sum of weights per cluster tmp <- tmp %>% dplyr::group_by(!! quo.id) %>% dplyr::summarise( sum_wij = sum(!! quo.weights), sum_sqwij = sum(.data$s_q_w), nj = dplyr::n() ) %>% dplyr::right_join(dummy_x, by = id.name) # multiply the original weight by the fraction of the # sampling unit total population based on Carle 2009 w_a <- tmp[[pw.name]] * tmp$nj / tmp$sum_wij w_b <- tmp[[pw.name]] * tmp$sum_wij / tmp$sum_sqwij x$svywght_a <- NA x$svywght_b <- NA x$svywght_a[weight_non_na] <- w_a x$svywght_b[weight_non_na] <- w_b x } sjstats/R/wtd_variance.R0000644000176200001440000000036313565520657014757 0ustar liggesusersweighted_variance <- function(x, w) { if (is.null(w)) w <- rep(1, length(x)) x[is.na(w)] <- NA w[is.na(x)] <- NA w <- na.omit(w) x <- na.omit(x) xbar <- sum(w * x) / sum(w) sum(w * ((x - xbar)^2)) / (sum(w) - 1) } sjstats/R/bootstrap.R0000644000176200001440000001050213616476363014323 0ustar liggesusers#' @title Generate nonparametric bootstrap replications #' @name bootstrap #' #' @description Generates \code{n} bootstrap samples of \code{data} and #' returns the bootstrapped data frames as list-variable. #' #' @param data A data frame. #' @param n Number of bootstraps to be generated. #' @param size Optional, size of the bootstrap samples. May either be a number #' between 1 and \code{nrow(data)} or a value between 0 and 1 to sample #' a proportion of observations from \code{data} (see 'Examples'). #' #' @return A data frame with one column: a list-variable #' \code{strap}, which contains resample-objects of class \code{sj_resample}. #' These resample-objects are lists with three elements: #' \enumerate{ #' \item the original data frame, \code{data} #' \item the rownmumbers \code{id}, i.e. rownumbers of \code{data}, indicating the resampled rows with replacement #' \item the \code{resample.id}, indicating the index of the resample (i.e. the position of the \code{sj_resample}-object in the list \code{strap}) #' } #' #' @details By default, each bootstrap sample has the same number of observations #' as \code{data}. To generate bootstrap samples without resampling #' same observations (i.e. sampling without replacement), use #' \code{size} to get bootstrapped data with a specific number #' of observations. However, specifying the \code{size}-argument is much #' less memory-efficient than the bootstrap with replacement. Hence, #' it is recommended to ignore the \code{size}-argument, if it is #' not really needed. #' #' @note This function applies nonparametric bootstrapping, i.e. the function #' draws samples with replacement. #' \cr \cr #' There is an \code{as.data.frame}- and a \code{print}-method to get or #' print the resampled data frames. See 'Examples'. The \code{as.data.frame}- #' method automatically applies whenever coercion is done because a data #' frame is required as input. See 'Examples' in \code{\link{boot_ci}}. #' #' #' @seealso \code{\link{boot_ci}} to calculate confidence intervals from #' bootstrap samples. #' #' @examples #' data(efc) #' bs <- bootstrap(efc, 5) #' #' # now run models for each bootstrapped sample #' lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x)) #' #' # generate bootstrap samples with 600 observations for each sample #' bs <- bootstrap(efc, 5, 600) #' #' # generate bootstrap samples with 70% observations of the original sample size #' bs <- bootstrap(efc, 5, .7) #' #' # compute standard error for a simple vector from bootstraps #' # use the `as.data.frame()`-method to get the resampled #' # data frame #' bs <- bootstrap(efc, 100) #' bs$c12hour <- unlist(lapply(bs$strap, function(x) { #' mean(as.data.frame(x)$c12hour, na.rm = TRUE) #' })) #' #' # or as tidyverse-approach #' if (require("dplyr") && require("purrr")) { #' bs <- efc %>% #' bootstrap(100) %>% #' mutate( #' c12hour = map_dbl(strap, ~mean(as.data.frame(.x)$c12hour, na.rm = TRUE)) #' ) #' #' # bootstrapped standard error #' boot_se(bs, c12hour) #' } #' @export bootstrap <- function(data, n, size) { if (!missing(size) && !is.null(size)) { # check for valid range if (size < 0 || size > nrow(data)) stop("`size` must be greater than 0, but not greater than number of rows of `data`.", call. = F) # check if we want proportions if (size < 1) size <- as.integer(nrow(data) * size) # generate bootstraps w/o replacement repl <- FALSE } else { # size = observations size <- nrow(data) # generate bootstraps with replacement repl <- TRUE } # generate bootstrap resamples strap <- replicate(n, resample(data, size, repl), simplify = FALSE) # add resample ID, may be used for other functions (like 'se()' for 'icc()') for (i in seq_len(length(strap))) strap[[i]]$resample.id <- i # return tibble data.frame(strap = I(strap)) } resample <- function(data, size, replace) { structure( class = "sj_resample", list( data = data, id = sample(nrow(data), size = size, replace = replace) )) } sjstats/R/wtd_sd.R0000644000176200001440000001142313616514224013562 0ustar liggesusers#' @title Weighted statistics for tests and variables #' @name weighted_sd #' @description \strong{Weighted statistics for variables} #' \cr \cr #' \code{weighted_sd()}, \code{weighted_se()}, \code{weighted_mean()} and \code{weighted_median()} #' compute weighted standard deviation, standard error, mean or median for a #' variable or for all variables of a data frame. \code{survey_median()} computes the #' median for a variable in a survey-design (see \code{\link[survey]{svydesign}}). #' \code{weighted_correlation()} computes a weighted correlation for a two-sided alternative #' hypothesis. #' \cr \cr #' \strong{Weighted tests} #' \cr \cr #' \code{weighted_ttest()} computes a weighted t-test, while \code{weighted_mannwhitney()} #' computes a weighted Mann-Whitney-U test or a Kruskal-Wallis test #' (for more than two groups). \code{weighted_chisqtest()} computes a weighted #' Chi-squared test for contigency tables. #' #' @param x (Numeric) vector or a data frame. For \code{survey_median()}, \code{weighted_ttest()}, #' \code{weighted_mannwhitney()} and \code{weighted_chisqtest()} the bare (unquoted) variable #' name, or a character vector with the variable name. #' @param weights Bare (unquoted) variable name, or a character vector with #' the variable name of the numeric vector of weights. If \code{weights = NULL}, #' unweighted statistic is reported. #' @param data A data frame. #' @param formula A formula of the form \code{lhs ~ rhs1 + rhs2} where \code{lhs} is a #' numeric variable giving the data values and \code{rhs1} a factor with two #' levels giving the corresponding groups and \code{rhs2} a variable with weights. #' @param y Optional, bare (unquoted) variable name, or a character vector with #' the variable name. #' @param grp Bare (unquoted) name of the cross-classifying variable, where #' \code{x} is grouped into the categories represented by \code{grp}, #' or a character vector with the variable name. #' @param mu A number indicating the true value of the mean (or difference in #' means if you are performing a two sample test). #' @param ci.lvl Confidence level of the interval. #' @param alternative A character string specifying the alternative hypothesis, #' must be one of \code{"two.sided"} (default), \code{"greater"} or #' \code{"less"}. You can specify just the initial letter. #' @param paired Logical, whether to compute a paired t-test. #' @param ... For \code{weighted_ttest()} and \code{weighted_mannwhitney()}, currently not used. #' For \code{weighted_chisqtest()}, further arguments passed down to #' \code{\link[stats]{chisq.test}}. #' #' @inheritParams svyglm.nb #' #' @return The weighted (test) statistic. #' #' @note \code{weighted_chisq()} is a convenient wrapper for \code{\link{crosstable_statistics}}. #' For a weighted one-way Anova, use \code{means_by_group()} with #' \code{weights}-argument. #' \cr \cr #' \code{weighted_ttest()} assumes unequal variance between the two groups. #' #' @examples #' # weighted sd and se ---- #' #' weighted_sd(rnorm(n = 100, mean = 3), runif(n = 100)) #' #' data(efc) #' weighted_sd(efc[, 1:3], runif(n = nrow(efc))) #' weighted_se(efc[, 1:3], runif(n = nrow(efc))) #' #' # survey_median ---- #' #' # median for variables from weighted survey designs #' if (require("survey")) { #' data(nhanes_sample) #' #' des <- svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' #' survey_median(total, des) #' survey_median("total", des) #' } #' #' # weighted t-test ---- #' #' efc$weight <- abs(rnorm(nrow(efc), 1, .3)) #' weighted_ttest(efc, e17age, weights = weight) #' weighted_ttest(efc, e17age, c160age, weights = weight) #' weighted_ttest(e17age ~ e16sex + weight, efc) #' #' # weighted Mann-Whitney-U-test ---- #' #' weighted_mannwhitney(c12hour ~ c161sex + weight, efc) #' #' # weighted Chi-squared-test ---- #' #' weighted_chisqtest(efc, c161sex, e16sex, weights = weight, correct = FALSE) #' weighted_chisqtest(c172code ~ c161sex + weight, efc) #' @export weighted_sd <- function(x, weights = NULL) { UseMethod("weighted_sd") } #' @rdname weighted_sd #' @export wtd_sd <- weighted_sd #' @export weighted_sd.data.frame <- function(x, weights = NULL) { sd_result <- purrr::map_dbl(x, ~ sqrt(weighted_variance(.x, weights))) names(sd_result) <- colnames(x) sd_result } #' @export weighted_sd.matrix <- function(x, weights = NULL) { sd_result <- purrr::map_dbl(x, ~ sqrt(weighted_variance(.x, weights))) names(sd_result) <- colnames(x) sd_result } #' @export weighted_sd.default <- function(x, weights = NULL) { sqrt(weighted_variance(x, weights)) } sjstats/R/svy_median.R0000644000176200001440000000103713565521357014444 0ustar liggesusers#' @rdname weighted_sd #' @importFrom stats as.formula #' @export survey_median <- function(x, design) { # check if pkg survey is available if (!requireNamespace("survey", quietly = TRUE)) { stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) } # deparse v <- stats::as.formula(paste("~", as.character(substitute(x)))) as.vector( survey::svyquantile( v, design = design, quantiles = 0.5, ci = FALSE, na.rm = TRUE ) ) } sjstats/R/auto_prior.R0000644000176200001440000001253013563265750014471 0ustar liggesusers#' @title Create default priors for brms-models #' @name auto_prior #' #' @description This function creates default priors for brms-regression #' models, based on the same automatic prior-scale adjustment as in #' \pkg{rstanarm}. #' #' @param formula A formula describing the model, which just needs to contain #' the model terms, but no notation of interaction, splines etc. Usually, #' you want only those predictors in the formula, for which automatic #' priors should be generated. Add informative priors afterwards to the #' returned \code{brmsprior}-object. #' @param data The data that will be used to fit the model. #' @param gaussian Logical, if the outcome is gaussian or not. #' @param locations A numeric vector with location values for the priors. If #' \code{locations = NULL}, \code{0} is used as location parameter. #' #' @return A \code{brmsprior}-object. #' #' @details \code{auto_prior()} is a small, convenient function to create #' some default priors for brms-models with automatically adjusted prior #' scales, in a similar way like \pkg{rstanarm} does. The default scale for #' the intercept is 10, for coefficients 2.5. If the outcome is gaussian, #' both scales are multiplied with \code{sd(y)}. Then, for categorical #' variables, nothing more is changed. For numeric variables, the scales #' are divided by the standard deviation of the related variable. #' \cr \cr #' All prior distributions are \emph{normal} distributions. \code{auto_prior()} #' is intended to quickly create default priors with feasible scales. If #' more precise definitions of priors is necessary, this needs to be done #' directly with brms-functions like \code{set_prior()}. #' #' @note As \code{auto_prior()} also sets priors on the intercept, the model #' formula used in \code{brms::brm()} must be rewritten to something like #' \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. #' #' @examples #' library(sjmisc) #' data(efc) #' efc$c172code <- as.factor(efc$c172code) #' efc$c161sex <- to_label(efc$c161sex) #' #' mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) #' #' if (requireNamespace("brms", quietly = TRUE)) #' auto_prior(mf, efc, TRUE) #' #' ## compare to #' # library(rstanarm) #' # m <- stan_glm(mf, data = efc, chains = 2, iter = 200) #' # ps <- prior_summary(m) #' # ps$prior_intercept$adjusted_scale #' # ps$prior$adjusted_scale #' #' ## usage #' # ap <- auto_prior(mf, efc, TRUE) #' # brm(mf, data = efc, priors = ap) #' #' # add informative priors #' mf <- formula(neg_c_7 ~ c161sex + c172code) #' #' if (requireNamespace("brms", quietly = TRUE)) { #' auto_prior(mf, efc, TRUE) + #' brms::prior(normal(.1554, 40), class = "b", coef = "c160age") #' } #' #' # example with binary response #' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) #' mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) #' #' if (requireNamespace("brms", quietly = TRUE)) #' auto_prior(mf, efc, FALSE) #' #' @importFrom stats sd na.omit #' @importFrom dplyr select n_distinct #' @importFrom insight find_predictors find_response #' @export auto_prior <- function(formula, data, gaussian, locations = NULL) { if (!requireNamespace("brms", quietly = TRUE)) stop("Package `brms` required.", call. = FALSE) scale.b <- 2.5 scale.y <- 10 pred <- insight::find_predictors(formula, effects = "all", flatten = TRUE) y.name <- insight::find_response(formula, combine = TRUE) cols <- c(y.name, pred) data <- data %>% dplyr::select(!! cols) %>% stats::na.omit() %>% as.data.frame() y <- data[[y.name]] # check if response is binary if (missing(gaussian) && dplyr::n_distinct(y, na.rm = TRUE) == 2) gaussian <- FALSE if (isTRUE(gaussian) && dplyr::n_distinct(y, na.rm = TRUE) == 2) warning("Priors were calculated based on assumption that the response is Gaussian, however it seems to be binary.", call. = F) if (gaussian) { scale.factor <- stats::sd(y, na.rm = TRUE) scale.b <- scale.b * scale.factor scale.y <- scale.y * scale.factor } if (!is.null(locations)) location.y <- locations[1] else location.y <- 0 priors <- brms::set_prior( sprintf("normal(%s, %s)", round(location.y, 2), round(scale.y, 2)), class = "Intercept" ) is.fac <- NULL term.names <- NULL scale.pred <- NULL # we need to check which predictors are categorical and then "mimic" # their coefficient name as it is represented in the model (i.e. variable # name + category name) for (i in pred) { f <- data[[i]] if (is.factor(f)) { i <- sprintf("%s%s", i, levels(f)[2:nlevels(f)]) is.fac <- c(is.fac, rep(TRUE, nlevels(f) - 1)) scale.pred <- c(scale.pred, rep(scale.b, nlevels(f) - 1)) } else { is.fac <- c(is.fac, FALSE) scale.pred <- c(scale.pred, scale.b / stats::sd(f, na.rm = TRUE)) } term.names <- c(term.names, i) } for (i in 1:length(term.names)) { if (!is.null(locations) && length(locations) >= (i + 1)) location.b <- locations[i + 1] else location.b <- 0 priors <- priors + brms::set_prior( sprintf("normal(%s, %s)", round(location.b, 2), round(scale.pred[i], 2)), class = "b", coef = term.names[i] ) } priors } sjstats/R/xtab_statistics.R0000644000176200001440000002327613565714117015525 0ustar liggesusers#' @title Measures of association for contingency tables #' @name crosstable_statistics #' #' @description This function calculates various measure of association for #' contingency tables and returns the statistic and p-value. #' Supported measures are Cramer's V, Phi, Spearman's rho, #' Kendall's tau and Pearson's r. #' #' @param data A data frame or a table object. If a table object, \code{x1} and #' \code{x2} will be ignored. For Kendall's \emph{tau}, Spearman's \emph{rho} #' or Pearson's product moment correlation coefficient, \code{data} needs #' to be a data frame. If \code{x1} and \code{x2} are not specified, #' the first two columns of the data frames are used as variables #' to compute the crosstab. #' @param formula A formula of the form \code{lhs ~ rhs} where \code{lhs} is a #' numeric variable giving the data values and \code{rhs} a factor giving the #' corresponding groups. #' @param tab A \code{\link{table}} or \code{\link[stats]{ftable}}. Tables of class #' \code{\link[stats]{xtabs}} and other will be coerced to \code{ftable} #' objects. #' @param x1 Name of first variable that should be used to compute the #' contingency table. If \code{data} is a table object, this argument #' will be irgnored. #' @param x2 Name of second variable that should be used to compute the #' contingency table. If \code{data} is a table object, this argument #' will be irgnored. #' @param statistics Name of measure of association that should be computed. May #' be one of \code{"auto"}, \code{"cramer"}, \code{"phi"}, \code{"spearman"}, #' \code{"kendall"}, \code{"pearson"} or \code{"fisher"}. See 'Details'. #' @param ci.lvl Scalar between 0 and 1. If not \code{NULL}, returns a data #' frame including lower and upper confidence intervals. #' @param ... Other arguments, passed down to the statistic functions #' \code{\link[stats]{chisq.test}}, \code{\link[stats]{fisher.test}} or #' \code{\link[stats]{cor.test}}. #' #' @inheritParams means_by_group #' @inheritParams bootstrap #' @inheritParams boot_ci #' #' @return For \code{phi()}, the table's Phi value. For \code{cramer()}, the #' table's Cramer's V. #' \cr \cr #' For \code{crosstable_statistics()}, a list with following components: #' \describe{ #' \item{\code{estimate}}{the value of the estimated measure of association.} #' \item{\code{p.value}}{the p-value for the test.} #' \item{\code{statistic}}{the value of the test statistic.} #' \item{\code{stat.name}}{the name of the test statistic.} #' \item{\code{stat.html}}{if applicable, the name of the test statistic, in HTML-format.} #' \item{\code{df}}{the degrees of freedom for the contingency table.} #' \item{\code{method}}{character string indicating the name of the measure of association.} #' \item{\code{method.html}}{if applicable, the name of the measure of association, in HTML-format.} #' \item{\code{method.short}}{the short form of association measure, equals the \code{statistics}-argument.} #' \item{\code{fisher}}{logical, if Fisher's exact test was used to calculate the p-value.} #' } #' #' @details The p-value for Cramer's V and the Phi coefficient are based #' on \code{chisq.test()}. If any expected value of a table cell is #' smaller than 5, or smaller than 10 and the df is 1, then \code{fisher.test()} #' is used to compute the p-value, unless \code{statistics = "fisher"}; in #' this case, the use of \code{fisher.test()} is forced to compute the #' p-value. The test statistic is calculated with \code{cramer()} resp. #' \code{phi()}. #' \cr \cr #' Both test statistic and p-value for Spearman's rho, Kendall's tau #' and Pearson's r are calculated with \code{cor.test()}. #' \cr \cr #' When \code{statistics = "auto"}, only Cramer's V or Phi are calculated, #' based on the dimension of the table (i.e. if the table has more than #' two rows or columns, Cramer's V is calculated, else Phi). #' #' @examples #' # Phi coefficient for 2x2 tables #' tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) #' phi(tab) #' #' # Cramer's V for nominal variables with more than 2 categories #' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) #' cramer(tab) #' #' # formula notation #' data(efc) #' cramer(e16sex ~ c161sex, data = efc) #' #' # bootstrapped confidence intervals #' cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100) #' #' # 2x2 table, compute Phi automatically #' crosstable_statistics(efc, e16sex, c161sex) #' #' # more dimensions than 2x2, compute Cramer's V automatically #' crosstable_statistics(efc, c172code, c161sex) #' #' # ordinal data, use Kendall's tau #' crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall") #' #' # calcilate Spearman's rho, with continuity correction #' crosstable_statistics(efc, #' e42dep, #' quol_5, #' statistics = "spearman", #' exact = FALSE, #' continuity = TRUE #' ) #' #' @importFrom stats fisher.test chisq.test cor.test ftable #' @importFrom dplyr case_when #' @importFrom MASS loglm #' @importFrom rlang quo_name enquo #' @export crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ...) { # match arguments statistics <- match.arg(statistics) # name for test statistics in HTML stat.html <- NULL # check if data is a table if (!is.table(data)) { # evaluate unquoted names x1 <- deparse(substitute(x1)) x2 <- deparse(substitute(x2)) weights <- deparse(substitute(weights)) # if names were quotes, remove quotes x1 <- gsub("\"", "", x1, fixed = T) x2 <- gsub("\"", "", x2, fixed = T) weights <- gsub("\"", "", weights, fixed = T) if (sjmisc::is_empty(weights) || weights == "NULL") weights <- NULL else weights <- data[[weights]] # check for "NULL" and get data if (x1 != "NULL" && x2 != "NULL") data <- data[, c(x1, x2)] else data <- data[, 1:2] if (!is.null(weights)) data <- cbind(data, weights) # make table if (!is.null(weights)) { tab <- as.table(round(stats::xtabs(data[[3]] ~ data[[1]] + data[[2]]))) class(tab) <- "table" } else tab <- table(data) } else { # 'data' is a table - copy to table object tab <- data # check if statistics are possible to compute if (statistics %in% c("spearman", "kendall", "pearson")) { stop( sprintf( "Need arguments `data`, `x1` and `x2` to compute %s-statistics.", statistics ), call. = F ) } } # get expected values tab.val <- table_values(tab) # remember whether fisher's exact test was used or not use.fisher <- FALSE # select statistics automatically, based on number of rows/columns if (statistics %in% c("auto", "cramer", "phi", "fisher")) { # get chisq-statistics, for df and p-value chsq <- suppressWarnings(stats::chisq.test(tab, ...)) pv <- chsq$p.value test <- chsq$statistic # set statistics name names(test) <- "Chi-squared" stat.html <- "χ2" # check row/columns if ((nrow(tab) > 2 || ncol(tab) > 2 || statistics %in% c("cramer", "fisher")) && statistics != "phi") { # get cramer's V s <- cramer(tab) # if minimum expected values below 5, compute fisher's exact test if (statistics == "fisher" || min(tab.val$expected) < 5 || (min(tab.val$expected) < 10 && chsq$parameter == 1)) { pv <- stats::fisher.test(tab, simulate.p.value = TRUE, ...)$p.value use.fisher <- TRUE } # set statistics statistics <- "cramer" } else { # get Phi s <- phi(tab) # if minimum expected values below 5 and df=1, compute fisher's exact test if (min(tab.val$expected) < 5 || (min(tab.val$expected) < 10 && chsq$parameter == 1)) { pv <- stats::fisher.test(tab, ...)$p.value use.fisher <- TRUE } # set statistics statistics <- "phi" } } else { # compute correlation coefficient cv <- stats::cor.test(x = data[[1]], y = data[[2]], method = statistics, ...) # get statistics and p-value s <- cv$estimate pv <- cv$p.value test <- cv$statistic stat.html <- names(test) } # compute method string method <- dplyr::case_when( statistics == "kendall" ~ "Kendall's tau", statistics == "spearman" ~ "Spearman's rho", statistics == "pearson" ~ "Pearson's r", statistics == "cramer" ~ "Cramer's V", statistics == "phi" ~ "Phi" ) # compute method string method.html <- dplyr::case_when( statistics == "kendall" ~ "Kendall's τ", statistics == "spearman" ~ "Spearman's ρ", statistics == "pearson" ~ "Pearson's r", statistics == "cramer" ~ "Cramer's V", statistics == "phi" ~ "φ" ) # return result structure(class = "sj_xtab_stat", list( estimate = s, p.value = pv, statistic = test, stat.name = names(test), stat.html = stat.html, df = (nrow(tab) - 1) * (ncol(tab) - 1), method = method, method.html = method.html, method.short = statistics, fisher = use.fisher )) } #' @rdname crosstable_statistics #' @export xtab_statistics <- crosstable_statistics sjstats/NEWS.md0000644000176200001440000004050113616514013013043 0ustar liggesusers# sjstats 0.17.9 ## Bug fixes * Fixed documentation for `chisq_gof()`. * Fixed issue in `anova_stats()` with incorrect effect sizes for certain Anova types (that included an intercept). # sjstats 0.17.8 ## Deprecated and defunct _sjstats_ is being re-structured, and many functions are re-implemented in new packages that are part of a new project called **easystats**. Therefore, following functions are now deprecated: * `cohens_f()`, please use `effectsize::cohens_f()`. * `std_beta()`, please use `effectsize::standardize_parameters()`. * `tidy_stan()`, please use `parameters::model_parameters()`. * `scale_weights()`, please use `parameters::rescale_weights()`. * `robust()`, please use `parameters::standard_error_robust()`. ## General * Functions for weighted statistics with prefix `wtd_*()` have been renamed to `weighted_*()`. * `svy_md()` was renamed to `survey_median()`. * `mannwhitney()` is an alias for `mwu()`. * `means_by_group()` is an alias for `grpmean()`. # sjstats 0.17.7 ## Deprecated and defunct _sjstats_ is being re-structured, and many functions are re-implemented in new packages that are part of a new project called **easystats**. The aim of **easystats** is to provide a unifying and consistent framework to tame, discipline and harness the scary R statistics and their pesky models. Therefore, following functions are now deprecated: * `p_value()`, please use `parameters::p_value()` * `se()`, please use `parameters::standard_error()` ## General * Revise some functions to cope with the forthcoming _insight_ update. # sjstats 0.17.6 ## General * Minor revisions to meet the changes in the forthcoming update from *tidyr*. * `design_effect()` is an alias for `deff()`. * `samplesize_mixed()` is an alias for `smpsize_lmm()`. * `crosstable_statistics()` is an alias for `xtab_statistics()`. ## New functions * `svyglm.zip()` to fit zero-inflated Poisson models for survey-designs. ## Changes to functions * `phi()` and `cramer()` can now compute confidence intervals. * `tidy_stan()` removes prior parameters from output. * `tidy_stan()` now also prints the probability of direction. ## Bug fixes * Fix bug with wrong computation in `odds_to_rr()`. # sjstats 0.17.5 ## New functions * `epsilon_sq()`, to compute epsilon-squared effect-size. ## Deprecated and defunct _sjstats_ is being re-structured, and many functions are re-implemented in new packages that are part of a new project called **easystats**. The aim of **easystats** is to provide a unifying and consistent framework to tame, discipline and harness the scary R statistics and their pesky models. Therefore, following functions are now deprecated: * `link_inverse()`, please use `insight::link_inverse()` * `model_family()`, please use `insight::model_info()` * `model_frame()`, please use `insight::get_data()` * `pred_vars()`, please use `insight::find_predictors()` * `re_grp_var()`, please use `insight::find_random()` * `grp_var()`, please use `insight::find_random()` * `resp_val()`, please use `insight::get_response()` * `resp_var()`, please use `insight::find_response()` * `var_names()`, please use `insight::clean_names()` * `overdisp()`, please use `performance::check_overdispersion()` * `zero_count()`, please use `performance::check_zeroinflation()` * `converge_ok()`, please use `performance::check_convergence()` * `is_singular()`, please use `performance::check_singularity()` * `reliab_test()`, please use `performance::item_reliability()` * `split_half()`, please use `performance::item_split_half()` * `predictive_accurarcy()`, please use `performance::performance_accuracy()` * `cronb()`, please use `performance::cronbachs_alpha()` * `difficulty()`, please use `performance::item_difficulty()` * `mic()`, please use `performance::item_intercor()` * `pca()`, please use `parameters::principal_components()` * `pca_rotate()`, please use `parameters::principal_components()` * `r2()`, please use `performance::r2()` * `icc()`, please use `performance::icc()` * `rmse()`, please use `performance::rmse()` * `rse()`, please use `performance::rse()` * `mse()`, please use `performance::mse()` * `hdi()`, please use `bayestestR::hdi()` * `cred_int()`, please use `bayestestR::ci()` * `rope()`, please use `bayestestR::rope()` * `n_eff()`, please use `bayestestR::effective_sample()` * `equi_test()`, please use `bayestestR::equivalence_test()` * `multicollin()`, please use `performance::check_collinearity()` * `normality()`, please use `performance::check_normality()` * `autocorrelation()`, please use `performance::check_autocorrelation()` * `heteroskedastic()`, please use `performance::check_heteroscedasticity()` * `outliers()`, please use `performance::check_outliers()` ## Changes to functions * Anova-stats functions (like `eta_sq()`) get a `method`-argument to define the method for computing confidence intervals from bootstrapping. ## Bug fixes * In some situations, `smpsize_lmm()` could result in negative sample-size recommendations. This was fixed, and a warning is now shown indicating that the parameters for the power-calculation should be modified. * Fixed issue with wrong calculated effect size `r` in `mwu()` if group-factor contained more than two groups. # sjstats 0.17.4 ## General * Following models/objects are now supported by model-information functions like `model_family()`, `link_inverse()` or `model_frame()`: `MixMod` (package **GLMMadaptive**), **MCMCglmm**, `mlogit` and `gmnl`. * Reduce package dependencies. ## New functions * `cred_int()`, to compute uncertainty intervals of Bayesian models. Mimics the behaviour and style of `hdi()` and is thus a convenient complement to functions like `posterior_interval()`. ## Changes to functions * `equi_test()` now finds better defaults for models with binomial outcome (like logistic regression models). * `r2()` for mixed models now also should work properly for mixed models fitted with **rstanarm**. * `anova_stats()` and alike (e.g. `eta_sq()`) now all preserve original term names. * `model_family()` now returns `$is_count = TRUE`, when model is a count-model, and `$is_beta = TRUE` for models with beta-family. * `pred_vars()` checks that return value has only unique values. * `pred_vars()` gets a `zi`-argument to return the variables from a model's zero-inflation-formula. ## Bug fixes * Fix minor issues in `wtd_sd()` and `wtd_mean()` when weight was `NULL` (which usually shoudln't be the case anyway). * Fix potential issue with `deparse()`, cutting off very long formulas in various functions. * Fix encoding issues in help-files. # sjstats 0.17.3 ## General * Export `dplyr::n()`, to meet forthcoming changes in dplyr 0.8.0. ## Changes to functions * `boot_ci()` gets a `ci.lvl`-argument. * The `rotation`-argument in `pca_rotate()` now supports all rotations from `psych::principal()`. * `pred_vars()` gets a `fe.only`-argument to return only fixed effects terms from mixed models, and a `disp`-argument to return the variables from a model's dispersion-formula. * `icc()` for Bayesian models gets a `adjusted`-argument, to calculate adjusted and conditional ICC (however, only for Gaussian models). * For `icc()` for non-Gaussian Bayes-models, a message is printed that recommends setting argument `ppd` to `TRUE`. * `resp_val()` and `resp_var()` now also work for **brms**-models with additional response information (like `trial()` in formula). * `resp_var()` gets a `combine`-argument, to return either the name of the matrix-column or the original variable names for matrix-columns. * `model_frame()` now also returns the original variables for matrix-column-variables. * `model_frame()` now also returns the variable from the dispersion-formula of **glmmTMB**-models. * `model_family()` and `link_inverse()` now supports **glmmPQL**, **felm** and **lm_robust**-models. * `anova_stats()` and alike (`omeqa_sq()` etc.) now support gam-models from package **gam**. * `p_value()` now supports objects of class `svyolr`. ## Bug fixes * Fix issue with `se()` and `get_re_var()` for objects returned by `icc()`. * Fix issue with `icc()` for Stan-models. * `var_names()` did not clear terms with log-log transformation, e.g. `log(log(y))`. * Fix issue in `model_frame()` for models with splines with only one column. # sjstats 0.17.2 ## General * Revised help-files for `r2()` and `icc()`, also by adding more references. ## New functions * `re_grp_var()` to find group factors of random effects in mixed models. ## Changes to functions * `omega_sq()` and `eta_sq()` give more informative messages when using non-supported objects. * `r2()` and `icc()` give more informative warnings and messages. * `tidy_stan()` supports printing simplex parameters of monotonic effects of **brms** models. * `grpmean()` and `mwu()` get a `file` and `encoding` argument, to save the HTML output as file. ## Bug fixes * `model_frame()` now correctly names the offset-columns for terms provided as `offset`-argument (i.e. for models where the offset was not specified inside the formula). * Fixed issue with `weights`-argument in `grpmean()` when variable name was passed as character vector. * Fixed issue with `r2()` for **glmmTMB** models with `ar1` random effects structure. # sjstats 0.17.1 ## New functions * `wtd_chisqtest()` to compute a weighted Chi-squared test. * `wtd_median()` to compute the weighted median of variables. * `wtd_cor()` to compute weighted correlation coefficients of variables. ## Changes to functions * `mediation()` can now cope with models from different families, e.g. if the moderator or outcome is binary, while the treatment-effect is continuous. * `model_frame()`, `link_inverse()`, `pred_vars()`, `resp_var()`, `resp_val()`, `r2()` and `model_family()` now support `clm2`-objects from package **ordinal**. * `anova_stats()` gives a more informative message for non-supported models or ANOVA-options. ## Bug fixes * Fixed issue with `model_family()` and `link_inverse()` for models fitted with `pscl::hurdle()` or `pscl::zeroinfl()`. * Fixed issue with wrong title in `grpmean()` for grouped data frames, when grouping variable was an unlabelled factor. * Fix issue with `model_frame()` for **coxph**-models with polynomial or spline-terms. * Fix issue with `mediation()` for logical variables. # sjstats 0.17.0 ## General * Reduce package dependencies. ## New functions * `wtd_ttest()` to compute a weighted t-test. * `wtd_mwu()` to compute a weighted Mann-Whitney-U or Kruskal-Wallis test. ## Changes to functions * `robust()` was revised, getting more arguments to specify different types of covariance-matrix estimation, and handling these more flexible. * Improved `print()`-method for `tidy_stan()` for _brmsfit_-objects with categorical-families. * `se()` now also computes standard errors for relative frequencies (proportions) of a vector. * `r2()` now also computes r-squared values for _glmmTMB_-models from `genpois`-families. * `r2()` gives more precise warnings for non-supported model-families. * `xtab_statistics()` gets a `weights`-argument, to compute measures of association for contingency tables for weighted data. * The `statistics`-argument in `xtab_statistics()` gets a `"fisher"`-option, to force Fisher's Exact Test to be used. * Improved variance calculation in `icc()` for generalized linear mixed models with Poisson or negative binomial families. * `icc()` gets an `adjusted`-argument, to calculate the adjusted and conditional ICC for mixed models. * To get consistent argument names accross functions, argument `weight.by` is now deprecated and renamed into `weights`. ## Bug fixes * Fix issues with effect size computation for repeated-measure Anova when using bootstrapping to compute confidence intervals. * `grpmean()` now also adjusts the `n`-columm for weighted data. * `icc()`, `re_var()` and `get_re_var()` now correctly compute the random-effect-variances for models with multiple random slopes per random effect term (e.g., `(1 + rs1 + rs2 | grp)`). * Fix issues in `tidy_stan()`, `mcse()`, `hdi()` and `n_eff()` for `stan_polr()`-models. * Plotting `equi_test()` did not work for intercept-only models. # sjstats 0.16.0 ## General * The S3-generics for functions like `hdi()`, `rope()`, `equi_test()` etc. are now more generic, and function usage for each supported object is now included in the documentation. * Following functions are now S3-generic: `icc()`, `r2()`, `p_value()`, `se()`, and `std_beta()`. * Added `print()`-methods for some more functions, for a clearer output. * Revised `r2()` for mixed models (packages **lme4**, **glmmTMB**). The r-squared value should be much more precise now, and reports the marginal and conditional r-squared values. * Reduced package dependencies and removed _apaTables_ and _MBESS_ from suggested packages * `stanmvreg`-models are now supported by many functions. ## New functions * `binned_resid()` to plot binned residuals for logistic regression models. * `error_rate()` to compute model quality for logistic regression models. * `auto_prior()` to quickly create automatically adjusted priors for brms-models. * `difficulty()` to compute the item difficulty. ## Changes to functions * `icc()` gets a `ppd`-argument for Stan-models (*brmsfit* and *stanreg*), which performs a variance decomposition based on the posterior predictive distribution. This is the recommended way for non-Gaussian models. * For Stan-models (*brmsfit* and *stanreg*), `icc()` now also computes the HDI for the ICC and random-effect variances. Use the `prob`-argument to specify the limits of this interval. * `link_inverse()` and `model_family()` now support _clmm_-models (package *ordinal*) and _glmRob_ and _lmRob_-models (package *robust*). * `model_family()` gets a `multi.resp`-argument, to return a list of family-informations for multivariate-response models (of class `brmsfit` or `stanmvreg`). * `link_inverse()` gets a `multi.resp`-argument, to return a list of link-inverse-functions for multivariate-response models (of class `brmsfit` or `stanmvreg`). * `p_value()` now supports _rlm_-models (package *MASS*). * `check_assumptions()` for single models with `as.logical = FALSE` now has a nice print-method. * `eta_sq()` and `omega_sq()` now also work for repeated-measure Anovas, i.e. Anova with error term (requires broom > 0.4.5). ## Bug fixes * `model_frame()` and `var_names()` now correctly cleans nested patterns like `offset(log(x + 10))` from column names. * `model_frame()` now returns proper column names from _gamm4_ models. * `model_frame()` did not work when the model frame had spline-terms and weights. * Fix issue in `robust()` when `exponentiate = TRUE` and `conf.int = FALSE`. * `reliab_test()` returned an error when the provided data frame has less than three columns, instead of returning `NULL`. # sjstats 0.15.0 ## General * Added new Vignette _Statistics for Bayesian Models_. ## New functions * `equi_test()` to test if parameter values in Bayesian estimation should be accepted or rejected. * `mediation()` to print a summary of a mediation analysis from multivariate response models fitted with _brms_. ## Changes to functions * `link_inverse()` now also returns the link-inverse function for cumulative-family _brms_-models. * `model_family()` now also returns an `is_ordinal`-element with information if the model is ordinal resp. a cumulative link model. * Functions that access model information (like `model_family()`) now better support `vglm`-models (package _VGAM_). * `r2()` now also calculates the standard error for _brms_ or _stanreg_ models. * `r2()` gets a `loo`-argument to calculate LOO-adjusted rsquared values for _brms_ or _stanreg_ models. This measure comes conceptionally closer to an adjusted r-squared measure. * Effect sizes (`anova_stats()`, `eta_sq()` etc.) are now also computed for mixed models. * To avoid confusion, `n_eff()` now computes the number of effective samples, and no longer its ratio in relation to the total number of samples. * The column name for the ratio of the number of effective samples in `tidy_stan()` is now named *neff_ratio*, to avoid confusion. ## Bug fixes * Fixed issue in `se()` for `icc()`-objects, where random effect term could not be found. * Fixed issue in `se()` for `merMod`-objects. * Fixed issue in `p_value()` for mixed models with KR-approximation, which is now more accurate. sjstats/MD50000644000176200001440000001337113617050712012264 0ustar liggesusers1301befc070e7df6797f2cc38bfe1123 *DESCRIPTION f5d0a20bec0dc24adcd6bc9941d08ed2 *NAMESPACE 183b5cb780ced37ecf2f2301a5192dc0 *NEWS.md 346b6b3d5035547aca8af71585b41d65 *R/Deprecated.R 8cee91aacf9aa2bc3064775568510542 *R/S3-methods.R de49ba92f7dff1fbb419eca17e2653bd *R/anova_stats.R f07eed02567e501ebc291a4ebdbbd9b2 *R/auto_prior.R 02b748815816d7bb5fd8ed5c1828aaa0 *R/boot_ci.R 929d376c6d626882302a3d7e04060417 *R/bootstrap.R ea51177798ebdfd83ad092412b937b63 *R/confint_ncg.R d536ef944364404cd1eedc2bf6f3300d *R/cramer.R 490a5cfa14769c451d1728cc6279ef92 *R/cv.R 572a948ece805e4f15769713798c111d *R/cv_error.R b826d04f20826fbf7e12669244ca6e1f *R/design_effect.R 8231ebfe158dbfa38650b37167618e6b *R/epsilon_sq.R 16c3baee57c5180c5b369c0a4585eb95 *R/eta_sq.R 393c5a76c2a9592e1979b0c34d85d093 *R/find_beta.R 018629b9ceb9820b6e0be166adefbdf8 *R/gmd.R 2649cb20ef7ee2dd4f2de1a2af15c2e9 *R/gof.R bf073b637fa8fc7bd682d7a47bdd9768 *R/grpmean.R a7730239de9913b777aa3557a6fced12 *R/helpfunctions.R 1b9e99c02100ee536af9059f2a80a8d5 *R/inequ_trends.R dfccc2131f58c0a3641918a664039ad0 *R/is_prime.R 072cc67c0e694bbb3ebbb67dcf525096 *R/mean_n.R ce3fa7b4df12ec5c50aa4a444fc5fd89 *R/mediation.R c9bde8a5b6cfb500d72a792431d1d6de *R/mwu.R 29aa35fb05109929751b4c91ca249cbe *R/nhanes_sample.R 154f098b1b74a42cfcb44ecec4851f1b *R/odds_to_rr.R fda1c3b4203cc9a98e524d15a84b3eb5 *R/omega_sq.R 3937ad62a93f35083e24acd361618454 *R/partial_eta_sq_ci.R 63eccf7ae760a9fd1497fd374c168f16 *R/phi.R 1749d4363a249ba35a80ddeb35c52cd5 *R/prop.R dfc5bc76bc96881840ccdc1d20c0d28d *R/re-exports.R ec1e70b39f84ece79ba1bbc6e66c5b1e *R/samplesize_mixed.R 86af38a4c7f24e92bfa4156dd0be571b *R/se_ybar.R d6df9bca022b5be049b3a00a93103588 *R/select_helpers.R d792df143be0e996bbb0250b8040b4b0 *R/sjStatistics.R 9f085ad99cbf2a750a2983a2a24b8853 *R/svy_median.R 62a377ab72893e5c43d36b7bc7b597db *R/svy_scale_weights.R ff40d9acba8b23b58f79a64594b58656 *R/svyglmnb.R dc33faa5c79b0f3570d3cb735b64214c *R/svyglmzip.R b60ff0344885bc77ba3a9c2214ad2cf0 *R/tidy_stan.R d549884ec18b554f9c6580b889139352 *R/var_pop.R c2c15305243436e9e9a3e3267b6c9566 *R/weight.R 79ad3482e0d69dc811caf87f74949bc2 *R/wtd_chisqtest.R 27a862c6531a93dade159942c38d2d68 *R/wtd_cor.R 43e3310d2af50ef270842e050f3bb5c3 *R/wtd_mean.R a6e86670a86c94b5f1ba7cb75a696232 *R/wtd_median.R b80564c160b37f83d2902ec4d418c26f *R/wtd_mwu.R 94f373421163a88b3c38f36cb51ba044 *R/wtd_sd.R 553b33cbd7cb656ad3e3fe104f89c409 *R/wtd_se.R b4b6dab951e3e86d5b895c9dd0b64776 *R/wtd_ttest.R 941be6a71a50e541fbc4f868c3baf7e5 *R/wtd_variance.R 5a5f069af539c4f5ab76c2ca421ccc0d *R/xtab_statistics.R ceff9d385b6cb905fb4230b34e4c9e21 *README.md f03ff671758f39e742b81ca214620c10 *build/partial.rdb 40b9cdc0b610840b0cef64c84b13fd3b *build/vignette.rds 3172b22b3d87d0f86d78326bc26891fc *data/efc.RData e53dd52c7b8138f8fc573d713e2a5d78 *data/fish.RData ad3f18b79c24699a3c122d8b377b5028 *data/nhanes_sample.RData 87f8a6a5cb2f59e0b2110feef3926498 *inst/CITATION 87ab0d45bb9c28364777ca3040a3acd1 *inst/doc/anova-statistics.R f0dc04d03e34e7ca3f56cd931218ba38 *inst/doc/anova-statistics.Rmd f16775ead2860e4ca8e861f16584a9d1 *inst/doc/anova-statistics.html fcdf23532eca170659220c5cc25d09cd *inst/doc/bayesian-statistics.R 29f786780210151792acf916a8b17fbb *inst/doc/bayesian-statistics.Rmd f5d8a374141b67575c9d4d718e0964ac *inst/doc/bayesian-statistics.html f68860f1122e6a32d1ca7925cb73864e *inst/doc/mixedmodels-statistics.R 6ecb81462aba9d78416a69dfea231160 *inst/doc/mixedmodels-statistics.Rmd be55bcea40fe9d626af840ad5017c6ce *inst/doc/mixedmodels-statistics.html 3aec13c45b1db134d328251fc90c62e3 *man/auto_prior.Rd ee2f70a784bf2f64fe6b321611e78adf *man/boot_ci.Rd 059263f9a19bfa1bd80ef05a536137a2 *man/bootstrap.Rd 1baa9c02deed5d6cdf58ef8e0056367f *man/chisq_gof.Rd 190898720e6d9388c1f67c69e1672bd5 *man/crosstable_statistics.Rd 196e2c06b9db920496145dc5b6bf8262 *man/cv.Rd 2cb00c307543ac8e799df75227b37c6e *man/cv_error.Rd cc8c96d49a9ff5e7638c6bd7ba85f759 *man/design_effect.Rd 719afae75947c286c7848d6c9626c02d *man/efc.Rd fa331f326e409554c89254488aa41dc5 *man/eta_sq.Rd 00a3f8db1d678464d37705968d42286f *man/figures/logo.png 6e4050c7d457ebcdd164526512ff122b *man/find_beta.Rd 905e352012b658296bd16e395d0a5f7b *man/fish.Rd 87a6cf1c492ffa091e99a95a081841a8 *man/gmd.Rd ae3b5572ebfd84fd612e84f9e0add80e *man/inequ_trend.Rd dd8cf1ee0d3efa4a89706be8804764b5 *man/is_prime.Rd 1aa949852b5c1efbc383a7325af19f97 *man/mean_n.Rd 860e56fd7181acf1ca3db6f3c67a969a *man/means_by_group.Rd 5a0724fa89b3607195892eca0680a314 *man/mediation.Rd b163fa3f5ba0da21c47f36a0c718fd88 *man/mwu.Rd 56194e97259a636ddc45a645824c90a5 *man/nhanes_sample.Rd acd0c50f1393e5e31b8a20f80da38df9 *man/odds_to_rr.Rd 5b80e02dca6aa3fb93423f58b996092a *man/prop.Rd 3f8480e693512454473a259efe160580 *man/r2.Rd 78e6694489620e29f3612bb292cabcb1 *man/reexports.Rd 6ca1a43a6b25052477035f0ff814b2ac *man/samplesize_mixed.Rd 38dad435d260efc69ded7126fce3040c *man/scale_weights.Rd be0a6f71acacea063d7cfed512a5ccf7 *man/se_ybar.Rd 558e607b27a9fbcbd3d74b90395e3d01 *man/sjstats-package.Rd 1db3c7eb2b7084b907ff4517ed2805e0 *man/svyglm.nb.Rd 5fe0f19e41fe75056a5504251c34549b *man/svyglm.zip.Rd f30a70c9da7de433db7fad6cb9917cea *man/table_values.Rd 4ca50e307ab5cd5f2d557a1bfe586d85 *man/tidy_stan.Rd d4d7e8473a71fe077b9622eb976af019 *man/var_pop.Rd c152a771236fc6a552193935e3de8b94 *man/weight.Rd 66b075762a52b810ed33cb6e198a5d14 *man/weighted_sd.Rd 2db0f4e0dfa8eb2416dc74ed964a588f *tests/testthat.R bfe54c8e3e34765d498ad765dd657fdf *tests/testthat/test-anova_stats.R 888d793d09df5a1760e6a06f60661158 *tests/testthat/test-autoprior.R 874a31ce9ad4de626f6cf0c81399b939 *tests/testthat/test-grpmean.R 9c2529923e1214c6d53cb5f1d813eb21 *tests/testthat/test-wtd.R f0dc04d03e34e7ca3f56cd931218ba38 *vignettes/anova-statistics.Rmd 29f786780210151792acf916a8b17fbb *vignettes/bayesian-statistics.Rmd 6ecb81462aba9d78416a69dfea231160 *vignettes/mixedmodels-statistics.Rmd sjstats/inst/0000755000176200001440000000000013616771064012735 5ustar liggesuserssjstats/inst/doc/0000755000176200001440000000000013616771064013502 5ustar liggesuserssjstats/inst/doc/mixedmodels-statistics.Rmd0000644000176200001440000001340713616476363020661 0ustar liggesusers--- title: "Statistics for Mixed Effects Models" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Statistics for Mixed Effects Models} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r set-options, echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) options(width = 800) if (!requireNamespace("lme4", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } ``` # Statistics and Measures for Mixed Effects Models This vignettes demontrates those functions of the *sjstats*-package that deal especially with mixed effects models. *sjstats* provides following functions: * `design_effect()` and `samplesize_mixed()` * `scale_weights()` Befor we start, we fit a simple linear mixed model: ```{r} library(sjstats) library(lme4) # load sample data data(sleepstudy) # fit linear mixed model m <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy) set.seed(2018) sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE) m2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy) ``` ## Sample Size Calculation for Mixed Models The first two functions, `design_effect()` and `samplesize_mixed()`, can be used to approximately calculate the sample size in the context of power calculation. Calculating the sample size for simple linear models is pretty straightforward, however, for (linear) mixed models, statistical power is affected through the change of the variance of test statistics. This is what _Hsieh et al. (2003)_ call a _design effect_ (or variance inflation factor, VIF). Once this design effect is calculated, the sample size calculated for a standard design can be adjusted accordingly. ### Design Effect for Two-Level Mixed Models `design_effect()` computes this design effect for linear mixed models with two-level design. It requires the approximated average number of observations per grouping cluster (i.e. level-2 unit) and the assumed intraclass correlation coefficient (ICC) for the multilevel-model. Typically, the minimum assumed value for the ICC is _0.05_. ```{r} # Design effect for two-level model with 30 observations per # cluster group (level-2 unit) and an assumed intraclass # correlation coefficient of 0.05. design_effect(n = 30) # Design effect for two-level model with 24 observation per cluster # group and an assumed intraclass correlation coefficient of 0.2. design_effect(n = 24, icc = 0.2) ``` ### Calculating the Sample Size for Linear Mixed Models `samplesize_mixed()` combines the functions for power calculation from the **pwr**-package and design effect `design_effect()`. It computes an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs. ```{r} # Sample size for multilevel model with 30 cluster groups and a small to # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and # hence a total sample size of about 802 observations is needed. samplesize_mixed(eff.size = .3, k = 30) # Sample size for multilevel model with 20 cluster groups and a medium # to large effect size for linear models of 0.2. Five subjects per cluster and # hence a total sample size of about 107 observations is needed. samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) ``` There are more ways to perform power calculations for multilevel models, however, most of these require very detailed knowledge about the sample characteristics and performing simulation studys. `samplesize_mixed()` is a more pragmatic alternative to these approaches. ## Rescale model weights for complex samples Most functions to fit multilevel and mixed effects models only allow to specify frequency weights, but not design (i.e. _sampling_ or _probability_) weights, which should be used when analyzing complex samples and survey data. `scale_weights()` implements an algorithm proposed by _Aaparouhov (2006)_ and _Carle (2009)_ to rescale design weights in survey data to account for the grouping structure of multilevel models, which then can be used for multilevel modelling. To calculate a weight-vector that can be used in multilevel models, `scale_weights()` needs the data frame with survey data as `x`-argument. This data frame should contain 1) a _cluster ID_ (argument `cluster.id`), which represents the _strata_ of the survey data (the level-2-cluster variable) and 2) the probability weights (argument `pweight`), which represents the design or sampling weights of the survey data (level-1-weight). `scale_weights()` then returns the original data frame, including two new variables: `svywght_a`, where the sample weights `pweight` are adjusted by a factor that represents the proportion of cluster size divided by the sum of sampling weights within each cluster. The adjustment factor for `svywght_b` is the sum of sample weights within each cluster devided by the sum of squared sample weights within each cluster (see _Carle (2009)_, Appendix B, for details). ```{r} data(nhanes_sample) scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) ``` # References Aaparouhov T. 2006. _General Multi-Level Modeling with Sampling Weights._ Communications in Statistics—Theory and Methods (35): 439–460 Carle AC. 2009. _Fitting multilevel models in complex survey data with design weights: Recommendations._ BMC Medical Research Methodology 9(49): 1-13 Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. _An Overview of Variance Inflation Factors for Sample-Size Calculation._ Evaluation & the Health Professions 26: 239–257. doi: [10.1177/0163278703255230](http://doi.org/10.1177/0163278703255230) sjstats/inst/doc/bayesian-statistics.Rmd0000644000176200001440000000746613616476363020152 0ustar liggesusers--- title: "Summary of Mediation Analysis using Bayesian Regression Models" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette --- ```{r, SETTINGS-knitr, echo = FALSE, warning = FALSE, message = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE ) options(width = 800) if (!requireNamespace("mediation", quietly = TRUE) || !requireNamespace("httr", quietly = TRUE) || !requireNamespace("brms", quietly = TRUE) || !requireNamespace("insight", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } ``` This vignettes demontrates the `mediation()`-function in **sjstats**. Before we start, we fit some models, including a mediation-object from the _mediation_-package, which we use for comparison with _brms_. ```{r} library(sjstats) library(mediation) library(brms) # load sample data data(jobs) set.seed(123) # linear models, for mediation analysis b1 <- lm(job_seek ~ treat + econ_hard + sex + age, data = jobs) b2 <- lm(depress2 ~ treat + job_seek + econ_hard + sex + age, data = jobs) # mediation analysis, for comparison with brms m1 <- mediate(b1, b2, sims = 1000, treat = "treat", mediator = "job_seek") ``` ```{r eval=FALSE} # Fit Bayesian mediation model f1 <- bf(job_seek ~ treat + econ_hard + sex + age) f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age) m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4) ``` ```{r echo=FALSE} m2 <- insight::download_model("brms_mv_6") ``` `mediation()` is a summary function, especially for mediation analysis, i.e. for multivariate response models with casual mediation effects. In the model _m2_, _treat_ is the treatment effect, *job_seek* is the mediator effect, _f1_ describes the mediator model and _f2_ describes the outcome model. `mediation()` returns a data frame with information on the _direct effect_ (median value of posterior samples from treatment of the outcome model), _mediator effect_ (median value of posterior samples from mediator of the outcome model), _indirect effect_ (median value of the multiplication of the posterior samples from mediator of the outcome model and the posterior samples from treatment of the mediation model) and the _total effect_ (median value of sums of posterior samples used for the direct and indirect effect). The _proportion mediated_ is the indirect effect divided by the total effect. The simplest call just needs the model-object. ```{r, message=TRUE} mediation(m2) ``` Typically, `mediation()` finds the treatment and mediator variables automatically. If this does not work, use the `treatment` and `mediator` arguments to specify the related variable names. For all values, the 90% HDIs are calculated by default. Use `prob` to calculate a different interval. Here is a comparison with the _mediation_ package. Note that the `summary()`-output of the _mediation_ package shows the indirect effect first, followed by the direct effect. ```{r} summary(m1) mediation(m2, prob = .95) ``` If you want to calculate mean instead of median values from the posterior samples, use the `typical`-argument. Furthermore, there is a `print()`-method, which allows to print more digits. ```{r, message=TRUE} mediation(m2, typical = "mean", prob = .95) %>% print(digits = 4) ``` As you can see, the results are similar to what the _mediation_ package produces for non-Bayesian models. # References Bürkner, P. C. (2017). brms: An R package for Bayesian multilevel models using Stan. Journal of Statistical Software, 80(1), 1-28 sjstats/inst/doc/anova-statistics.html0000644000176200001440000005610613616771053017672 0ustar liggesusers Statistics for Anova Tables

Statistics for Anova Tables

Daniel Lüdecke

2020-02-06

Effect Size Statistics for Anova Tables

This vignettes demontrates those functions of the sjstats-package that deal with Anova tables. These functions report different effect size measures, which are useful beyond significance tests (p-values), because they estimate the magnitude of effects, independent from sample size. sjstats provides following functions:

  • eta_sq()
  • omega_sq()
  • epsilon_sq()
  • anova_stats()

Befor we start, we fit a simple model:

library(sjstats)
# load sample data
data(efc)

# fit linear model
fit <- aov(
  c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age,
  data = efc
)

All functions accept objects of class aov or anova, so you can also use model fits from the car package, which allows fitting Anova’s with different types of sum of squares. Other objects, like lm, will be coerced to anova internally.

The following functions return the effect size statistic as named numeric vector, using the model’s term names.

Eta-Squared

The eta-squared is the proportion of the total variability in the dependent variable that is accounted for by the variation in the independent variable. It is the ratio of the sum of squares for each group level to the total sum of squares. It can be interpreted as percentage of variance accounted for by a variable.

For variables with 1 degree of freedeom (in the numerator), the square root of eta-squared is equal to the correlation coefficient r. For variables with more than 1 degree of freedom, eta-squared equals R2. This makes eta-squared easily interpretable. Furthermore, these effect sizes can easily be converted into effect size measures that can be, for instance, further processed in meta-analyses.

Eta-squared can be computed simply with:

eta_sq(fit)
#>                  term etasq
#> 1   as.factor(e42dep) 0.266
#> 2 as.factor(c172code) 0.005
#> 3             c160age 0.048

Partial Eta-Squared

The partial eta-squared value is the ratio of the sum of squares for each group level to the sum of squares for each group level plus the residual sum of squares. It is more difficult to interpret, because its value strongly depends on the variability of the residuals. Partial eta-squared values should be reported with caution, and Levine and Hullett (2002) recommend reporting eta- or omega-squared rather than partial eta-squared.

Use the partial-argument to compute partial eta-squared values:

eta_sq(fit, partial = TRUE)
#>                  term partial.etasq
#> 1   as.factor(e42dep)         0.281
#> 2 as.factor(c172code)         0.008
#> 3             c160age         0.066

Omega-Squared

While eta-squared estimates tend to be biased in certain situations, e.g. when the sample size is small or the independent variables have many group levels, omega-squared estimates are corrected for this bias.

Omega-squared can be simply computed with:

omega_sq(fit)
#>                  term omegasq
#> 1   as.factor(e42dep)   0.263
#> 2 as.factor(c172code)   0.004
#> 3             c160age   0.048

Partial Omega-Squared

omega_sq() also has a partial-argument to compute partial omega-squared values. Computing the partial omega-squared statistics is based on bootstrapping. In this case, use n to define the number of samples (1000 by default.)

omega_sq(fit, partial = TRUE, n = 100)
#>                  term partial.omegasq
#> 1   as.factor(e42dep)           0.278
#> 2 as.factor(c172code)           0.005
#> 3             c160age           0.065

Epsilon Squared

Espilon-squared is a less common measure of effect size. It is sometimes considered as an “adjusted r-squared” value. You can compute this effect size using epsilon_sq().

epsilon_sq(fit)
#>                  term epsilonsq
#> 1   as.factor(e42dep)     0.264
#> 2 as.factor(c172code)     0.004
#> 3             c160age     0.048

When the ci.lvl-argument is defined, bootstrapping is used to compute the confidence intervals.

epsilon_sq(fit, ci.lvl = .95, n = 100)
#>                  term epsilonsq conf.low conf.high
#> 1   as.factor(e42dep)     0.264    0.215     0.322
#> 2 as.factor(c172code)     0.004   -0.004     0.014
#> 3             c160age     0.048    0.022     0.071

Complete Statistical Table Output

The anova_stats() function takes a model input and computes a comprehensive summary, including the above effect size measures, returned as tidy data frame:

anova_stats(fit)
#>                  term  df      sumsq     meansq statistic p.value etasq partial.etasq omegasq partial.omegasq epsilonsq cohens.f power
#> 1   as.factor(e42dep)   3  577756.33 192585.444   108.786   0.000 0.266         0.281   0.263           0.278     0.264    0.626  1.00
#> 2 as.factor(c172code)   2   11722.05   5861.024     3.311   0.037 0.005         0.008   0.004           0.005     0.004    0.089  0.63
#> 3             c160age   1  105169.60 105169.595    59.408   0.000 0.048         0.066   0.048           0.065     0.048    0.267  1.00
#> 4           Residuals 834 1476436.34   1770.307        NA      NA    NA            NA      NA              NA        NA       NA    NA

Like the other functions, the input may also be an object of class anova, so you can also use model fits from the car package, which allows fitting Anova’s with different types of sum of squares:

anova_stats(car::Anova(fit, type = 3))
#>                  term       sumsq     meansq  df statistic p.value etasq partial.etasq omegasq partial.omegasq epsilonsq cohens.f power
#> 1   as.factor(e42dep)  426461.571 142153.857   3    80.299   0.000 0.212         0.224   0.209           0.221     0.209    0.537 1.000
#> 2 as.factor(c172code)    7352.049   3676.025   2     2.076   0.126 0.004         0.005   0.002           0.003     0.002    0.071 0.429
#> 3             c160age  105169.595 105169.595   1    59.408   0.000 0.052         0.066   0.051           0.065     0.051    0.267 1.000
#> 4           Residuals 1476436.343   1770.307 834        NA      NA    NA            NA      NA              NA        NA       NA    NA

Confidence Intervals

eta_sq() and omega_sq() have a ci.lvl-argument, which - if not NULL - also computes a confidence interval.

For eta-squared, i.e. eta_sq() with partial = FALSE, due to non-symmetry, confidence intervals are based on bootstrap-methods. Confidence intervals for partial omega-squared, i.e. omega_sq() with partial = TRUE - is also based on bootstrapping. In these cases, n indicates the number of bootstrap samples to be drawn to compute the confidence intervals.

eta_sq(fit, partial = TRUE, ci.lvl = .8)
#>                  term partial.etasq conf.low conf.high
#> 1   as.factor(e42dep)         0.281    0.247     0.310
#> 2 as.factor(c172code)         0.008    0.001     0.016
#> 3             c160age         0.066    0.047     0.089

# uses bootstrapping - here, for speed, just 100 samples
omega_sq(fit, partial = TRUE, ci.lvl = .9, n = 100)
#>                  term partial.omegasq conf.low conf.high
#> 1   as.factor(e42dep)           0.278    0.236     0.320
#> 2 as.factor(c172code)           0.005   -0.003     0.020
#> 3             c160age           0.065    0.035     0.096

References

Levine TR, Hullet CR. Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. Human Communication Research 28(4); 2002: 612-625

sjstats/inst/doc/anova-statistics.Rmd0000644000176200001440000001221313565522642017440 0ustar liggesusers--- title: "Statistics for Anova Tables" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Statistics for Anova Tables} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r set-options, echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) options(width = 800) ``` # Effect Size Statistics for Anova Tables This vignettes demontrates those functions of the *sjstats*-package that deal with Anova tables. These functions report different effect size measures, which are useful beyond significance tests (p-values), because they estimate the magnitude of effects, independent from sample size. *sjstats* provides following functions: * `eta_sq()` * `omega_sq()` * `epsilon_sq()` * `anova_stats()` Befor we start, we fit a simple model: ```{r} library(sjstats) # load sample data data(efc) # fit linear model fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) ``` All functions accept objects of class `aov` or `anova`, so you can also use model fits from the *car* package, which allows fitting Anova's with different types of sum of squares. Other objects, like `lm`, will be coerced to `anova` internally. The following functions return the effect size statistic as named numeric vector, using the model's term names. ## Eta-Squared The eta-squared is the proportion of the total variability in the dependent variable that is accounted for by the variation in the independent variable. It is the ratio of the sum of squares for each group level to the total sum of squares. It can be interpreted as percentage of variance accounted for by a variable. For variables with 1 degree of freedeom (in the numerator), the square root of eta-squared is equal to the correlation coefficient _r_. For variables with more than 1 degree of freedom, eta-squared equals _R2_. This makes eta-squared easily interpretable. Furthermore, these effect sizes can easily be converted into effect size measures that can be, for instance, further processed in meta-analyses. Eta-squared can be computed simply with: ```{r} eta_sq(fit) ``` ## Partial Eta-Squared The partial eta-squared value is the ratio of the sum of squares for each group level to the sum of squares for each group level plus the residual sum of squares. It is more difficult to interpret, because its value strongly depends on the variability of the residuals. Partial eta-squared values should be reported with caution, and Levine and Hullett (2002) recommend reporting eta- or omega-squared rather than partial eta-squared. Use the `partial`-argument to compute partial eta-squared values: ```{r} eta_sq(fit, partial = TRUE) ``` ## Omega-Squared While eta-squared estimates tend to be biased in certain situations, e.g. when the sample size is small or the independent variables have many group levels, omega-squared estimates are corrected for this bias. Omega-squared can be simply computed with: ```{r} omega_sq(fit) ``` ## Partial Omega-Squared `omega_sq()` also has a `partial`-argument to compute partial omega-squared values. Computing the partial omega-squared statistics is based on bootstrapping. In this case, use `n` to define the number of samples (1000 by default.) ```{r} omega_sq(fit, partial = TRUE, n = 100) ``` # Epsilon Squared Espilon-squared is a less common measure of effect size. It is sometimes considered as an "adjusted r-squared" value. You can compute this effect size using `epsilon_sq()`. ```{r} epsilon_sq(fit) ``` When the `ci.lvl`-argument is defined, bootstrapping is used to compute the confidence intervals. ```{r} epsilon_sq(fit, ci.lvl = .95, n = 100) ``` # Complete Statistical Table Output The `anova_stats()` function takes a model input and computes a comprehensive summary, including the above effect size measures, returned as tidy data frame: ```{r} anova_stats(fit) ``` Like the other functions, the input may also be an object of class `anova`, so you can also use model fits from the *car* package, which allows fitting Anova's with different types of sum of squares: ```{r} anova_stats(car::Anova(fit, type = 3)) ``` # Confidence Intervals `eta_sq()` and `omega_sq()` have a `ci.lvl`-argument, which - if not `NULL` - also computes a confidence interval. For eta-squared, i.e. `eta_sq()` with `partial = FALSE`, due to non-symmetry, confidence intervals are based on bootstrap-methods. Confidence intervals for partial omega-squared, i.e. `omega_sq()` with `partial = TRUE` - is also based on bootstrapping. In these cases, `n` indicates the number of bootstrap samples to be drawn to compute the confidence intervals. ```{r} eta_sq(fit, partial = TRUE, ci.lvl = .8) # uses bootstrapping - here, for speed, just 100 samples omega_sq(fit, partial = TRUE, ci.lvl = .9, n = 100) ``` # References Levine TR, Hullet CR. Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. Human Communication Research 28(4); 2002: 612-625 sjstats/inst/doc/bayesian-statistics.html0000644000176200001440000005214513616771063020361 0ustar liggesusers Summary of Mediation Analysis using Bayesian Regression Models

Summary of Mediation Analysis using Bayesian Regression Models

Daniel Lüdecke

2020-02-06

This vignettes demontrates the mediation()-function in sjstats. Before we start, we fit some models, including a mediation-object from the mediation-package, which we use for comparison with brms.

library(sjstats)
library(mediation)
library(brms)

# load sample data
data(jobs)
set.seed(123)

# linear models, for mediation analysis
b1 <- lm(job_seek ~ treat + econ_hard + sex + age, data = jobs)
b2 <- lm(depress2 ~ treat + job_seek + econ_hard + sex + age, data = jobs)

# mediation analysis, for comparison with brms
m1 <- mediate(b1, b2, sims = 1000, treat = "treat", mediator = "job_seek")
# Fit Bayesian mediation model
f1 <- bf(job_seek ~ treat + econ_hard + sex + age)
f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age)

m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4)

mediation() is a summary function, especially for mediation analysis, i.e. for multivariate response models with casual mediation effects.

In the model m2, treat is the treatment effect, job_seek is the mediator effect, f1 describes the mediator model and f2 describes the outcome model.

mediation() returns a data frame with information on the direct effect (median value of posterior samples from treatment of the outcome model), mediator effect (median value of posterior samples from mediator of the outcome model), indirect effect (median value of the multiplication of the posterior samples from mediator of the outcome model and the posterior samples from treatment of the mediation model) and the total effect (median value of sums of posterior samples used for the direct and indirect effect). The proportion mediated is the indirect effect divided by the total effect.

The simplest call just needs the model-object.

mediation(m2)
#> 
#> # Causal Mediation Analysis for Stan Model
#> 
#>   Treatment: treat
#>    Mediator: job_seek
#>    Response: depress2
#> 
#>                  Estimate    HDI (90%)
#>   Direct effect:    -0.04 [-0.11 0.03]
#> Indirect effect:    -0.02 [-0.04 0.00]
#>    Total effect:    -0.05 [-0.13 0.02]
#> 
#> Proportion mediated: 28.14% [-79.57% 135.86%]

Typically, mediation() finds the treatment and mediator variables automatically. If this does not work, use the treatment and mediator arguments to specify the related variable names. For all values, the 90% HDIs are calculated by default. Use prob to calculate a different interval.

Here is a comparison with the mediation package. Note that the summary()-output of the mediation package shows the indirect effect first, followed by the direct effect.

summary(m1)
#> 
#> Causal Mediation Analysis 
#> 
#> Quasi-Bayesian Confidence Intervals
#> 
#>                Estimate 95% CI Lower 95% CI Upper p-value
#> ACME            -0.0157      -0.0387         0.01    0.19
#> ADE             -0.0438      -0.1315         0.04    0.35
#> Total Effect    -0.0595      -0.1530         0.02    0.21
#> Prop. Mediated   0.2137      -2.0277         2.70    0.32
#> 
#> Sample Size Used: 899 
#> 
#> 
#> Simulations: 1000

mediation(m2, prob = .95)
#> 
#> # Causal Mediation Analysis for Stan Model
#> 
#>   Treatment: treat
#>    Mediator: job_seek
#>    Response: depress2
#> 
#>                  Estimate    HDI (95%)
#>   Direct effect:    -0.04 [-0.12 0.04]
#> Indirect effect:    -0.02 [-0.04 0.01]
#>    Total effect:    -0.05 [-0.15 0.03]
#> 
#> Proportion mediated: 28.14% [-178.65% 234.94%]

If you want to calculate mean instead of median values from the posterior samples, use the typical-argument. Furthermore, there is a print()-method, which allows to print more digits.

mediation(m2, typical = "mean", prob = .95) %>% print(digits = 4)
#> 
#> # Causal Mediation Analysis for Stan Model
#> 
#>   Treatment: treat
#>    Mediator: job_seek
#>    Response: depress2
#> 
#>                  Estimate        HDI (95%)
#>   Direct effect:  -0.0395 [-0.1244 0.0450]
#> Indirect effect:  -0.0158 [-0.0400 0.0086]
#>    Total effect:  -0.0553 [-0.1482 0.0302]
#> 
#> Proportion mediated: 28.5975% [-178.1953% 235.3902%]

As you can see, the results are similar to what the mediation package produces for non-Bayesian models.

References

Bürkner, P. C. (2017). brms: An R package for Bayesian multilevel models using Stan. Journal of Statistical Software, 80(1), 1-28

sjstats/inst/doc/mixedmodels-statistics.R0000644000176200001440000001265013616771063020332 0ustar liggesusers## ----set-options, echo = FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) options(width = 800) if (!requireNamespace("lme4", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(sjstats) library(lme4) # load sample data data(sleepstudy) # fit linear mixed model m <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy) set.seed(2018) sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE) m2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Design effect for two-level model with 30 observations per # cluster group (level-2 unit) and an assumed intraclass # correlation coefficient of 0.05. design_effect(n = 30) # Design effect for two-level model with 24 observation per cluster # group and an assumed intraclass correlation coefficient of 0.2. design_effect(n = 24, icc = 0.2) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Sample size for multilevel model with 30 cluster groups and a small to # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and # hence a total sample size of about 802 observations is needed. samplesize_mixed(eff.size = .3, k = 30) # Sample size for multilevel model with 20 cluster groups and a medium # to large effect size for linear models of 0.2. Five subjects per cluster and # hence a total sample size of about 107 observations is needed. samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- data(nhanes_sample) scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) sjstats/inst/doc/mixedmodels-statistics.html0000644000176200001440000005247613616771063021107 0ustar liggesusers Statistics for Mixed Effects Models

Statistics for Mixed Effects Models

Daniel Lüdecke

2020-02-06

Statistics and Measures for Mixed Effects Models

This vignettes demontrates those functions of the sjstats-package that deal especially with mixed effects models. sjstats provides following functions:

  • design_effect() and samplesize_mixed()
  • scale_weights()

Befor we start, we fit a simple linear mixed model:

library(sjstats)
library(lme4)
# load sample data
data(sleepstudy)

# fit linear mixed model
m <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy)

set.seed(2018)
sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE)
m2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy)

Sample Size Calculation for Mixed Models

The first two functions, design_effect() and samplesize_mixed(), can be used to approximately calculate the sample size in the context of power calculation. Calculating the sample size for simple linear models is pretty straightforward, however, for (linear) mixed models, statistical power is affected through the change of the variance of test statistics. This is what Hsieh et al. (2003) call a design effect (or variance inflation factor, VIF). Once this design effect is calculated, the sample size calculated for a standard design can be adjusted accordingly.

Design Effect for Two-Level Mixed Models

design_effect() computes this design effect for linear mixed models with two-level design. It requires the approximated average number of observations per grouping cluster (i.e. level-2 unit) and the assumed intraclass correlation coefficient (ICC) for the multilevel-model. Typically, the minimum assumed value for the ICC is 0.05.

# Design effect for two-level model with 30 observations per
# cluster group (level-2 unit) and an assumed intraclass
# correlation coefficient of 0.05.
design_effect(n = 30)
#> [1] 2.45

# Design effect for two-level model with 24 observation per cluster
# group and an assumed intraclass correlation coefficient of 0.2.
design_effect(n = 24, icc = 0.2)
#> [1] 5.6

Calculating the Sample Size for Linear Mixed Models

samplesize_mixed() combines the functions for power calculation from the pwr-package and design effect design_effect(). It computes an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs.

# Sample size for multilevel model with 30 cluster groups and a small to
# medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and
# hence a total sample size of about 802 observations is needed.
samplesize_mixed(eff.size = .3, k = 30)
#> $`Subjects per Cluster`
#> [1] 27
#> 
#> $`Total Sample Size`
#> [1] 802

# Sample size for multilevel model with 20 cluster groups and a medium
# to large effect size for linear models of 0.2. Five subjects per cluster and
# hence a total sample size of about 107 observations is needed.
samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9)
#> $`Subjects per Cluster`
#> [1] 5
#> 
#> $`Total Sample Size`
#> [1] 107

There are more ways to perform power calculations for multilevel models, however, most of these require very detailed knowledge about the sample characteristics and performing simulation studys. samplesize_mixed() is a more pragmatic alternative to these approaches.

Rescale model weights for complex samples

Most functions to fit multilevel and mixed effects models only allow to specify frequency weights, but not design (i.e. sampling or probability) weights, which should be used when analyzing complex samples and survey data.

scale_weights() implements an algorithm proposed by Aaparouhov (2006) and Carle (2009) to rescale design weights in survey data to account for the grouping structure of multilevel models, which then can be used for multilevel modelling.

To calculate a weight-vector that can be used in multilevel models, scale_weights() needs the data frame with survey data as x-argument. This data frame should contain 1) a cluster ID (argument cluster.id), which represents the strata of the survey data (the level-2-cluster variable) and 2) the probability weights (argument pweight), which represents the design or sampling weights of the survey data (level-1-weight).

scale_weights() then returns the original data frame, including two new variables: svywght_a, where the sample weights pweight are adjusted by a factor that represents the proportion of cluster size divided by the sum of sampling weights within each cluster. The adjustment factor for svywght_b is the sum of sample weights within each cluster devided by the sum of squared sample weights within each cluster (see Carle (2009), Appendix B, for details).

data(nhanes_sample)
scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR)
#> # A tibble: 2,992 x 9
#>    total   age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR svywght_a svywght_b
#>    <dbl> <dbl>    <dbl>    <dbl>   <dbl>    <dbl>    <dbl>     <dbl>     <dbl>
#>  1     1  2.2         1        3       2       31   97594.     1.57      1.20 
#>  2     7  2.08        2        3       1       29   39599.     0.623     0.525
#>  3     3  1.48        2        1       2       42   26620.     0.898     0.544
#>  4     4  1.32        2        4       2       33   34999.     0.708     0.550
#>  5     1  2           2        1       1       41   14746.     0.422     0.312
#>  6     6  2.2         2        4       1       38   28232.     0.688     0.516
#>  7   350  1.6         1        3       2       33   93162.     1.89      1.46 
#>  8    NA  1.48        2        3       1       29   82276.     1.29      1.09 
#>  9     3  2.28        2        4       1       41   24726.     0.707     0.523
#> 10    30  0.84        1        3       2       35   39895.     0.760     0.594
#> # ... with 2,982 more rows

References

Aaparouhov T. 2006. General Multi-Level Modeling with Sampling Weights. Communications in Statistics—Theory and Methods (35): 439–460

Carle AC. 2009. Fitting multilevel models in complex survey data with design weights: Recommendations. BMC Medical Research Methodology 9(49): 1-13

Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation & the Health Professions 26: 239–257. doi: 10.1177/0163278703255230

sjstats/inst/doc/bayesian-statistics.R0000644000176200001440000001533413616771063017615 0ustar liggesusers## ---- SETTINGS-knitr, echo = FALSE, warning = FALSE, message = FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE ) options(width = 800) if (!requireNamespace("mediation", quietly = TRUE) || !requireNamespace("httr", quietly = TRUE) || !requireNamespace("brms", quietly = TRUE) || !requireNamespace("insight", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(sjstats) library(mediation) library(brms) # load sample data data(jobs) set.seed(123) # linear models, for mediation analysis b1 <- lm(job_seek ~ treat + econ_hard + sex + age, data = jobs) b2 <- lm(depress2 ~ treat + job_seek + econ_hard + sex + age, data = jobs) # mediation analysis, for comparison with brms m1 <- mediate(b1, b2, sims = 1000, treat = "treat", mediator = "job_seek") ## ----eval=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # # Fit Bayesian mediation model # f1 <- bf(job_seek ~ treat + econ_hard + sex + age) # f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age) # # m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4) ## ----echo=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- m2 <- insight::download_model("brms_mv_6") ## ---- message=TRUE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ mediation(m2) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- summary(m1) mediation(m2, prob = .95) ## ---- message=TRUE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ mediation(m2, typical = "mean", prob = .95) %>% print(digits = 4) sjstats/inst/doc/anova-statistics.R0000644000176200001440000002247613616771053017132 0ustar liggesusers## ----set-options, echo = FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) options(width = 800) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(sjstats) # load sample data data(efc) # fit linear model fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- eta_sq(fit) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- eta_sq(fit, partial = TRUE) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- omega_sq(fit) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- omega_sq(fit, partial = TRUE, n = 100) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- epsilon_sq(fit) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- epsilon_sq(fit, ci.lvl = .95, n = 100) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- anova_stats(fit) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- anova_stats(car::Anova(fit, type = 3)) ## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- eta_sq(fit, partial = TRUE, ci.lvl = .8) # uses bootstrapping - here, for speed, just 100 samples omega_sq(fit, partial = TRUE, ci.lvl = .9, n = 100) sjstats/inst/CITATION0000644000176200001440000000055713563265750014102 0ustar liggesusersyear <- sub("-.*", "", meta$Date) title <- sprintf("sjstats: Statistical Functions for Regression Models (Version %s)", meta$Version) bibentry(bibtype="manual", title = title, author = person("Daniel", "Lüdecke"), year = year, url = "https://CRAN.R-project.org/package=sjstats", doi = "10.5281/zenodo.1284472")