sjstats/0000755000176200001440000000000014620625152011750 5ustar liggesuserssjstats/NAMESPACE0000644000176200001440000000501514620444364013174 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(AIC,svyglm.nb) S3method(as.data.frame,sj_resample) S3method(as.integer,sj_resample) S3method(cramers_v,formula) S3method(cramers_v,ftable) S3method(cramers_v,table) S3method(deviance,svyglm.nb) S3method(family,svyglm.nb) S3method(formula,svyglm.nb) S3method(formula,svyglm.zip) S3method(model.frame,svyglm.nb) S3method(model.frame,svyglm.zip) S3method(phi,formula) S3method(phi,ftable) S3method(phi,table) S3method(plot,sj_inequ_trend) S3method(predict,svyglm.nb) S3method(print,sj_anova_stat) S3method(print,sj_chi2gof) S3method(print,sj_htest_chi) S3method(print,sj_htest_kw) S3method(print,sj_htest_mwu) S3method(print,sj_htest_t) S3method(print,sj_htest_wilcox) S3method(print,sj_resample) S3method(print,sj_ttest) S3method(print,sj_wcor) S3method(print,sj_xtab_stat) S3method(print,svyglm.nb) S3method(print,svyglm.zip) S3method(residuals,svyglm.nb) S3method(terms,svyglm.nb) S3method(weighted_correlation,default) S3method(weighted_correlation,formula) S3method(weighted_se,data.frame) S3method(weighted_se,default) S3method(weighted_se,matrix) export(anova_stats) export(auto_prior) export(boot_ci) export(boot_est) export(boot_p) export(boot_se) export(bootstrap) export(chi_squared_test) export(chisq_gof) export(cohens_f) export(cramer) export(cramers_v) export(crosstable_statistics) export(cv) export(cv_compare) export(cv_error) export(design_effect) export(epsilon_sq) export(eta_sq) export(find_beta) export(find_beta2) export(find_cauchy) export(find_normal) export(gmd) export(icc) export(inequ_trend) export(is_prime) export(kruskal_wallis_test) export(link_inverse) export(mann_whitney_test) export(mean_n) export(means_by_group) export(mse) export(omega_sq) export(p_value) export(phi) export(prop) export(props) export(r2) export(rmse) export(robust) export(samplesize_mixed) export(scale_weights) export(sd_pop) export(se) export(se_ybar) export(smpsize_lmm) export(survey_median) export(svyglm.nb) export(svyglm.zip) export(t_test) export(table_values) export(var_pop) export(weight) export(weight2) export(weighted_correlation) export(weighted_mean) export(weighted_median) export(weighted_sd) export(weighted_se) export(wilcoxon_test) export(xtab_statistics) importFrom(datawizard,weighted_mean) importFrom(datawizard,weighted_median) importFrom(datawizard,weighted_sd) importFrom(insight,link_inverse) importFrom(performance,mse) importFrom(performance,rmse) importFrom(stats,family) sjstats/README.md0000644000176200001440000000503014620621620013221 0ustar liggesusers# sjstats - Collection of Convenient Functions for Common Statistical Computations [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1284472.svg)](https://doi.org/10.5281/zenodo.1284472) Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages. This package aims at providing, **first**, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like Cramer's V, Phi, or effict size statistics like Eta or Omega squared), or for which currently no functions are available. **Second**, another focus lies on implementations of common statistical significance tests with a consistent syntax, like t-test, Mann-Whitney test, Chi-squared test, and more. These functions are designed to be more user-friendly and also support weights, i.e. weighted statistics can be calculated. **Finally**, the package includes miscellaneous functions that are either not yet available in R (like `svyglm.nb()` or `svyglm.zip()` to calculate negative binomial or zero-inflated poisson models for survey data) or are just convenient for daily work (like functions for bootstrapping, or ANOVA summary tables). The comprised tools include: * Especially for mixed models: design effect, sample size calculation * Significance tests: Correlation, Chi-squared test, t-test, Mann-Whitney-U test, Wilcoxon rank sum test, Kruskal-Wallis test. Note that most functions that formerly were available in this package have been moved to the [**easystats** project](https://easystats.github.io/easystats/). ## Documentation Please visit [https://strengejacke.github.io/sjstats/](https://strengejacke.github.io/sjstats/) for documentation and vignettes. ## Installation ### Latest development build To install the latest development snapshot (see latest changes below), type following commands into the R console: ```r library(remotes) remotes::install_github("strengejacke/sjstats") ``` ### Officiale, stable release [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/sjstats)](https://cran.r-project.org/package=sjstats) To install the latest stable release from CRAN, type following command into the R console: ```r install.packages("sjstats") ``` ## Citation In case you want / have to cite my package, please use `citation('sjstats')` for citation information. [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1284472.svg)](https://doi.org/10.5281/zenodo.1284472) sjstats/data/0000755000176200001440000000000014616613032012660 5ustar liggesuserssjstats/data/efc.RData0000644000176200001440000004213414616613032014336 0ustar liggesusers՝]qߏIJ9 1*TRhjIlHI)AIˡ. ÿ( p/gι V_3ܻKw{{򉧺'ϧ?.|rsXq{?9/ pӍ~<=|o11}z G 7~g9Pv^1r<371?Wü+1y5KA FO> }z6̼cz'ZcB?AS|n<_#}DXK?3UӍaEG||] |+w< A#:;wgc="˾9GUҏqƥ_zKBn#ZW?6<_Q/`ޛh ~艾_"e܏OyѮ71ۀE@# :_ A#K q?x9/{s{ u`a܈@z^MSqsK#O1ޫh˟T1_H/z^釶zM?qߒCr)owcݠ/OET_}ճʺ'fK鳀u81ƕW*QrI^': <1X2?'ᾥy)=~3?Yw=v'/bu\puR;_A .\>E)oH?W/E:G[+u~-d?׸W(uR?!}|:W?-swGWP/;U|kS?\RP柕2x.O|y;=q!wCyP5@/> Aj?ެKXKq8к>SYgJWЯ~+qd\K{ ^tq<ȼsYi\tY${J ^J: ΒCr3I?kS1#7|mٝ 9#zzu%_=S?1{s}y?I/8[xCGXǯοf!A~k|n>on>:v6[w;l]W\v`C7^̍L_Զ|kV@Jyj嶰7Vg?/l?fgW`oIwuH;.k!;;%kGx2N0?'puvz6`I^uwߟM7<0VA7rvj7K|&~ .6o&H?v=̷I?ۼ}7Od$/+`~>o#:\?3-7Q>g`nѿB?f2l5nƃf2щtun?ߏۡ?ߣ 4 q)Xv[</;y-{+}26%g7 ?|Y?ܻ:9\morT==; j|zx/o>]k7}w뻷?y֣ova7?ڽ}p2?:\HkwHÓWǷ6n|{\F:_S_ȢnVhvnWbr̕^R0SKuVuprΕ{vY:s6͛ۆu_t\nn0Wu+Wv}N.VV[+G_;\j#"_ ʿz{4ޚOxY~54>KieG߶?.g-VX[_,pf,]ϭ"zrDmG[~\VX{XJmוKvwy헭c-ut|սُl#ڿ9^}9(jt r?<1Py^7޹OūC~s@߱<8_\?7H. o[wc“<?%:])'r@Kø~Sn'9 >/<ѻ*yGE^]I_>nS阯"=eOqihz4^3›Uz8a\~#{79ܢϗ.)@%\c}/4̧h=Dڲ[mtJԺmK[_zNW Oo?1yWX.ҁOIGqvi#vYDK>`=WP?+qq?!bn/yOr֩Oz_'؍aON|$Ax%/Pv,~=oG%7a>^;ҫ D)zЖi=$&bn.P9[.%+~y+%N/'zN0_v+tcyJ>C'&@G1?3r]p?r2%ҁ?RzOc~ܷ5z~Yߍ^% mօ$!'g_X_^zI<گ/}%~yMyWx|`/r7o~'7_<ެ/qֳzL]%<9x~;OdYx{n7SRIu>wB(zЯ3gQX⇐@yq(~cXп̸諮[9o/<%+w uhړs`ǐ1^YeC /\WħY`?Ճ83wg[z +yӑ<ϕ~Wa4\][n,Oo}JI5uԖ+tYG{.vb r%~C?I}MУnL笲Ϟ^8/׼r4cXn=sm?(x#XG_ %L@KuqCx^;cy$nHo3%}q{ ޿yxߡ]M~re=%K?r,|ov?HO#`yd^^Kٻ^ےR'Tfpc vt0^B>NGϭ#؟{)z.dGD wh;?ҋ镍xrK:(m [0._NHHO'wmܸ=Y9H?қrDuRkE9\6Ku#9"ġ+5ym~ y}*gh◜M؁~4߬[m} m'_m5P7{Ooȅ{9c/7o#*g; n4-ڢ>_6xjkf7|\a$g>Y|E$~oժwEszfv~|%ή ڭvM-$lt,Z9:VIq۹v'ed_W?N~VxY/rY9"~%R޺N<],,.h8590]_3Wv{kdY*:yɟx?+_>Ѻf[+F"<G7?/͏=jj_H?U_`mUY+ja<‹dyuvuCϬ]+^\?\}yx{ucٿk6/6յݣͧ{o~o| thg0zxd?/v<ܟݖz9`2~i6r[zce$_em>\yj#W_Q^Ot1ʩJ }3"lg2ڼG+-C_DՋ= =G"G=Vێ֞W|y]|*[;_9uhk]泥9N/|Ζ1vi=':j9P.;wzWf?N8qէwALt}e;9֏sa^6un=lu~N_t|:9<\]|{pnk)>%744ߝ)"]%^oHVkqU+=f)̋Bx1~fE>B>OI{C70+md%lu^J k)[4Wi=HZ6E둎p3M;QOKw6txxpշ?yǵ'}[xR狭#EYgga>x#T{xw޵'{wG~z:ܻbG{Ǐ %x<sQv|zT񫽊Ke֫HW31JZzH/*%R/Ֆh|55m/[$m_F7ˑJq2K?|5o?+Wv^6+;/h'Yfam !˿8ڼ[$-S7yl})oWЬ<曵cz=gr#y'x= I?N?8GV~VN>'W׆o?[g;݈_ϭ/ οnK{mK'p_M~z: Oh8yulɵZ+O8 U>wnqvOFn?;?zʏQse */̛{^((ݓܸv:޺޵id- z}8^I/SR/d[ԥv #k_a Ul<>eyBIjdo'owT_)"vmK@'WVh>kߢZ׃rYX?ݣ^G%wm;M {8 k"ۊʶ||Nɷz|uw"g{_>~ORkw4e3*W{ίl]Z2}^3xAU脪̸.Ym󣂭q=t ?HV9kS+GMS?ǣ7"͒RO~Xק\9֣^󼾍Ow%ǶjG.|{>ϛ_zlIj=sl-[[A]O/OB'[ӳUpi:o=/={G}%WX_ok^j]Gn'|6Wm{\8΋߈g?I۳;=?z<-M[[xقrUjedx{k1{% L~o'6~ꓵSѺ?(/M8^|#wq'/ȷKtuCv_sr̿]5o18:a~ݝ mIʪ%+_z3'x٧=:kS@ڪ:?g#7'r8}Ο뇑fuH~#YldKϵu{jd_FO{3j[y#ɗpOo?FlkWϤu4ċ괰 }W։lgw"o@7ʱSr=~e-ț1Kl,:inuxۨsGlyYhK;f|v=CF=K-n/#[r'yqUS{ѭo|re[Ns`^<`__{tuc_޺ny9zmm }4vƈ o=+y\HE+od0?^tCZäAn }(g7Ͼo?2. qke>`?}:c$'ac=Pwv6,'Fj5k$K406HwN.rTog3ڳô$l>o͏s`:?ϯW]gwnd_tԝ]xZ us2Nxpqum -a/uJxټh7ylU{ya=A:A^ehlN=_$w+ >ub7 }ڀ({5'Gz٩3'njy"pl3սُK۷nl>ruhospGOo~rxh|a1~ana)7,H%)Oh^q7J#g ތGKo4zFG|Xtt+WV\1=>ɋ#[<8fGnz$ED 9l1c=D$(oZ?-ʳQQߏa8ʧ[NroY97Ѹrg}>+7z&%{ߛ~dvcKᅣʼn|K97y/H>=g=[uG:Oh䭮ϻ) pޢ'ߥOU~>@WyO\j-.<,|~O'~\oѹ2~>֥0~E|J<hk=^Ez0>-<Y?};g,ZG# tdwe? ⧼!. 7\e=ŕK֛y!AtOP~Z e7_w:SkZz x%Otxn.yFrgݸ_)ڥ%ɾokOW.|8W呝ؖ]ŷC~JP~% m/NmzJn[|7ɏ8)'d0/DF~ŋݹ>Gޜ|}u/>^Zi='9 PQPq-ܲV~?2.Z_~]۬/~7[8My4&׾U~zZo5Kayߢ/{eOkHѓZ?~7{].{hRgJ'[= /S?CAy03@?_tx ۏ15]G+3~yZoK/Cvơ_o99[>_o/џCT\Zq]ŏyMtyK_A<_wӱ|7Z֡K9d?7!4_<?+~> ޝ_rtc>̃?] 8_B-91/Z|uȯ?HٕH֏Zr^> &;JN_8:*.e_`(Uih(?)?~S^̓Z5T?U. zNPuH9QفuIr)ym~/g~,}.O0./qo}U's=&Gʝ1]y5=,Жh}ӱ>W<%<8w~,z7O7Y tE7 KQ|q/mn-ʹGa[m+/so{Z kV+ݬZ˥٥_f#p)o7KQ?n~}dqn>*y{㳟FǫO77Gr_|bԯw77}-+sZdjA}ӺY?TٺٽniŏZ~m}PI:Y9y#:q\ܖO3H~>W`Y'!n|v0K&+|yܼu'yynߚjoy9jWoe<[럵v\*oF~ZKGs~Mg);ͷ|ZtnܺfnTim[Rȍg8noךg?=OG{g\|b*ڙ_}pד_n{S ~XM oBw6v0߀~/!~C"i==TM6=#='ziOKoRz: }'~}f7_cbo =d#?zn_)!o+/ /2BWj=} q#?xxq["_yA8 _0O|4q$<͗Ԗj3j3S?}3.zCrreGӏh}v1%%y[m o?^C?\ Z_ E z7ySWu#go$旿n||A3%yDr0иGڕu1e7>yc~$%=92_Ň%-y/_rNqyN}InIx̷_iG~)YGބS^~OPzf9(9eWm"q? 3ga~)3?. _AB?a|g''<楫<_п>Ʃח~Tv\<0nW˼+> _֫Oc\s&4yJ|ȼI:AgisySOw'S?%GE#c޽yoc>9nߡYr`=C=9a\:!ƙ;rn1.3o\|P^ek<Żg;c*ƕאzmVo_~7N>70WA76_8P/{\ y(lOm{~#Wr<_\l2.+.oɲ= ~E|_|Ux0_~sZO-\y/a>?C~C;пwxỤm^/ڃs/<~#uz㤻}(w_a5θѸ:b>=2oH~o:r=W/;t5.zcvg\oŗ?EP{'ѡ=o2}N\O̗'\vD~.=޴ x\/;=7ig]M;0|/WL?Au)/ggޣo(%7W/n>Ÿd[8bGH}is˿k3YH^ {S </~:M|Wv"㹚Jwu7Rnݙ8(huO3@un_\dڢCeJ}俬cO9dc>sʸqu'ϛNb]]6S>wwx]ar.Et_dn_y sk>5Ps8yݹٕʸ&ڍqF?qu#v>t 3.p\=-|9{Gv#9tyu0L?$?w>]1E95׍\}ud?ӯ}c]H.|s\MʟÂyB'}gv;zmwl%O2]2ߺ{<^Sy#y9̘~-@!b_|ڑv7v/ :2;0`~K'sv>?}j^wv^.U} _O+9hmw} ϯ㮎wN;gcwp>;qĽgϼGݜ|FyCGyw>tŗ{n LsuoszoQrw~pu9K}Y;yב~9ǣM\1\tujGuGdwrg}ףs~Wq^;;[eqxwoo?xIn<̠T!02_ 3Y|9--9fZ+%rݞOwq7yC'?W=^󝬣i_'kԓ)|=Y^i~2џտc'@k7)]>>M>rM~+1EyK._c3-Ӟ.nWGoOC8aqXkxw_8?+/Vgsybޡ;9t}1{|"|ܿkW߽.y?Wz=)|^FF}ǟwpr3/Ǽf` ?EƺOv:_tײyG={oꗞy+RN=KvdRaŁzJ% hx>ղ3w0y=.aAs.9<oSx<*._~ui]WЇ_'Cɟyc/ o=vr&~7=ɗ3Ty|_c=~x 9<+~6%<%W:Dk_wp|N<ڼ'~O=X2_puwoځwncJxܟd:/Gre\ODGy/FgtG#zԏ<Wx=Eɟtytksu;܌ϣxB]\B'/W7V{76W&3}*݇џtxsJi]~H7u2McՌ'Пfe>ۢms^;?:O.o~h?rhϲٹVߵ_l܏Շ~j]dX K.?y˵zq&y>=w_繥?p80Zow>o͇/&N--\}wEGYz6>y}j]>Zw;>a[^~tO=ãE5ҟm~x~:9Nn3M:du9>9ڻvwx0d?bF'7yyxǿ<66| ?cUOcXCI?k>\r{#wzB??o=|}aO瑾>r"΢?LyîOwrkZƥw=`;ƹgOeW>?Z:wo=Sظi=!]kW6] 8^dZ=|ܼ}֭&~$G6~D6l>'v#|~_~%~Z~SM =G7+'.ټT~D5`m}\Ol{׋ڻZY:s[yqލvuz#:}rMo#?gUi[o9dv7,q~ާm6.w,ݹ뜥Ϛ08B\eWdS6nRugxɺjB|3[ɞgܼzn ߻9^8kHoğ]GH{sc'9O ΍DA<__wm]q|#;dOvϞkΝ=H} '}'_'KJK]Z;z}j7"'[OFi=Om|ϞsO30eT=ۓT#?o[D=':czig g=7+v ?Or|8x._c|kE%eO&{޻~=YxFKgӟv>x*rѶwE>QY {aq{qY?6gO=KC^Fބ[6|QvqvrLrx[zj^/FFGyr\U&v(<nݢuO>Dkտ>O"y9//ʇv6Nz/ҋ|s5RQYg˻|#{8ΞV`}ks-}g.Q=ЪeNX~VʇKom>qtjjK!~]!jo_{;̷3F٫v=<5x=[Ng,kn|q"!ͻN^]-0̇s풥ptn=E@N(.UG@WQ^7k0 #[ŮKwm>ΛOO/ǣ8~Þ>Yt~I菾,tD\{񥛇ӛ_9 oo^|x^tSZ_Uҵ[/>h|҃Ż͗4_ۼ+mkom^om@Wwu <}Jy}+nӟ<99| [}~;@\o.o}|}Y/~ljOkt'/8;_0sjstats/data/nhanes_sample.RData0000644000176200001440000006554014616613032016424 0ustar liggesusers| N1nDi6Wݦ?2 JBXdJ$p{C#CY;϶;W9gkֳ>oPqN~{\{~IK=Eu&\tn NtyJ9y_31TC7gXz[ǻWqdz<|䷤F.נ[,_~/R}/ z]qYKC?P ׁxO){rQ~K5Gn2^~UCazZ-z?`?|~Bo[w9Em6rubI0<ܘ 2^Zx5puNx܆|gK$.Ecѡrx<Kwy̯ a?Izq\r6eq5OB(6֟k~?C4F_i> =}"8cr2I\9|N]W@n#ɿ|TŻۉ%`W}=S_-)9$]d}jD~z 棆ϔ~:zf8.0`xy8*y=Fc0a?g/g~qO#gwx}y?z1\y Or|]}xʰ7!:gʻIW~yks'{8AzlϿ1ahqx2+ǃ}#\_;,q+P?@>JG"pCyw?%pu%8j?ڏqg9qǺӏMJk[=C;"j(Ojzh9TxCR'@1|4<^W4{@S{~9n0iZϼWޜvF\A_+F.;;/b~O_Xg=A?h{眷@\ß/qc<>7y=䲝a]1/w=0/a]αxn,yO#xxߙ! \f?~ 84-/dؿKs|rZ䦸ʺcӸ}5w{ W'iXpźps 猷_ 0NF=tnΆ6ggvґ_Fs^Sw{x>hc\%-=J(G᧛Ż,`a U:"N=g?x\l76=6=>v=ΟsX{ qW+sܒ-ox@<ҰJT%z,mS9!cbǓJy kG=b(8cr7ܯ5~"OeO%bݎu!w}>I<l=浲"Itzvks:w?S_yWa܅x{;x+WS=C~29}p=7<^ߌ~F[Ge; a|PPċr \byxߡz̻XXwF9hDWX+}?J=ǼyxfZXGKzy+~$= 09|W|*\=^'\ܬ~/l XK=lG,3d_ܰSڿg 䈥+/G;My\ Đ<K9Է>| :՟q7y?+-;Z!u{>r^{bqckKR9- ]Do3#GIvn'QA#g?Ĝ[|uOڍ!9ն`q"?]ܢ!7S,{ o'Y4b}/U#yH_DS=8@vxgg pǺ/܂ǣ9[.zQ&}Bw7<=I}ʧhsW>'X1~؍ux<Wy#|4p Y60~\R;,/OKЛ1 ^/ƾFvw?'^orq'eo3hr:w(?%G? 7y[qHwc})k\wpcoy ÷+px'<9'r!/Cs+pa=cq//g#=ɌuS;c_,ݯ'lNw3x~hϡ{˹2k>ܡ+Hگuz~?V\ms!ݑ ;ခ?ߐeCq܄vh?A!2pP~Q|vn3o떓qBxgyr#Nxq)V_XH[~gabf%YOC)la^w/q^7 04/|v/?GɳMu!\Ju9~VkJYK_spنd/Ƌ{?׋LJu9'YG˼]_,N7ցdD#}NDn>b]Un?GN!ngdWF2_XW_.ޣ_v^({^@'/c\hC8c9G=l'o$}|Wt7/xHy]u|_'&EGvyomC+z:zwNYNWkr܍_uw9}-ȅuzH(}^= zb~x.5sc~{/\)1n <1_^o!-N3: 0_K78wc/8 /_Ẉ~/? _ /7IO%\:rm䟘3'{0ǺH>=ޜ.B~{+<31#98<zq$rD/.ǺKK~_ǽyݥ]/S\,׽"ŗF>o>`!'!#(#ց}w`ǐq=D}/GGy{x.x7XW_`^0أiugv->oNǼ!ܴd CS)O@@Nkb.R9ށ^Wf#/"-|K~CnY7GHz=)3fyi]7x0N zg_Cq{NEy.%O|pva/#v$r]_ߊyoSD QBN M$n@\5s v(x%g a}L']s~y {ؗ'%vE5#']Cp3NHS3y;y౾ Or縗s L#~Ƽ4> !6W3 7>;~(zi<碏:A 1< '36q~;>Ob3@䏍 0>[Ox$zy#q$d^8Iq)&3~(|~lq8Oqbxv%IyPQ/qϒab%@Sx v~+$Sփ p\ۮ9o#wטk<۫؉;eP)x8_>g3 '#nd{";0h~% zo %N#?y~Xo"/8).¯Q>Qݝ_2g('8Hq2ޡ]ȇy ޝSgz7Yi\=J_璟pGop_G1c;8yB=+<8?s~?$ oROUۯx?s￰ߕ<{~y|wyNsN⒜sir=oY<2~p:h䛘ǐxoW`!m,yCbB9s6丂TI&1Mƥz7՜hqKg>,yU7M#'ԃSܗ ܿO~fؕA>%όýo#~1$Lqw9!~ڭw%#>ǂ~0gsSFSۈ$C94s?Ǔ܎/S E ^u8܆RɸWpy(qP^ф낗y0<;n0HwG$/b=rۓ'hGA~O Ei zIZ̳Z/ʂ7kxUC+rλisa^[ĵ/ ޟCX_\Q^1%wr|9a>QPc#!<¿Kނ c^0.[x~NDvpBxC'q( #އ^oOx ~??楘?G·ʇ8h̋8zAOf}|y})>7O_(y72w]j1OcQ=Ey!BHY!K?B-Xځܘ3q(w{ |-dW~C;^~پ0O~!/١'ijxƹe{|90XYwaK*qCSE=_${8~RF<qgkX>漟ÐuMv!F\oweK8ۧ:7| >iOSb|Mɱ-yo$ zqr 5W|73_AWK<迶3琞q~ e\S$b<`0~惔?>ļXWSOb$&'y2rQ>֍FUKq^?Y;b7dv>_ug*U/ ُ%}#l^^7hH5Տvm/9gP={֮m ;zH;R9'^ï\~˅oѲhCڏ6Da]E 1[;~C-ǯE?z%?h;_mװ~zaσ)ψo}.u j'^lm ~۾\h7Ho?Hs^ϽGA [Rbk?kX_dڿWyrGk> +O"'l}ut]Ou׫[;ђ#z~8rx-v%wî{-m{s~y{_zn#h[~yX9lmAۯWAj翚hDp47Z-Tx>5Z8b_ }o_YVծ^G˟y ڋDKo%W<]D{"m/R;'}]?h)װo+mG\OnFD\P_xm۟#ny緝e_G?~ED 碍A劶 ھMhWP+W4t}"9~ׯ^䈖z|V_ַokV5R+/G~ {U.,^-Wpuﰫ|~j'\x|ȫU,m:bԮGDWclm9wuٮU=տoq_ܫ/}3ëm=匱9w9k㵮Aݱ顇(<_ gSw7/fgk? ?1k1zя^9wq9]S..^rg~1VymZo%-+:/OCn|~1n" /_\ ̯Y΀ӫ?_=~w^:t>CgKOzȫjR/pO)KCu1ܾ ,g{}ss܁COb,#ƿVXWPzmz? =1^xQ/w|\?x<<_үsο^M߂ԫ=r[ȏn7$n{g/ڳq&^8e+8.}qu?1Ά^rx_m3{^}y籑M{>'ߡ,A]|s?~΃G;ηC^{s/۫_OA]\v<-zc,,Cr'SO{3<ۯ~Xe}W/ɟ,O6y#czXϜoϋXg>݊?aO]rN8+{+ۏz~u񝿋9w=O\~~|Peބ ;Ay("NO{{/m8XyP<? <=ŧy'bk?^xz+rE#p^(xz鉫1ܾKm_}|oӠ8%'y}Gy _h^}y_{}ذ~W7=[9N}Qq9~>z{_:ފ⭧?ݠvծ_KyOu꥟</П=3Ke}3v'p˧^z7 <߸ڳӎ- OqDKqr9ƽM~On/~Tw{Vz~=^qO{HտO9Emɞvj;r=߁yꥥg9^{{}"=6m߯{q~8dC w=q;wW[?\ۯ5)<~{+^㷵cvVޯ\~v_9^ *_¶㷞_yv~m;k"+lF~oa嶕\yP>,gA Z>*m} >h9za0ROFZW{6y䌴M>[hgvտ\ڳq95ZxWHvt-V~_G#m?:-֎+'kx%_P}Ͻۿ~v +_6yr?r~\#Z>l{~*w? ܯk}G[B *yXі3,/zخa'h}h!'qa~x>وʣ>3Sl?STb7$㚢^P;l,798ycA;=ڗyv/?=l7uNd>[1nNQ?lKٿz+ab'#yl~r2|qTy~x|3hfRte~Pxˇa$'0멱4^w)»IN+?x -7P=Y^c|D9ï+/>u9ϋ_IN[|~/1ur=m''C+xޱf9}?VO=Y#0n G/<j5< H"zteoT3ļqqO&}܌#qO`u2?Ty?ϛ՟[ysX^l> <ϠW?}88/n 8.9os^丁yW[q,<rz1Ogb>#+g~vf6^ʸɸo˳2Yy=x^v/'98̣ W~$?0{8<zzo?hGxw}KysާF>^-< 臾~|12~27䴭-zz<r_{qu4rDz9ӆ+F\Jm8q0)>23uWCH.n0/~liΧԷpqy|x_o? --?o?\-8e?f=o.|omy3ϣz8eKhmzwӽ3xO ' =nxnm[ly8Tֿ wm^zlqך~.scMr|<~~9`!o{\6 s#+yct}"ΟpyWOxןΰ?/8/vqNƁr}o'{Wo=WB`<u>G9>ڵW:ȿ6r0$(-nCTc_Lss?Gb?iaIy>ؾ8wطg\9/'=߳K=}ebі}{-~7c??(#޵Cmh|OqгxwO8Obx-O~8}B?6m;Wm3EXn 5޶k;gƼTzFލ㸄A/{s87CQ|~9q}gq<$#@X =yu?kSQ=, [|h;/9>c{B`1=mFE}ǧv9b2PyY7zmvc<`}5?ն/lqM/xFr3{1_5Yyq^填-.χ>9wy[2O|~n=Fr8lqCyVqy-<mzd|۳a<.G-?$;ڷb} ~ٿAQy\S_9O`y[~߶Gnlō_5!.grqT]84}x m889mTދ>9bl1T.Eپlq?7mж?f;OS3c\clAѽqe;1v8di8rR{4noovöc; ~S[l_~WXڡ g;Gl<od'd><7vlo'8q8X. O< kG^Eml1?xwܞro{9Q .oOP0^~OW#?۾Kak[G\ ^qﭿ7Bqo<7-ʖ'0ΙXby58e:+?A98Ɵטwh߆6ghc:}?ewjS$33W 8i|v'lQ[\~y`=.<=[sX_ gbsS =la|XlvΕmxy {;Al7S9+~B>K==ylǩ Ì$<'9?R[v>'} yُ0NN4n汆|T͈a;/f|W_=9z3~ m4Y۾2 #<5pq?g;ud^˸<4[}k36{GjxYm?iCy>l*G @0_L}|0'bxu^3S^L|m|Tv>ϖװOG{F^a\f8>Pto>&xvkݖOm#lW]s9O'?wXj߯xt2Ï +ݤQ\q.vԼТvߗx,O6V|ڨn_%?9 j=+_*8^gTq;ٯSK![T:F2i?zuy]-_暙{U2u_MRs~Uv麴lK%We C=┘ۋ?+)\^jߍnV]5«w6T){9Ub+&'_Gf6;WWMO zUͿZߥ nnkdlT1፼jՎ}Xw٨2[Vs2j Z~rJAZ+> O=kc7՚/Tso}{s}V*ek'lhix7ui~d^}pMȶ^UꏾN$[eM潤nWn޴սԂzn6=Xf\<`Λx8c՞O㵚ltyJkUj>lxTm(XԞ Q:k"uN?]w ۫uʹ:w3K*tz'OYLxCSRI[J/Kh<^wU|ʸUQ:s$u}-9ZZz'gp뵩/S[ 8j]KOdsuP.+VھqSN~~5V{7<%jώkZ7wE?;7 {~CfWg*rʾ^[eUپTZ+5ۙx[QᵢOi6ԑMW+{j[wyf)X'3]p 8o/,F=/w|aJH=02jn˶̨Һ)QF KZ+ S%{*˯[ƻʃû9ThUG'm7VLR7>_|Uj=MxpZЗ]r-岲۹ieV >ꤹEm,ym;7w-SUmXnD+Vo˫}{7ׇTQ#==?WK}-cny=+vvB9 z81"Iۨw4e*—J >/hsZ2&cJJt-SQny/[BMy=mC|Reߤ溦X '.~WM5kTfZIU74('4wZ_N^Ԧi_|N?ia'j^ezN jێ/xOO[H[ܱF 77v|ȳe65R> -{2ǿSw!uUCKVxe_ޠJʪe*k #.';楗#5sMR[S[ҕ)/Tor`ԗk/JR=Īy}(s~&n]%^|7 M~?:]{Ί:*YKmbwv_/'WvU?/}a?qgxȪ_s\~0MJ|.N/x_gȱc#&mܸIg!YpfݷkΏn~mINT}f狫]ûP)߹ƽ%R/-.v >u=O;'4X߱^TʥU:[j3ԑ.{&mϫ.}O{:&6ʌ6;0>ěyՒgxYqW,U{ȇ<9^7rB}OO۝Sh kGmjF5--i /ц눟ݸל y5ҟo{2wqEiǘ.iꏙuXLplw7???K^.r;lHK7!<~{7j9Z, 듾]c7<]|52Pc x'$?0$hZVPۦy:ӖÏ ?9Gi;u߈Q!/ |]eu.gCo/C/o߮hw_? E';ϿA*~_ _Z瞿@ை3Gkڌ_~q$ }~ߋ_Tjy'^A>䇀/+?]È{?~ʁic%w.0*{;!>Q)6ncrf;.7u%NhqK$ͳvjgfkzE3؛ H۠7T;ze~=+F܁}&S]G n=5}C%o)DՈ{`tRs1.pjZ}gӅ7"_}œc.U{^uKW3jkSȾΛ'^2J>pvM/wpwW|ĝ*;Mm/׾JO'i8{:CQ؏c{~|j^s^d ]+-wMQQJt}";7'>!x,#?Q/_t*y/*"+##]?oD<|/B??smǴL䡗{baV?SUW]U̜ԃW_1es/#~|YҌ +I~xxmOBfȕٮ"׶46c⟠߂z'%~|f5\:݆ f|s%Es'qx kvu? @v݊GY-}g/O ̗?*qB~OO>yJ^n=6=g@|s<{Wyw #=tvce˞}`f]'dx[M~zqa57&9Nr`]WY$_#]!_}cʩ~™)(rʘ&/ynx~875hpKg/q~y& /^#=maNF?ȏCOwayMv-;75Ѫ3n.<s ^>yd~?4޺w 5U~8o~t7Cw W!MXj]YQA z< 8 ~ak/Z,=^ FK7oϽYOK9j4OmYEOYԃy@-)v}wg!9<|2+?^I2^{bĒy#|Y'v!weS9 C_G=(@dz<$'Fӈ3n~D·$`m՚l1Ŀ?o^)TN˖<|r@!p> =zZ$V/<9wJӯ\K~3eߡN<ĭ?zG >)u~s teF;X޾mg;S/K+x p/B棫yP`Lj ¹@/yHcB`-3-/#>`r 0SIs4F{~.塝/ZډG]cs:g<᷻ʽqYr/'w:c '@.}<wෑ7Z?~˸a=%㥟ʾư ݲ7|9Apj#3E:55>~ 'A|srhnokvvV_y,в?ԑK3V'„OSe!iz[i9?y;v&g{ckq%s]1Pq˶<!a8\~qV)V9x͌wN98Ļ8zq۾XQ~>W|e?6~筲OF^ 7>om |XC[@ >Oyw_cQ_A.!r\:#KhPy/C =no>ΥO|,j䳱/?ZIگu'}ㆯo@hOJIVxyJ]7K>9q[O;WRBg%CX#ӏ_ib9ѩ~`IGZ1>wCz_C@˯RS?dJ<~ ? }cu8x%⒬E87j]uK!O_΢?o}o"P_8sވǐ<`59Y_ ?9y0Tگ vA]g<$6l&x¥*eg6y\1[gCr>|}(+|\C_-&شf#c^֢wZ%ͻ!< )w|ny sX_?VTCu9:TZ\QS%VyCY-:gt3\z~g? ^?J | sǁ'nPa\}cb59<=ʳGoz6t\Y-kF!O51K;۬9|ÔW=3_zY 7}ďr>Jw-ic?{NW:M3*,(sjJܗ2OC~-ݥͅG]r?%Gy8g<!灵Foui+^ȕ!8Fp|w/K9*ONeO ͻvҳ`g+| 9;~$SXs9|\yV !op`'T/\eJjmnzξS'gү:yi*śgSqW~GDYժN\LJ>"žGgx?;N][pCkWWA^3ثݲ.aev_yl %6?rNL+A |?y]Xʹ7'y=x"xDR/>ڮԃ_ﴑo3ċxׯ׋G$qϲ}kސQ93Aݓ><kuGO^VO.y-O}yQii7MXO~5M7b7{[#>K~M*qN3Nl]$NohE%ڌXH<".949~SuWZ[H\"(Z|? =Xe#xwIv6s-^,usϠ7{a1ƝJQgXg_Ca n8D[wo(~O7ȃ<TI\vܬӷ//-Yt%ռ1wI-ᰟ-yǶ^KԚQ8ďاiy_ 7omM'np=sz_a儧/ >Pg?O4xxO ƾp>ee}S#o>'K"_A<$lһnBijqrG5ǧ/Fy5qӸ w,Yޜw{wqNv@< :w;C{'C~q~=]r;3|w8wwC^k#A@Υ)#~_#O$/j\y 齜||EoEk>wo}5"@r7uU ꇏ_[_8w {A!o~7 Oe _okv?ĩI@r K{g⽏h#u/^};km@ QJ+]k?;|n뾧;3A\ _b]qSƥg>aoXK>jaC"9'!Zkye% ΀C؉官.4'(['sOf=n1ʹĭQWQRN0= Λ!@}K|ǀ+W츳NSGI ?c_k]I W(YؿնOurgMuw/A!Kϥ}r.Ց?N;0@| żh~sOԗz -3x~B</h8e7n_  ;@^y4MFK\%q<m=zwx!Nx{KGA.y[h'Aq1y^ȃsXߍ ,%qDx{||M<"+3?\<6qJ͉'5|%(wg>&/9qscO5CLj2+q_~;hzK Y7|kesʹ`'Ǐ5/qA7ӯٍIķ' ]+ݞ2C#~d1^~?Lc <kR{{|O}{%>˗*yc}0/@^ma{;^ٗ¹l?mFr$_h( }~x~oxʠ 49ϋh ߡ㜄Gȏ}Ǎ |)r~8 = 0.1 \item medium effect >= 0.3 \item large effect >= 0.5 } \strong{r} is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("coin") && requireNamespace("survey")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) # Mann-Whitney-U tests for elder's age by elder's sex. mann_whitney_test(efc, "e17age", by = "e16sex") # base R equivalent wilcox.test(e17age ~ e16sex, data = efc) # when data is in wide-format, specify all relevant continuous # variables in `select` and omit `by` set.seed(123) wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20)) mann_whitney_test(wide_data, select = c("scale1", "scale2")) # base R equivalent wilcox.test(wide_data$scale1, wide_data$scale2) # same as if we had data in long format, with grouping variable long_data <- data.frame( scales = c(wide_data$scale1, wide_data$scale2), groups = as.factor(rep(c("A", "B"), each = 20)) ) mann_whitney_test(long_data, select = "scales", by = "groups") # base R equivalent wilcox.test(scales ~ groups, long_data) \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. \doi{10.3390/math11091982} \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/gmd.Rd0000644000176200001440000000207314620333763013567 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/gmd.R \name{gmd} \alias{gmd} \title{Gini's Mean Difference} \usage{ gmd(x, select = NULL) } \arguments{ \item{x}{A vector or data frame.} \item{select}{Optional, names of variables as character vector that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed.} } \value{ For numeric vectors, Gini's mean difference. For non-numeric vectors or vectors of length < 2, returns \code{NA}. } \description{ \code{gmd()} computes Gini's mean difference for a numeric vector or for all numeric vectors in a data frame. } \note{ Gini's mean difference is defined as the mean absolute difference between any two distinct elements of a vector. Missing values from \code{x} are silently removed. } \examples{ data(efc) gmd(efc$e17age) gmd(efc, c("e17age", "c160age", "c12hour")) } \references{ David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575 } sjstats/man/var_pop.Rd0000644000176200001440000000154714620333763014473 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/var_pop.R \name{var_pop} \alias{var_pop} \alias{sd_pop} \title{Calculate population variance and standard deviation} \usage{ var_pop(x) sd_pop(x) } \arguments{ \item{x}{(Numeric) vector.} } \value{ The population variance or standard deviation of \code{x}. } \description{ Calculate the population variance or standard deviation of a vector. } \details{ Unlike \code{\link[stats]{var}}, which returns the sample variance, \code{var_pop()} returns the population variance. \code{sd_pop()} returns the standard deviation based on the population variance. } \examples{ data(efc) # sampling variance var(efc$c12hour, na.rm = TRUE) # population variance var_pop(efc$c12hour) # sampling sd sd(efc$c12hour, na.rm = TRUE) # population sd sd_pop(efc$c12hour) } sjstats/man/inequ_trend.Rd0000644000176200001440000000526714620333763015345 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/inequ_trends.R \name{inequ_trend} \alias{inequ_trend} \title{Compute trends in status inequalities} \usage{ inequ_trend(data, prev.low, prev.hi) } \arguments{ \item{data}{A data frame that contains the variables with prevalence rates for both low and high status groups (see 'Examples').} \item{prev.low}{The name of the variable with the prevalence rates for the low status groups.} \item{prev.hi}{The name of the variable with the prevalence rates for the hi status groups.} } \value{ A data frame with the prevalence rates as well as the values for the proportional change in absolute (\code{rd}) and relative (\code{rr}) ineqqualities. } \description{ This method computes the proportional change of absolute (rate differences) and relative (rate ratios) inequalities of prevalence rates for two different status groups, as proposed by Mackenbach et al. (2015). } \details{ Given the time trend of prevalence rates of an outcome for two status groups (e.g. the mortality rates for people with lower and higher socioeconomic status over 40 years), this function computes the proportional change of absolute and relative inequalities, expressed in changes in rate differences and rate ratios. The function implements the algorithm proposed by \emph{Mackenbach et al. 2015}. } \examples{ \dontshow{if (requireNamespace("ggplot2")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 # 40 simulated time points, with an initial rate ratio of 2 and # a rate difference of 100 (i.e. low status group starts with a # prevalence rate of 200, the high status group with 100) # annual decline of prevalence is 1\% for the low, and 3\% for the # high status group n <- 40 time <- seq(1, n, by = 1) lo <- rep(200, times = n) for (i in 2:n) lo[i] <- lo[i - 1] * .99 hi <- rep(100, times = n) for (i in 2:n) hi[i] <- hi[i - 1] * .97 prev.data <- data.frame(lo, hi) # print values inequ_trend(prev.data, "lo", "hi") # plot trends - here we see that the relative inequalities # are increasing over time, while the absolute inequalities # are first increasing as well, but later are decreasing # (while rel. inequ. are still increasing) plot(inequ_trend(prev.data, "lo", "hi")) \dontshow{\}) # examplesIf} } \references{ Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. \doi{10.1136/jech-2015-207018} } sjstats/man/samplesize_mixed.Rd0000644000176200001440000000643414620333763016367 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/samplesize_mixed.R \name{samplesize_mixed} \alias{samplesize_mixed} \alias{smpsize_lmm} \title{Sample size for linear mixed models} \usage{ samplesize_mixed( eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05 ) smpsize_lmm( eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05 ) } \arguments{ \item{eff.size}{Effect size.} \item{df.n}{Optional argument for the degrees of freedom for numerator. See 'Details'.} \item{power}{Power of test (1 minus Type II error probability).} \item{sig.level}{Significance level (Type I error probability).} \item{k}{Number of cluster groups (level-2-unit) in multilevel-design.} \item{n}{Optional, number of observations per cluster groups (level-2-unit) in multilevel-design.} \item{icc}{Expected intraclass correlation coefficient for multilevel-model.} } \value{ A list with two values: The number of subjects per cluster, and the total sample size for the linear mixed model. } \description{ Compute an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs. } \details{ The sample size calculation is based on a power-calculation for the standard design. If \code{df.n} is not specified, a power-calculation for an unpaired two-sample t-test will be computed (using \code{\link[pwr]{pwr.t.test}} of the \CRANpkg{pwr}-package). If \code{df.n} is given, a power-calculation for general linear models will be computed (using \code{\link[pwr]{pwr.f2.test}} of the \pkg{pwr}-package). The sample size of the standard design is then adjusted for the design effect of two-level-designs (see \code{\link{design_effect}}). Thus, the sample size calculation is appropriate in particular for two-level-designs (see \cite{Snijders 2005}). Models that additionally include repeated measures (three-level-designs) may work as well, however, the computed sample size may be less accurate. } \examples{ \dontshow{if (requireNamespace("pwr")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # Sample size for multilevel model with 30 cluster groups and a small to # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and # hence a total sample size of about 802 observations is needed. samplesize_mixed(eff.size = .3, k = 30) # Sample size for multilevel model with 20 cluster groups and a medium # to large effect size for linear models of 0.2. Five subjects per cluster and # hence a total sample size of about 107 observations is needed. samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) \dontshow{\}) # examplesIf} } \references{ Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum. \cr \cr Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \cr \cr Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. } sjstats/man/svyglm.zip.Rd0000644000176200001440000000327314616613032015140 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svyglmzip.R \name{svyglm.zip} \alias{svyglm.zip} \title{Survey-weighted zero-inflated Poisson model} \usage{ svyglm.zip(formula, design, ...) } \arguments{ \item{formula}{An object of class \code{formula}, i.e. a symbolic description of the model to be fitted. See 'Details' in \code{\link[pscl]{zeroinfl}}.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{...}{Other arguments passed down to \code{\link[pscl]{zeroinfl}}.} } \value{ An object of class \code{\link[survey]{svymle}} and \code{svyglm.zip}, with some additional information about the model. } \description{ \code{svyglm.zip()} is an extension to the \CRANpkg{survey}-package to fit survey-weighted zero-inflated Poisson models. It uses \code{\link[survey]{svymle}} to fit sampling-weighted maximum likelihood estimates, based on starting values provided by \code{\link[pscl]{zeroinfl}}. } \details{ Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/. } \examples{ if (require("survey")) { data(nhanes_sample) set.seed(123) nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2) nhanes_sample$malepartners[sample(1:2992, 400)] <- 0 # create survey design des <- svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) # fit negative binomial regression fit <- svyglm.zip( malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1), des ) # print coefficients and standard errors fit } } sjstats/man/bootstrap.Rd0000644000176200001440000000555714620333763015047 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bootstrap.R \name{bootstrap} \alias{bootstrap} \title{Generate nonparametric bootstrap replications} \usage{ bootstrap(data, n, size) } \arguments{ \item{data}{A data frame.} \item{n}{Number of bootstraps to be generated.} \item{size}{Optional, size of the bootstrap samples. May either be a number between 1 and \code{nrow(data)} or a value between 0 and 1 to sample a proportion of observations from \code{data} (see 'Examples').} } \value{ A data frame with one column: a list-variable \code{strap}, which contains resample-objects of class \code{sj_resample}. These resample-objects are lists with three elements: \enumerate{ \item the original data frame, \code{data} \item the rownmumbers \code{id}, i.e. rownumbers of \code{data}, indicating the resampled rows with replacement \item the \code{resample.id}, indicating the index of the resample (i.e. the position of the \code{sj_resample}-object in the list \code{strap}) } } \description{ Generates \code{n} bootstrap samples of \code{data} and returns the bootstrapped data frames as list-variable. } \details{ By default, each bootstrap sample has the same number of observations as \code{data}. To generate bootstrap samples without resampling same observations (i.e. sampling without replacement), use \code{size} to get bootstrapped data with a specific number of observations. However, specifying the \code{size}-argument is much less memory-efficient than the bootstrap with replacement. Hence, it is recommended to ignore the \code{size}-argument, if it is not really needed. } \note{ This function applies nonparametric bootstrapping, i.e. the function draws samples with replacement. \cr \cr There is an \code{as.data.frame}- and a \code{print}-method to get or print the resampled data frames. See 'Examples'. The \code{as.data.frame}- method automatically applies whenever coercion is done because a data frame is required as input. See 'Examples' in \code{\link{boot_ci}}. } \examples{ data(efc) bs <- bootstrap(efc, 5) # now run models for each bootstrapped sample lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x)) # generate bootstrap samples with 600 observations for each sample bs <- bootstrap(efc, 5, 600) # generate bootstrap samples with 70\% observations of the original sample size bs <- bootstrap(efc, 5, .7) # compute standard error for a simple vector from bootstraps # use the `as.data.frame()`-method to get the resampled # data frame bs <- bootstrap(efc, 100) bs$c12hour <- unlist(lapply(bs$strap, function(x) { mean(as.data.frame(x)$c12hour, na.rm = TRUE) })) # bootstrapped standard error boot_se(bs, "c12hour") # bootstrapped CI boot_ci(bs, "c12hour") } \seealso{ \code{\link{boot_ci}} to calculate confidence intervals from bootstrap samples. } sjstats/man/chi_squared_test.Rd0000644000176200001440000002004214620603332016332 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/chi_squared_test.R \name{chi_squared_test} \alias{chi_squared_test} \title{Chi-Squared test} \usage{ chi_squared_test( data, select = NULL, by = NULL, probabilities = NULL, weights = NULL, paired = FALSE, ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{probabilities}{A numeric vector of probabilities for each cell in the contingency table. The length of the vector must match the number of cells in the table, i.e. the number of unique levels of the variable specified in \code{select}. If \code{probabilities} is provided, a chi-squared test for given probabilities is conducted. Furthermore, if \code{probabilities} is given, \code{by} must be \code{NULL}. The probabilities must sum to 1.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} \item{paired}{Logical, if \code{TRUE}, a McNemar test is conducted for 2x2 tables. Note that \code{paired} only works for 2x2 tables.} \item{...}{Additional arguments passed down to \code{\link[=chisq.test]{chisq.test()}}.} } \value{ A data frame with test results. The returned effects sizes are Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against given probabilities. } \description{ This function performs a \eqn{\chi^2} test for contingency tables or tests for given probabilities. The returned effects sizes are Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against given probabilities (see \emph{Ben-Shachar et al. 2023}). } \details{ The function is a wrapper around \code{\link[=chisq.test]{chisq.test()}} and \code{\link[=fisher.test]{fisher.test()}} (for small expected values) for contingency tables, and \code{chisq.test()} for given probabilities. When \code{probabilities} are provided, these are rescaled to sum to 1 (i.e. \code{rescale.p = TRUE}). When \code{fisher.test()} is called, simulated p-values are returned (i.e. \code{simulate.p.value = TRUE}, see \code{?fisher.test}). If \code{paired = TRUE} and a 2x2 table is provided, a McNemar test (see \code{\link[=mcnemar.test]{mcnemar.test()}}) is conducted. The weighted version of the chi-squared test is based on the a weighted table, using \code{\link[=xtabs]{xtabs()}} as input for \code{chisq.test()}. Interpretation of effect sizes are based on rules described in \code{\link[effectsize:interpret_r]{effectsize::interpret_phi()}}, \code{\link[effectsize:interpret_r]{effectsize::interpret_cramers_v()}}, and \code{\link[effectsize:interpret_r]{effectsize::interpret_fei()}}. Use these function directly to get other interpretations, by providing the returned effect size as argument, e.g. \code{interpret_phi(0.35, rules = "gignac2016")}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("effectsize") && requireNamespace("MASS")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) # Chi-squared test chi_squared_test(efc, "c161sex", by = "e16sex") # weighted Chi-squared test chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight") # Chi-squared test for given probabilities chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7)) \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. \doi{10.3390/math11091982} \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/is_prime.Rd0000644000176200001440000000101114616613032014611 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_prime.R \name{is_prime} \alias{is_prime} \title{Find prime numbers} \usage{ is_prime(x) } \arguments{ \item{x}{An integer, or a vector of integers.} } \value{ \code{TRUE} for each prime number in \code{x}, \code{FALSE} otherwise. } \description{ This functions checks whether a number is, or numbers in a vector are prime numbers. } \examples{ is_prime(89) is_prime(15) is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) } sjstats/man/design_effect.Rd0000644000176200001440000000355014616613032015601 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/design_effect.R \name{design_effect} \alias{design_effect} \title{Design effects for two-level mixed models} \usage{ design_effect(n, icc = 0.05) } \arguments{ \item{n}{Average number of observations per grouping cluster (i.e. level-2 unit).} \item{icc}{Assumed intraclass correlation coefficient for multilevel-model.} } \value{ The design effect (Variance Inflation Factor) for the two-level model. } \description{ Compute the design effect (also called \emph{Variance Inflation Factor}) for mixed models with two-level design. } \details{ The formula for the design effect is simply \code{(1 + (n - 1) * icc)}. } \examples{ # Design effect for two-level model with 30 observations per # cluster group (level-2 unit) and an assumed intraclass # correlation coefficient of 0.05. design_effect(n = 30) # Design effect for two-level model with 24 observation per cluster # group and an assumed intraclass correlation coefficient of 0.2. design_effect(n = 24, icc = 0.2) } \references{ Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20. \cr \cr Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} \cr \cr Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} \cr \cr Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. \doi{10.1370/afm.1347} } sjstats/man/t_test.Rd0000644000176200001440000001600414620602402014306 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/t_test.R \name{t_test} \alias{t_test} \title{Student's t test} \usage{ t_test( data, select = NULL, by = NULL, weights = NULL, paired = FALSE, mu = 0, alternative = "two.sided" ) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} \item{paired}{Logical, whether to compute a paired t-test for dependent samples.} \item{mu}{The hypothesized difference in means (for \code{t_test()}) or location shift (for \code{wilcoxon_test()} and \code{mann_whitney_test()}). The default is 0.} \item{alternative}{A character string specifying the alternative hypothesis, must be one of \code{"two.sided"} (default), \code{"greater"} or \code{"less"}. See \code{?t.test} and \code{?wilcox.test}.} } \value{ A data frame with test results. Effectsize Cohen's \emph{d} is returned for larger samples (n > 20), while Hedges' \emph{g} is returned for smaller samples. } \description{ This function performs a Student's t test for two independent samples, for paired samples, or for one sample. It's a parametric test for the null hypothesis that the means of two independent samples are equal, or that the mean of one sample is equal to a specified value. The hypothesis can be one- or two-sided. Unlike the underlying base R function \code{t.test()}, this function allows for weighted tests and automatically calculates effect sizes. Cohen's \emph{d} is returned for larger samples (n > 20), while Hedges' \emph{g} is returned for smaller samples. } \details{ Interpretation of effect sizes are based on rules described in \code{\link[effectsize:interpret_cohens_d]{effectsize::interpret_cohens_d()}} and \code{\link[effectsize:interpret_cohens_d]{effectsize::interpret_hedges_g()}}. Use these function directly to get other interpretations, by providing the returned effect size (\emph{Cohen's d} or \emph{Hedges's g} in this case) as argument, e.g. \code{interpret_cohens_d(0.35, rules = "sawilowsky2009")}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("effectsize")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(sleep) # one-sample t-test t_test(sleep, "extra") # base R equivalent t.test(extra ~ 1, data = sleep) # two-sample t-test, by group t_test(mtcars, "mpg", by = "am") # base R equivalent t.test(mpg ~ am, data = mtcars) # paired t-test t_test(mtcars, c("mpg", "hp"), paired = TRUE) # base R equivalent t.test(mtcars$mpg, mtcars$hp, data = mtcars, paired = TRUE) \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/weighted_se.Rd0000644000176200001440000000465014620333763015312 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svy_median.R, R/wtd_cor.R, R/wtd_se.R \name{survey_median} \alias{survey_median} \alias{weighted_correlation} \alias{weighted_correlation.default} \alias{weighted_correlation.formula} \alias{weighted_se} \title{Weighted statistics for variables} \usage{ survey_median(x, design) weighted_correlation(data, ...) \method{weighted_correlation}{default}(data, x, y, weights, ci.lvl = 0.95, ...) \method{weighted_correlation}{formula}(formula, data, ci.lvl = 0.95, ...) weighted_se(x, weights = NULL) } \arguments{ \item{x}{(Numeric) vector or a data frame. For \code{survey_median()} or \code{weighted_ttest()}, the bare (unquoted) variable name, or a character vector with the variable name.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{data}{A data frame.} \item{...}{Currently not used.} \item{y}{Optional, bare (unquoted) variable name, or a character vector with the variable name.} \item{weights}{Bare (unquoted) variable name, or a character vector with the variable name of the numeric vector of weights. If \code{weights = NULL}, unweighted statistic is reported.} \item{ci.lvl}{Confidence level of the interval.} \item{formula}{A formula of the form \code{lhs ~ rhs1 + rhs2} where \code{lhs} is a numeric variable giving the data values and \code{rhs1} a factor with two levels giving the corresponding groups and \code{rhs2} a variable with weights.} } \value{ The weighted (test) statistic. } \description{ \code{weighted_se()} computes weighted standard errors of a variable or for all variables of a data frame. \code{survey_median()} computes the median for a variable in a survey-design (see [\verb{survey::svydesign()]}). \code{weighted_correlation()} computes a weighted correlation for a two-sided alternative hypothesis. } \examples{ \dontshow{if (requireNamespace("survey")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) weighted_se(efc$c12hour, abs(runif(n = nrow(efc)))) # survey_median ---- # median for variables from weighted survey designs data(nhanes_sample) des <- survey::svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) survey_median(total, des) survey_median("total", des) \dontshow{\}) # examplesIf} } sjstats/man/nhanes_sample.Rd0000644000176200001440000000103714616613032015627 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nhanes_sample.R \docType{data} \name{nhanes_sample} \alias{nhanes_sample} \title{Sample dataset from the National Health and Nutrition Examination Survey} \description{ Selected variables from the National Health and Nutrition Examination Survey that are used in the example from Lumley (2010), Appendix E. See \code{\link{svyglm.nb}} for examples. } \references{ Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley } \keyword{data} sjstats/man/se_ybar.Rd0000644000176200001440000000176514620333763014453 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/se_ybar.R \name{se_ybar} \alias{se_ybar} \title{Standard error of sample mean for mixed models} \usage{ se_ybar(fit) } \arguments{ \item{fit}{Fitted mixed effects model (\code{\link[lme4]{merMod}}-class).} } \value{ The standard error of the sample mean of \code{fit}. } \description{ Compute the standard error for the sample mean for mixed models, regarding the extent to which clustering affects the standard errors. May be used as part of the multilevel power calculation for cluster sampling (see \cite{Gelman and Hill 2007, 447ff}). } \examples{ \dontshow{if (require("lme4")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) se_ybar(fit) \dontshow{\}) # examplesIf} } \references{ Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press } sjstats/man/auto_prior.Rd0000644000176200001440000000571014620500520015167 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/auto_prior.R \name{auto_prior} \alias{auto_prior} \title{Create default priors for brms-models} \usage{ auto_prior(formula, data, gaussian, locations = NULL) } \arguments{ \item{formula}{A formula describing the model, which just needs to contain the model terms, but no notation of interaction, splines etc. Usually, you want only those predictors in the formula, for which automatic priors should be generated. Add informative priors afterwards to the returned \code{brmsprior}-object.} \item{data}{The data that will be used to fit the model.} \item{gaussian}{Logical, if the outcome is gaussian or not.} \item{locations}{A numeric vector with location values for the priors. If \code{locations = NULL}, \code{0} is used as location parameter.} } \value{ A \code{brmsprior}-object. } \description{ This function creates default priors for brms-regression models, based on the same automatic prior-scale adjustment as in \pkg{rstanarm}. } \details{ \code{auto_prior()} is a small, convenient function to create some default priors for brms-models with automatically adjusted prior scales, in a similar way like \pkg{rstanarm} does. The default scale for the intercept is 10, for coefficients 2.5. If the outcome is gaussian, both scales are multiplied with \code{sd(y)}. Then, for categorical variables, nothing more is changed. For numeric variables, the scales are divided by the standard deviation of the related variable. \cr \cr All prior distributions are \emph{normal} distributions. \code{auto_prior()} is intended to quickly create default priors with feasible scales. If more precise definitions of priors is necessary, this needs to be done directly with brms-functions like \code{set_prior()}. } \note{ As \code{auto_prior()} also sets priors on the intercept, the model formula used in \code{brms::brm()} must be rewritten to something like \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. } \examples{ \dontshow{if (requireNamespace("brms")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) efc$c172code <- as.factor(efc$c172code) efc$c161sex <- as.factor(efc$c161sex) mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) auto_prior(mf, efc, TRUE) ## compare to # m <- rstanarm::stan_glm(mf, data = efc, chains = 2, iter = 200) # ps <- prior_summary(m) # ps$prior_intercept$adjusted_scale # ps$prior$adjusted_scale ## usage # ap <- auto_prior(mf, efc, TRUE) # brm(mf, data = efc, prior = ap) # add informative priors mf <- formula(neg_c_7 ~ c161sex + c172code) auto_prior(mf, efc, TRUE) + brms::prior(normal(.1554, 40), class = "b", coef = "c160age") # example with binary response efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) auto_prior(mf, efc, FALSE) \dontshow{\}) # examplesIf} } sjstats/man/chisq_gof.Rd0000644000176200001440000000414014616613032014752 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/gof.R \name{chisq_gof} \alias{chisq_gof} \title{Compute model quality} \usage{ chisq_gof(x, prob = NULL, weights = NULL) } \arguments{ \item{x}{A numeric vector or a \code{glm}-object.} \item{prob}{Vector of probabilities (indicating the population probabilities) of the same length as \code{x}'s amount of categories / factor levels. Use \code{nrow(table(x))} to determine the amount of necessary values for \code{prob}. Only used, when \code{x} is a vector, and not a \code{glm}-object.} \item{weights}{Vector with weights, used to weight \code{x}.} } \value{ For vectors, returns the object of the computed \code{\link[stats]{chisq.test}}. For \code{glm}-objects, an object of class \code{chisq_gof} with following values: \code{p.value}, the p-value for the goodness-of-fit test; \code{z.score}, the standardized z-score for the goodness-of-fit test; \code{rss}, the residual sums of squares term and \code{chisq}, the pearson chi-squared statistic. } \description{ For logistic regression models, performs a Chi-squared goodness-of-fit-test. } \details{ For vectors, this function is a convenient function for the \code{chisq.test()}, performing goodness-of-fit test. For \code{glm}-objects, this function performs a goodness-of-fit test. A well-fitting model shows \emph{no} significant difference between the model and the observed data, i.e. the reported p-values should be greater than 0.05. } \examples{ data(efc) efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) m <- glm( neg_c_7d ~ c161sex + barthtot + c172code, data = efc, family = binomial(link = "logit") ) # goodness-of-fit test for logistic regression chisq_gof(m) # goodness-of-fit test for vectors against probabilities # differing from population chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28)) # equal to population chisq_gof(efc$e42dep, prop.table(table(efc$e42dep))) } \references{ Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. } sjstats/man/table_values.Rd0000644000176200001440000000231414616613032015457 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/sjStatistics.R \name{table_values} \alias{table_values} \title{Expected and relative table values} \usage{ table_values(tab, digits = 2) } \arguments{ \item{tab}{Simple \code{\link{table}} or \code{\link[stats]{ftable}} of which cell, row and column percentages as well as expected values are calculated. Tables of class \code{\link[stats]{xtabs}} and other will be coerced to \code{ftable} objects.} \item{digits}{Amount of digits for the table percentage values.} } \value{ (Invisibly) returns a list with four tables: \enumerate{ \item \code{cell} a table with cell percentages of \code{tab} \item \code{row} a table with row percentages of \code{tab} \item \code{col} a table with column percentages of \code{tab} \item \code{expected} a table with expected values of \code{tab} } } \description{ This function calculates a table's cell, row and column percentages as well as expected values and returns all results as lists of tables. } \examples{ tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) # show expected values table_values(tab)$expected # show cell percentages table_values(tab)$cell } sjstats/man/cv_error.Rd0000644000176200001440000000313014616613032014627 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cv_error.R \name{cv_error} \alias{cv_error} \alias{cv_compare} \title{Test and training error from model cross-validation} \usage{ cv_error(data, formula, k = 5) cv_compare(data, formulas, k = 5) } \arguments{ \item{data}{A data frame.} \item{formula}{The formula to fit the linear model for the test and training data.} \item{k}{The number of folds for the kfold-crossvalidation.} \item{formulas}{A list of formulas, to fit linear models for the test and training data.} } \value{ A data frame with the root mean squared errors for the training and test data. } \description{ \code{cv_error()} computes the root mean squared error from a model fitted to kfold cross-validated test-training-data. \code{cv_compare()} does the same, for multiple formulas at once (by calling \code{cv_error()} for each formula). } \details{ \code{cv_error()} first generates cross-validated test-training pairs, using \code{\link[modelr]{crossv_kfold}} and then fits a linear model, which is described in \code{formula}, to the training data. Then, predictions for the test data are computed, based on the trained models. The \emph{training error} is the mean value of the \code{\link{rmse}} for all \emph{trained} models; the \emph{test error} is the rmse based on all residuals from the test data. } \examples{ data(efc) cv_error(efc, neg_c_7 ~ barthtot + c161sex) cv_compare(efc, formulas = list( neg_c_7 ~ barthtot + c161sex, neg_c_7 ~ barthtot + c161sex + e42dep, neg_c_7 ~ barthtot + c12hour )) } sjstats/man/anova_stats.Rd0000644000176200001440000000251614620444364015344 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/anova_stats.R \name{anova_stats} \alias{anova_stats} \title{Effect size statistics for anova} \usage{ anova_stats(model, digits = 3) } \arguments{ \item{model}{A fitted anova-model of class \code{aov} or \code{anova}. Other models are coerced to \code{\link[stats]{anova}}.} \item{digits}{Amount of digits for returned values.} } \value{ A data frame with all statistics is returned (excluding confidence intervals). } \description{ Returns the (partial) eta-squared, (partial) omega-squared, epsilon-squared statistic or Cohen's F for all terms in an anovas. \code{anova_stats()} returns a tidy summary, including all these statistics and power for each term. } \examples{ \dontshow{if (requireNamespace("car")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # load sample data data(efc) # fit linear model fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) anova_stats(car::Anova(fit, type = 2)) \dontshow{\}) # examplesIf} } \references{ Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. \cr \cr Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. } sjstats/man/figures/0000755000176200001440000000000014616613032014166 5ustar liggesuserssjstats/man/figures/logo.png0000644000176200001440000004411314616613032015637 0ustar liggesusersPNG  IHDRxb]e iCCPICC Profile8U]hU>sg#$Sl4t? % V46nI6"dΘ83OEP|1Ŀ (>/ % (>P苦;3ie|{g蹪X-2s=+WQ+]L6O w[C{_F qb Uvz?Zb1@/zcs>~if,ӈUSjF 1_Mjbuݠpamhmçϙ>a\+5%QKFkm}ۖ?ޚD\!~6,-7SثŜvķ5Z;[rmS5{yDyH}r9|-ăFAJjI.[/]mK 7KRDrYQO-Q||6 (0 MXd(@h2_f<:”_δ*d>e\c?~,7?& ك^2Iq2"y@g|UP`o pHYs  YiTXtXML:com.adobe.xmp 1 L'Y@IDATxW[`)@m0h4$M˪ۜ|v?])]w-@HMY dYc轆a;;0\JfDܹ}S<9-V.Zksm<_LC3jj/,?k? zݷhQy[HyMӬ%% tl-@'Ą@zF56qko^(dYf3W#Xl5?|"~ AɩXK,WA~ǣ---@ (DTU'|o|D8],,/_7ocϽo ,??G~>#3gTɚ#~B`7kUhԢ\+IPo y)P8EMC{[e6$vCy7_wu58 6UNFbV ,Wmyc-QლhĢMwN|GN^Eh}ef9gѕ> Նj B~HvWۚն!bPk:"X| R< d礆-kgdeXwMؤg{;nO9 a'=3Mؚ#Ͱ`mhھ35xmn0<97wR"U+-I٩60dž˴Pb--)Oc̙oyrI'UG2'o=`}e˖<F fcc0JǪl}EՂ`DBp ,DK>YoS$]s>kƜVT{M)߼eee^.[,X6k&bgUޘREm/+K8{nM~)iZ!DZS-ٵ5v9f}4?v/.6H!Kn`S6-/ƃ~x<F3dCV73yVV,m9Uct,;@9䱟o>PoXT22 uMyV#g s3@j&Vr;:8/$!@M/D>ϱ ̪(\dPBм~o޴z s=QVpњX v@ZzfQ$l>1 9p`litTb/Ď;i!Yv m' K14&Nh%ä `WYC}w𓟾}LsnǁۛUT#7>:%%m  n)UqgñJ[{αlr!KXۦJjtmͰ ;M!ii@d=vwZ?n̙klO^{_7 *PnM+P8ʨIE`+lrv;Z$ YqN7@MhPtzFl+)ȱX8r`ZZ575swN^u)rz֬.))os=>_t^ Fbgϩ[ nk\+; |RĪ@Qce%ܞ1zG5HKϰfgX仟WzP%6v|;nNw|jYT6E{~YM'.=;Vl(ó;RY܋+,JΪtžl;a?Of`03NP49(mG>ҷ7V~>vlZ̳ި^*^fMҳa_JMϸg 1BF>7NZ5K>ED٬=H̗SNMmDıLMMu+~{;څ%`]VKτ'RGZyGE'|ͶVajZKBlm8! LcN ٵ2-,9PY9nXԾ`ĵ%/osUޅ ;nZG+'#+_C}͈ c܋km-= 5J,vQm)UZݞ=q}nb>1`Z(F8ܱmw5}ȘX/tݰ-ٳcEh;{Į=SoMs>Ec({}5!Ld8t8%q N1h%5[ߍ[F6i (rv2;vVl`0.Yq/Bb67򚆄{9+b(6 `躬cU(^Ct\l\np |JۆsD&777X,M}'JL8} gBu띆ر6gm_ѱgd6lFȊh}س;p/z}ٳ3MoPg!rUF?FF>qDpZZ3̫g[5ypIҹ$+P%%_ki隼TxojzVA$҄{,8fO wR\rVjU3{ᘨYl[eXia KERˆP("} X{1o (3]:r0MF,ʢ? oKS0crNڠ>6@gc]a4Z_o~j)H:o;nEW|?ixٹeFЌ-㓜݋ٳgCm3Z+Xy`op]ȐƭSSKNoUb?G@H|Uꖅ-)9@`mֹkOp̖UOZ%@[bWa lD*rEk _ c,bbcg"$Z̪&’n5|;XQ_S:aɶĒMwyn@S q~U^$ɍ|9֙"4|%ja\u(fLQF(g{8fPnF)},Q&C.А{?Qv&嶶EHS:@9QU܏Ln% aծ_LB{H|#n!V@}9Bvؒ_)h3weXFj,)Z+cu kze{ 7h'Io5^2ݢ>ւQx6F3?adD:+AK @p-kzIbB3ׄP+{-k7K22TIVe&>d Ăna nԆzD&f AJ&_$ 7SѢ[!:s|'6*SrwpE=?[6Aڌa֞ `u}s:4:c8TrBUOB_1 EO:Dics`ۚ0p6 w;uu^^w!DqŠar .Un㱛:ӡSya7;O -09bNbyWڋb˵8ĭxAtnX$K~tM`oo^cK\%#,/uB9vSzƦs`@*{0}bG[Opsm r o{TJ\?|#T?y?0a޼_}{.RZAUoM.~Cmoپx}L9h]}Ț`'zBdN9綎}dW'HDA8hnmcNUNVRԥq_%2Uz8$m_.GٺG_URX)M'VW4lLRM6֜_ bo}qwIeЏ GM-ԻO_mg`>׸ K~Y.D#~A>#]oO ̥.`E '4 om@v:C?⾂5n%kCm_h !0.C﹩ƲQpVl=bw/j/n9nOy֤kl9}(Zbr%"JXI<_AV<\b'o\+O,l5nb岕[u *7EXR6 )h?Љ?]` 9^ٷ™4j1UPu ^9DC{V.^=C҉3g~[_[lˬ[͑kP˫ &#^zꒄUjlR`Xm"UrE\ 2\uhiP9W$(G &X6H>W* RTЯ@џ8?t]/=>蚊$ܦEA?~{}v}!An_TGƛT&(z1\zD?g"X (i~C62+ʥ}٣2OU7[;gг`PWiEFʅ Wo=jWv4AWф*Hr:UOW.GpNiɃаHsd3<(+ʬM 9DE.{#yƻ{H=o􌮿=^]](RQBdkDJ+h U4$:`ˁrh+GQW(Q]FbۑgrEJ>ßJ' Nt i Ə>{ƪF&'^P/r뛂`E)ņ7Y J/out<נ>z=juO{Ͻ Nbr`|:.Mb1x˷ œ% WTrNk3ۻF9QV]OX2zIsrsz=DL{ XuO.IZXt2-*v<0g{ Y͘t٘s=bINw!b)njhJq%J ܣK/6=.[Bm GJ*nТO2#rxvY'$wq:yF$FHH$]ϲTϐxJn՞ZcׂQUV,nkQK4[|uJWl&,NI{zIBpor׀TUfRv mA$ 4A3D|+yϱ)^vЕ\1jQdCel&{ne~!w&X`VHfIFjv; `*Jl7@+`!BuL S.vvz[βAhÐO;vU鐭vq"^P z]t,WJjYRU[Ru9$#Nn\*,ՠQ~=nPXʲTc璿b hI}EȔb\|كâŖ2Ŷ҆BJ )dr[^$%Kÿ:89%憦\Dr\tt)1",1Φ)VڳV/$st\[J#%ZYԾ҅`LX_dpb5TpLmU'^7گr*FȦ%®:Bt$;_SP Y_g+&SkGAW}Ԗ `hڒ᪃]ӕ(Цgc鱛6*"%w-. QOjYD/!VLG'TxSf;! XEdJyтc&;L }6kRb#@\S䠾tET{I42sE,UR_}v}ݖ PeRҖ .]jRE(~ѵvZ޻n>{㳴2-xWQ2I$wmS R#HNl6RDYBfKdQa{)LR:*P^&#<^qH滎W,V!Eۼyu/w<}&d6Ye0ޖ oTOG=i`fǤAnŁ_֡u$[]ST^Т>v+ oB(SϷǝoV@v;X7אBJ*(A{<18Dj W$!X?uM |sN[ٌiT EWic4].\@*wͲ͎fJHL__e7*q0A̠ &Hvպx]#:{Po=Tv|/zD 5/A `W GA&j"2$#IlO&[̻dmjR1jۈ1g1[\oIԭwM3ZaF;:~6ֶhțHc֫ן''aڭNTEML~.C~~4I*hB:(u-ߥ4=Fr$eI uw6k .h[A5:,RbjK'dmS^3a' B~ q!K$֩OKd;iu=>\oæFy D6Nu<Ħe:٫*$!^ua5KKʄDg=c߭n=, ~CXr(M`$qȜt^&IZS *0$k{5ڣSgK)դ% Ӣ[,vTe3t?҆^ DfZ![_R#v)JרRD-p:N% TxtծybT{iEl<18rN}Mm&.nCG;ud[$w 8Dqȩy;rI{9FMwȿY~<OJg$/%tUOm⳶;,)lk[{RiۘaK+ȵj1sF? *ozmv'L= mmO,*ԩs'Z}éCY`'_aK8Lۊ>QH 54dKەӭ$YDVML&MP2z8 ;"$-Y@o5!Wٙ5x&k#qi9ՇDܢF^E hJN,$^dWZ^e827Vr3m[N_yT#9Fj@Z5&#DTciyphWډxNb \59;Q97aХɑ=K>[;u2bEA!Σ:cd ʵ!yvT=;HZ7=>TdQHlqc_[F$5 c m}O<)qTQXAIQ%ISUgUi١fJ_+-g~!LSc$ej_lWeꔋQ?A> &Zۤ4 (cع`2 p OZ܊@l!Z$EZGbC+)zL8pRЙyS8(s?6F}q?F#hssfs(X9[1᪨cj'Qgut,_v Y}h`B+`J%RlxWTSI_E9PX+Pr&*Zqlt:sn]DŽXJ+\':5CDqjL*Ĉ'K%̘`Ȃ uO̼pC]>7и>>h΄_u#tĜ*Zr pG[ͣ! \WTAnFrsY_N`QRJۜ%|W:xZ(q|셺Jr_5*@]MLȟ14"o"?jD"_0{#6,Ö4Ee_LgG90 rL>*xO)ꋀ/QLY:L/瀬1x]l ot]R{Bl۽|'g> Ӵsݫ~,{qb""dc wںe'|G/ǫgdD8Y!B$Ϲv+|-Kh=hơfysgpB8zE*\:*;%&r|'Id5ƹ| \ 4D?uwH[ 2g\LUvJv,H#PH 7kք-bEuWg7??=$i ;TYkka2ĶUz*>oo (%رSqur~>JG* P0HIMT?}wdi2m? "ػlٲَ͜mo |NKJmh83|~H>.msஆf4I-'D'QX9h@hlh#o,@Ơ`_ Tum|yxo-r8¹/`݊V{h@٬aqWCakl@a!}0k-+mvYF' =ooBY8*,obˎGMO`-Qp]A>&#/c7w«0D|e3gƨL^teeey~K6|ypj=gEfw\ncλErKEwvw\DRtX ;q%x%&&8 A6µ`ğz#](ض8SlAb=!EM d<OM,4EAmfB6 yMf*$U{p{woE-j2k"ׂZ(wN)Ѱ0*tٔ4ښFS5qXlY+Fpg֓KR0)W F>q4f̪uUp~NEsOuEIBرVbIabS L@ah'[|t 槯_|~'sJv%N4-[fϸ==\A)bmÒL\;Btezܞq\@3;>=Sp/Nq/*9\4vV=[ݒ>Ã̞Y^ZȘzksGAƱ^uTJ@^SNn/y+p?FmM%E̾նvܞ {JWoO|EBAXxӑZ݋XqdmvbrZ7_sP?dVX0и۰_~>[VYljB@:XqKT;͠k ͍^[}šDϵ4FUG۶o{J~{sn~4 uF Z{dum>]K-) [[B1F8td'^Tb"܋d&Lx]qAW^iaW]t5\1u]aɹ6 x&>q)/Em=ܞ\6$J%k޽gp/j-Nߘv]YũgM,}'ܹ:GEήxػN!D/_ο;X}q?z=hEs3YPuU KmoF>K~Vr;]ٖҁ%g^/mYj 嶄SR)JKKq!=M7d̜i,Īo*=%-'-5%k'9$=3'T6c?HɬE2~PdWX@쎈sIDATKWJ` rVaQx 鑒*v*@Xi5`g\ߙ?{!ľ2l~뉥#Zsއ}`8|3r#[DWJVaɭΫq{ÕX5+DEڸL'7ered"̈́"!kwY*g7p'^`Dvmyt=PZJr~6=#k,xXpnOC%\|% Xa|VcI^Wa?9lrēJu=&Uo=~,{vT/`.p㭫1̅%5sW XX'#9nڐ={rj]%ۛ{5hiO=jֵU%oCn5ىs]l듣D娀]Y/>X̙𢮫hmϽJjlj^^`oڶVK?2şwQ})7Ǎp0ض)s{[l |J40`El>t?IҒ`%}޷We~pgm'CIENDB`sjstats/man/reexports.Rd0000644000176200001440000000146314620444364015055 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/re-exports.R \docType{import} \name{reexports} \alias{reexports} \alias{mse} \alias{rmse} \alias{link_inverse} \alias{weighted_sd} \alias{weighted_mean} \alias{weighted_median} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{datawizard}{\code{\link[datawizard]{weighted_mean}}, \code{\link[datawizard:weighted_mean]{weighted_median}}, \code{\link[datawizard:weighted_mean]{weighted_sd}}} \item{insight}{\code{\link[insight]{link_inverse}}} \item{performance}{\code{\link[performance:performance_mse]{mse}}, \code{\link[performance:performance_rmse]{rmse}}} }} sjstats/man/efc.Rd0000644000176200001440000000070214616613032013545 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nhanes_sample.R \docType{data} \name{efc} \alias{efc} \title{Sample dataset from the EUROFAMCARE project} \description{ German data set from the European study on family care of older people. } \references{ Lamura G, Döhner H, Kofahl C, editors. Family carers of older people in Europe: a six-country comparative study. Münster: LIT, 2008. } \keyword{data} sjstats/man/find_beta.Rd0000644000176200001440000000760714617141032014733 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/find_beta.R \name{find_beta} \alias{find_beta} \alias{find_beta2} \alias{find_cauchy} \alias{find_normal} \title{Determining distribution parameters} \usage{ find_beta(x1, p1, x2, p2) find_beta2(x, se, ci, n) find_cauchy(x1, p1, x2, p2) find_normal(x1, p1, x2, p2) } \arguments{ \item{x1}{Value for the first percentile.} \item{p1}{Probability of the first percentile.} \item{x2}{Value for the second percentile.} \item{p2}{Probability of the second percentile.} \item{x}{Numeric, a probability value between 0 and 1. Typically indicates a prevalence rate of an outcome of interest; Or an integer value with the number of observed events. In this case, specify \code{n} to indicate the toral number of observations.} \item{se}{The standard error of \code{x}. Either \code{se} or \code{ci} must be specified.} \item{ci}{The upper limit of the confidence interval of \code{x}. Either \code{se} or \code{ci} must be specified.} \item{n}{Numeric, number of total observations. Needs to be specified, if \code{x} is an integer (number of observed events), and no probability. See 'Examples'.} } \value{ A list of length two, with the two distribution parameters than can be used to define the distribution, which (best) describes the shape for the given input parameters. } \description{ \code{find_beta()}, \code{find_normal()} and \code{find_cauchy()} find the shape, mean and standard deviation resp. the location and scale parameters to describe the beta, normal or cauchy distribution, based on two percentiles. \code{find_beta2()} finds the shape parameters for a Beta distribution, based on a probability value and its standard error or confidence intervals. } \details{ These functions can be used to find parameter for various distributions, to define prior probabilities for Bayesian analyses. \code{x1}, \code{p1}, \code{x2} and \code{p2} are parameters that describe two quantiles. Given this knowledge, the distribution parameters are returned. Use \code{find_beta2()}, if the known parameters are, e.g. a prevalence rate or similar probability, and its standard deviation or confidence interval. In this case. \code{x} should be a probability, for example a prevalence rate of a certain event. \code{se} then needs to be the standard error for this probability. Alternatively, \code{ci} can be specified, which should indicate the upper limit of the confidence interval od the probability (prevalence rate) \code{x}. If the number of events out of a total number of trials is known (e.g. 12 heads out of 30 coin tosses), \code{x} can also be the number of observed events, while \code{n} indicates the total amount of trials (in the above example, the function call would be: \code{find_beta2(x = 12, n = 30)}). } \examples{ # example from blogpost: # https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/ # 10\% of patients respond within 30 days of treatment # and 80\% respond within 90 days of treatment find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) curve( dnorm(x, mean = parms$mean, sd = parms$sd), from = 0, to = 200 ) parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) curve( dcauchy(x, location = parms$location, scale = parms$scale), from = 0, to = 200 ) find_beta2(x = .25, ci = .5) shapes <- find_beta2(x = .25, ci = .5) curve(dbeta(x, shapes[[1]], shapes[[2]])) # find Beta distribution for 3 events out of 20 observations find_beta2(x = 3, n = 20) shapes <- find_beta2(x = 3, n = 20) curve(dbeta(x, shapes[[1]], shapes[[2]])) } \references{ Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF}) } sjstats/man/wilcoxon_test.Rd0000644000176200001440000001562114620602403015712 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/wilcoxon_test.R \name{wilcoxon_test} \alias{wilcoxon_test} \title{Wilcoxon rank sum test} \usage{ wilcoxon_test( data, select = NULL, by = NULL, weights = NULL, mu = 0, alternative = "two.sided", ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} \item{mu}{The hypothesized difference in means (for \code{t_test()}) or location shift (for \code{wilcoxon_test()} and \code{mann_whitney_test()}). The default is 0.} \item{alternative}{A character string specifying the alternative hypothesis, must be one of \code{"two.sided"} (default), \code{"greater"} or \code{"less"}. See \code{?t.test} and \code{?wilcox.test}.} \item{...}{Additional arguments passed to \code{wilcox.test()} (for unweighted tests, i.e. when \code{weights = NULL}).} } \value{ A data frame with test results. The function returns p and Z-values as well as effect size r and group-rank-means. } \description{ This function performs Wilcoxon rank sum tests for one sample or for two \emph{paired} (dependent) samples. For \emph{unpaired} (independent) samples, please use the \code{mann_whitney_test()} function. A Wilcoxon rank sum test is a non-parametric test for the null hypothesis that two samples have identical continuous distributions. The implementation in \code{wilcoxon_test()} is only used for \emph{paired}, i.e. \emph{dependent} samples. For independent (unpaired) samples, use \code{mann_whitney_test()}. \code{wilcoxon_test()} can be used for ordinal scales or when the continuous variables are not normally distributed. For large samples, or approximately normally distributed variables, the \code{t_test()} function can be used (with \code{paired = TRUE}). } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ \dontshow{if (requireNamespace("coin")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(mtcars) # one-sample test wilcoxon_test(mtcars, "mpg") # base R equivalent, we set exact = FALSE to avoid a warning wilcox.test(mtcars$mpg ~ 1, exact = FALSE) # paired test wilcoxon_test(mtcars, c("mpg", "hp")) # base R equivalent, we set exact = FALSE to avoid a warning wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE) # when `by` is specified, each group must be of same length data(iris) d <- iris[iris$Species != "setosa", ] wilcoxon_test(d, "Sepal.Width", by = "Species") \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/boot_ci.Rd0000644000176200001440000000703214620333763014436 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/boot_ci.R \name{boot_ci} \alias{boot_ci} \alias{boot_se} \alias{boot_p} \alias{boot_est} \title{Standard error and confidence intervals for bootstrapped estimates} \usage{ boot_ci(data, select = NULL, method = c("dist", "quantile"), ci.lvl = 0.95) boot_se(data, select = NULL) boot_p(data, select = NULL) boot_est(data, select = NULL) } \arguments{ \item{data}{A data frame that containts the vector with bootstrapped estimates, or directly the vector (see 'Examples').} \item{select}{Optional, unquoted names of variables (as character vector) with bootstrapped estimates. Required, if either \code{data} is a data frame (and no vector), and only selected variables from \code{data} should be processed.} \item{method}{Character vector, indicating if confidence intervals should be based on bootstrap standard error, multiplied by the value of the quantile function of the t-distribution (default), or on sample quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{ci.lvl}{Numeric, the level of the confidence intervals.} } \value{ A data frame with either bootstrap estimate, standard error, the lower and upper confidence intervals or the p-value for all bootstrapped estimates. } \description{ Compute nonparametric bootstrap estimate, standard error, confidence intervals and p-value for a vector of bootstrap replicate estimates. } \details{ The methods require one or more vectors of bootstrap replicate estimates as input. \itemize{ \item \code{boot_est()}: returns the bootstrapped estimate, simply by computing the mean value of all bootstrap estimates. \item \code{boot_se()}: computes the nonparametric bootstrap standard error by calculating the standard deviation of the input vector. \item The mean value of the input vector and its standard error is used by \code{boot_ci()} to calculate the lower and upper confidence interval, assuming a t-distribution of bootstrap estimate replicates (for \code{method = "dist"}, the default, which is \verb{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)}); for \code{method = "quantile"}, 95\\% sample quantiles are used to compute the confidence intervals (\code{quantile(x, probs = c(0.025, 0.975))}). Use \code{ci.lvl} to change the level for the confidence interval. \item P-values from \code{boot_p()} are also based on t-statistics, assuming normal distribution. } } \examples{ data(efc) bs <- bootstrap(efc, 100) # now run models for each bootstrapped sample bs$models <- lapply( bs$strap, function(.x) lm(neg_c_7 ~ e42dep + c161sex, data = .x) ) # extract coefficient "dependency" and "gender" from each model bs$dependency <- vapply(bs$models, function(x) coef(x)[2], numeric(1)) bs$gender <- vapply(bs$models, function(x) coef(x)[3], numeric(1)) # get bootstrapped confidence intervals boot_ci(bs$dependency) # compare with model fit fit <- lm(neg_c_7 ~ e42dep + c161sex, data = efc) confint(fit)[2, ] # alternative function calls. boot_ci(bs$dependency) boot_ci(bs, "dependency") boot_ci(bs, c("dependency", "gender")) boot_ci(bs, c("dependency", "gender"), method = "q") # compare coefficients mean(bs$dependency) boot_est(bs$dependency) coef(fit)[2] } \references{ Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 } \seealso{ []\code{bootstrap()}] to generate nonparametric bootstrap samples. } sjstats/man/crosstable_statistics.Rd0000644000176200001440000001406514620333763017437 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cramer.R, R/phi.R, R/xtab_statistics.R \name{cramers_v} \alias{cramers_v} \alias{cramer} \alias{cramers_v.formula} \alias{phi} \alias{crosstable_statistics} \alias{xtab_statistics} \title{Measures of association for contingency tables} \usage{ cramers_v(tab, ...) cramer(tab, ...) \method{cramers_v}{formula}( formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ... ) phi(tab, ...) crosstable_statistics( data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ... ) xtab_statistics( data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ... ) } \arguments{ \item{tab}{A \code{\link[=table]{table()}} or \code{\link[=ftable]{ftable()}}. Tables of class \code{\link[=xtabs]{xtabs()}} and other will be coerced to \code{ftable} objects.} \item{...}{Other arguments, passed down to the statistic functions \code{\link[=chisq.test]{chisq.test()}}, \code{\link[=fisher.test]{fisher.test()}} or \code{\link[=cor.test]{cor.test()}}.} \item{formula}{A formula of the form \code{lhs ~ rhs} where \code{lhs} is a numeric variable giving the data values and \code{rhs} a factor giving the corresponding groups.} \item{data}{A data frame or a table object. If a table object, \code{x1} and \code{x2} will be ignored. For Kendall's \emph{tau}, Spearman's \emph{rho} or Pearson's product moment correlation coefficient, \code{data} needs to be a data frame. If \code{x1} and \code{x2} are not specified, the first two columns of the data frames are used as variables to compute the crosstab.} \item{ci.lvl}{Scalar between 0 and 1. If not \code{NULL}, returns a data frame including lower and upper confidence intervals.} \item{n}{Number of bootstraps to be generated.} \item{method}{Character vector, indicating if confidence intervals should be based on bootstrap standard error, multiplied by the value of the quantile function of the t-distribution (default), or on sample quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{x1}{Name of first variable that should be used to compute the contingency table. If \code{data} is a table object, this argument will be irgnored.} \item{x2}{Name of second variable that should be used to compute the contingency table. If \code{data} is a table object, this argument will be irgnored.} \item{statistics}{Name of measure of association that should be computed. May be one of \code{"auto"}, \code{"cramer"}, \code{"phi"}, \code{"spearman"}, \code{"kendall"}, \code{"pearson"} or \code{"fisher"}. See 'Details'.} \item{weights}{Name of variable in \code{x} that indicated the vector of weights that will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} } \value{ For \code{\link[=phi]{phi()}}, the table's Phi value. For [\verb{cramers_v()]}, the table's Cramer's V. For \code{crosstable_statistics()}, a list with following components: \itemize{ \item \code{estimate}: the value of the estimated measure of association. \item \code{p.value}: the p-value for the test. \item \code{statistic}: the value of the test statistic. \item \code{stat.name}: the name of the test statistic. \item \code{stat.html}: if applicable, the name of the test statistic, in HTML-format. \item \code{df}: the degrees of freedom for the contingency table. \item \code{method}: character string indicating the name of the measure of association. \item \code{method.html}: if applicable, the name of the measure of association, in HTML-format. \item \code{method.short}: the short form of association measure, equals the \code{statistics}-argument. \item \code{fisher}: logical, if Fisher's exact test was used to calculate the p-value. } } \description{ This function calculates various measure of association for contingency tables and returns the statistic and p-value. Supported measures are Cramer's V, Phi, Spearman's rho, Kendall's tau and Pearson's r. } \details{ The p-value for Cramer's V and the Phi coefficient are based on \code{chisq.test()}. If any expected value of a table cell is smaller than 5, or smaller than 10 and the df is 1, then \code{fisher.test()} is used to compute the p-value, unless \code{statistics = "fisher"}; in this case, the use of \code{fisher.test()} is forced to compute the p-value. The test statistic is calculated with \code{cramers_v()} resp. \code{phi()}. Both test statistic and p-value for Spearman's rho, Kendall's tau and Pearson's r are calculated with \code{cor.test()}. When \code{statistics = "auto"}, only Cramer's V or Phi are calculated, based on the dimension of the table (i.e. if the table has more than two rows or columns, Cramer's V is calculated, else Phi). } \examples{ # Phi coefficient for 2x2 tables tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) phi(tab) # Cramer's V for nominal variables with more than 2 categories tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) cramer(tab) # formula notation data(efc) cramer(e16sex ~ c161sex, data = efc) # bootstrapped confidence intervals cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100) # 2x2 table, compute Phi automatically crosstable_statistics(efc, e16sex, c161sex) # more dimensions than 2x2, compute Cramer's V automatically crosstable_statistics(efc, c172code, c161sex) # ordinal data, use Kendall's tau crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall") # calcilate Spearman's rho, with continuity correction crosstable_statistics(efc, e42dep, quol_5, statistics = "spearman", exact = FALSE, continuity = TRUE ) } \references{ Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. \doi{10.3390/math11091982} } sjstats/man/kruskal_wallis_test.Rd0000644000176200001440000001427214620602402017077 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/kruskal_wallis_test.R \name{kruskal_wallis_test} \alias{kruskal_wallis_test} \title{Kruskal-Wallis test} \usage{ kruskal_wallis_test(data, select = NULL, by = NULL, weights = NULL) } \arguments{ \item{data}{A data frame.} \item{select}{Name(s) of the continuous variable(s) (as character vector) to be used as samples for the test. \code{select} can be one of the following: \itemize{ \item \code{select} can be used in combination with \code{by}, in which case \code{select} is the name of the continous variable (and \code{by} indicates a grouping factor). \item \code{select} can also be a character vector of length two or more (more than two names only apply to \code{kruskal_wallis_test()}), in which case the two continuous variables are treated as samples to be compared. \code{by} must be \code{NULL} in this case. \item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples are considered as \emph{dependent} and a paired test is carried out. \item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}) \item For \code{chi_squared_test()}, if \code{select} specifies \strong{one} variable and both \code{by} and \code{probabilities} are \code{NULL}, a one-sample test against given probabilities is automatically conducted, with equal probabilities for each level of \code{select}. }} \item{by}{Name of the variable indicating the groups. Required if \code{select} specifies only one variable that contains all samples to be compared in the test. If \code{by} is not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} } \value{ A data frame with test results. } \description{ This function performs a Kruskal-Wallis rank sum test, which is a non-parametric method to test the null hypothesis that the population median of all of the groups are equal. The alternative is that they differ in at least one. Unlike the underlying base R function \code{kruskal.test()}, this function allows for weighted tests. } \details{ The function simply is a wrapper around \code{\link[=kruskal.test]{kruskal.test()}}. The weighted version of the Kruskal-Wallis test is based on the \strong{survey} package, using \code{\link[survey:svyranktest]{survey::svyranktest()}}. } \section{Which test to use}{ The following table provides an overview of which test to use for different types of data. The choice of test depends on the scale of the outcome variable and the number of samples to compare.\tabular{lll}{ \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 1 \tab continuous, normal \tab \code{t_test()} \cr 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr 2, independent \tab continuous, normal \tab \code{t_test()} \cr 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } (1) More than two dependent samples are considered as \emph{repeated measurements}. For ordinal or not-normally distributed outcomes, these samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples in one variable, the groups to compare in another variable, and a third variable indicating the repeated measurements (subject IDs). (2) More than two dependent samples are considered as \emph{repeated measurements}. For normally distributed outcomes, these samples are usually tested using a ANOVA for repeated measurements. A more sophisticated approach would be using a linear mixed model. } \examples{ data(efc) # Kruskal-Wallis test for elder's age by education kruskal_wallis_test(efc, "e17age", by = "c172code") # when data is in wide-format, specify all relevant continuous # variables in `select` and omit `by` set.seed(123) wide_data <- data.frame( scale1 = runif(20), scale2 = runif(20), scale3 = runif(20) ) kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3")) # same as if we had data in long format, with grouping variable long_data <- data.frame( scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3), groups = rep(c("A", "B", "C"), each = 20) ) kruskal_wallis_test(long_data, select = "scales", by = "groups") # base R equivalent kruskal.test(scales ~ groups, data = long_data) } \references{ \itemize{ \item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. Dtsch Med Wochenschr 2007; 132: e24–e25 \item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 } } \seealso{ \itemize{ \item \code{\link[=t_test]{t_test()}} for parametric t-tests of dependent and independent samples. \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for non-parametric tests of unpaired (independent) samples. \item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for non-parametric tests of paired (dependent) samples. \item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric tests with more than two independent samples. \item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables, dependent and independent). } } sjstats/man/svyglm.nb.Rd0000644000176200001440000000526114616613032014734 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/svyglmnb.R \name{svyglm.nb} \alias{svyglm.nb} \title{Survey-weighted negative binomial generalised linear model} \usage{ svyglm.nb(formula, design, ...) } \arguments{ \item{formula}{An object of class \code{formula}, i.e. a symbolic description of the model to be fitted. See 'Details' in \code{\link[stats]{glm}}.} \item{design}{An object of class \code{\link[survey]{svydesign}}, providing a specification of the survey design.} \item{...}{Other arguments passed down to \code{\link[MASS]{glm.nb}}.} } \value{ An object of class \code{\link[survey]{svymle}} and \code{svyglm.nb}, with some additional information about the model. } \description{ \code{svyglm.nb()} is an extension to the \CRANpkg{survey}-package to fit survey-weighted negative binomial models. It uses \code{\link[survey]{svymle}} to fit sampling-weighted maximum likelihood estimates, based on starting values provided by \code{\link[MASS]{glm.nb}}, as proposed by \emph{Lumley (2010, pp249)}. } \details{ For details on the computation method, see Lumley (2010), Appendix E (especially 254ff.) \cr \cr \pkg{sjstats} implements following S3-methods for \code{svyglm.nb}-objects: \code{family()}, \code{model.frame()}, \code{formula()}, \code{print()}, \code{predict()} and \code{residuals()}. However, these functions have some limitations: \itemize{ \item{\code{family()} simply returns the family-object from the underlying \code{\link[MASS]{glm.nb}}-model.} \item{The \code{predict()}-method just re-fits the \code{svyglm.nb}-model with \code{\link[MASS]{glm.nb}}, overwrites the \code{$coefficients} from this model-object with the coefficients from the returned \code{\link[survey]{svymle}}-object and finally calls \code{\link[stats]{predict.glm}} to compute the predicted values.} \item{\code{residuals()} re-fits the \code{svyglm.nb}-model with \code{\link[MASS]{glm.nb}} and then computes the Pearson-residuals from the \code{glm.nb}-object.} } } \examples{ # ------------------------------------------ # This example reproduces the results from # Lumley 2010, figure E.7 (Appendix E, p256) # ------------------------------------------ if (require("survey")) { data(nhanes_sample) # create survey design des <- svydesign( id = ~SDMVPSU, strat = ~SDMVSTRA, weights = ~WTINT2YR, nest = TRUE, data = nhanes_sample ) # fit negative binomial regression fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des) # print coefficients and standard errors fit } } \references{ Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley } sjstats/man/cv.Rd0000644000176200001440000000145414616613032013425 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cv.R \name{cv} \alias{cv} \title{Compute model quality} \usage{ cv(x, ...) } \arguments{ \item{x}{Fitted linear model of class \code{lm}, \code{merMod} (\pkg{lme4}) or \code{lme} (\pkg{nlme}).} \item{...}{More fitted model objects, to compute multiple coefficients of variation at once.} } \value{ Numeric, the coefficient of variation. } \description{ Compute the coefficient of variation. } \details{ The advantage of the cv is that it is unitless. This allows coefficient of variation to be compared to each other in ways that other measures, like standard deviations or root mean squared residuals, cannot be. } \examples{ data(efc) fit <- lm(barthtot ~ c160age + c12hour, data = efc) cv(fit) } sjstats/man/r2.Rd0000644000176200001440000000131214616725410013335 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/Deprecated.R \name{r2} \alias{r2} \alias{cohens_f} \alias{eta_sq} \alias{epsilon_sq} \alias{omega_sq} \alias{scale_weights} \alias{robust} \alias{icc} \alias{p_value} \alias{se} \alias{means_by_group} \alias{mean_n} \title{Deprecated functions} \usage{ r2(x) cohens_f(x, ...) eta_sq(x, ...) epsilon_sq(x, ...) omega_sq(x, ...) scale_weights(x, ...) robust(x, ...) icc(x) p_value(x, ...) se(x, ...) means_by_group(x, ...) mean_n(x, ...) } \arguments{ \item{x}{An object.} \item{...}{Currently not used.} } \value{ Nothing. } \description{ A list of deprecated functions. } sjstats/man/weight.Rd0000644000176200001440000000316314616613032014303 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/weight.R \name{weight} \alias{weight} \alias{weight2} \title{Weight a variable} \usage{ weight(x, weights, digits = 0) weight2(x, weights) } \arguments{ \item{x}{(Unweighted) variable.} \item{weights}{Vector with same length as \code{x}, which contains weight factors. Each value of \code{x} has a specific assigned weight in \code{weights}.} \item{digits}{Numeric value indicating the number of decimal places to be used for rounding the weighted values. By default, this value is \code{0}, i.e. the returned values are integer values.} } \value{ The weighted \code{x}. } \description{ These functions weight the variable \code{x} by a specific vector of \code{weights}. } \details{ \code{weight2()} sums up all \code{weights} values of the associated categories of \code{x}, whereas \code{weight()} uses a \code{\link[stats]{xtabs}} formula to weight cases. Thus, \code{weight()} may return a vector of different length than \code{x}. } \note{ The values of the returned vector are in sorted order, whereas the values' order of the original \code{x} may be spread randomly. Hence, \code{x} can't be used, for instance, for further cross tabulation. In case you want to have weighted contingency tables or (grouped) box plots etc., use the \code{weightBy} argument of most functions. } \examples{ v <- sample(1:4, 20, TRUE) table(v) w <- abs(rnorm(20)) table(weight(v, w)) table(weight2(v, w)) set.seed(1) x <- sample(letters[1:5], size = 20, replace = TRUE) w <- runif(n = 20) table(x) table(weight(x, w)) } sjstats/man/prop.Rd0000644000176200001440000000647714620333763014014 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/prop.R \name{prop} \alias{prop} \alias{props} \title{Proportions of values in a vector} \usage{ prop(data, ..., weights = NULL, na.rm = TRUE, digits = 4) props(data, ..., na.rm = TRUE, digits = 4) } \arguments{ \item{data}{A data frame. May also be a grouped data frame (see 'Examples').} \item{...}{One or more value pairs of comparisons (logical predicates). Put variable names the left-hand-side and values to match on the right hand side. Expressions may be quoted or unquoted. See 'Examples'.} \item{weights}{Vector of weights that will be applied to weight all observations. Must be a vector of same length as the input vector. Default is \code{NULL}, so no weights are used.} \item{na.rm}{Logical, whether to remove NA values from the vector when the proportion is calculated. \code{na.rm = FALSE} gives you the raw percentage of a value in a vector, \code{na.rm = TRUE} the valid percentage.} \item{digits}{Amount of digits for returned values.} } \value{ For one condition, a numeric value with the proportion of the values inside a vector. For more than one condition, a data frame with one column of conditions and one column with proportions. For grouped data frames, returns a data frame with one column per group with grouping categories, followed by one column with proportions per condition. } \description{ \code{prop()} calculates the proportion of a value or category in a variable. \code{props()} does the same, but allows for multiple logical conditions in one statement. It is similar to \code{mean()} with logical predicates, however, both \code{prop()} and \code{props()} work with grouped data frames. } \details{ \code{prop()} only allows one logical statement per comparison, while \code{props()} allows multiple logical statements per comparison. However, \code{prop()} supports weighting of variables before calculating proportions, and comparisons may also be quoted. Hence, \code{prop()} also processes comparisons, which are passed as character vector (see 'Examples'). } \examples{ \dontshow{if (getRversion() >= "4.2.0" && requireNamespace("datawizard", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) # proportion of value 1 in e42dep prop(efc, e42dep == 1) # expression may also be completely quoted prop(efc, "e42dep == 1") # use "props()" for multiple logical statements props(efc, e17age > 70 & e17age < 80) # proportion of value 1 in e42dep, and all values greater # than 2 in e42dep, including missing values. will return a data frame prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE) # for factors or character vectors, use quoted or unquoted values library(datawizard) # convert numeric to factor, using labels as factor levels efc$e16sex <- to_factor(efc$e16sex) efc$n4pstu <- to_factor(efc$n4pstu) # get proportion of female older persons prop(efc, e16sex == female) # get proportion of male older persons prop(efc, e16sex == "male") # "props()" needs quotes around non-numeric factor levels props(efc, e17age > 70 & e17age < 80, n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' ) # also works with pipe-chains efc |> prop(e17age > 70) efc |> prop(e17age > 70, e16sex == 1) \dontshow{\}) # examplesIf} } sjstats/DESCRIPTION0000644000176200001440000000307314620625152013461 0ustar liggesusersPackage: sjstats Type: Package Encoding: UTF-8 Title: Collection of Convenient Functions for Common Statistical Computations Version: 0.19.0 Authors@R: person("Daniel", "Lüdecke", role = c("aut", "cre"), email = "d.luedecke@uke.de", comment = c(ORCID = "0000-0002-8895-3206")) Maintainer: Daniel Lüdecke Description: Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages. This package aims at providing, first, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like Cramer's V, Phi, or effect size statistics like Eta or Omega squared), or for which currently no functions available. Second, another focus lies on weighted variants of common statistical measures and tests like weighted standard error, mean, t-test, correlation, and more. License: GPL-3 Depends: R (>= 3.4), utils Imports: datawizard, effectsize (>= 0.8.8), insight, parameters, performance, stats Suggests: brms, car, coin, ggplot2, lme4, MASS, pscl, pwr, survey, testthat URL: https://strengejacke.github.io/sjstats/ BugReports: https://github.com/strengejacke/sjstats/issues RoxygenNote: 7.3.1 Config/testthat/edition: 3 Config/testthat/parallel: true NeedsCompilation: no Packaged: 2024-05-14 08:43:22 UTC; Daniel Author: Daniel Lüdecke [aut, cre] () Repository: CRAN Date/Publication: 2024-05-14 09:10:02 UTC sjstats/build/0000755000176200001440000000000014620622047013047 5ustar liggesuserssjstats/build/partial.rdb0000644000176200001440000000007414620622047015175 0ustar liggesusersb```b`a 00 FN ͚Z d@$$7sjstats/tests/0000755000176200001440000000000014620333763013116 5ustar liggesuserssjstats/tests/testthat/0000755000176200001440000000000014620625152014752 5ustar liggesuserssjstats/tests/testthat/test-wtd.R0000644000176200001440000000042714620333763016657 0ustar liggesuserstest_that("wtd", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) expect_equal(weighted_se(efc$c12hour, weights = efc$weight), 1.704182, tolerance = 1e-5) expect_equal(weighted_se(efc$c12hour, weights = NULL), 1.691623, tolerance = 1e-5) }) sjstats/tests/testthat/test-kruskal_wallis_test.R0000644000176200001440000000265614620333763022155 0ustar liggesusersskip_if_not_installed("survey") skip_if_not_installed("datawizard") test_that("kruskal_wallis_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) out1 <- kruskal_wallis_test(efc, "e17age", by = "c172code") out2 <- kruskal.test(e17age ~ c172code, data = efc) expect_equal(out1$Chi2, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) set.seed(123) wide_data <- data.frame( scale1 = runif(20), scale2 = runif(20), scale3 = runif(20) ) long_data <- data.frame( scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3), groups = as.factor(rep(c("A", "B", "C"), each = 20)), stringsAsFactors = FALSE ) out1 <- kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3")) out2 <- kruskal_wallis_test(long_data, select = "scales", by = "groups") out3 <- kruskal.test(scales ~ groups, data = long_data) expect_equal(out1$Chi2, out2$Chi2, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$Chi2, out3$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out3$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out1 <- kruskal_wallis_test(efc, "e17age", by = "c172code", weights = "weight") }) sjstats/tests/testthat/test-mann_whitney_test.R0000644000176200001440000000364414620333763021624 0ustar liggesusersskip_if_not_installed("coin") skip_if_not_installed("survey") skip_if_not_installed("datawizard") test_that("mann_whitney_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) out1 <- mann_whitney_test(efc, "e17age", by = "e16sex") out2 <- wilcox.test(e17age ~ e16sex, data = efc) expect_equal(out1$w, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$estimate, -151, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$r, 0.2571254, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) set.seed(123) wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20)) out1 <- mann_whitney_test(wide_data, select = c("scale1", "scale2")) out2 <- wilcox.test(wide_data$scale1, wide_data$scale2) expect_equal(out1$w, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$r, 0.05132394, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out <- mann_whitney_test(efc, "e17age", by = "e16sex", weights = "weight") expect_equal(out$p, 1.976729e-14, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$estimate, 0.1594972, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$r, 0.2599877, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out)) }) test_that("mann_whitney_test, sanity checks", { data(efc) expect_error(mann_whitney_test(efc, "e17age", by = "c172code"), regex = "Only two groups are") expect_error(mann_whitney_test(efc, c("e17age", "c172code", "e16sex")), regex = "You may only specify") expect_error(mann_whitney_test(efc, c("e17age", "c172code"), by = "e17age"), regex = "If `select` specifies more") expect_error(mann_whitney_test(efc, "e17age"), regex = "Only one variable provided") }) sjstats/tests/testthat/test-t_test.R0000644000176200001440000000273314620445611017361 0ustar liggesusersskip_if_not_installed("datawizard") skip_if_not_installed("effectsize") test_that("t_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) expect_snapshot(t_test(efc, "e17age")) expect_snapshot(t_test(efc, "e17age", "e16sex")) expect_snapshot(t_test(efc, c("e17age", "c160age"))) expect_snapshot(t_test(efc, c("e17age", "c160age"), paired = TRUE)) expect_snapshot(t_test(efc, "e17age", weights = "weight")) expect_snapshot(t_test(efc, "e17age", "e16sex", weights = "weight")) expect_snapshot(t_test(efc, c("e17age", "c160age"), weights = "weight")) expect_snapshot(t_test(efc, c("e17age", "c160age"), weights = "weight", paired = TRUE)) out1 <- t_test(efc, "e17age") out2 <- t.test(efc$e17age ~ 1) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$effect_size, 9.774916, tolerance = 1e-4, ignore_attr = TRUE) out1 <- t_test(efc, "e17age", "e16sex") out2 <- t.test(efc$e17age ~ efc$e16sex) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$effect_size, -0.5641989, tolerance = 1e-4, ignore_attr = TRUE) efc$e17age <- as.factor(efc$e17age) expect_error(t_test(efc, "e17age", "c161sex"), regex = "Variable provided in `select` must be numeric") }) sjstats/tests/testthat/test-autoprior.R0000644000176200001440000000177114620333763020110 0ustar liggesusers.runThisTest <- Sys.getenv("RunAllsjstatsTests") == "yes" if (.runThisTest) { if (suppressWarnings( require("testthat") && require("sjstats") && require("brms") )) { context("sjstats, autoprior") data(efc) efc$c172code <- as.factor(efc$c172code) efc$c161sex <- to_label(efc$c161sex) efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) test_that("auto_prior", { mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) expect_s3_class(auto_prior(mf, efc, TRUE), "brmsprior") mf <- formula(neg_c_7 ~ c161sex + c160age + c172code + c12hour + e17age) expect_s3_class(auto_prior(mf, efc, TRUE), "brmsprior") expect_error(auto_prior(mf, efc)) mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) expect_s3_class(auto_prior(mf, efc, FALSE), "brmsprior") expect_s3_class(auto_prior(mf, efc), "brmsprior") expect_warning(auto_prior(mf, efc, TRUE)) }) } } sjstats/tests/testthat/_snaps/0000755000176200001440000000000014620333763016241 5ustar liggesuserssjstats/tests/testthat/_snaps/chi_squared_test.md0000644000176200001440000000253414620333763022115 0ustar liggesusers# chi_squared_test Code print(out1) Output # Chi-squared test for contingency tables Data: c161sex by e16sex (n = 900) χ² = 2.233, ϕ = 0.053 (very small effect), df = 1, p = 0.135 --- Code print(out) Output # Chi-squared test for contingency tables (weighted) Data: c161sex by e16sex (n = 904) χ² = 2.416, ϕ = 0.054 (very small effect), df = 1, p = 0.120 --- Code print(out1) Output # Chi-squared test for given probabilities Data: c161sex against probabilities 30% and 70% (n = 901) χ² = 16.162, פ‎ = 0.088 (very small effect), df = 1, p < .001 --- Code print(out) Output # Chi-squared test for given probabilities (weighted) Data: c161sex against probabilities 30% and 70% (n = 906) χ² = 20.074, פ‎ = 0.097 (very small effect), df = 1, p < .001 --- Code print(out1) Output # Chi-squared test for contingency tables (using McNemar's test for paired data) Data: survey_1 by survey_2 (n = 1000) χ² = 10.868, ϕ = 0.032 (tiny effect), df = 1, p < .001 sjstats/tests/testthat/_snaps/wilcoxon_test.md0000644000176200001440000000127214620333763021466 0ustar liggesusers# wilcoxon_test Code print(out1) Output # One Sample Wilcoxon signed rank test Alternative hypothesis: true location shift is not equal to 0 V = 528, p < .001 --- Code print(out1) Output # Paired Wilcoxon signed rank test Alternative hypothesis: true location shift is not equal to 0 V = 0, r = 0.87, Z = -4.94, p < .001 --- Code print(out) Output # Paired Wilcoxon signed rank test Alternative hypothesis: true location shift is not equal to 0 V = 247, r = 0.39, Z = -2.76, p = 0.006 sjstats/tests/testthat/_snaps/kruskal_wallis_test.md0000644000176200001440000000066114620333763022654 0ustar liggesusers# kruskal_wallis_test Code print(out1) Output # Kruskal-Wallis test Data: e17age by c172code (3 groups, n = 506, 180 and 156) χ² = 4.05, df = 2, p = 0.132 --- Code print(out1) Output # Kruskal-Wallis test Data: scale1 by scale2 (3 groups, n = 20, 20 and 20) χ² = 4.86, df = 2, p = 0.088 sjstats/tests/testthat/_snaps/mann_whitney_test.md0000644000176200001440000000165014620333763022324 0ustar liggesusers# mann_whitney_test Code print(out1) Output # Mann-Whitney test Group 1: male (n = 294, rank mean = 147.50) Group 2: female (n = 596, rank mean = 298.50) Alternative hypothesis: true location shift is not equal to 0 W = 59684 , r = 0.26, Z = -7.75, p < .001 --- Code print(out1) Output # Mann-Whitney test Group 1: scale1 (n = 20, rank mean = 10.50) Group 2: scale2 (n = 20, rank mean = 10.50) Alternative hypothesis: true location shift is not equal to 0 W = 188 , r = 0.05, Z = -0.32, p = 0.758 --- Code print(out) Output # Mann-Whitney test (weighted) Group 1: male (n = 296, rank mean = 147.58) Group 2: female (n = 600, rank mean = 299.42) r = 0.26, Z = 7.78, p < .001 sjstats/tests/testthat/_snaps/t_test.md0000644000176200001440000000574314620333763020076 0ustar liggesusers# t_test Code t_test(efc, "e17age") Output # One Sample t-test Data: e17age Group 1: e17age (mean = 79.12) Alternative hypothesis: true mean is not equal to 0 t = 291.78, Cohen's d = 9.77 (large effect), df = 890, p < .001 --- Code t_test(efc, "e17age", "e16sex") Output # Welch Two Sample t-test Data: e17age by e16sex Group 1: 1 (n = 294, mean = 76.16) Group 2: 2 (n = 596, mean = 80.57) Alternative hypothesis: true difference in means is not equal to 0 t = -8.05, Cohen's d = -0.56 (medium effect), df = 610.8, p < .001 --- Code t_test(efc, c("e17age", "c160age")) Output # Welch Two Sample t-test Data: e17age by c160age Group 1: c160age (n = 890, mean = 53.42) Group 2: e17age (n = 890, mean = 79.12) Alternative hypothesis: true difference in means is not equal to 0 t = -49.22, Cohen's d = -2.33 (large effect), df = 1468.1, p < .001 --- Code t_test(efc, c("e17age", "c160age"), paired = TRUE) Output # Paired t-test Data: e17age and c160age (mean difference = 25.70) Alternative hypothesis: true mean is not equal to 0 t = 54.11, Cohen's d = 1.81 (large effect), df = 889, p < .001 --- Code t_test(efc, "e17age", weights = "weight") Output # One Sample t-test (weighted) Data: e17age Group 1: e17age (n = 897, mean = 79.17) Alternative hypothesis: true mean is not equal to 0 t = 291.31, Cohen's d = 3.17 (large effect), df = 890, p < .001 --- Code t_test(efc, "e17age", "e16sex", weights = "weight") Output # Two-Sample t-test (weighted) Data: e17age by e16sex Group 1: 1 (n = 600, mean = 80.63) Group 2: 2 (n = 296, mean = 76.19) Alternative hypothesis: true difference in means is not equal to 0 t = 8.03, Cohen's d = -0.17 (very small effect), df = 604.5, p < .001 --- Code t_test(efc, c("e17age", "c160age"), weights = "weight") Output # Two-Sample t-test (weighted) Data: e17age by c160age Group 1: c160age (n = 896, mean = 79.17) Group 2: e17age (n = 896, mean = 53.40) Alternative hypothesis: true difference in means is not equal to 0 t = 49.31, Cohen's d = -1.12 (large effect), df = 1470.0, p < .001 --- Code t_test(efc, c("e17age", "c160age"), weights = "weight", paired = TRUE) Output # Paired t-test (weighted) Data: e17age and c160age (mean difference = 25.77) Alternative hypothesis: true mean difference is not equal to 0 t = 54.37, Cohen's d = 1.54 (large effect), df = 889, p < .001 sjstats/tests/testthat/test-chi_squared_test.R0000644000176200001440000000404114620333763021403 0ustar liggesusersskip_if_not_installed("effectsize") skip_if_not_installed("datawizard") test_that("chi_squared_test", { data(efc) set.seed(123) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) out1 <- chi_squared_test(efc, "c161sex", by = "e16sex") out2 <- chisq.test(efc$c161sex, efc$e16sex) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out <- chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight") expect_equal(out$statistic, 2.415755, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$effect_size, 0.05448519, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$p, 0.1201201, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out)) out1 <- chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7)) out2 <- chisq.test(table(efc$c161sex), p = c(0.3, 0.7)) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out <- chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7), weights = "weight") expect_equal(out$statistic, 20.07379, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out$effect_size, 0.0974456, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out)) set.seed(1234) d <- data.frame( survey_1 = sample(c("Approve", "Disapprove"), size = 1000, replace = TRUE, prob = c(0.45, 0.55)), survey_2 = sample(c("Approve", "Disapprove"), size = 1000, replace = TRUE, prob = c(0.42, 0.58)) ) out1 <- chi_squared_test(d, "survey_1", "survey_2", paired = TRUE) out2 <- mcnemar.test(table(d)) expect_equal(out1$statistic, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$effect_size, 0.03170437, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) }) sjstats/tests/testthat/test-wilcoxon_test.R0000644000176200001440000000157414620333763020766 0ustar liggesusersskip_if_not_installed("survey") skip_if_not_installed("datawizard") skip_if_not_installed("coin") test_that("wilcoxon_test", { data(mtcars) out1 <- wilcoxon_test(mtcars, "mpg") out2 <- wilcox.test(mtcars$mpg ~ 1, exact = FALSE) expect_equal(out1$v, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) out1 <- wilcoxon_test(mtcars, c("mpg", "hp")) out2 <- wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE) expect_equal(out1$v, out2$statistic, tolerance = 1e-4, ignore_attr = TRUE) expect_equal(out1$p, out2$p.value, tolerance = 1e-4, ignore_attr = TRUE) expect_snapshot(print(out1)) data(iris) d <- iris[iris$Species != "setosa", ] out <- wilcoxon_test(d, "Sepal.Width", by = "Species") expect_snapshot(print(out)) }) sjstats/tests/testthat.R0000644000176200001440000000007414620333763015102 0ustar liggesuserslibrary(testthat) library(sjstats) test_check("sjstats") sjstats/R/0000755000176200001440000000000014620500520012140 5ustar liggesuserssjstats/R/nhanes_sample.R0000644000176200001440000000150514616613032015111 0ustar liggesusers#' @docType data #' @title Sample dataset from the National Health and Nutrition Examination Survey #' @name nhanes_sample #' @keywords data #' #' @description Selected variables from the National Health and Nutrition Examination #' Survey that are used in the example from Lumley (2010), Appendix E. #' See \code{\link{svyglm.nb}} for examples. #' #' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley NULL #' @docType data #' @title Sample dataset from the EUROFAMCARE project #' @name efc #' @keywords data #' #' @description German data set from the European study on family care of older people. #' #' @references Lamura G, Döhner H, Kofahl C, editors. Family carers of older people in Europe: a six-country comparative study. Münster: LIT, 2008. NULL sjstats/R/find_beta.R0000644000176200001440000001421714620333763014220 0ustar liggesusers#' @title Determining distribution parameters #' @name find_beta #' #' @description `find_beta()`, `find_normal()` and `find_cauchy()` find the #' shape, mean and standard deviation resp. the location and scale parameters #' to describe the beta, normal or cauchy distribution, based on two #' percentiles. `find_beta2()` finds the shape parameters for a Beta #' distribution, based on a probability value and its standard error #' or confidence intervals. #' #' @param x1 Value for the first percentile. #' @param p1 Probability of the first percentile. #' @param x2 Value for the second percentile. #' @param p2 Probability of the second percentile. #' @param x Numeric, a probability value between 0 and 1. Typically indicates #' a prevalence rate of an outcome of interest; Or an integer value #' with the number of observed events. In this case, specify `n` #' to indicate the toral number of observations. #' @param se The standard error of `x`. Either `se` or `ci` must #' be specified. #' @param ci The upper limit of the confidence interval of `x`. Either #' `se` or `ci` must be specified. #' @param n Numeric, number of total observations. Needs to be specified, if #' `x` is an integer (number of observed events), and no #' probability. See 'Examples'. #' #' @return A list of length two, with the two distribution parameters than can #' be used to define the distribution, which (best) describes #' the shape for the given input parameters. #' #' @details These functions can be used to find parameter for various distributions, #' to define prior probabilities for Bayesian analyses. `x1`, `p1`, `x2` and #' `p2` are parameters that describe two quantiles. Given this knowledge, the #' distribution parameters are returned. #' #' Use `find_beta2()`, if the known parameters are, e.g. a prevalence rate or #' similar probability, and its standard deviation or confidence interval. In #' this case. `x` should be a probability, for example a prevalence rate of a #' certain event. `se` then needs to be the standard error for this probability. #' Alternatively, `ci` can be specified, which should indicate the upper limit #' of the confidence interval od the probability (prevalence rate) `x`. If the #' number of events out of a total number of trials is known (e.g. 12 heads out #' of 30 coin tosses), `x` can also be the number of observed events, while `n` #' indicates the total amount of trials (in the above example, the function #' call would be: `find_beta2(x = 12, n = 30)`). #' #' @references Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF}) #' #' @examples #' # example from blogpost: #' # https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/ #' # 10% of patients respond within 30 days of treatment #' # and 80% respond within 90 days of treatment #' find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' #' parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' curve( #' dnorm(x, mean = parms$mean, sd = parms$sd), #' from = 0, to = 200 #' ) #' #' parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) #' curve( #' dcauchy(x, location = parms$location, scale = parms$scale), #' from = 0, to = 200 #' ) #' #' #' find_beta2(x = .25, ci = .5) #' #' shapes <- find_beta2(x = .25, ci = .5) #' curve(dbeta(x, shapes[[1]], shapes[[2]])) #' #' # find Beta distribution for 3 events out of 20 observations #' find_beta2(x = 3, n = 20) #' #' shapes <- find_beta2(x = 3, n = 20) #' curve(dbeta(x, shapes[[1]], shapes[[2]])) #' #' @export find_beta <- function(x1, p1, x2, p2) { logK <- seq(-5, 10, length = 200) K <- exp(logK) m <- unlist(lapply(K, betaprior, x = x1, p = p1)) prob2 <- stats::pbeta(x2, K * m, K * (1 - m)) ind <- ((prob2 > 0) & (prob2 < 1)) app <- stats::approx(prob2[ind], logK[ind], p2) K0 <- exp(app$y) m0 <- betaprior(K0, x1, p1) s1 <- K0 * m0 s2 <- K0 * (1 - m0) list(shape1 = s1, shape2 = s2) } betaprior <- function(K, x, p) { m.lo <- 0 m.hi <- 1 flag <- TRUE while (flag) { m0 <- (m.lo + m.hi) / 2 p0 <- stats::pbeta(x, K * m0, K * (1 - m0)) if (p0 < p) m.hi <- m0 else m.lo <- m0 if (abs(p0 - p) < 1e-04) flag <- FALSE } m0 } #' @rdname find_beta #' @export find_beta2 <- function(x, se, ci, n) { # check if all required arguments are given if (missing(se) && missing(ci) && missing(n)) { insight::format_error("Either `se` or `ci`, or `n` must be specified.") } # for number of observations, compute variance of beta distribution if (!missing(n)) { if (!is.integer(x) && x < 1) insight::format_error("If `n` is given, x` must be an integer value greater than 0.") # compute 2 SD from beta variance bvar <- 2 * sqrt((x * n) / ((x + n)^2 * (x + n + 1))) # need to compute proportion x <- x / n p2 <- 0.95 x2 <- x + bvar } # for standard errors, we assume a 68% quantile if (!missing(se)) { p2 <- 0.68 x2 <- x + se } # for CI, we assume a 68% quantile if (!missing(ci)) { p2 <- 0.95 x2 <- ci } # the probability is assumed to be the median p1 <- 0.5 x1 <- x find_beta(x1, p1, x2, p2) } #' @rdname find_beta #' @export find_cauchy <- function(x1, p1, x2, p2) { # find location paramater l <- (x1 * stats::qcauchy(p2) ^ -1 - x2 * stats::qcauchy(p1) ^ -1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1) s <- (x2 - x1) / (stats::qcauchy(p2) ^ -1 - stats::qcauchy(p1) ^ -1) list(location = l, scale = s) } #' @rdname find_beta #' @export find_normal <- function(x1, p1, x2, p2) { # find location paramater mw <- (x1 * stats::qnorm(p2) ^ -1 - x2 * stats::qnorm(p1) ^ -1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1) stddev <- (x2 - x1) / (stats::qnorm(p2) ^ -1 - stats::qnorm(p1) ^ -1) list(mean = mw, sd = stddev) } sjstats/R/t_test.R0000644000176200001440000003200014620601140013560 0ustar liggesusers#' @title Student's t test #' @name t_test #' @description This function performs a Student's t test for two independent #' samples, for paired samples, or for one sample. It's a parametric test for #' the null hypothesis that the means of two independent samples are equal, or #' that the mean of one sample is equal to a specified value. The hypothesis #' can be one- or two-sided. #' #' Unlike the underlying base R function `t.test()`, this function allows for #' weighted tests and automatically calculates effect sizes. Cohen's _d_ is #' returned for larger samples (n > 20), while Hedges' _g_ is returned for #' smaller samples. #' #' @inheritParams mann_whitney_test #' @param paired Logical, whether to compute a paired t-test for dependent #' samples. #' @inherit mann_whitney_test seealso #' #' @inheritSection mann_whitney_test Which test to use #' #' @details Interpretation of effect sizes are based on rules described in #' [`effectsize::interpret_cohens_d()`] and [`effectsize::interpret_hedges_g()`]. #' Use these function directly to get other interpretations, by providing the #' returned effect size (_Cohen's d_ or _Hedges's g_ in this case) as argument, #' e.g. `interpret_cohens_d(0.35, rules = "sawilowsky2009")`. #' #' @return A data frame with test results. Effectsize Cohen's _d_ is returned #' for larger samples (n > 20), while Hedges' _g_ is returned for smaller samples. #' #' @references #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @examplesIf requireNamespace("effectsize") #' data(sleep) #' # one-sample t-test #' t_test(sleep, "extra") #' # base R equivalent #' t.test(extra ~ 1, data = sleep) #' #' # two-sample t-test, by group #' t_test(mtcars, "mpg", by = "am") #' # base R equivalent #' t.test(mpg ~ am, data = mtcars) #' #' # paired t-test #' t_test(mtcars, c("mpg", "hp"), paired = TRUE) #' # base R equivalent #' t.test(mtcars$mpg, mtcars$hp, data = mtcars, paired = TRUE) #' @export t_test <- function(data, select = NULL, by = NULL, weights = NULL, paired = FALSE, mu = 0, alternative = "two.sided") { insight::check_if_installed(c("datawizard", "effectsize")) alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) # sanity checks .sanitize_htest_input(data, select, by, weights, test = "t_test") data_name <- NULL # filter and remove NA data <- stats::na.omit(data[c(select, by, weights)]) # does select indicate more than one variable? We than reshape the data # to have one continous scale and one grouping variable if (length(select) > 1) { # paired? if (paired) { # subtract the two variables for paired t-test, and "set" by to NULL data[[select[1]]] <- data[[select[1]]] - data[[select[2]]] data_name <- paste(select[1], "and", select[2]) select <- select[1] by <- NULL } else { # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long( data[c(select, weights)], select = select, names_to = "group", values_to = "scale" ) by <- select[2] select <- select[1] # after converting to long, we have the "grouping" variable first in the data colnames(data) <- c(weights, by, select) } } # get data dv <- data[[select]] # for two-sample t-test... if (!is.null(by)) { grp <- data[[by]] # coerce to factor grp <- datawizard::to_factor(grp) # only two groups allowed if (insight::n_unique(grp) > 2) { insight::format_error("Only two groups are allowed for Student's t test.") # nolint } # value labels group_labels <- names(attr(data[[by]], "labels", exact = TRUE)) if (is.null(group_labels)) { group_labels <- levels(droplevels(grp)) } data_name <- paste(select, "by", by) } else { # one-sample t-test... grp <- NULL group_labels <- select if (is.null(data_name)) { data_name <- select } } if (is.null(weights)) { .calculate_ttest(dv, grp, mu, paired, alternative, group_labels, data_name) } else { .calculate_weighted_ttest(dv, grp, mu, paired, alternative, data[[weights]], group_labels, data_name) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_ttest <- function(dv, grp, mu, paired, alternative, group_labels, data_name) { insight::check_if_installed("effectsize") # prepare data if (is.null(grp)) { tdat <- data.frame(dv) t_formula <- stats::as.formula("dv ~ 1") } else { tdat <- data.frame(dv, grp) t_formula <- stats::as.formula("dv ~ grp") } # perfom wilcox test htest <- stats::t.test( t_formula, data = tdat, alternative = alternative, mu = mu ) test_statistic <- htest$statistic if (nrow(tdat) > 20) { effect_size <- stats::setNames( effectsize::cohens_d( t_formula, data = tdat, alternative = alternative, mu = mu )$Cohens_d, "Cohens_d" ) } else { effect_size <- stats::setNames( effectsize::hedges_g( t_formula, data = tdat, alternative = alternative, mu = mu )$Hedges_g, "Hedges_g" ) } # return result out <- data.frame( data = data_name, statistic_name = "t", statistic = test_statistic, effect_size_name = names(effect_size), effect_size = as.numeric(effect_size), p = as.numeric(htest$p.value), df = as.numeric(htest$parameter), method = ifelse(paired, "Paired t-test", htest$method), alternative = alternative, mu = mu, stringsAsFactors = FALSE ) class(out) <- c("sj_htest_t", "data.frame") attr(out, "group_labels") <- group_labels attr(out, "means") <- as.numeric(htest$estimate) attr(out, "paired") <- isTRUE(paired) attr(out, "one_sample") <- is.null(grp) attr(out, "weighted") <- FALSE if (!is.null(grp)) { attr(out, "n_groups") <- stats::setNames( c(as.numeric(table(grp))), c("N Group 1", "N Group 2") ) } out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_ttest <- function(dv, grp, mu, paired, alternative, weights, group_labels, data_name) { insight::check_if_installed(c("datawizard", "effectsize")) if (is.null(grp)) { dat <- stats::na.omit(data.frame(dv, weights)) colnames(dat) <- c("y", "w") x_values <- dat$y x_weights <- dat$w y_values <- NULL # group N's n_groups <- stats::setNames(round(sum(x_weights)), "N Group 1") } else { dat <- stats::na.omit(data.frame(dv, grp, weights)) colnames(dat) <- c("y", "g", "w") # unique groups groups <- unique(dat$g) # values for sample 1 x_values <- dat$y[dat$g == groups[1]] x_weights <- dat$w[dat$g == groups[1]] # values for sample 2 y_values <- dat$y[dat$g == groups[2]] y_weights <- dat$w[dat$g == groups[2]] # group N's n_groups <- stats::setNames( c(round(sum(x_weights)), round(sum(y_weights))), c("N Group 1", "N Group 2") ) } mu_x <- stats::weighted.mean(x_values, x_weights, na.rm = TRUE) var_x <- datawizard::weighted_sd(x_values, x_weights)^2 se_x <- sqrt(var_x / length(x_values)) if (paired || is.null(y_values)) { # paired se <- se_x dof <- length(x_values) - 1 test_statistic <- (mu_x - mu) / se estimate <- mu_x method <- if (paired) "Paired t-test" else "One Sample t-test" } else { # unpaired t-test mu_y <- stats::weighted.mean(y_values, y_weights) var_y <- datawizard::weighted_sd(y_values, y_weights)^2 se_y <- sqrt(var_y / length(y_values)) se <- sqrt(se_x^2 + se_y^2) dof <- se^4 / (se_x^4 / (length(x_values) - 1) + se_y^4 / (length(y_values) - 1)) test_statistic <- (mu_x - mu_y - mu) / se estimate <- c(mu_x, mu_y) method <- "Two-Sample t-test" } # p-values if (alternative == "less") { pval <- stats::pt(test_statistic, dof) } else if (alternative == "greater") { pval <- stats::pt(test_statistic, dof, lower.tail = FALSE) } else { pval <- 2 * stats::pt(-abs(test_statistic), dof) } # effect size dat$y <- dat$y * dat$w if (is.null(y_values)) { t_formula <- stats::as.formula("y ~ 1") } else { t_formula <- stats::as.formula("y ~ g") } if (nrow(dat) > 20) { effect_size <- stats::setNames( effectsize::cohens_d( t_formula, data = dat, alternative = alternative, mu = mu, paired = FALSE )$Cohens_d, "Cohens_d" ) } else { effect_size <- stats::setNames( effectsize::hedges_g( t_formula, data = dat, alternative = alternative, mu = mu, paired = FALSE )$Hedges_g, "Hedges_g" ) } # return result out <- data.frame( data = data_name, statistic_name = "t", statistic = test_statistic, effect_size_name = names(effect_size), effect_size = as.numeric(effect_size), p = pval, df = dof, method = method, alternative = alternative, mu = mu, stringsAsFactors = FALSE ) class(out) <- c("sj_htest_t", "data.frame") attr(out, "means") <- estimate attr(out, "n_groups") <- n_groups attr(out, "means") <- estimate attr(out, "group_labels") <- group_labels attr(out, "paired") <- isTRUE(paired) attr(out, "one_sample") <- is.null(y_values) && !isTRUE(paired) attr(out, "weighted") <- TRUE out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_t <- function(x, ...) { insight::check_if_installed("effectsize") # fetch attributes group_labels <- attributes(x)$group_labels means <- attributes(x)$means n_groups <- attributes(x)$n_groups weighted <- attributes(x)$weighted paired <- isTRUE(attributes(x)$paired) one_sample <- isTRUE(attributes(x)$one_sample) if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } # same width group_labels <- format(group_labels) # header insight::print_color(sprintf("# %s%s\n\n", x$method, weight_string), "blue") # print for paired t-test if (paired) { # data insight::print_color(sprintf( " Data: %s (mean difference = %s)\n", x$data, insight::format_value(means[1], protect_integers = TRUE) ), "cyan") } else { # data insight::print_color(sprintf(" Data: %s\n", x$data), "cyan") # group-1-info if (is.null(n_groups)) { insight::print_color( sprintf( " Group 1: %s (mean = %s)\n", group_labels[1], insight::format_value(means[1], protect_integers = TRUE) ), "cyan" ) } else { insight::print_color( sprintf( " Group 1: %s (n = %i, mean = %s)\n", group_labels[1], n_groups[1], insight::format_value(means[1], protect_integers = TRUE) ), "cyan" ) } # group-2-info if (length(group_labels) > 1) { if (is.null(n_groups)) { insight::print_color( sprintf( " Group 2: %s (mean = %s)\n", group_labels[2], insight::format_value(means[2], protect_integers = TRUE) ), "cyan" ) } else { insight::print_color( sprintf( " Group 2: %s (n = %i, mean = %s)\n", group_labels[2], n_groups[2], insight::format_value(means[2], protect_integers = TRUE) ), "cyan" ) } } } # alternative hypothesis alt_string <- switch(x$alternative, two.sided = "not equal to", less = "less than", greater = "greater than" ) if (one_sample) { alt_string <- paste("true mean is", alt_string, x$mu) } else if (paired) { alt_string <- paste("true mean difference is", alt_string, x$mu) } else { alt_string <- paste("true difference in means is", alt_string, x$mu) } insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") # string for effectsizes if (x$effect_size_name == "Cohens_d") { eff_string <- sprintf( "Cohen's d = %.2f (%s effect)", x$effect_size, effectsize::interpret_cohens_d(x$effect_size) ) } else { eff_string <- sprintf( "Hedges' g = %.2f (%s effect)", x$effect_size, effectsize::interpret_hedges_g(x$effect_size) ) } cat(sprintf( "\n t = %.2f, %s, df = %s, %s\n\n", x$statistic, eff_string, insight::format_value(x$df, digits = 1, protect_integers = TRUE), insight::format_p(x$p) )) } sjstats/R/gof.R0000644000176200001440000000727214620333763013063 0ustar liggesusers#' @title Compute model quality #' @name chisq_gof #' #' @description For logistic regression models, performs a Chi-squared #' goodness-of-fit-test. #' #' @param x A numeric vector or a \code{glm}-object. #' @param prob Vector of probabilities (indicating the population probabilities) #' of the same length as \code{x}'s amount of categories / factor levels. #' Use \code{nrow(table(x))} to determine the amount of necessary values #' for \code{prob}. Only used, when \code{x} is a vector, and not a #' \code{glm}-object. #' @param weights Vector with weights, used to weight \code{x}. #' #' @references #' Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. #' #' @details For vectors, this function is a convenient function for the #' \code{chisq.test()}, performing goodness-of-fit test. For #' \code{glm}-objects, this function performs a goodness-of-fit test. #' A well-fitting model shows \emph{no} significant difference between the #' model and the observed data, i.e. the reported p-values should be #' greater than 0.05. #' #' @return For vectors, returns the object of the computed \code{\link[stats]{chisq.test}}. #' For \code{glm}-objects, an object of class \code{chisq_gof} with #' following values: \code{p.value}, the p-value for the goodness-of-fit test; #' \code{z.score}, the standardized z-score for the goodness-of-fit test; #' \code{rss}, the residual sums of squares term and \code{chisq}, the pearson #' chi-squared statistic. #' #' @examples #' data(efc) #' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) #' m <- glm( #' neg_c_7d ~ c161sex + barthtot + c172code, #' data = efc, #' family = binomial(link = "logit") #' ) #' #' # goodness-of-fit test for logistic regression #' chisq_gof(m) #' #' # goodness-of-fit test for vectors against probabilities #' # differing from population #' chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28)) #' #' # equal to population #' chisq_gof(efc$e42dep, prop.table(table(efc$e42dep))) #' #' @export chisq_gof <- function(x, prob = NULL, weights = NULL) { if (inherits(x, "glm")) { # This is an adapted version from the # "binomTools" package. The "X2GOFtest()" # function did not work when model data frame # had missing values. y_hat <- stats::fitted(x) wt <- x$prior.weight vJ <- wt * y_hat * (1 - y_hat) cJ <- (1 - 2 * y_hat) / vJ X2 <- sum(stats::resid(x, type = "pearson")^2) form <- stats::as.formula(x$formula) form[[2]] <- as.name("cJ") # use model matrix instead of data values, # because data may contain more variables # than needed, and due to missing may have # different row length dat <- stats::na.omit(x$model) dat$cJ <- cJ dat$vJ <- vJ RSS <- sum(vJ * stats::resid(stats::lm(form, data = dat, weights = vJ))^2) A <- 2 * (length(y_hat) - sum(1 / wt)) z <- (X2 - x$df.residual) / sqrt(A + RSS) p.value <- 2 * stats::pnorm(abs(z), lower.tail = FALSE) chi2gof <- list( p.value = p.value, z.score = z, rss = RSS, chisq = X2 ) class(chi2gof) <- c("sj_chi2gof", "list") } else { # check if we have probs if (is.null(prob)) { warning("`prob` needs to be specified.", call. = F) return(invisible(NULL)) } # performs a Chi-square goodnes-of-fit-test if (!is.null(weights)) x <- weight(x, weights) dummy <- as.vector(table(x)) # goodness of fit-test. x is one-dimensional and # y not given chi2gof <- stats::chisq.test(dummy, p = prob) } chi2gof } sjstats/R/boot_ci.R0000644000176200001440000001361414620333763013723 0ustar liggesusers#' @title Standard error and confidence intervals for bootstrapped estimates #' @name boot_ci #' #' @description Compute nonparametric bootstrap estimate, standard error, #' confidence intervals and p-value for a vector of bootstrap #' replicate estimates. #' #' @param data A data frame that containts the vector with bootstrapped #' estimates, or directly the vector (see 'Examples'). #' @param ci.lvl Numeric, the level of the confidence intervals. #' @param select Optional, unquoted names of variables (as character vector) #' with bootstrapped estimates. Required, if either `data` is a data frame #' (and no vector), and only selected variables from `data` should be processed. #' @param method Character vector, indicating if confidence intervals should be #' based on bootstrap standard error, multiplied by the value of the quantile #' function of the t-distribution (default), or on sample quantiles of the #' bootstrapped values. See 'Details' in `boot_ci()`. May be abbreviated. #' #' @return A data frame with either bootstrap estimate, standard error, the #' lower and upper confidence intervals or the p-value for all bootstrapped #' estimates. #' #' @details The methods require one or more vectors of bootstrap replicate #' estimates as input. #' #' - `boot_est()`: returns the bootstrapped estimate, simply by computing #' the mean value of all bootstrap estimates. #' - `boot_se()`: computes the nonparametric bootstrap standard error by #' calculating the standard deviation of the input vector. #' - The mean value of the input vector and its standard error is used by #' `boot_ci()` to calculate the lower and upper confidence interval, #' assuming a t-distribution of bootstrap estimate replicates (for #' `method = "dist"`, the default, which is #' `mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)`); for #' `method = "quantile"`, 95\% sample quantiles are used to compute the #' confidence intervals (`quantile(x, probs = c(0.025, 0.975))`). Use #' `ci.lvl` to change the level for the confidence interval. #' - P-values from `boot_p()` are also based on t-statistics, assuming normal #' distribution. #' #' @references Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 #' #' @seealso []`bootstrap()`] to generate nonparametric bootstrap samples. #' #' @examples #' data(efc) #' bs <- bootstrap(efc, 100) #' #' # now run models for each bootstrapped sample #' bs$models <- lapply( #' bs$strap, #' function(.x) lm(neg_c_7 ~ e42dep + c161sex, data = .x) #' ) #' #' # extract coefficient "dependency" and "gender" from each model #' bs$dependency <- vapply(bs$models, function(x) coef(x)[2], numeric(1)) #' bs$gender <- vapply(bs$models, function(x) coef(x)[3], numeric(1)) #' #' # get bootstrapped confidence intervals #' boot_ci(bs$dependency) #' #' # compare with model fit #' fit <- lm(neg_c_7 ~ e42dep + c161sex, data = efc) #' confint(fit)[2, ] #' #' # alternative function calls. #' boot_ci(bs$dependency) #' boot_ci(bs, "dependency") #' boot_ci(bs, c("dependency", "gender")) #' boot_ci(bs, c("dependency", "gender"), method = "q") #' #' #' # compare coefficients #' mean(bs$dependency) #' boot_est(bs$dependency) #' coef(fit)[2] #' @export boot_ci <- function(data, select = NULL, method = c("dist", "quantile"), ci.lvl = 0.95) { # match arguments method <- match.arg(method) # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute confidence intervals for all values transform_boot_result(lapply(.dat, function(x) { # check if method should be based on t-distribution of # bootstrap values or quantiles if (method == "dist") { # get bootstrap standard error bootse <- stats::qt((1 + ci.lvl) / 2, df = length(x) - 1) * stats::sd(x, na.rm = TRUE) # lower and upper confidence interval ci <- mean(x, na.rm = TRUE) + c(-bootse, bootse) } else { # CI based on quantiles of bootstrapped values ci <- stats::quantile(x, probs = c((1 - ci.lvl) / 2, (1 + ci.lvl) / 2)) } # give proper names names(ci) <- c("conf.low", "conf.high") ci })) } #' @rdname boot_ci #' @export boot_se <- function(data, select = NULL) { # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # get bootstrap standard error se <- stats::sd(x, na.rm = TRUE) names(se) <- "std.err" se })) } #' @rdname boot_ci #' @export boot_p <- function(data, select = NULL) { # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # compute t-statistic t.stat <- mean(x, na.rm = TRUE) / stats::sd(x, na.rm = TRUE) # compute p-value p <- 2 * stats::pt(abs(t.stat), df = length(x) - 1, lower.tail = FALSE) names(p) <- "p.value" p })) } #' @rdname boot_ci #' @export boot_est <- function(data, select = NULL) { # evaluate arguments, generate data if (is.null(select)) { .dat <- as.data.frame(data) } else { .dat <- data[select] } # compute mean for all values (= bootstrapped estimate) transform_boot_result(lapply(.dat, function(x) { estimate <- mean(x, na.rm = TRUE) names(estimate) <- "estimate" estimate })) } transform_boot_result <- function(res) { # transform a bit, so we have each estimate in a row, and ci's as columns... rownames_as_column(as.data.frame(t(as.data.frame(res))), var = "term") } sjstats/R/wilcoxon_test.R0000644000176200001440000001675314620602205015203 0ustar liggesusers#' @title Wilcoxon rank sum test #' @name wilcoxon_test #' @description This function performs Wilcoxon rank sum tests for one sample #' or for two _paired_ (dependent) samples. For _unpaired_ (independent) #' samples, please use the `mann_whitney_test()` function. #' #' A Wilcoxon rank sum test is a non-parametric test for the null hypothesis #' that two samples have identical continuous distributions. The implementation #' in `wilcoxon_test()` is only used for _paired_, i.e. _dependent_ samples. For #' independent (unpaired) samples, use `mann_whitney_test()`. #' #' `wilcoxon_test()` can be used for ordinal scales or when the continuous #' variables are not normally distributed. For large samples, or approximately #' normally distributed variables, the `t_test()` function can be used (with #' `paired = TRUE`). #' #' @inheritParams mann_whitney_test #' @inherit mann_whitney_test seealso #' #' @inheritSection mann_whitney_test Which test to use #' #' @return A data frame with test results. The function returns p and Z-values #' as well as effect size r and group-rank-means. #' #' @references #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @examplesIf requireNamespace("coin") #' data(mtcars) #' # one-sample test #' wilcoxon_test(mtcars, "mpg") #' # base R equivalent, we set exact = FALSE to avoid a warning #' wilcox.test(mtcars$mpg ~ 1, exact = FALSE) #' #' # paired test #' wilcoxon_test(mtcars, c("mpg", "hp")) #' # base R equivalent, we set exact = FALSE to avoid a warning #' wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE) #' #' # when `by` is specified, each group must be of same length #' data(iris) #' d <- iris[iris$Species != "setosa", ] #' wilcoxon_test(d, "Sepal.Width", by = "Species") #' @export wilcoxon_test <- function(data, select = NULL, by = NULL, weights = NULL, mu = 0, alternative = "two.sided", ...) { insight::check_if_installed("datawizard") alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) # sanity checks .sanitize_htest_input(data, select, by, weights, test = "wilcoxon_test") # alternative only if weights are NULL if (!is.null(weights) && alternative != "two.sided") { insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.") } # for paired two-sample, do groups all have same length? if (!is.null(by)) { group_len <- as.numeric(table(as.vector(data[[by]]))) if (!all(group_len == group_len[1])) { insight::format_error("For paired two-sample Wilcoxon test, all groups specified in `by` must have the same length.") # nolint } # convert to wide format out <- split(data[select], as.character(data[[by]])) data <- stats::setNames(do.call(cbind, out), names(out)) select <- colnames(data) } # value labels group_labels <- select x <- data[[select[1]]] if (length(select) > 1) { y <- data[[select[2]]] } else { y <- NULL } if (is.null(weights)) { .calculate_wilcox(x, y, alternative, mu, group_labels, ...) } else { .calculate_weighted_mwu(x, y, data[[weights]], group_labels) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_wilcox <- function(x, y, alternative, mu, group_labels, ...) { insight::check_if_installed("coin") # for paired Wilcoxon test, we have effect sizes if (!is.null(y)) { # prepare data wcdat <- data.frame(x, y) # perfom wilcox test wt <- coin::wilcoxsign_test(x ~ y, data = wcdat) # compute statistics u <- as.numeric(coin::statistic(wt, type = "linear")) z <- as.numeric(coin::statistic(wt, type = "standardized")) r <- abs(z / sqrt(nrow(wcdat))) } else { wt <- u <- z <- r <- NULL } # prepare data if (is.null(y)) { dv <- x } else { dv <- x - y } htest <- suppressWarnings(stats::wilcox.test( dv ~ 1, alternative = alternative, mu = mu, ... )) v <- htest$statistic p <- htest$p.value out <- data.frame( group1 = group_labels[1], v = v, p = as.numeric(p), mu = mu, alternative = alternative ) # two groups? if (length(group_labels) > 1) { out$group2 <- group_labels[2] } # add effectsizes, when we have if (!is.null(wt)) { out$u <- u out$z <- z out$r <- r } attr(out, "group_labels") <- group_labels attr(out, "method") <- "wilcoxon" attr(out, "weighted") <- FALSE attr(out, "one_sample") <- length(group_labels) == 1 class(out) <- c("sj_htest_wilcox", "data.frame") out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_wilcox <- function(x, y, weights, group_labels) { # check if pkg survey is available insight::check_if_installed("survey") # prepare data if (is.null(y)) { dv <- x } else { dv <- x - y } dat <- stats::na.omit(data.frame(dv, weights)) colnames(dat) <- c("y", "w") design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) result <- survey::svyranktest(formula = y ~ 1, design, test = "wilcoxon") # statistics and effect sizes z <- result$statistic r <- abs(z / sqrt(nrow(dat))) out <- data_frame( group1 = group_labels[1], estimate = result$estimate, z = z, r = r, p = as.numeric(result$p.value), mu = 0, alternative = "two.sided" ) # two groups? if (length(group_labels) > 1) { out$group2 <- group_labels[2] } attr(out, "group_labels") <- group_labels attr(out, "weighted") <- TRUE attr(out, "one_sample") <- length(group_labels) == 1 attr(out, "method") <- "wilcoxon" class(out) <- c("sj_htest_wilcox", "data.frame") out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_wilcox <- function(x, ...) { # fetch attributes group_labels <- attributes(x)$group_labels weighted <- attributes(x)$weighted one_sample <- attributes(x)$one_sample if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } if (one_sample) { onesample_string <- "One Sample" } else { onesample_string <- "Paired" } # same width group_labels <- format(group_labels) # header insight::print_color(sprintf( "# %s Wilcoxon signed rank test%s\n\n", onesample_string, weight_string ), "blue") # alternative hypothesis if (!is.null(x$alternative) && !is.null(x$mu)) { alt_string <- switch(x$alternative, two.sided = "not equal to", less = "less than", greater = "greater than" ) alt_string <- paste("true location shift is", alt_string, x$mu) insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") } if (!is.null(x[["v"]])) { v_stat <- sprintf("V = %i, ", round(x$v)) } else { v_stat <- "" } if (!is.null(x[["r"]])) { cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", v_stat, x$r, x$z, insight::format_p(x$p))) } else { cat(sprintf("\n %s%s\n\n", v_stat, insight::format_p(x$p))) } } sjstats/R/svyglmnb.R0000644000176200001440000001161714620333763014147 0ustar liggesusersutils::globalVariables("scaled.weights") #' @title Survey-weighted negative binomial generalised linear model #' @name svyglm.nb #' @description \code{svyglm.nb()} is an extension to the \CRANpkg{survey}-package #' to fit survey-weighted negative binomial models. It uses #' \code{\link[survey]{svymle}} to fit sampling-weighted #' maximum likelihood estimates, based on starting values provided #' by \code{\link[MASS]{glm.nb}}, as proposed by \emph{Lumley #' (2010, pp249)}. #' #' #' @param formula An object of class \code{formula}, i.e. a symbolic description #' of the model to be fitted. See 'Details' in \code{\link[stats]{glm}}. #' @param design An object of class \code{\link[survey]{svydesign}}, providing #' a specification of the survey design. #' @param ... Other arguments passed down to \code{\link[MASS]{glm.nb}}. #' #' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.nb}, #' with some additional information about the model. #' #' @details For details on the computation method, see Lumley (2010), Appendix E #' (especially 254ff.) #' \cr \cr #' \pkg{sjstats} implements following S3-methods for \code{svyglm.nb}-objects: #' \code{family()}, \code{model.frame()}, \code{formula()}, \code{print()}, #' \code{predict()} and \code{residuals()}. However, these functions have some #' limitations: #' \itemize{ #' \item{\code{family()} simply returns the family-object from the #' underlying \code{\link[MASS]{glm.nb}}-model.} #' \item{The \code{predict()}-method just re-fits the \code{svyglm.nb}-model #' with \code{\link[MASS]{glm.nb}}, overwrites the \code{$coefficients} #' from this model-object with the coefficients from the returned #' \code{\link[survey]{svymle}}-object and finally calls #' \code{\link[stats]{predict.glm}} to compute the predicted values.} #' \item{\code{residuals()} re-fits the \code{svyglm.nb}-model with #' \code{\link[MASS]{glm.nb}} and then computes the Pearson-residuals #' from the \code{glm.nb}-object.} #' } #' #' #' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley #' #' @examples #' # ------------------------------------------ #' # This example reproduces the results from #' # Lumley 2010, figure E.7 (Appendix E, p256) #' # ------------------------------------------ #' if (require("survey")) { #' data(nhanes_sample) #' #' # create survey design #' des <- svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' #' # fit negative binomial regression #' fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des) #' #' # print coefficients and standard errors #' fit #' } #' @export svyglm.nb <- function(formula, design, ...) { insight::check_if_installed(c("survey", "MASS")) # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) # update design with scaled weights design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE)) # fit negative binomial model, with scaled design weights mod <- MASS::glm.nb(formula, data = stats::model.frame(design), weights = scaled.weights, ...) fam <- stats::family(mod) # fit survey model, using maximum likelihood estimation svyfit <- survey::svymle( loglike = sjstats_loglik, grad = sjstats_score, design = design, formulas = list(theta = ~1, eta = formula), start = c(mod$theta, stats::coef(mod)), na.action = "na.omit" ) # add additoinal information class(svyfit) <- c("svyglm.nb", class(svyfit)) attr(svyfit, "nb.terms") <- all.vars(formula) attr(svyfit, "nb.formula") <- formula attr(svyfit, "family") <- fam attr(svyfit, "nb.theta") <- mod[["theta"]] attr(svyfit, "nb.theta.se") <- mod[["SE.theta"]] svyfit$deviance <- mod$deviance svyfit$df.residuals <- mod$df.residuals svyfit$df <- length(stats::coef(mod)) + 1 svyfit$aic <- mod$aic svyfit } # log-likelihood function used in "svymle()" sjstats_loglik <- function(y, theta, eta) { mu <- exp(eta) return( lgamma(theta + y) - lgamma(theta) - lgamma(y + 1) + theta * log(theta) + y * log(mu + (y == 0)) - (theta + y) * log(theta + mu) ) } # derivative sjstats_deta <- function(y, theta, eta) { mu <- exp(eta) dmu <- y / mu - (theta + y) / (theta + mu) dmu * mu } # derivative sjstats_dtheta <- function(y, theta, eta) { mu <- exp(eta) digamma(theta + y) - digamma(theta) + log(theta) + 1 - log(theta + mu) - (y + theta) / (mu + theta) } # score function, combines derivatives sjstats_score <- function(y, theta, eta) { cbind(sjstats_dtheta(y, theta,eta), sjstats_deta(y, theta, eta)) } sjstats/R/chi_squared_test.R0000644000176200001440000002475114620603324015630 0ustar liggesusers#' @title Chi-Squared test #' @name chi_squared_test #' @description This function performs a \eqn{\chi^2} test for contingency #' tables or tests for given probabilities. The returned effects sizes are #' Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) #' for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against #' given probabilities (see _Ben-Shachar et al. 2023_). #' #' @param probabilities A numeric vector of probabilities for each cell in the #' contingency table. The length of the vector must match the number of cells #' in the table, i.e. the number of unique levels of the variable specified #' in `select`. If `probabilities` is provided, a chi-squared test for given #' probabilities is conducted. Furthermore, if `probabilities` is given, `by` #' must be `NULL`. The probabilities must sum to 1. #' @param paired Logical, if `TRUE`, a McNemar test is conducted for 2x2 tables. #' Note that `paired` only works for 2x2 tables. #' @param ... Additional arguments passed down to [`chisq.test()`]. #' @inheritParams mann_whitney_test #' #' @inheritSection mann_whitney_test Which test to use #' #' @inherit mann_whitney_test seealso #' #' @return A data frame with test results. The returned effects sizes are #' Cramer's V for tables with more than two rows or columns, Phi (\eqn{\phi}) #' for 2x2 tables, and Fei (\ifelse{latex}{\eqn{Fei}}{פ}) for tests against #' given probabilities. #' #' @details The function is a wrapper around [`chisq.test()`] and #' [`fisher.test()`] (for small expected values) for contingency tables, and #' `chisq.test()` for given probabilities. When `probabilities` are provided, #' these are rescaled to sum to 1 (i.e. `rescale.p = TRUE`). When `fisher.test()` #' is called, simulated p-values are returned (i.e. `simulate.p.value = TRUE`, #' see `?fisher.test`). If `paired = TRUE` and a 2x2 table is provided, #' a McNemar test (see [`mcnemar.test()`]) is conducted. #' #' The weighted version of the chi-squared test is based on the a weighted #' table, using [`xtabs()`] as input for `chisq.test()`. #' #' Interpretation of effect sizes are based on rules described in #' [`effectsize::interpret_phi()`], [`effectsize::interpret_cramers_v()`], #' and [`effectsize::interpret_fei()`]. Use these function directly to get other #' interpretations, by providing the returned effect size as argument, e.g. #' `interpret_phi(0.35, rules = "gignac2016")`. #' #' @references #' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., #' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data #' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. #' \doi{10.3390/math11091982} #' #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @examplesIf requireNamespace("effectsize") && requireNamespace("MASS") #' data(efc) #' efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) #' #' # Chi-squared test #' chi_squared_test(efc, "c161sex", by = "e16sex") #' #' # weighted Chi-squared test #' chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight") #' #' # Chi-squared test for given probabilities #' chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7)) #' @export chi_squared_test <- function(data, select = NULL, by = NULL, probabilities = NULL, weights = NULL, paired = FALSE, ...) { # sanity check - if we only have one variable in "select" and "by" and # "probabilities" are NULL, set probalities if (is.null(probabilities) && !is.null(select) && is.null(by) && length(select) == 1) { probabilities <- rep(1 / length(data[[select]]), length(data[[select]])) } if (is.null(probabilities)) { .calculate_chisq(data, select, by, weights, paired, ...) } else { # sanity check - `paired = TRUE` is not available for given probabilities if (paired) { insight::format_error("When `probabilities` are provided, `paired = TRUE` is not available.") # nolint } .calculate_chisq_gof(data, select, probabilities, weights, ...) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_chisq <- function(data, select, by, weights, paired = FALSE, ...) { insight::check_if_installed(c("datawizard", "MASS")) # sanity checks .sanitize_htest_input(data, select, by, weights) # get data grp1 <- data[[select]] grp2 <- data[[by]] # if paired = TRUE, we only allow a 2x2 table if (paired && (length(stats::na.omit(unique(grp1))) != 2 || length(stats::na.omit(unique(grp2))) != 2)) { insight::format_error("When `paired = TRUE`, only 2x2 tables are allowed (i.e. both variables must have exactly two levels).") # nolint } # create data frame for table x <- data.frame( grp1 = datawizard::to_factor(grp1), grp2 = datawizard::to_factor(grp2) ) # add weights if (!is.null(weights)) { x$weights <- data[[weights]] } # remove missings x <- stats::na.omit(x) # contingency table if (is.null(weights)) { tab <- table(x) } else { tab <- as.table(round(stats::xtabs(x[[3]] ~ x[[1]] + x[[2]]))) class(tab) <- "table" } # expected values, to identify whether Fisher's test is needed expected_values <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab))) # nolint # paired? mc-nemar test if (paired) { htest <- suppressWarnings(stats::mcnemar.test(tab, ...)) test_statistic <- htest$statistic } else { # chi-squared test htest <- suppressWarnings(stats::chisq.test(tab, ...)) test_statistic <- htest$statistic # need fisher? if (min(expected_values) < 5 || (min(expected_values) < 10 && htest$parameter == 1)) { htest <- stats::fisher.test(tab, simulate.p.value = TRUE, ...) } } p_value <- htest$p.value # effect size if (nrow(tab) > 2 || ncol(tab) > 2) { effect_size <- stats::setNames(cramer(tab), "Cramer's V") } else { effect_size <- stats::setNames(phi(tab), "Phi") } # return result out <- data.frame( data = paste(select, "by", by), statistic_name = "Chi-squared", statistic = test_statistic, effect_size_name = names(effect_size), effect_size = as.numeric(effect_size), p = p_value, df = (nrow(tab) - 1) * (ncol(tab) - 1), n_obs = sum(tab, na.rm = TRUE), stringsAsFactors = FALSE ) class(out) <- c("sj_htest_chi", "data.frame") attr(out, "weighted") <- !is.null(weights) attr(out, "fisher") <- isTRUE(startsWith(htest$method, "Fisher")) attr(out, "mcnemar") <- isTRUE(paired) attr(out, "caption") <- "contingency tables" out } .calculate_chisq_gof <- function(data, select, probabilities, weights, ...) { insight::check_if_installed("effectsize") # get data x <- data.frame(grp = data[[select]]) # add weights if (!is.null(weights)) { x$weights <- data[[weights]] } # remove missings x <- stats::na.omit(x) # contingency table if (is.null(weights)) { tab <- table(x) } else { tab <- as.table(round(stats::xtabs(x[[2]] ~ x[[1]]))) class(tab) <- "table" } # table dimensions n_rows <- nlevels(droplevels(as.factor(x$grp))) # sanity check if (length(probabilities) != n_rows) { insight::format_error("Length of probabilities must match number of cells in table (i.e. number of levels of input factor).") # nolint } if (!isTRUE(all.equal(sum(probabilities), 1))) { insight::format_error("Probabilities must sum to 1.") } # chi-squared test htest <- suppressWarnings(stats::chisq.test(tab, p = probabilities, rescale.p = TRUE, ...)) test_statistic <- htest$statistic p_value <- htest$p.value effect_size <- effectsize::chisq_to_fei( test_statistic, n = sum(tab), nrow = n_rows, ncol = 1, p = probabilities, alternative = "two.sided" )$Fei # return result out <- data.frame( data = paste( select, "against probabilities", datawizard::text_concatenate(sprintf("%i%%", round(100 * probabilities))) ), statistic_name = "Chi-squared", statistic = test_statistic, effect_size_name = "Fei", effect_size = as.numeric(effect_size), p = p_value, df = n_rows - 1, n_obs = sum(tab, na.rm = TRUE), stringsAsFactors = FALSE ) class(out) <- c("sj_htest_chi", "data.frame") attr(out, "caption") <- "given probabilities" attr(out, "weighted") <- !is.null(weights) out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_chi <- function(x, ...) { weighted <- attributes(x)$weighted if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } fisher <- attributes(x)$fisher mcnemar <- attributes(x)$mcnemar # headline insight::print_color(sprintf( "\n# Chi-squared test for %s%s\n", attributes(x)$caption, weight_string ), "blue") # Fisher's exact test? if (isTRUE(fisher)) { insight::print_color(" (using Fisher's exact test due to small expected values)\n", "blue") # nolint } else if (isTRUE(mcnemar)) { insight::print_color(" (using McNemar's test for paired data)\n", "blue") # nolint } cat("\n") # data info insight::print_color( sprintf(" Data: %s (n = %i)\n", x$data, round(x$n_obs)), "cyan" ) # prepare and align strings eff_symbol <- .format_symbols(x$effect_size_name) stat_symbol <- .format_symbols(x$statistic_name) # string for effectsizes eff_string <- switch(x$effect_size_name, Fei = sprintf( "%s = %.3f (%s effect)", eff_symbol, x$effect_size, effectsize::interpret_fei(x$effect_size) ), Phi = sprintf( "%s = %.3f (%s effect)", eff_symbol, x$effect_size, effectsize::interpret_phi(x$effect_size) ), sprintf( "Cramer's V = %.3f (%s effect)", x$effect_size, effectsize::interpret_cramers_v(x$effect_size) ) ) cat(sprintf( "\n %s = %.3f, %s, df = %i, %s\n\n", stat_symbol, x$statistic, eff_string, round(x$df), insight::format_p(x$p) )) } sjstats/R/S3-methods.R0000644000176200001440000002612114620333763014230 0ustar liggesusers#' @export print.svyglm.nb <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.nb(x, digits, v_se = se)[-1, -2] pan <- ifelse(sm$p.value < 0.001, "<0.001 ***", ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint sprintf("%.*f ", digits, sm$p.value) ) ) ) ) sm$p.value <- pan print(sm, ...) # add dispersion parameter cat(sprintf("\nDispersion parameter Theta: %.*f", digits, attr(x, "nb.theta", exact = TRUE))) cat(sprintf("\n Standard Error of Theta: %.*f", digits, attr(x, "nb.theta.se", exact = TRUE))) message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se)) } #' @export print.svyglm.zip <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.zip(x, digits, v_se = se)[-1, ] pan <- ifelse(sm$p.value < 0.001, "<0.001 ***", ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint sprintf("%.*f ", digits, sm$p.value) ) ) ) ) sm$p.value <- pan print(sm, ...) message(sprintf("\nShowing %s standard errors on link-scale (untransformed).", se)) } tidy_svyglm.nb <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) # keep original value, not rounded est <- stats::coef(x) se <- sqrt(diag(stats::vcov(x, stderr = v_se))) data_frame( term = substring(names(stats::coef(x)), 5), estimate = round(est, digits), irr = round(exp(est), digits), std.error = round(se, digits), conf.low = round(exp(est - stats::qnorm(0.975) * se), digits), conf.high = round(exp(est + stats::qnorm(0.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } tidy_svyglm.zip <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) # keep original value, not rounded est <- stats::coef(x) se <- sqrt(diag(stats::vcov(x, stderr = v_se))) data_frame( term = substring(names(stats::coef(x)), 5), estimate = round(est, digits), std.error = round(se, digits), conf.low = round(exp(est - stats::qnorm(0.975) * se), digits), conf.high = round(exp(est + stats::qnorm(0.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } #' @export model.frame.svyglm.nb <- function(formula, ...) { pred <- attr(formula, "nb.terms", exact = TRUE) formula$design$variables[intersect(pred, colnames(formula$design$variables))] } #' @export model.frame.svyglm.zip <- function(formula, ...) { pred <- attr(formula, "zip.terms", exact = TRUE) formula$design$variables[intersect(pred, colnames(formula$design$variables))] } #' @importFrom stats family #' @export family.svyglm.nb <- function(object, ...) { attr(object, "family", exact = TRUE) } #' @export formula.svyglm.nb <- function(x, ...) { attr(x, "nb.formula", exact = TRUE) } #' @export formula.svyglm.zip <- function(x, ...) { attr(x, "zip.formula", exact = TRUE) } #' @export predict.svyglm.nb <- function(object, newdata = NULL, type = c("link", "response", "terms"), se.fit = FALSE, dispersion = NULL, terms = NULL, na.action = stats::na.pass, ...) { insight::check_if_installed(c("survey", "MASS")) fnb <- MASS::glm.nb( attr(object, "nb.formula", exact = TRUE), data = object$design$variables, weights = scaled.weights ) cf <- stats::coef(fnb) names.cf <- names(cf) cf <- stats::coef(object)[-1] cf <- stats::setNames(cf, names.cf) fnb$coefficients <- cf stats::predict.glm( object = fnb, newdata = newdata, type = type, se.fit = se.fit, dispersion = dispersion, terms = terms, na.action = na.action, ... ) } #' @export residuals.svyglm.nb <- function(object, ...) { if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) fnb <- MASS::glm.nb( attr(object, "nb.formula", exact = TRUE), data = object$design$variables, weights = scaled.weights ) y <- insight::get_response(fnb) mu <- stats::predict.glm(fnb, type = "response") wts <- fnb$prior.weights (y - mu) * sqrt(wts) / sqrt(fnb$family$variance(mu)) } #' @export terms.svyglm.nb <- function(x, ...) { if (!isNamespaceLoaded("survey")) requireNamespace("survey", quietly = TRUE) stats::terms(stats::formula(x), ...) } #' @export AIC.svyglm.nb <- function(object, ...) { ## FIXME this one just returns the AIC of the underlying glm.nb() model aics <- lapply(list(object, ...), getaic) as.data.frame(do.call(rbind, aics)) } getaic <- function(x) { c(df = x$df, AIC = x$aic) } #' @export deviance.svyglm.nb <- function(object, ...) { ## FIXME this one just returns the deviance of the underlying glm.nb() model object$deviance } #' @export as.integer.sj_resample <- function(x, ...) { x$id } #' @export as.data.frame.sj_resample <- function(x, ...) { x$data[x$id, , drop = FALSE] } #' @export print.sj_resample <- function(x, ...) { n <- length(x$id) if (n > 12) id10 <- c(x$id[1:12], "...") else id10 <- x$id cat("<", paste0( "id's of resample [", prettyNum(nrow(x$data), big.mark = ","), " x ", prettyNum(ncol(x$data), big.mark = ","), "]" ), "> ", toString(id10), "\n", sep = "" ) } #' @export plot.sj_inequ_trend <- function(x, ...) { .data <- NULL insight::check_if_installed("ggplot2") # add time indicator x$data$zeit <- seq_len(nrow(x$data)) # get gather column names gather.cols1 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "lo", "hi")] gather.cols2 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "rr", "rd")] # gather data to plot rr and rd dat1 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y") # gather data for raw prevalences dat2 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y") # Proper value names, for facet labels dat1$grp[dat1$grp == "rr"] <- "Rate Ratios" dat1$grp[dat1$grp == "rd"] <- "Rate Differences" # plot prevalences gp1 <- ggplot2::ggplot(dat2, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + ggplot2::geom_smooth(method = "loess", se = FALSE) + ggplot2::labs(title = "Prevalance Rates for Lower and Higher SES Groups", y = "Prevalances", x = "Time", colour = "") + ggplot2::scale_color_manual(values = c("darkblue", "darkred"), labels = c("High SES", "Low SES")) # plot rr and rd gp2 <- ggplot2::ggplot(dat1, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + ggplot2::geom_smooth(method = "loess", se = FALSE) + ggplot2::facet_wrap(~grp, ncol = 1, scales = "free") + ggplot2::labs(title = "Proportional Change in Rate Ratios and Rate Differences", colour = NULL, y = NULL, x = "Time") + ggplot2::guides(colour = "none") suppressMessages(graphics::plot(gp1)) suppressMessages(graphics::plot(gp2)) } #' @export print.sj_xtab_stat <- function(x, ...) { # get length of method name, to align output l <- max(nchar(c(x$method, x$stat.name, "p-value", "Observations"))) # headline insight::print_color("\n# Measure of Association for Contingency Tables\n", "blue") # used fisher? if (x$fisher) insight::print_color(" (using Fisher's Exact Test)\n", "blue") cat("\n") # print test statistic cat(sprintf(" %*s: %.4f\n", l, x$stat.name, x$statistic)) cat(sprintf(" %*s: %.4f\n", l, x$method, x$estimate)) cat(sprintf(" %*s: %g\n", l, "df", x$df)) cat(sprintf(" %*s: %s\n", l, "p-value", insight::format_p(x$p.value, stars = TRUE, name = NULL))) cat(sprintf(" %*s: %g\n", l, "Observations", x$n_obs)) } #' @export print.sj_chi2gof <- function(x, ...) { insight::print_color("\n# Chi-squared Goodness-of-Fit Test\n\n", "blue") v1 <- sprintf("%.3f", x$chisq) v2 <- sprintf("%.3f", x$z.score) v3 <- sprintf("%.3f", x$p.value) space <- max(nchar(c(v1, v2, v3))) cat(sprintf(" Chi-squared: %*s\n", space, v1)) cat(sprintf(" z-score: %*s\n", space, v2)) cat(sprintf(" p-value: %*s\n\n", space, v3)) if (x$p.value >= 0.05) message("Summary: model seems to fit well.") else message("Summary: model does not fit well.") } #' @export print.sj_ttest <- function(x, ...) { insight::print_color(sprintf("\n%s (%s)\n", x$method, x$alternative), "blue") group <- attr(x, "group.name", exact = TRUE) xn <- attr(x, "x.name", exact = TRUE) yn <- attr(x, "y.name", exact = TRUE) if (!is.null(group)) verbs <- c("of", "by") else verbs <- c("between", "and") st <- sprintf("# t=%.2f df=%i p-value=%.3f\n\n", x$statistic, as.integer(x$df), x$p.value) if (!is.null(yn)) { insight::print_color(sprintf("\n# comparison %s %s %s %s\n", verbs[1], xn, verbs[2], yn), "cyan") } insight::print_color(st, "cyan") if (!is.null(yn)) { if (!is.null(group)) { l1 <- sprintf("mean in group %s", group[1]) l2 <- sprintf("mean in group %s", group[2]) } else { l1 <- sprintf("mean of %s", xn) l2 <- sprintf("mean of %s", yn) } l3 <- "difference of mean" slen <- max(nchar(c(l1, l2, l3))) cat(sprintf(" %s: %.3f\n", format(l1, width = slen), x$estimate[1])) cat(sprintf(" %s: %.3f\n", format(l2, width = slen), x$estimate[2])) cat(sprintf(" %s: %.3f [%.3f %.3f]\n", format(l3, width = slen), x$estimate[1] - x$estimate[2], x$ci[1], x$ci[2])) } else { cat(sprintf(" mean of %s: %.3f [%.3f, %.3f]\n", xn, x$estimate[1], x$ci[1], x$ci[2])) } cat("\n") } #' @export print.sj_wcor <- function(x, ...) { insight::print_color(sprintf("\nWeighted %s\n\n", x$method), "blue") if (!is.null(x$ci)) { cilvl <- sprintf("%.2i%%", as.integer(100 * x$ci.lvl)) cat(sprintf(" estimate [%s CI]: %.3f [%.3f %.3f]\n", cilvl, x$estimate, x$ci[1], x$ci[2])) cat(sprintf(" p-value: %.3f\n\n", x$p.value)) } else { cat(sprintf(" estimate: %.3f\n", x$estimate)) cat(sprintf(" p-value: %.3f\n\n", x$p.value)) } } #' @export print.sj_anova_stat <- function(x, digits = 3, ...) { x$p.value <- insight::format_p(x$p.value, name = NULL) cat(insight::export_table(x, digits = digits, protect_integers = TRUE)) } sjstats/R/design_effect.R0000644000176200001440000000364214616613032015065 0ustar liggesusers#' @title Design effects for two-level mixed models #' @name design_effect #' #' @description Compute the design effect (also called \emph{Variance Inflation Factor}) #' for mixed models with two-level design. #' #' @param n Average number of observations per grouping cluster (i.e. level-2 unit). #' @param icc Assumed intraclass correlation coefficient for multilevel-model. #' #' @return The design effect (Variance Inflation Factor) for the two-level model. #' #' @references Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20. #' \cr \cr #' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. \doi{10.1177/0163278703255230} #' \cr \cr #' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. \doi{10.1002/0470013192.bsa492} #' \cr \cr #' Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. \doi{10.1370/afm.1347} #' #' @details The formula for the design effect is simply \code{(1 + (n - 1) * icc)}. #' #' @examples #' # Design effect for two-level model with 30 observations per #' # cluster group (level-2 unit) and an assumed intraclass #' # correlation coefficient of 0.05. #' design_effect(n = 30) #' #' # Design effect for two-level model with 24 observation per cluster #' # group and an assumed intraclass correlation coefficient of 0.2. #' design_effect(n = 24, icc = 0.2) #' #' @export design_effect <- function(n, icc = 0.05) { 1 + (n - 1) * icc } sjstats/R/cv.R0000644000176200001440000000311314620333763012706 0ustar liggesusers#' @title Compute model quality #' @name cv #' #' @description Compute the coefficient of variation. #' #' @param x Fitted linear model of class \code{lm}, \code{merMod} (\pkg{lme4}) #' or \code{lme} (\pkg{nlme}). #' @param ... More fitted model objects, to compute multiple coefficients of #' variation at once. #' #' @details The advantage of the cv is that it is unitless. This allows #' coefficient of variation to be compared to each other in ways #' that other measures, like standard deviations or root mean #' squared residuals, cannot be. #' #' @return Numeric, the coefficient of variation. #' #' @examples #' data(efc) #' fit <- lm(barthtot ~ c160age + c12hour, data = efc) #' cv(fit) #' #' @export cv <- function(x, ...) { # return value cv_ <- cv_helper(x) # check if we have multiple parameters if (nargs() > 1) { # get input list params_ <- list(...) cv_ <- c(cv_, sapply(params_, cv_helper)) } cv_ } cv_helper <- function(x) { insight::check_if_installed("performance") # check if we have a fitted linear model if (inherits(x, c("lm", "lmerMod", "lme", "merModLmerTest")) && !inherits(x, "glm")) { # get response dv <- insight::get_response(x) mw <- mean(dv, na.rm = TRUE) stddev <- performance::rmse(x) } else { mw <- mean(x, na.rm = TRUE) stddev <- stats::sd(x, na.rm = TRUE) } # check if mean is zero? if (mw == 0) stop("Mean of dependent variable is zero. Cannot compute model's coefficient of variation.", call. = F) stddev / mw } sjstats/R/is_prime.R0000644000176200001440000000121414620333763014105 0ustar liggesusers#' @title Find prime numbers #' @name is_prime #' #' @description This functions checks whether a number is, or numbers in a #' vector are prime numbers. #' #' @param x An integer, or a vector of integers. #' #' @return `TRUE` for each prime number in `x`, `FALSE` otherwise. #' #' @examples #' is_prime(89) #' is_prime(15) #' is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) #' #' @export is_prime <- function(x) { if (is.numeric(x) && !all(x %% 1 == 0, na.rm = TRUE)) { insight::format_error("`x` needs to be an integer value.") } vapply(x, function(.x) .x == 2L || all(.x %% 2L:max(2, floor(sqrt(.x))) != 0), logical(1)) } sjstats/R/gmd.R0000644000176200001440000000263614620333763013056 0ustar liggesusers#' @title Gini's Mean Difference #' @name gmd #' @description `gmd()` computes Gini's mean difference for a numeric vector #' or for all numeric vectors in a data frame. #' #' @param x A vector or data frame. #' @param select Optional, names of variables as character vector that should be #' selected for further processing. Required, if `x` is a data frame (and no vector) #' and only selected variables from `x` should be processed. #' #' @return For numeric vectors, Gini's mean difference. For non-numeric vectors #' or vectors of length < 2, returns `NA`. #' #' @note Gini's mean difference is defined as the mean absolute difference between #' any two distinct elements of a vector. Missing values from `x` are silently #' removed. #' #' @references David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575 #' #' @examples #' data(efc) #' gmd(efc$e17age) #' gmd(efc, c("e17age", "c160age", "c12hour")) #' #' @export gmd <- function(x, select = NULL) { if (is.data.frame(x)) { do.call(rbind, lapply(select, function(i) { data.frame( variable = i, gmd = gmd_helper(x[[i]]) ) })) } else { gmd_helper(x) } } gmd_helper <- function(x) { if (!is.numeric(x)) return(NA) x <- stats::na.omit(x) n <- length(x) if (n < 2) return(NA) w <- 4 * ((1:n) - (n - 1) / 2) / n / (n - 1) sum(w * sort(x - mean(x))) } sjstats/R/cramer.R0000644000176200001440000000235414620333763013555 0ustar liggesusers#' @rdname crosstable_statistics #' @export cramers_v <- function(tab, ...) { UseMethod("cramers_v") } #' @rdname crosstable_statistics #' @export cramer <- cramers_v #' @export cramers_v.table <- function(tab, ...) { .cramers_v(tab) } #' @export cramers_v.ftable <- function(tab, ...) { .cramers_v(tab) } #' @rdname crosstable_statistics #' @export cramers_v.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { fterms <- all.vars(formula) tab <- table(data[[fterms[1]]], data[[fterms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { .cramers_v(tab) } else { straps <- sjstats::bootstrap(data[fterms], n) tables <- lapply(straps$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }) cramers <- sapply(tables, .cramers_v) ci <- boot_ci(cramers, ci.lvl = ci.lvl, method = method) data_frame( cramer = .cramers_v(tab), conf.low = ci$conf.low, conf.high = ci$conf.high ) } } .cramers_v <- function(tab) { # convert to flat table if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) sqrt(phi(tab)^2 / min(dim(tab) - 1)) } sjstats/R/wtd_se.R0000644000176200001440000000473314620333763013574 0ustar liggesusers#' @title Weighted statistics for variables #' @name weighted_se #' @description #' `weighted_se()` computes weighted standard errors of a variable or for #' all variables of a data frame. `survey_median()` computes the median for #' a variable in a survey-design (see [`survey::svydesign()]`). #' `weighted_correlation()` computes a weighted correlation for a two-sided #' alternative hypothesis. #' #' @param x (Numeric) vector or a data frame. For `survey_median()` or `weighted_ttest()`, #' the bare (unquoted) variable name, or a character vector with the variable name. #' @param weights Bare (unquoted) variable name, or a character vector with #' the variable name of the numeric vector of weights. If `weights = NULL`, #' unweighted statistic is reported. #' @param data A data frame. #' @param formula A formula of the form `lhs ~ rhs1 + rhs2` where `lhs` is a #' numeric variable giving the data values and `rhs1` a factor with two #' levels giving the corresponding groups and `rhs2` a variable with weights. #' @param y Optional, bare (unquoted) variable name, or a character vector with #' the variable name. #' @param ci.lvl Confidence level of the interval. #' @param ... Currently not used. #' #' @inheritParams svyglm.nb #' #' @return The weighted (test) statistic. #' #' @examplesIf requireNamespace("survey") #' data(efc) #' weighted_se(efc$c12hour, abs(runif(n = nrow(efc)))) #' #' # survey_median ---- #' # median for variables from weighted survey designs #' data(nhanes_sample) #' #' des <- survey::svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' survey_median(total, des) #' survey_median("total", des) #' @export weighted_se <- function(x, weights = NULL) { UseMethod("weighted_se") } #' @export weighted_se.data.frame <- function(x, weights = NULL) { se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights) names(se_result) <- colnames(x) se_result } #' @export weighted_se.matrix <- function(x, weights = NULL) { se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights) names(se_result) <- colnames(x) se_result } #' @export weighted_se.default <- function(x, weights = NULL) { weighted_se_helper(x, weights) } weighted_se_helper <- function(x, weights) { if (is.null(weights)) weights <- rep(1, length(x)) sqrt(weighted_variance(x, weights) / length(stats::na.omit(x))) } sjstats/R/var_pop.R0000644000176200001440000000276214620333763013755 0ustar liggesusers#' @title Calculate population variance and standard deviation #' @name var_pop #' @description Calculate the population variance or standard deviation of a vector. #' #' @param x (Numeric) vector. #' #' @return The population variance or standard deviation of \code{x}. #' #' @details Unlike \code{\link[stats]{var}}, which returns the sample variance, #' \code{var_pop()} returns the population variance. \code{sd_pop()} #' returns the standard deviation based on the population variance. #' #' @examples #' data(efc) #' #' # sampling variance #' var(efc$c12hour, na.rm = TRUE) #' # population variance #' var_pop(efc$c12hour) #' #' # sampling sd #' sd(efc$c12hour, na.rm = TRUE) #' # population sd #' sd_pop(efc$c12hour) #' @export var_pop <- function(x) { insight::check_if_installed("datawizard") # check for categorical if (is.factor(x)) { # only allow numeric factors if (!.is_pseudo_numeric(x)) { insight::format_error("`x` must be numeric vector or a factor with numeric levels.") } # convert factor to numeric x <- datawizard::to_numeric(x, dummy_factors = FALSE) } # remove NA x <- stats::na.omit(x) n <- length(x) # population variance stats::var(x) * ((n - 1) / n) } #' @rdname var_pop #' @export sd_pop <- function(x) { # get population variance pv <- var_pop(x) # factors with non-numeric level return NULL if (!is.null(pv) && !is.na(pv)) sqrt(pv) else NA } sjstats/R/re-exports.R0000644000176200001440000000070014620444364014405 0ustar liggesusers#' @importFrom performance mse #' @export performance::mse #' @importFrom performance rmse #' @export performance::rmse #' @importFrom insight link_inverse #' @export insight::link_inverse #' @importFrom datawizard weighted_sd #' @export datawizard::weighted_sd #' @importFrom datawizard weighted_mean #' @export datawizard::weighted_mean #' @importFrom datawizard weighted_median #' @export datawizard::weighted_median sjstats/R/Deprecated.R0000644000176200001440000000340514616725410014341 0ustar liggesusers#' @title Deprecated functions #' @name r2 #' @description A list of deprecated functions. #' #' @param x An object. #' @param ... Currently not used. #' #' @return Nothing. #' #' @export r2 <- function(x) { .Defunct("performance::r2()") performance::r2(x) } #' @rdname r2 #' @export cohens_f <- function(x, ...) { .Defunct("effectsize::cohens_f()") effectsize::cohens_f(x) } #' @rdname r2 #' @export eta_sq <- function(x, ...) { .Defunct("effectsize::eta_squared()") effectsize::eta_squared(x) } #' @rdname r2 #' @export epsilon_sq <- function(x, ...) { .Defunct("effectsize::epsilon_squared()") effectsize::epsilon_squared(x) } #' @rdname r2 #' @export omega_sq <- function(x, ...) { .Defunct("effectsize::omega_sqared()") effectsize::omega_squared(x) } #' @rdname r2 #' @export scale_weights <- function(x, ...) { .Defunct("datawizard::rescale_weights()") datawizard::rescale_weights(x, ...) } #' @rdname r2 #' @export robust <- function(x, ...) { .Defunct("parameters::standard_error()") parameters::standard_error(x, ...) } #' @rdname r2 #' @export icc <- function(x) { .Defunct("performance::icc()") performance::icc(x) } #' @rdname r2 #' @export p_value <- function(x, ...) { .Defunct("parameters::p_value()") parameters::p_value(x) } #' @rdname r2 #' @export se <- function(x, ...) { .Defunct("parameters::standard_error()") parameters::standard_error(x) } #' @rdname r2 #' @export means_by_group <- function(x, ...) { .Defunct("datawizard::means_by_group()") datawizard::means_by_group(x, ...) } #' @rdname r2 #' @export mean_n <- function(x, ...) { .Defunct("datawizard::row_means()") datawizard::row_means(x, ...) } sjstats/R/select_helpers.R0000644000176200001440000000154514620333763015306 0ustar liggesusersstring_starts_with <- function(pattern, x) { pattern <- paste0("^\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_contains <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_ends_with <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E$") grep(pattern, x, perl = TRUE) } string_one_of <- function(pattern, x) { m <- unlist(lapply(pattern, grep, x = x, fixed = TRUE, useBytes = TRUE)) x[m] } rownames_as_column <- function(x, var = "rowname") { rn <- data.frame(rn = rownames(x), stringsAsFactors = FALSE) x <- cbind(rn, x) colnames(x)[1] <- var rownames(x) <- NULL x } obj_has_name <- function(x, name) { name %in% names(x) } obj_has_rownames <- function(x) { !identical(as.character(seq_len(nrow(x))), rownames(x)) } sjstats/R/svyglmzip.R0000644000176200001440000000767514620333763014363 0ustar liggesusersutils::globalVariables("scaled.weights") #' @title Survey-weighted zero-inflated Poisson model #' @name svyglm.zip #' @description \code{svyglm.zip()} is an extension to the \CRANpkg{survey}-package #' to fit survey-weighted zero-inflated Poisson models. It uses #' \code{\link[survey]{svymle}} to fit sampling-weighted #' maximum likelihood estimates, based on starting values provided #' by \code{\link[pscl]{zeroinfl}}. #' #' #' @param formula An object of class \code{formula}, i.e. a symbolic description #' of the model to be fitted. See 'Details' in \code{\link[pscl]{zeroinfl}}. #' @param design An object of class \code{\link[survey]{svydesign}}, providing #' a specification of the survey design. #' @param ... Other arguments passed down to \code{\link[pscl]{zeroinfl}}. #' #' @return An object of class \code{\link[survey]{svymle}} and \code{svyglm.zip}, #' with some additional information about the model. #' #' @details Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/. #' #' @examples #' if (require("survey")) { #' data(nhanes_sample) #' set.seed(123) #' nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2) #' nhanes_sample$malepartners[sample(1:2992, 400)] <- 0 #' #' # create survey design #' des <- svydesign( #' id = ~SDMVPSU, #' strat = ~SDMVSTRA, #' weights = ~WTINT2YR, #' nest = TRUE, #' data = nhanes_sample #' ) #' #' # fit negative binomial regression #' fit <- svyglm.zip( #' malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1), #' des #' ) #' #' # print coefficients and standard errors #' fit #' } #' @export svyglm.zip <- function(formula, design, ...) { insight::check_if_installed(c("survey", "pscl")) # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) # update design with scaled weights design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE)) # fit ZIP model, with scaled design weights mod <- suppressWarnings(pscl::zeroinfl(formula, data = stats::model.frame(design), weights = scaled.weights, ...)) ff <- insight::find_formula(mod) # fit survey model, using maximum likelihood estimation svyfit <- survey::svymle( loglike = sjstats_loglik_zip, grad = sjstats_score_zip, design = design, formulas = list(eta = ff$conditional, logitp = ff$zero_inflated), start = stats::coef(mod), na.action = "na.omit" ) # add additoinal information class(svyfit) <- c("svyglm.zip", class(svyfit)) attr(svyfit, "zip.terms") <- all.vars(formula) attr(svyfit, "zip.formula") <- formula svyfit$deviance <- mod$deviance svyfit$df.residuals <- mod$df.residuals svyfit$df <- length(stats::coef(mod)) + 1 svyfit$aic <- mod$aic svyfit } # log-likelihood function used in "svymle()" sjstats_loglik_zip <- function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) log(p * (y == 0) + (1 - p) * stats::dpois(y, mu)) } sjstats_dlogitp = function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) dexpit <- p / (1 + p) ^ 2 num <- dexpit * (y == 0) - dexpit * stats::dpois(y, mu) denom <- p * (y == 0) + (1 - p) * stats::dpois(y, mu) num / denom } # derivative sjstats_deta_zip <- function(y, eta, logitp) { mu <- exp(eta) p <- exp(logitp) / (1 + exp(logitp)) dmutoy <- 0 * y dmutoy[y > 0] = exp(-mu[y > 0]) * mu[y > 0] ^ (y[y > 0] - 1) / factorial(y[y > 0] - 1) num = (1 - p) * (-stats::dpois(y, mu) + dmutoy) denom = p * (y == 0) + (1 - p) * stats::dpois(y, mu) num / denom } # score function, combines derivatives sjstats_score_zip <- function(y, eta, logitp) { cbind(sjstats_deta_zip(y, eta, logitp), sjstats_dlogitp(y, eta, logitp)) } sjstats/R/cv_error.R0000644000176200001440000000521214620333763014121 0ustar liggesusers#' @title Test and training error from model cross-validation #' @name cv_error #' #' @description \code{cv_error()} computes the root mean squared error from a model fitted #' to kfold cross-validated test-training-data. \code{cv_compare()} #' does the same, for multiple formulas at once (by calling \code{cv_error()} #' for each formula). #' #' @param data A data frame. #' @param formula The formula to fit the linear model for the test and training data. #' @param formulas A list of formulas, to fit linear models for the test and training data. #' @param k The number of folds for the kfold-crossvalidation. #' #' @return A data frame with the root mean squared errors for the training and test data. #' #' @details \code{cv_error()} first generates cross-validated test-training pairs, using #' \code{\link[modelr]{crossv_kfold}} and then fits a linear model, which #' is described in \code{formula}, to the training data. Then, predictions #' for the test data are computed, based on the trained models. #' The \emph{training error} is the mean value of the \code{\link{rmse}} for #' all \emph{trained} models; the \emph{test error} is the rmse based on all #' residuals from the test data. #' #' @examples #' data(efc) #' cv_error(efc, neg_c_7 ~ barthtot + c161sex) #' #' cv_compare(efc, formulas = list( #' neg_c_7 ~ barthtot + c161sex, #' neg_c_7 ~ barthtot + c161sex + e42dep, #' neg_c_7 ~ barthtot + c12hour #' )) #' #' @export cv_error <- function(data, formula, k = 5) { insight::check_if_installed("datawizard") # response resp <- insight::find_response(formula) # compute cross validation data cv_data <- lapply(k, function(i) { datawizard::data_partition(data, proportion = 0.8) }) # get train and test datasets train_data <- lapply(cv_data, function(cvdat) cvdat[[1]]) test_data <- lapply(cv_data, function(cvdat) cvdat[[2]]) # fit models on datasets trained_models <- lapply(train_data, function(x) stats::lm(formula, data = x)) test_models <- lapply(test_data, function(x) stats::lm(formula, data = x)) # RMSE train_error <- mean(vapply(trained_models, performance::rmse, numeric(1)), na.rm = TRUE) test_error <- mean(vapply(test_models, performance::rmse, numeric(1)), na.rm = TRUE) data_frame( model = deparse(formula), train.error = round(train_error, 4), test.error = round(test_error, 4) ) } #' @rdname cv_error #' @export cv_compare <- function(data, formulas, k = 5) { do.call(rbind, lapply(formulas, function(f) cv_error(data, formula = f, k = k))) } sjstats/R/prop.R0000644000176200001440000001546314620333763013271 0ustar liggesusers#' @title Proportions of values in a vector #' @name prop #' #' @description `prop()` calculates the proportion of a value or category #' in a variable. `props()` does the same, but allows for #' multiple logical conditions in one statement. It is similar #' to `mean()` with logical predicates, however, both #' `prop()` and `props()` work with grouped data frames. #' #' @param data A data frame. May also be a grouped data frame (see 'Examples'). #' @param ... One or more value pairs of comparisons (logical predicates). Put #' variable names the left-hand-side and values to match on the #' right hand side. Expressions may be quoted or unquoted. See #' 'Examples'. #' @param weights Vector of weights that will be applied to weight all observations. #' Must be a vector of same length as the input vector. Default is #' `NULL`, so no weights are used. #' @param na.rm Logical, whether to remove NA values from the vector when the #' proportion is calculated. `na.rm = FALSE` gives you the raw #' percentage of a value in a vector, `na.rm = TRUE` the valid #' percentage. #' @param digits Amount of digits for returned values. #' #' @details `prop()` only allows one logical statement per comparison, #' while `props()` allows multiple logical statements per comparison. #' However, `prop()` supports weighting of variables before calculating #' proportions, and comparisons may also be quoted. Hence, `prop()` #' also processes comparisons, which are passed as character vector #' (see 'Examples'). #' #' #' @return For one condition, a numeric value with the proportion of the values #' inside a vector. For more than one condition, a data frame with one column #' of conditions and one column with proportions. For grouped data frames, #' returns a data frame with one column per group with grouping categories, #' followed by one column with proportions per condition. #' #' @examplesIf getRversion() >= "4.2.0" && requireNamespace("datawizard", quietly = TRUE) #' data(efc) #' #' # proportion of value 1 in e42dep #' prop(efc, e42dep == 1) #' #' # expression may also be completely quoted #' prop(efc, "e42dep == 1") #' #' # use "props()" for multiple logical statements #' props(efc, e17age > 70 & e17age < 80) #' #' # proportion of value 1 in e42dep, and all values greater #' # than 2 in e42dep, including missing values. will return a data frame #' prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE) #' #' # for factors or character vectors, use quoted or unquoted values #' library(datawizard) #' # convert numeric to factor, using labels as factor levels #' efc$e16sex <- to_factor(efc$e16sex) #' efc$n4pstu <- to_factor(efc$n4pstu) #' #' # get proportion of female older persons #' prop(efc, e16sex == female) #' #' # get proportion of male older persons #' prop(efc, e16sex == "male") #' #' # "props()" needs quotes around non-numeric factor levels #' props(efc, #' e17age > 70 & e17age < 80, #' n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' #' ) #' #' # also works with pipe-chains #' efc |> prop(e17age > 70) #' efc |> prop(e17age > 70, e16sex == 1) #' @export prop <- function(data, ..., weights = NULL, na.rm = TRUE, digits = 4) { # check argument if (!is.data.frame(data)) { insight::format_error("`data` needs to be a data frame.") } dots <- match.call(expand.dots = FALSE)[["..."]] .proportions(data, dots = dots, weight.by = weights, na.rm, digits, multi_logical = FALSE) } #' @rdname prop #' @export props <- function(data, ..., na.rm = TRUE, digits = 4) { # check argument if (!is.data.frame(data)) { insight::format_error("`data` needs to be a data frame.") } dots <- match.call(expand.dots = FALSE)[["..."]] .proportions(data, dots = dots, NULL, na.rm, digits, multi_logical = TRUE) } .proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) { # remember comparisons comparisons <- lapply(dots, function(x) { # to character, and remove spaces and quotes x <- gsub(" ", "", deparse(x), fixed = TRUE) x <- gsub("\"", "", x, fixed = TRUE) x }) if (inherits(data, "grouped_df")) { grps <- attributes(data)$groups result <- lapply(grps[[".rows"]], function(x) { .process_prop(data[x, , drop = FALSE], comparisons, dots, multi_logical, na.rm, digits, weight.by) }) } else { result <- .process_prop(data, comparisons, dots, multi_logical, na.rm, digits, weight.by) } result } .process_prop <- function(data, comparisons, dots, multi_logical, na.rm, digits, weight.by) { # iterate dots (comparing conditions) if (multi_logical) result <- lapply(dots, get_multiple_proportion, data, na.rm, digits) else result <- lapply(dots, get_proportion, data, weight.by, na.rm, digits) # if we have more than one proportion, return a data frame. this allows us # to save more information, the condition and the proportion value if (length(comparisons) > 1) { return(data_frame( condition = as.character(unlist(comparisons)), prop = unlist(result) )) } unlist(result) } get_proportion <- function(x, data, weight.by, na.rm, digits) { # to character, and remove spaces and quotes x <- gsub(" ", "", deparse(x), fixed = TRUE) x <- gsub("\"", "", x, fixed = TRUE) # split expression at ==, < or > x.parts <- unlist(regmatches(x, gregexpr("[!=]=|[<>]|(?:(?![=!]=)[^<>])+", x, perl = TRUE))) # shorter version, however, does not split variable names with dots # x.parts <- unlist(regmatches(x, regexec("(\\w+)(\\W+)(\\w+)", x)))[-1] # correct == assignment? if (length(x.parts) < 3) { message("?Syntax error in argument. You possibly used `=` instead of `==`.") return(NULL) } # get variable from data and value from equation f <- data[[x.parts[1]]] v <- suppressWarnings(as.numeric(x.parts[3])) # if we have factor, values maybe non-numeric if (is.na(v)) v <- x.parts[3] # weight vector? if (!is.null(weight.by)) f <- weight(f, weights = weight.by) # get proportions dummy <- switch(x.parts[2], "==" = f == v, "!=" = f != v, "<" = f < v, ">" = f > v, f == v ) # remove missings? if (na.rm) dummy <- stats::na.omit(dummy) # get proportion round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits) } get_multiple_proportion <- function(x, data, na.rm, digits) { # evaluate argument dummy <- with(data, eval(parse(text = deparse(x)))) # remove missings? if (na.rm) dummy <- stats::na.omit(dummy) # get proportion round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits) } sjstats/R/mann_whitney_test.R0000644000176200001440000004232214620603142016031 0ustar liggesusers#' @title Mann-Whitney test #' @name mann_whitney_test #' @description This function performs a Mann-Whitney test (or Wilcoxon rank #' sum test for _unpaired_ samples). Unlike the underlying base R function #' `wilcox.test()`, this function allows for weighted tests and automatically #' calculates effect sizes. For _paired_ (dependent) samples, or for one-sample #' tests, please use the `wilcoxon_test()` function. #' #' A Mann-Whitney test is a non-parametric test for the null hypothesis that two #' _independent_ samples have identical continuous distributions. It can be used #' for ordinal scales or when the two continuous variables are not normally #' distributed. For large samples, or approximately normally distributed variables, #' the `t_test()` function can be used. #' #' @param data A data frame. #' @param select Name(s) of the continuous variable(s) (as character vector) #' to be used as samples for the test. `select` can be one of the following: #' #' - `select` can be used in combination with `by`, in which case `select` is #' the name of the continous variable (and `by` indicates a grouping factor). #' - `select` can also be a character vector of length two or more (more than #' two names only apply to `kruskal_wallis_test()`), in which case the two #' continuous variables are treated as samples to be compared. `by` must be #' `NULL` in this case. #' - If `select` select is of length **two** and `paired = TRUE`, the two samples #' are considered as *dependent* and a paired test is carried out. #' - If `select` specifies **one** variable and `by = NULL`, a one-sample test #' is carried out (only applicable for `t_test()` and `wilcoxon_test()`) #' - For `chi_squared_test()`, if `select` specifies **one** variable and #' both `by` and `probabilities` are `NULL`, a one-sample test against given #' probabilities is automatically conducted, with equal probabilities for #' each level of `select`. #' @param by Name of the variable indicating the groups. Required if `select` #' specifies only one variable that contains all samples to be compared in the #' test. If `by` is not a factor, it will be coerced to a factor. For #' `chi_squared_test()`, if `probabilities` is provided, `by` must be `NULL`. #' @param weights Name of an (optional) weighting variable to be used for the test. #' @param alternative A character string specifying the alternative hypothesis, #' must be one of `"two.sided"` (default), `"greater"` or `"less"`. See `?t.test` #' and `?wilcox.test`. #' @param mu The hypothesized difference in means (for `t_test()`) or location #' shift (for `wilcoxon_test()` and `mann_whitney_test()`). The default is 0. #' @param ... Additional arguments passed to `wilcox.test()` (for unweighted #' tests, i.e. when `weights = NULL`). #' #' @section Which test to use: #' The following table provides an overview of which test to use for different #' types of data. The choice of test depends on the scale of the outcome #' variable and the number of samples to compare. #' #' | **Samples** | **Scale of Outcome** | **Significance Test** | #' |-----------------|------------------------|---------------------------------| #' | 1 | binary / nominal | `chi_squared_test()` | #' | 1 | continuous, not normal | `wilcoxon_test()` | #' | 1 | continuous, normal | `t_test()` | #' | 2, independent | binary / nominal | `chi_squared_test()` | #' | 2, independent | continuous, not normal | `mann_whitney_test()` | #' | 2, independent | continuous, normal | `t_test()` | #' | 2, dependent | binary (only 2x2) | `chi_squared_test(paired=TRUE)` | #' | 2, dependent | continuous, not normal | `wilcoxon_test()` | #' | 2, dependent | continuous, normal | `t_test(paired=TRUE)` | #' | >2, independent | continuous, not normal | `kruskal_wallis_test()` | #' | >2, independent | continuous, normal | `datawizard::means_by_group()` | #' | >2, dependent | continuous, not normal | _not yet implemented_ (1) | #' | >2, dependent | continuous, normal | _not yet implemented_ (2) | #' #' (1) More than two dependent samples are considered as _repeated measurements_. #' For ordinal or not-normally distributed outcomes, these samples are #' usually tested using a [`friedman.test()`], which requires the samples #' in one variable, the groups to compare in another variable, and a third #' variable indicating the repeated measurements (subject IDs). #' #' (2) More than two dependent samples are considered as _repeated measurements_. #' For normally distributed outcomes, these samples are usually tested using #' a ANOVA for repeated measurements. A more sophisticated approach would #' be using a linear mixed model. #' #' @seealso #' - [`t_test()`] for parametric t-tests of dependent and independent samples. #' - [`mann_whitney_test()`] for non-parametric tests of unpaired (independent) #' samples. #' - [`wilcoxon_test()`] for Wilcoxon rank sum tests for non-parametric tests #' of paired (dependent) samples. #' - [`kruskal_wallis_test()`] for non-parametric tests with more than two #' independent samples. #' - [`chi_squared_test()`] for chi-squared tests (two categorical variables, #' dependent and independent). #' #' @return A data frame with test results. The function returns p and Z-values #' as well as effect size r and group-rank-means. #' #' @references #' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., #' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data #' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. #' \doi{10.3390/math11091982} #' #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @details This function is based on [`wilcox.test()`] and [`coin::wilcox_test()`] #' (the latter to extract effect sizes). The weighted version of the test is #' based on [`survey::svyranktest()`]. #' #' Interpretation of the effect size **r**, as a rule-of-thumb: #' #' - small effect >= 0.1 #' - medium effect >= 0.3 #' - large effect >= 0.5 #' #' **r** is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}. #' #' @examplesIf requireNamespace("coin") && requireNamespace("survey") #' data(efc) #' # Mann-Whitney-U tests for elder's age by elder's sex. #' mann_whitney_test(efc, "e17age", by = "e16sex") #' # base R equivalent #' wilcox.test(e17age ~ e16sex, data = efc) #' #' # when data is in wide-format, specify all relevant continuous #' # variables in `select` and omit `by` #' set.seed(123) #' wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20)) #' mann_whitney_test(wide_data, select = c("scale1", "scale2")) #' # base R equivalent #' wilcox.test(wide_data$scale1, wide_data$scale2) #' # same as if we had data in long format, with grouping variable #' long_data <- data.frame( #' scales = c(wide_data$scale1, wide_data$scale2), #' groups = as.factor(rep(c("A", "B"), each = 20)) #' ) #' mann_whitney_test(long_data, select = "scales", by = "groups") #' # base R equivalent #' wilcox.test(scales ~ groups, long_data) #' @export mann_whitney_test <- function(data, select = NULL, by = NULL, weights = NULL, mu = 0, alternative = "two.sided", ...) { insight::check_if_installed("datawizard") alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) # sanity checks .sanitize_htest_input(data, select, by, weights, test = "mann_whitney_test") # alternative only if weights are NULL if (!is.null(weights) && alternative != "two.sided") { insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.") } # does select indicate more than one variable? if (length(select) > 1) { # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale") by <- select[2] select <- select[1] # after converting to long, we have the "grouping" variable first in the data colnames(data) <- c(by, select) } # get data dv <- data[[select]] grp <- data[[by]] # coerce to factor grp <- datawizard::to_factor(grp) # only two groups allowed if (insight::n_unique(grp) > 2) { insight::format_error("Only two groups are allowed for Mann-Whitney test. Please use `kruskal_wallis_test()` for more than two groups.") # nolint } # value labels group_labels <- names(attr(data[[by]], "labels", exact = TRUE)) if (is.null(group_labels)) { group_labels <- levels(droplevels(grp)) } if (is.null(weights)) { .calculate_mwu(dv, grp, alternative, mu, group_labels, ...) } else { .calculate_weighted_mwu(dv, grp, data[[weights]], group_labels) } } # Mann-Whitney-Test for two groups -------------------------------------------- .calculate_mwu <- function(dv, grp, alternative, mu, group_labels, ...) { insight::check_if_installed("coin") # prepare data wcdat <- data.frame(dv, grp) # perfom wilcox test wt <- coin::wilcox_test(dv ~ grp, data = wcdat) # for rank mean group_levels <- levels(grp) # compute statistics u <- as.numeric(coin::statistic(wt, type = "linear")) z <- as.numeric(coin::statistic(wt, type = "standardized")) r <- abs(z / sqrt(length(dv))) htest <- suppressWarnings(stats::wilcox.test( dv ~ grp, data = wcdat, alternative = alternative, mu = mu, ... )) w <- htest$statistic p <- htest$p.value # group means dat_gr1 <- stats::na.omit(dv[grp == group_levels[1]]) dat_gr2 <- stats::na.omit(dv[grp == group_levels[2]]) rank_mean_1 <- mean(rank(dat_gr1)) rank_mean_2 <- mean(rank(dat_gr2)) # compute n for each group n_grp1 <- length(dat_gr1) n_grp2 <- length(dat_gr2) out <- data.frame( group1 = group_levels[1], group2 = group_levels[2], estimate = rank_mean_1 - rank_mean_2, u = u, w = w, z = z, r = r, p = as.numeric(p), mu = mu, alternative = alternative ) attr(out, "rank_means") <- stats::setNames( c(rank_mean_1, rank_mean_2), c("Mean Group 1", "Mean Group 2") ) attr(out, "n_groups") <- stats::setNames( c(n_grp1, n_grp2), c("N Group 1", "N Group 2") ) attr(out, "group_labels") <- group_labels attr(out, "method") <- "wilcoxon" attr(out, "weighted") <- FALSE class(out) <- c("sj_htest_mwu", "data.frame") out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_mwu <- function(dv, grp, weights, group_labels) { # check if pkg survey is available insight::check_if_installed("survey") dat <- stats::na.omit(data.frame(dv, grp, weights)) colnames(dat) <- c("x", "g", "w") design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) result <- survey::svyranktest(formula = x ~ g, design, test = "wilcoxon") # for rank mean group_levels <- levels(droplevels(grp)) # subgroups dat_gr1 <- dat[dat$g == group_levels[1], ] dat_gr2 <- dat[dat$g == group_levels[2], ] dat_gr1$rank_x <- rank(dat_gr1$x) dat_gr2$rank_x <- rank(dat_gr2$x) # rank means design_mean1 <- survey::svydesign( ids = ~0, data = dat_gr1, weights = ~w ) rank_mean_1 <- survey::svymean(~rank_x, design_mean1) design_mean2 <- survey::svydesign( ids = ~0, data = dat_gr2, weights = ~w ) rank_mean_2 <- survey::svymean(~rank_x, design_mean2) # group Ns n_grp1 <- round(sum(dat_gr1$w)) n_grp2 <- round(sum(dat_gr2$w)) # statistics and effect sizes z <- result$statistic r <- abs(z / sqrt(sum(n_grp1, n_grp2))) out <- data_frame( group1 = group_levels[1], group2 = group_levels[2], estimate = result$estimate, z = z, r = r, p = as.numeric(result$p.value), alternative = "two.sided" ) attr(out, "rank_means") <- stats::setNames( c(rank_mean_1, rank_mean_2), c("Mean Group 1", "Mean Group 2") ) attr(out, "n_groups") <- stats::setNames( c(n_grp1, n_grp2), c("N Group 1", "N Group 2") ) attr(out, "group_labels") <- group_labels attr(out, "weighted") <- TRUE class(out) <- c("sj_htest_mwu", "data.frame") out } # helper ---------------------------------------------------------------------- .sanitize_htest_input <- function(data, select, by, weights, test = NULL) { # check if arguments are NULL if (is.null(select)) { insight::format_error("Argument `select` is missing.") } # sanity check - may only specify two variable names if (identical(test, "mann_whitney_test") && length(select) > 2) { insight::format_error("You may only specify two variables for Mann-Whitney test.") } if (identical(test, "mann_whitney_test") && length(select) == 1 && is.null(by)) { insight::format_error("Only one variable provided in `select`, but none in `by`. You need to specify a second continuous variable in `select`, or a grouping variable in `by` for Mann-Whitney test.") # nolint } # sanity check - may only specify two variable names if (identical(test, "t_test") && length(select) > 2) { insight::format_error("You may only specify two variables for Student's t test.") } if ((!is.null(test) && test %in% c("t_test", "kruskal_wallis_test", "mann_whitney_test")) && length(select) > 1 && !is.null(by)) { # nolint insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.") } # check if arguments have correct length or are of correct type if (!is.character(select)) { insight::format_error("Argument `select` must be a character string with the name(s) of the variable(s).") } if (!is.null(by) && (length(by) != 1 || !is.character(by))) { insight::format_error("Argument `by` must be a character string with the name of a single variable.") } if (!is.null(weights) && (length(weights) != 1 || !is.character(weights))) { insight::format_error("Argument `weights` must be a character string with the name of a single variable.") } # check if "select" is in data if (!all(select %in% colnames(data))) { not_found <- setdiff(select, colnames(data))[1] insight::format_error( sprintf("Variable '%s' not found in data frame.", not_found), .misspelled_string(colnames(data), not_found, "Maybe misspelled?") ) } # check if "by" is in data if (!is.null(by) && !by %in% colnames(data)) { insight::format_error( sprintf("Variable '%s' not found in data frame.", by), .misspelled_string(colnames(data), by, "Maybe misspelled?") ) } # check if "weights" is in data if (!is.null(weights) && !weights %in% colnames(data)) { insight::format_error( sprintf("Weighting variable '%s' not found in data frame.", weights), .misspelled_string(colnames(data), weights, "Maybe misspelled?") ) } # select variable type for certain tests if (identical(test, "t_test") && !all(vapply(data[select], is.numeric, logical(1)))) { insight::format_error("Variable provided in `select` must be numeric for Student's t test.") } } # methods --------------------------------------------------------------------- #' @export print.sj_htest_mwu <- function(x, ...) { # fetch attributes group_labels <- attributes(x)$group_labels rank_means <- attributes(x)$rank_means n_groups <- attributes(x)$n_groups weighted <- attributes(x)$weighted if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } # same width group_labels <- format(group_labels) # header insight::print_color(sprintf("# Mann-Whitney test%s\n\n", weight_string), "blue") # group-1-info insight::print_color( sprintf( " Group 1: %s (n = %i, rank mean = %s)\n", group_labels[1], n_groups[1], insight::format_value(rank_means[1], protect_integers = TRUE) ), "cyan" ) # group-2-info insight::print_color( sprintf( " Group 2: %s (n = %i, rank mean = %s)\n", group_labels[2], n_groups[2], insight::format_value(rank_means[2], protect_integers = TRUE) ), "cyan" ) # alternative hypothesis if (!is.null(x$alternative) && !is.null(x$mu)) { alt_string <- switch(x$alternative, two.sided = "not equal to", less = "less than", greater = "greater than" ) alt_string <- paste("true location shift is", alt_string, x$mu) insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") } if (!is.null(x$w)) { w_stat <- paste("W =", insight::format_value(x$w, protect_integers = TRUE), ", ") } else { w_stat <- "" } cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", w_stat, x$r, x$z, insight::format_p(x$p))) } sjstats/R/anova_stats.R0000644000176200001440000001577714620444364014643 0ustar liggesusers#' @title Effect size statistics for anova #' @name anova_stats #' @description Returns the (partial) eta-squared, (partial) omega-squared, #' epsilon-squared statistic or Cohen's F for all terms in an anovas. #' \code{anova_stats()} returns a tidy summary, including all these statistics #' and power for each term. #' #' @param model A fitted anova-model of class \code{aov} or \code{anova}. Other #' models are coerced to \code{\link[stats]{anova}}. #' @param digits Amount of digits for returned values. #' #' @return A data frame with all statistics is returned (excluding confidence intervals). #' #' @references Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. #' \cr \cr #' Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. #' #' @examplesIf requireNamespace("car") #' # load sample data #' data(efc) #' #' # fit linear model #' fit <- aov( #' c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, #' data = efc #' ) #' anova_stats(car::Anova(fit, type = 2)) #' @export anova_stats <- function(model, digits = 3) { # .Deprecated("effectsize::effectsize()", package = "effectsize") # get tidy summary table aov.sum <- aov_stat_summary(model) # compute all model statistics etasq <- aov_stat_core(aov.sum, type = "eta") partial.etasq <- aov_stat_core(aov.sum, type = "peta") omegasq <- aov_stat_core(aov.sum, type = "omega") partial.omegasq <- aov_stat_core(aov.sum, type = "pomega") epsilonsq <- aov_stat_core(aov.sum, type = "epsilon") # compute power for each estimate cohens.f <- sqrt(partial.etasq / (1 - partial.etasq)) # bind as data frame anov_stat <- rbind( data.frame(etasq, partial.etasq, omegasq, partial.omegasq, epsilonsq, cohens.f), data.frame(etasq = NA, partial.etasq = NA, omegasq = NA, partial.omegasq = NA, epsilonsq = NA, cohens.f = NA) ) anov_stat <- cbind(anov_stat, data.frame(aov.sum)) # get nr of terms nt <- nrow(anov_stat) - 1 # finally, compute power as_power <- tryCatch( c(.calculate_power( df1 = anov_stat$df[1:nt], df2 = anov_stat$df[nrow(anov_stat)], effect_size = anov_stat$cohens.f[1:nt]^2 ), NA ), error = function(x) { NA } ) out <- cbind(anov_stat, data.frame(power = as_power)) out[] <- lapply(out, function(i) { if (is.numeric(i)) { round(i, digits) } else { i } }) class(out) <- c("sj_anova_stat", class(out)) out } aov_stat <- function(model, type) { aov.sum <- aov_stat_summary(model) aov.res <- aov_stat_core(aov.sum, type) if (obj_has_name(aov.sum, "stratum")) attr(aov.res, "stratum") <- aov.sum[["stratum"]] aov.res } aov_stat_summary <- function(model) { insight::check_if_installed("parameters") # check if we have a mixed model mm <- is_merMod(model) ori.model <- model # check that model inherits from correct class # else, try to coerce to anova table if (!inherits(model, c("Gam", "aov", "anova", "anova.rms", "aovlist"))) model <- stats::anova(model) # get summary table aov.sum <- insight::standardize_names(as.data.frame(parameters::model_parameters(model)), style = "broom") # for mixed models, add information on residuals if (mm) { res <- stats::residuals(ori.model) aov.sum <- rbind( aov.sum, data_frame( term = "Residuals", df = length(res) - sum(aov.sum[["df"]]), sumsq = sum(res^2, na.rm = TRUE), meansq = mse(ori.model), statistic = NA ) ) } # check if object has sums of square if (!obj_has_name(aov.sum, "sumsq")) { stop("Model object has no sums of squares. Cannot compute effect size statistic.", call. = FALSE) } # need special handling for rms-anova if (inherits(model, "anova.rms")) colnames(aov.sum) <- c("term", "df", "sumsq", "meansq", "statistic", "p.value") # for car::Anova, the meansq-column might be missing, so add it manually if (!obj_has_name(aov.sum, "meansq")) { pos_sumsq <- which(colnames(aov.sum) == "sumsq") aov.sum <- cbind( aov.sum[1:pos_sumsq], data.frame(meansq = aov.sum$sumsq / aov.sum$df), aov.sum[(pos_sumsq + 1):ncol(aov.sum)] ) } intercept <- .which_intercept(aov.sum$term) if (length(intercept) > 0) { aov.sum <- aov.sum[-intercept, ] } aov.sum } aov_stat_core <- function(aov.sum, type) { intercept <- .which_intercept(aov.sum$term) if (length(intercept) > 0) { aov.sum <- aov.sum[-intercept, ] } # get mean squared of residuals meansq.resid <- aov.sum[["meansq"]][nrow(aov.sum)] # get total sum of squares ss.total <- sum(aov.sum[["sumsq"]]) # get sum of squares of residuals ss.resid <- aov.sum[["sumsq"]][nrow(aov.sum)] # number of terms in model n_terms <- nrow(aov.sum) - 1 # number of observations N <- sum(aov.sum[["df"]]) + 1 aovstat <- switch(type, # compute omega squared for each model term omega = unlist(lapply(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / (ss.total + meansq.resid) })), # compute partial omega squared for each model term pomega = unlist(lapply(1:n_terms, function(x) { df.term <- aov.sum[["df"]][x] meansq.term <- aov.sum[["meansq"]][x] (df.term * (meansq.term - meansq.resid)) / (df.term * meansq.term + (N - df.term) * meansq.resid) })), # compute epsilon squared for each model term epsilon = unlist(lapply(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / ss.total })), # compute eta squared for each model term eta = unlist(lapply(1:n_terms, function(x) { aov.sum[["sumsq"]][x] / sum(aov.sum[["sumsq"]]) })), # compute partial eta squared for each model term cohens.f = , peta = unlist(lapply(1:n_terms, function(x) { aov.sum[["sumsq"]][x] / (aov.sum[["sumsq"]][x] + ss.resid) })) ) # compute Cohen's F if (type == "cohens.f") aovstat <- sqrt(aovstat / (1 - aovstat)) # give values names of terms names(aovstat) <- aov.sum[["term"]][1:n_terms] aovstat } .which_intercept <- function(x) { which(tolower(x) %in% c("(intercept)_zi", "intercept (zero-inflated)", "intercept", "zi_intercept", "(intercept)", "b_intercept", "b_zi_intercept")) } .calculate_power <- function(df1, df2, effect_size) { if (any(effect_size < 0)) { return(NA) } if (!is.null(df1) && any(df1 < 1)) { return(NA) } if (!is.null(df2) && any(df2 < 1)) { return(NA) } lambda <- effect_size * (df1 + df2 + 1) stats::pf( stats::qf(0.05, df1 = df1, df2 = df2, lower.tail = FALSE), df1 = df1, df2 = df2, ncp = lambda, lower.tail = FALSE ) } sjstats/R/kruskal_wallis_test.R0000644000176200001440000001434214620600424016361 0ustar liggesusers#' @title Kruskal-Wallis test #' @name kruskal_wallis_test #' @description This function performs a Kruskal-Wallis rank sum test, which is #' a non-parametric method to test the null hypothesis that the population median #' of all of the groups are equal. The alternative is that they differ in at #' least one. Unlike the underlying base R function `kruskal.test()`, this #' function allows for weighted tests. #' #' @inheritParams mann_whitney_test #' @inherit mann_whitney_test seealso #' #' @return A data frame with test results. #' #' @inheritSection mann_whitney_test Which test to use #' #' @references #' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. #' Dtsch Med Wochenschr 2007; 132: e24–e25 #' #' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer #' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 #' #' @details The function simply is a wrapper around [`kruskal.test()`]. The #' weighted version of the Kruskal-Wallis test is based on the **survey** package, #' using [`survey::svyranktest()`]. #' #' @examples #' data(efc) #' # Kruskal-Wallis test for elder's age by education #' kruskal_wallis_test(efc, "e17age", by = "c172code") #' #' # when data is in wide-format, specify all relevant continuous #' # variables in `select` and omit `by` #' set.seed(123) #' wide_data <- data.frame( #' scale1 = runif(20), #' scale2 = runif(20), #' scale3 = runif(20) #' ) #' kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3")) #' #' # same as if we had data in long format, with grouping variable #' long_data <- data.frame( #' scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3), #' groups = rep(c("A", "B", "C"), each = 20) #' ) #' kruskal_wallis_test(long_data, select = "scales", by = "groups") #' # base R equivalent #' kruskal.test(scales ~ groups, data = long_data) #' @export kruskal_wallis_test <- function(data, select = NULL, by = NULL, weights = NULL) { insight::check_if_installed("datawizard") # sanity checks .sanitize_htest_input(data, select, by, weights, test = "kruskal_wallis_test") # does select indicate more than one variable? if (length(select) > 1) { # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale") by <- select[2] select <- select[1] # after converting to long, we have the "grouping" variable first in the data colnames(data) <- c(by, select) } # get data dv <- data[[select]] grp <- data[[by]] # coerce to factor grp <- datawizard::to_factor(grp) # only two groups allowed if (insight::n_unique(grp) < 2) { insight::format_error("At least two groups are required, i.e. data must have at least two unique levels in `by` for `kruskal_wallis_test()`.") # nolint } if (is.null(weights)) { .calculate_kw(dv, grp, group_labels = c(select, by)) } else { .calculate_weighted_kw(dv, grp, data[[weights]], group_labels = c(select, by)) } } # Kruskal-Wallis-Test -------------------------------------------- .calculate_kw <- function(dv, grp, paired = FALSE, group_labels = NULL) { # prepare data wcdat <- data.frame(dv, grp) if (paired) { # perfom friedman test for paired data wt <- stats::friedman.test(table(wcdat)) } else { # perfom kruskal wallis test wt <- stats::kruskal.test(dv ~ grp, data = wcdat) } # number of groups n_groups <- vapply( stats::na.omit(unique(grp)), function(g) sum(grp == g, na.rm = TRUE), numeric(1) ) out <- data.frame( data = paste(group_labels[1], "by", group_labels[2]), Chi2 = wt$statistic, df = wt$parameter, p = as.numeric(wt$p.value), stringsAsFactors = FALSE ) attr(out, "n_groups") <- n_groups attr(out, "method") <- ifelse(paired, "friedman", "kruskal") attr(out, "weighted") <- FALSE class(out) <- c("sj_htest_kw", "data.frame") out } # Weighted Mann-Whitney-Test for two groups ---------------------------------- .calculate_weighted_kw <- function(dv, grp, weights, paired = FALSE, group_labels = NULL) { # check if pkg survey is available insight::check_if_installed("survey") dat <- stats::na.omit(data.frame(dv, grp, weights)) colnames(dat) <- c("x", "g", "w") # number of groups n_groups <- vapply(stats::na.omit(unique(grp)), function(g) { sum(dat$w[dat$grp == g], na.rm = TRUE) }, numeric(1)) if (paired) { ## TODO: paired no working. should call `friedman.test()` } else { design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) result <- survey::svyranktest(formula = x ~ g, design, test = "KruskalWallis") } out <- data.frame( data = paste(group_labels[1], "by", group_labels[2]), Chi2 = result$statistic, df = result$parameter, p = as.numeric(result$p.value), stringsAsFactors = FALSE ) attr(out, "n_groups") <- n_groups attr(out, "method") <- ifelse(paired, "friedman", "kruskal") attr(out, "weighted") <- TRUE class(out) <- c("sj_htest_kw", "data.frame") out } # methods --------------------------------------------------------------------- #' @export print.sj_htest_kw <- function(x, ...) { insight::check_if_installed("datawizard") # fetch attributes n_groups <- attributes(x)$n_groups weighted <- attributes(x)$weighted method <- attributes(x)$method if (weighted) { weight_string <- " (weighted)" } else { weight_string <- "" } # header if (identical(method, "kruskal")) { insight::print_color(sprintf("# Kruskal-Wallis test%s\n\n", weight_string), "blue") } else { insight::print_color(sprintf("# Friedman test%s\n\n", weight_string), "blue") } # data info insight::print_color( sprintf( " Data: %s (%i groups, n = %s)\n", x$data, length(n_groups), datawizard::text_concatenate(n_groups) ), "cyan" ) stat_symbol <- .format_symbols("Chi2") cat(sprintf( "\n %s = %.2f, df = %i, %s\n\n", stat_symbol, x$Chi2, round(x$df), insight::format_p(x$p) )) } sjstats/R/phi.R0000644000176200001440000000225714620333763013066 0ustar liggesusers#' @rdname crosstable_statistics #' @export phi <- function(tab, ...) { UseMethod("phi") } #' @export phi.table <- function(tab, ...) { .phi(tab) } #' @export phi.ftable <- function(tab, ...) { .phi(tab) } #' @export phi.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { formula_terms <- all.vars(formula) tab <- table(data[[formula_terms[1]]], data[[formula_terms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { .cramers_v(tab) } else { straps <- sjstats::bootstrap(data[formula_terms], n) tables <- lapply(straps$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }) phis <- sapply(tables, .phi) ci <- boot_ci(phis, ci.lvl = ci.lvl, method = method) data_frame( phi = .phi(tab), conf.low = ci$conf.low, conf.high = ci$conf.high ) } } .phi <- function(tab) { insight::check_if_installed("MASS") # convert to flat table if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) tb <- summary(MASS::loglm(~1 + 2, tab))$tests sqrt(tb[2, 1] / sum(tab)) } sjstats/R/helpfunctions.R0000644000176200001440000000672014620333763015166 0ustar liggesusers# Help-functions data_frame <- function(...) { x <- data.frame(..., stringsAsFactors = FALSE) rownames(x) <- NULL x } is_merMod <- function(fit) { inherits(fit, c("lmerMod", "glmerMod", "nlmerMod", "merModLmerTest")) } .compact_character <- function(x) { x[!sapply(x, function(i) is.null(i) || !nzchar(i, keepNA = TRUE) || is.na(i) || any(i == "NULL", na.rm = TRUE))] } .format_symbols <- function(x) { if (.unicode_symbols()) { x <- gsub("Delta", "\u0394", x, ignore.case = TRUE) x <- gsub("Phi", "\u03D5", x, ignore.case = TRUE) x <- gsub("Eta", "\u03B7", x, ignore.case = TRUE) x <- gsub("Epsilon", "\u03b5", x, ignore.case = TRUE) x <- gsub("Omega", "\u03b5", x, ignore.case = TRUE) x <- gsub("R2", "R\u00b2", x, ignore.case = TRUE) x <- gsub("Chi2", "\u03C7\u00b2", x, ignore.case = TRUE) x <- gsub("Chi-squared", "\u03C7\u00b2", x, ignore.case = TRUE) x <- gsub("Chi", "\u03C7", x, ignore.case = TRUE) x <- gsub("Sigma", "\u03C3", x, ignore.case = TRUE) x <- gsub("Rho", "\u03C1", x, ignore.case = TRUE) x <- gsub("Mu", "\u03BC", x, ignore.case = TRUE) x <- gsub("Theta", "\u03B8", x, ignore.case = TRUE) x <- gsub("Fei", "\u05E4\u200E", x, ignore.case = TRUE) } x } .unicode_symbols <- function() { win_os <- tryCatch( { si <- Sys.info() if (is.null(si["sysname"])) { FALSE } else { si["sysname"] == "Windows" || startsWith(R.version$os, "mingw") } }, error = function(e) { TRUE } ) l10n_info()[["UTF-8"]] && ((win_os && getRversion() >= "4.2") || (!win_os && getRversion() >= "4.0")) } .is_pseudo_numeric <- function(x) { (is.character(x) && !anyNA(suppressWarnings(as.numeric(stats::na.omit(x[nzchar(x, keepNA = TRUE)]))))) || (is.factor(x) && !anyNA(suppressWarnings(as.numeric(levels(x))))) # nolint } .misspelled_string <- function(source, searchterm, default_message = NULL) { if (is.null(searchterm) || length(searchterm) < 1) { return(default_message) } # used for many matches more_found <- "" # init default msg <- "" # remove matching strings same <- intersect(source, searchterm) searchterm <- setdiff(searchterm, same) source <- setdiff(source, same) # guess the misspelled string possible_strings <- unlist(lapply(searchterm, function(s) { source[.fuzzy_grep(source, s)] # nolint }), use.names = FALSE) if (length(possible_strings)) { msg <- "Did you mean " if (length(possible_strings) > 1) { # make sure we don't print dozens of alternatives for larger data frames if (length(possible_strings) > 5) { more_found <- sprintf( " We even found %i more possible matches, not shown here.", length(possible_strings) - 5 ) possible_strings <- possible_strings[1:5] } msg <- paste0(msg, "one of ", toString(paste0("\"", possible_strings, "\""))) } else { msg <- paste0(msg, "\"", possible_strings, "\"") } msg <- paste0(msg, "?", more_found) } else { msg <- default_message } # no double white space insight::trim_ws(msg) } .fuzzy_grep <- function(x, pattern, precision = NULL) { if (is.null(precision)) { precision <- round(nchar(pattern) / 3) } if (precision > nchar(pattern)) { return(NULL) } p <- sprintf("(%s){~%i}", pattern, precision) grep(pattern = p, x = x, ignore.case = FALSE) } sjstats/R/weight.R0000644000176200001440000000623514620333763013575 0ustar liggesusers#' @title Weight a variable #' @name weight #' @description These functions weight the variable \code{x} by #' a specific vector of \code{weights}. #' #' @param x (Unweighted) variable. #' @param weights Vector with same length as \code{x}, which #' contains weight factors. Each value of \code{x} has a #' specific assigned weight in \code{weights}. #' @param digits Numeric value indicating the number of decimal places to be #' used for rounding the weighted values. By default, this value is #' \code{0}, i.e. the returned values are integer values. #' #' @return The weighted \code{x}. #' #' @details \code{weight2()} sums up all \code{weights} values of the associated #' categories of \code{x}, whereas \code{weight()} uses a #' \code{\link[stats]{xtabs}} formula to weight cases. Thus, \code{weight()} #' may return a vector of different length than \code{x}. #' #' @note The values of the returned vector are in sorted order, whereas the values' #' order of the original \code{x} may be spread randomly. Hence, \code{x} can't be #' used, for instance, for further cross tabulation. In case you want to have #' weighted contingency tables or (grouped) box plots etc., use the \code{weightBy} #' argument of most functions. #' #' @examples #' v <- sample(1:4, 20, TRUE) #' table(v) #' w <- abs(rnorm(20)) #' table(weight(v, w)) #' table(weight2(v, w)) #' #' set.seed(1) #' x <- sample(letters[1:5], size = 20, replace = TRUE) #' w <- runif(n = 20) #' #' table(x) #' table(weight(x, w)) #' #' @export weight <- function(x, weights, digits = 0) { # remember if x is numeric x.is.num <- is.numeric(x) # init values weightedvar <- c() wtab <- round(stats::xtabs(weights ~ x, data = data.frame(weights = weights, x = x), na.action = stats::na.pass, exclude = NULL), digits = digits) # iterate all table values for (w in seq_len(length(wtab))) { # retrieve count of each table cell w_count <- wtab[[w]] # retrieve "cell name" which is identical to the variable value # first check whether values are numeric or not nval_ <- suppressWarnings(as.numeric(names(wtab[w]))) # if value is not numeric, use as is if (is.na(nval_)) w_value <- names(wtab[w]) else # else, use numeric value w_value <- nval_ # append variable value, repeating it "w_count" times. weightedvar <- c(weightedvar, rep(w_value, w_count)) } # if we have NA values, weighted var is coerced to character. # coerce back to numeric then here if (!is.numeric(weightedvar) && x.is.num) { weightedvar <- datawizard::to_numeric(weightedvar, dummy_factors = FALSE) } # return result weightedvar } #' @rdname weight #' @export weight2 <- function(x, weights) { items <- unique(x) newvar <- c() for (i in seq_len(length(items))) { newcount <- round(sum(weights[which(x == items[i])])) newvar <- c(newvar, rep(items[i], newcount)) } newvar } sjstats/R/confint_ncg.R0000644000176200001440000000743714620333763014602 0ustar liggesusers# This function is a modified version from package MBESS # copied from https://github.com/cran/MBESS/blob/master/R/conf.limits.ncf.R # Author: Ken Kelley # License: GPL-3 confint_ncg <- function(F.value = NULL, conf.level = 0.95, df.1 = NULL, df.2 = NULL) { alpha.lower <- alpha.upper <- (1 - conf.level) / 2 tol <- 1e-09 Jumping.Prop <- 0.1 FAILED <- NULL LL.0 <- stats::qf(p = alpha.lower * 5e-04, df1 = df.1, df2 = df.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) - (1 - alpha.lower) if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < (1 - alpha.lower)) { FAILED <- if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = 0) < 1 - alpha.lower) LL.0 <- 1e-08 if (stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.0) < 1 - alpha.lower) FAILED <- TRUE } if (is.null(FAILED)) { LL.1 <- LL.2 <- LL.0 while (Diff > tol) { LL.2 <- LL.1 * (1 + Jumping.Prop) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.2) - (1 - alpha.lower) LL.1 <- LL.2 } LL.1 <- LL.2 / (1 + Jumping.Prop) LL.Bounds <- c(LL.1, (LL.1 + LL.2) / 2, LL.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) while (abs(Diff) > tol) { Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[1]) - (1 - alpha.lower) > tol Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[3]) - (1 - alpha.lower) > tol if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[2], (LL.Bounds[2] + LL.Bounds[3]) / 2, LL.Bounds[3]) } if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[1], (LL.Bounds[1] + LL.Bounds[2]) / 2, LL.Bounds[2]) } Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) } LL <- LL.Bounds[2] } if (!is.null(FAILED)) LL <- NA FAILED.Up <- NULL UL.0 <- stats::qf(p = 1 - alpha.upper * 5e-04, df1 = df.1, df2 = df.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper if (Diff < 0) UL.0 <- 1e-08 Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.0) - alpha.upper if (Diff < 0) FAILED.Up <- TRUE if (is.null(FAILED.Up)) { UL.1 <- UL.2 <- UL.0 while (Diff > tol) { UL.2 <- UL.1 * (1 + Jumping.Prop) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.2) - alpha.upper UL.1 <- UL.2 } UL.1 <- UL.2 / (1 + Jumping.Prop) UL.Bounds <- c(UL.1, (UL.1 + UL.2) / 2, UL.2) Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper while (abs(Diff) > tol) { Diff.1 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[1]) - alpha.upper > tol Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[3]) - alpha.upper > tol if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) { UL.Bounds <- c(UL.Bounds[2], (UL.Bounds[2] + UL.Bounds[3]) / 2, UL.Bounds[3]) } if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) { UL.Bounds <- c(UL.Bounds[1], (UL.Bounds[1] + UL.Bounds[2]) / 2, UL.Bounds[2]) } Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper } UL <- UL.Bounds[2] } if (!is.null(FAILED.Up)) UL <- NA list(Lower.Limit = LL, Upper.Limit = UL) } sjstats/R/inequ_trends.R0000644000176200001440000001002314620333763014774 0ustar liggesusers#' @title Compute trends in status inequalities #' @name inequ_trend #' #' @description This method computes the proportional change of absolute #' (rate differences) and relative (rate ratios) inequalities #' of prevalence rates for two different status groups, as proposed #' by Mackenbach et al. (2015). #' #' @param data A data frame that contains the variables with prevalence rates for both low #' and high status groups (see 'Examples'). #' @param prev.low The name of the variable with the prevalence rates for #' the low status groups. #' @param prev.hi The name of the variable with the prevalence rates for #' the hi status groups. #' #' @return A data frame with the prevalence rates as well as the values for the #' proportional change in absolute (\code{rd}) and relative (\code{rr}) #' ineqqualities. #' #' @references Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. \doi{10.1136/jech-2015-207018} #' #' @details Given the time trend of prevalence rates of an outcome for two status #' groups (e.g. the mortality rates for people with lower and higher #' socioeconomic status over 40 years), this function computes the #' proportional change of absolute and relative inequalities, expressed #' in changes in rate differences and rate ratios. The function implements #' the algorithm proposed by \emph{Mackenbach et al. 2015}. #' #' @examplesIf requireNamespace("ggplot2") #' # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 #' #' # 40 simulated time points, with an initial rate ratio of 2 and #' # a rate difference of 100 (i.e. low status group starts with a #' # prevalence rate of 200, the high status group with 100) #' #' # annual decline of prevalence is 1% for the low, and 3% for the #' # high status group #' #' n <- 40 #' time <- seq(1, n, by = 1) #' lo <- rep(200, times = n) #' for (i in 2:n) lo[i] <- lo[i - 1] * .99 #' #' hi <- rep(100, times = n) #' for (i in 2:n) hi[i] <- hi[i - 1] * .97 #' #' prev.data <- data.frame(lo, hi) #' #' # print values #' inequ_trend(prev.data, "lo", "hi") #' #' # plot trends - here we see that the relative inequalities #' # are increasing over time, while the absolute inequalities #' # are first increasing as well, but later are decreasing #' # (while rel. inequ. are still increasing) #' plot(inequ_trend(prev.data, "lo", "hi")) #' #' @export inequ_trend <- function(data, prev.low, prev.hi) { # prepare data for prevalence rates for low and hi status groups if (is.null(data) || missing(data)) { dat <- data.frame(prev.low, prev.hi) } else { dat <- data[c(prev.low, prev.hi)] } # ensure common column names colnames(dat) <- c("lo", "hi") # trends in rate ratios # compute relative inequality for first time point, needed # as reference to compute proportional change over time dat$rr <- dat$lo[1] / dat$hi[1] # compute proportional change of relative inequalities over time for (t in 2:nrow(dat)) { delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1] delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1] dat$rr[t] <- dat$rr[t - 1] * ((1 + delta.low) / (1 + delta.hi)) } # trends in rate difference # compute absolute inequality for first time point, needed # as reference to compute proportional change over time dat$rd <- dat$lo[1] - dat$hi[1] # compute proportional change of absolute inequalities over time for (t in 2:nrow(dat)) { delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1] delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1] dat$rd[t] <- dat$rd[t - 1] + (dat$lo[t - 1] * delta.low - dat$hi[t - 1] * delta.hi) } # return structure(class = "sj_inequ_trend", list(data = dat)) } sjstats/R/se_ybar.R0000644000176200001440000000312114620333763013721 0ustar liggesusers#' @title Standard error of sample mean for mixed models #' @name se_ybar #' #' @description Compute the standard error for the sample mean for mixed models, #' regarding the extent to which clustering affects the standard errors. #' May be used as part of the multilevel power calculation for cluster sampling #' (see \cite{Gelman and Hill 2007, 447ff}). #' #' @param fit Fitted mixed effects model (\code{\link[lme4]{merMod}}-class). #' #' @return The standard error of the sample mean of \code{fit}. #' #' @references Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press #' #' @examplesIf require("lme4") #' fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) #' se_ybar(fit) #' @export se_ybar <- function(fit) { # get model icc vars <- insight::get_variance(fit, verbose = FALSE) # get group variances tau.00 <- unname(vars$var.intercept) # total variance tot_var <- sum(tau.00, vars$var.residual) # get number of groups m.cnt <- vapply(fit@flist, nlevels, 1) # compute number of observations per group (level-2-unit) obs <- round(stats::nobs(fit) / m.cnt) # compute simple icc icc <- tau.00 / tot_var # compute standard error of sample mean se <- unlist(lapply(seq_len(length(m.cnt)), function(.x) { sqrt((tot_var / stats::nobs(fit)) * design_effect(n = obs[.x], icc = icc[.x])) })) # give names for se, so user sees, which random effect has what impact names(se) <- names(m.cnt) se } sjstats/R/sjStatistics.R0000644000176200001440000000352514616613032014767 0ustar liggesusers#' @title Expected and relative table values #' @name table_values #' @description This function calculates a table's cell, row and column percentages as #' well as expected values and returns all results as lists of tables. #' #' @param tab Simple \code{\link{table}} or \code{\link[stats]{ftable}} of which #' cell, row and column percentages as well as expected values are calculated. #' Tables of class \code{\link[stats]{xtabs}} and other will be coerced to #' \code{ftable} objects. #' @param digits Amount of digits for the table percentage values. #' #' @return (Invisibly) returns a list with four tables: #' \enumerate{ #' \item \code{cell} a table with cell percentages of \code{tab} #' \item \code{row} a table with row percentages of \code{tab} #' \item \code{col} a table with column percentages of \code{tab} #' \item \code{expected} a table with expected values of \code{tab} #' } #' #' @examples #' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) #' # show expected values #' table_values(tab)$expected #' # show cell percentages #' table_values(tab)$cell #' #' @export table_values <- function(tab, digits = 2) { # convert to ftable object if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) tab.cell <- round(100 * prop.table(tab), digits) tab.row <- round(100 * prop.table(tab, 1), digits) tab.col <- round(100 * prop.table(tab, 2), digits) tab.expected <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab))) # return results invisible(structure(class = "sjutablevalues", list(cell = tab.cell, row = tab.row, col = tab.col, expected = tab.expected))) } sjstats/R/samplesize_mixed.R0000644000176200001440000001133214620333763015642 0ustar liggesusers#' @title Sample size for linear mixed models #' @name samplesize_mixed #' #' @description Compute an approximated sample size for linear mixed models #' (two-level-designs), based on power-calculation for standard #' design and adjusted for design effect for 2-level-designs. #' #' @param eff.size Effect size. #' @param df.n Optional argument for the degrees of freedom for numerator. See 'Details'. #' @param power Power of test (1 minus Type II error probability). #' @param sig.level Significance level (Type I error probability). #' @param k Number of cluster groups (level-2-unit) in multilevel-design. #' @param n Optional, number of observations per cluster groups #' (level-2-unit) in multilevel-design. #' @param icc Expected intraclass correlation coefficient for multilevel-model. #' #' @return A list with two values: The number of subjects per cluster, and the #' total sample size for the linear mixed model. #' #' @references Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum. #' \cr \cr #' Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. #' \cr \cr #' Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. #' #' @details The sample size calculation is based on a power-calculation for the #' standard design. If \code{df.n} is not specified, a power-calculation #' for an unpaired two-sample t-test will be computed (using #' \code{\link[pwr]{pwr.t.test}} of the \CRANpkg{pwr}-package). #' If \code{df.n} is given, a power-calculation for general linear models #' will be computed (using \code{\link[pwr]{pwr.f2.test}} of the #' \pkg{pwr}-package). The sample size of the standard design #' is then adjusted for the design effect of two-level-designs (see #' \code{\link{design_effect}}). Thus, the sample size calculation is appropriate #' in particular for two-level-designs (see \cite{Snijders 2005}). Models that #' additionally include repeated measures (three-level-designs) may work #' as well, however, the computed sample size may be less accurate. #' #' @examplesIf requireNamespace("pwr") #' # Sample size for multilevel model with 30 cluster groups and a small to #' # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and #' # hence a total sample size of about 802 observations is needed. #' samplesize_mixed(eff.size = .3, k = 30) #' #' # Sample size for multilevel model with 20 cluster groups and a medium #' # to large effect size for linear models of 0.2. Five subjects per cluster and #' # hence a total sample size of about 107 observations is needed. #' samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) #' @export samplesize_mixed <- function(eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, n, icc = 0.05) { if (!requireNamespace("pwr", quietly = TRUE)) { stop("Package `pwr` needed for this function to work. Please install it.", call. = FALSE) } # compute sample size for standard design if (is.null(df.n)) # if we have no degrees of freedom specified, use t-test obs <- 2 * pwr::pwr.t.test(d = eff.size, sig.level = sig.level, power = power)$n else # we have df, so power-calc for linear models obs <- pwr::pwr.f2.test(u = df.n, f2 = eff.size, sig.level = sig.level, power = power)$v + df.n + 1 # if we have no information on the number of observations per cluster, # compute this number now if (missing(n) || is.null(n)) { n <- (obs * (1 - icc)) / (k - (obs * icc)) if (n < 1) { warning("Minimum required number of subjects per cluster is negative and was adjusted to be positive. You may reduce the requirements for the multi-level structure (i.e. reduce `k` or `icc`), or you can increase the effect-size.", call. = FALSE) n <- 1 } } # adjust standard design by design effect total.n <- obs * design_effect(n = n, icc = icc) # sample size for each group and total n smpsz <- list(round(total.n / k), round(total.n)) # name list names(smpsz) <- c("Subjects per Cluster", "Total Sample Size") smpsz } #' @rdname samplesize_mixed #' @export smpsize_lmm <- samplesize_mixed sjstats/R/wtd_cor.R0000644000176200001440000000421614620333763013744 0ustar liggesusers#' @rdname weighted_se #' @export weighted_correlation <- function(data, ...) { UseMethod("weighted_correlation") } #' @rdname weighted_se #' @export weighted_correlation.default <- function(data, x, y, weights, ci.lvl = 0.95, ...) { if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) insight::format_error("'ci.lvl' must be a single number between 0 and 1.") x.name <- deparse(substitute(x)) y.name <- deparse(substitute(y)) w.name <- deparse(substitute(weights)) if (w.name == "NULL") { w.name <- "weights" data$weights <- 1 } # create string with variable names vars <- c(x.name, y.name, w.name) # get data dat <- suppressMessages(data[vars]) dat <- stats::na.omit(dat) xv <- dat[[x.name]] yv <- dat[[y.name]] wv <- dat[[w.name]] weighted_correlation_helper(xv, yv, wv, ci.lvl) } #' @rdname weighted_se #' @export weighted_correlation.formula <- function(formula, data, ci.lvl = 0.95, ...) { if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) insight::format_error("'ci.lvl' must be a single number between 0 and 1.") vars <- all.vars(formula) if (length(vars) < 3) { vars <- c(vars, "weights") data$weights <- 1 } # get data dat <- suppressMessages(data[vars]) dat <- stats::na.omit(dat) xv <- dat[[vars[1]]] yv <- dat[[vars[2]]] wv <- dat[[vars[3]]] weighted_correlation_helper(xv, yv, wv, ci.lvl) } weighted_correlation_helper <- function(xv, yv, wv, ci.lvl) { x <- xv - weighted_mean(xv, weights = wv) y <- yv - weighted_mean(yv, weights = wv) x <- x / weighted_sd(x, weights = wv) y <- y / weighted_sd(y, weights = wv) results <- stats::coef(summary(stats::lm(y ~ x, weights = wv)))[2, ] ci <- ci.lvl - ((1 - ci.lvl) / 2) ci <- results[1] + (stats::qnorm(ci) * c(-1, 1) * results[2]) structure( class = "sj_wcor", list( estimate = results[1], method = "Pearson's Correlation Coefficient", p.value = results[4], ci = ci, ci.lvl = ci.lvl ) ) } sjstats/R/wtd_variance.R0000644000176200001440000000040114620333763014741 0ustar liggesusersweighted_variance <- function(x, w) { if (is.null(w)) w <- rep(1, length(x)) x[is.na(w)] <- NA w[is.na(x)] <- NA w <- stats::na.omit(w) x <- stats::na.omit(x) xbar <- sum(w * x) / sum(w) sum(w * ((x - xbar)^2)) / (sum(w) - 1) } sjstats/R/bootstrap.R0000644000176200001440000001020414620333763014312 0ustar liggesusers#' @title Generate nonparametric bootstrap replications #' @name bootstrap #' #' @description Generates \code{n} bootstrap samples of \code{data} and #' returns the bootstrapped data frames as list-variable. #' #' @param data A data frame. #' @param n Number of bootstraps to be generated. #' @param size Optional, size of the bootstrap samples. May either be a number #' between 1 and \code{nrow(data)} or a value between 0 and 1 to sample #' a proportion of observations from \code{data} (see 'Examples'). #' #' @return A data frame with one column: a list-variable #' \code{strap}, which contains resample-objects of class \code{sj_resample}. #' These resample-objects are lists with three elements: #' \enumerate{ #' \item the original data frame, \code{data} #' \item the rownmumbers \code{id}, i.e. rownumbers of \code{data}, indicating the resampled rows with replacement #' \item the \code{resample.id}, indicating the index of the resample (i.e. the position of the \code{sj_resample}-object in the list \code{strap}) #' } #' #' @details By default, each bootstrap sample has the same number of observations #' as \code{data}. To generate bootstrap samples without resampling #' same observations (i.e. sampling without replacement), use #' \code{size} to get bootstrapped data with a specific number #' of observations. However, specifying the \code{size}-argument is much #' less memory-efficient than the bootstrap with replacement. Hence, #' it is recommended to ignore the \code{size}-argument, if it is #' not really needed. #' #' @note This function applies nonparametric bootstrapping, i.e. the function #' draws samples with replacement. #' \cr \cr #' There is an \code{as.data.frame}- and a \code{print}-method to get or #' print the resampled data frames. See 'Examples'. The \code{as.data.frame}- #' method automatically applies whenever coercion is done because a data #' frame is required as input. See 'Examples' in \code{\link{boot_ci}}. #' #' #' @seealso \code{\link{boot_ci}} to calculate confidence intervals from #' bootstrap samples. #' #' @examples #' data(efc) #' bs <- bootstrap(efc, 5) #' #' # now run models for each bootstrapped sample #' lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x)) #' #' # generate bootstrap samples with 600 observations for each sample #' bs <- bootstrap(efc, 5, 600) #' #' # generate bootstrap samples with 70% observations of the original sample size #' bs <- bootstrap(efc, 5, .7) #' #' # compute standard error for a simple vector from bootstraps #' # use the `as.data.frame()`-method to get the resampled #' # data frame #' bs <- bootstrap(efc, 100) #' bs$c12hour <- unlist(lapply(bs$strap, function(x) { #' mean(as.data.frame(x)$c12hour, na.rm = TRUE) #' })) #' #' # bootstrapped standard error #' boot_se(bs, "c12hour") #' #' # bootstrapped CI #' boot_ci(bs, "c12hour") #' @export bootstrap <- function(data, n, size) { if (!missing(size) && !is.null(size)) { # check for valid range if (size < 0 || size > nrow(data)) stop("`size` must be greater than 0, but not greater than number of rows of `data`.", call. = F) # check if we want proportions if (size < 1) size <- as.integer(nrow(data) * size) # generate bootstraps w/o replacement repl <- FALSE } else { # size = observations size <- nrow(data) # generate bootstraps with replacement repl <- TRUE } # generate bootstrap resamples strap <- replicate(n, resample(data, size, repl), simplify = FALSE) # add resample ID, may be used for other functions (like 'se()' for 'icc()') for (i in seq_len(length(strap))) strap[[i]]$resample.id <- i # return daza frame data.frame(strap = I(strap)) } resample <- function(data, size, replace) { structure( class = "sj_resample", list( data = data, id = sample(nrow(data), size = size, replace = replace) )) } sjstats/R/svy_median.R0000644000176200001440000000060714620333763014441 0ustar liggesusers#' @rdname weighted_se #' @export survey_median <- function(x, design) { # check if pkg survey is available insight::check_if_installed("survey") # deparse v <- stats::as.formula(paste("~", as.character(substitute(x)))) as.vector( survey::svyquantile( v, design = design, quantiles = 0.5, ci = FALSE, na.rm = TRUE ) ) } sjstats/R/auto_prior.R0000644000176200001440000001157614620500520014460 0ustar liggesusers#' @title Create default priors for brms-models #' @name auto_prior #' #' @description This function creates default priors for brms-regression #' models, based on the same automatic prior-scale adjustment as in #' \pkg{rstanarm}. #' #' @param formula A formula describing the model, which just needs to contain #' the model terms, but no notation of interaction, splines etc. Usually, #' you want only those predictors in the formula, for which automatic #' priors should be generated. Add informative priors afterwards to the #' returned \code{brmsprior}-object. #' @param data The data that will be used to fit the model. #' @param gaussian Logical, if the outcome is gaussian or not. #' @param locations A numeric vector with location values for the priors. If #' \code{locations = NULL}, \code{0} is used as location parameter. #' #' @return A \code{brmsprior}-object. #' #' @details \code{auto_prior()} is a small, convenient function to create #' some default priors for brms-models with automatically adjusted prior #' scales, in a similar way like \pkg{rstanarm} does. The default scale for #' the intercept is 10, for coefficients 2.5. If the outcome is gaussian, #' both scales are multiplied with \code{sd(y)}. Then, for categorical #' variables, nothing more is changed. For numeric variables, the scales #' are divided by the standard deviation of the related variable. #' \cr \cr #' All prior distributions are \emph{normal} distributions. \code{auto_prior()} #' is intended to quickly create default priors with feasible scales. If #' more precise definitions of priors is necessary, this needs to be done #' directly with brms-functions like \code{set_prior()}. #' #' @note As \code{auto_prior()} also sets priors on the intercept, the model #' formula used in \code{brms::brm()} must be rewritten to something like #' \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. #' #' @examplesIf requireNamespace("brms") #' data(efc) #' efc$c172code <- as.factor(efc$c172code) #' efc$c161sex <- as.factor(efc$c161sex) #' #' mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) #' auto_prior(mf, efc, TRUE) #' #' ## compare to #' # m <- rstanarm::stan_glm(mf, data = efc, chains = 2, iter = 200) #' # ps <- prior_summary(m) #' # ps$prior_intercept$adjusted_scale #' # ps$prior$adjusted_scale #' #' ## usage #' # ap <- auto_prior(mf, efc, TRUE) #' # brm(mf, data = efc, prior = ap) #' #' # add informative priors #' mf <- formula(neg_c_7 ~ c161sex + c172code) #' #' auto_prior(mf, efc, TRUE) + #' brms::prior(normal(.1554, 40), class = "b", coef = "c160age") #' #' # example with binary response #' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) #' mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) #' auto_prior(mf, efc, FALSE) #' @export auto_prior <- function(formula, data, gaussian, locations = NULL) { insight::check_if_installed("brms") scale.b <- 2.5 scale.y <- 10 pred <- insight::find_predictors(formula, effects = "all", flatten = TRUE) y.name <- insight::find_response(formula, combine = TRUE) data <- stats::na.omit(data[c(y.name, pred)]) y <- data[[y.name]] # check if response is binary if (missing(gaussian) && insight::n_unique(y) == 2) gaussian <- FALSE if (isTRUE(gaussian) && insight::n_unique(y) == 2) insight::format_alert("Priors were calculated based on assumption that the response is Gaussian, however it seems to be binary.") # nolint if (gaussian) { scale.factor <- stats::sd(y, na.rm = TRUE) scale.b <- scale.b * scale.factor scale.y <- scale.y * scale.factor } if (!is.null(locations)) location.y <- locations[1] else location.y <- 0 priors <- brms::set_prior( sprintf("normal(%s, %s)", round(location.y, 2), round(scale.y, 2)), class = "Intercept" ) is.fac <- NULL term.names <- NULL scale.pred <- NULL # we need to check which predictors are categorical and then "mimic" # their coefficient name as it is represented in the model (i.e. variable # name + category name) for (i in pred) { f <- data[[i]] if (is.factor(f)) { i <- sprintf("%s%s", i, levels(f)[2:nlevels(f)]) is.fac <- c(is.fac, rep(TRUE, nlevels(f) - 1)) scale.pred <- c(scale.pred, rep(scale.b, nlevels(f) - 1)) } else { is.fac <- c(is.fac, FALSE) scale.pred <- c(scale.pred, scale.b / stats::sd(f, na.rm = TRUE)) } term.names <- c(term.names, i) } for (i in seq_along(term.names)) { if (!is.null(locations) && length(locations) >= (i + 1)) location.b <- locations[i + 1] else location.b <- 0 priors <- priors + brms::set_prior( sprintf("normal(%s, %s)", round(location.b, 2), round(scale.pred[i], 2)), class = "b", coef = term.names[i] ) } priors } sjstats/R/xtab_statistics.R0000644000176200001440000002240214620333763015510 0ustar liggesusers#' @title Measures of association for contingency tables #' @name crosstable_statistics #' #' @description This function calculates various measure of association for #' contingency tables and returns the statistic and p-value. #' Supported measures are Cramer's V, Phi, Spearman's rho, #' Kendall's tau and Pearson's r. #' #' @param data A data frame or a table object. If a table object, `x1` and #' `x2` will be ignored. For Kendall's _tau_, Spearman's _rho_ or Pearson's #' product moment correlation coefficient, `data` needs to be a data frame. #' If `x1` and `x2` are not specified, the first two columns of the data #' frames are used as variables to compute the crosstab. #' @param formula A formula of the form `lhs ~ rhs` where `lhs` is a #' numeric variable giving the data values and `rhs` a factor giving the #' corresponding groups. #' @param tab A [`table()`] or [`ftable()`]. Tables of class [`xtabs()`] and #' other will be coerced to `ftable` objects. #' @param x1 Name of first variable that should be used to compute the #' contingency table. If `data` is a table object, this argument will be #' irgnored. #' @param x2 Name of second variable that should be used to compute the #' contingency table. If `data` is a table object, this argument will be #' irgnored. #' @param statistics Name of measure of association that should be computed. May #' be one of `"auto"`, `"cramer"`, `"phi"`, `"spearman"`, `"kendall"`, #' `"pearson"` or `"fisher"`. See 'Details'. #' @param ci.lvl Scalar between 0 and 1. If not `NULL`, returns a data #' frame including lower and upper confidence intervals. #' @param weights Name of variable in `x` that indicated the vector of weights #' that will be applied to weight all observations. Default is `NULL`, so no #' weights are used. #' @param ... Other arguments, passed down to the statistic functions #' [`chisq.test()`], [`fisher.test()`] or [`cor.test()`]. #' #' @inheritParams bootstrap #' @inheritParams boot_ci #' #' @return For [`phi()`], the table's Phi value. For [`cramers_v()]`, the #' table's Cramer's V. #' #' For `crosstable_statistics()`, a list with following components: #' #' - `estimate`: the value of the estimated measure of association. #' - `p.value`: the p-value for the test. #' - `statistic`: the value of the test statistic. #' - `stat.name`: the name of the test statistic. #' - `stat.html`: if applicable, the name of the test statistic, in HTML-format. #' - `df`: the degrees of freedom for the contingency table. #' - `method`: character string indicating the name of the measure of association. #' - `method.html`: if applicable, the name of the measure of association, in HTML-format. #' - `method.short`: the short form of association measure, equals the `statistics`-argument. #' - `fisher`: logical, if Fisher's exact test was used to calculate the p-value. #' #' @details The p-value for Cramer's V and the Phi coefficient are based #' on `chisq.test()`. If any expected value of a table cell is smaller than 5, #' or smaller than 10 and the df is 1, then `fisher.test()` is used to compute #' the p-value, unless `statistics = "fisher"`; in this case, the use of #' `fisher.test()` is forced to compute the p-value. The test statistic is #' calculated with `cramers_v()` resp. `phi()`. #' #' Both test statistic and p-value for Spearman's rho, Kendall's tau and #' Pearson's r are calculated with `cor.test()`. #' #' When `statistics = "auto"`, only Cramer's V or Phi are calculated, based on #' the dimension of the table (i.e. if the table has more than two rows or #' columns, Cramer's V is calculated, else Phi). #' #' @references Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., #' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data #' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. #' \doi{10.3390/math11091982} #' #' @examples #' # Phi coefficient for 2x2 tables #' tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) #' phi(tab) #' #' # Cramer's V for nominal variables with more than 2 categories #' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) #' cramer(tab) #' #' # formula notation #' data(efc) #' cramer(e16sex ~ c161sex, data = efc) #' #' # bootstrapped confidence intervals #' cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100) #' #' # 2x2 table, compute Phi automatically #' crosstable_statistics(efc, e16sex, c161sex) #' #' # more dimensions than 2x2, compute Cramer's V automatically #' crosstable_statistics(efc, c172code, c161sex) #' #' # ordinal data, use Kendall's tau #' crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall") #' #' # calcilate Spearman's rho, with continuity correction #' crosstable_statistics(efc, #' e42dep, #' quol_5, #' statistics = "spearman", #' exact = FALSE, #' continuity = TRUE #' ) #' @export crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ...) { # match arguments statistics <- match.arg(statistics) # name for test statistics in HTML stat.html <- NULL # check if data is a table if (is.table(data)) { # 'data' is a table - copy to table object tab <- data # check if statistics are possible to compute if (statistics %in% c("spearman", "kendall", "pearson")) { stop( sprintf( "Need arguments `data`, `x1` and `x2` to compute %s-statistics.", statistics ), call. = FALSE ) } } else { # evaluate unquoted names x1 <- deparse(substitute(x1)) x2 <- deparse(substitute(x2)) weights <- deparse(substitute(weights)) # if names were quotes, remove quotes x1 <- gsub("\"", "", x1, fixed = TRUE) x2 <- gsub("\"", "", x2, fixed = TRUE) weights <- gsub("\"", "", weights, fixed = TRUE) if (insight::is_empty_object(weights) || weights == "NULL") weights <- NULL else weights <- data[[weights]] # check for "NULL" and get data if (x1 != "NULL" && x2 != "NULL") data <- data[, c(x1, x2)] else data <- data[, 1:2] if (!is.null(weights)) data <- cbind(data, weights) # make table if (!is.null(weights)) { tab <- as.table(round(stats::xtabs(data[[3]] ~ data[[1]] + data[[2]]))) class(tab) <- "table" } else { tab <- table(data) } } # get expected values tab.val <- table_values(tab) # remember whether fisher's exact test was used or not use.fisher <- FALSE # select statistics automatically, based on number of rows/columns if (statistics %in% c("auto", "cramer", "phi", "fisher")) { # get chisq-statistics, for df and p-value chsq <- suppressWarnings(stats::chisq.test(tab, ...)) pv <- chsq$p.value test <- chsq$statistic # set statistics name names(test) <- "Chi-squared" stat.html <- "χ2" # check row/columns if ((nrow(tab) > 2 || ncol(tab) > 2 || statistics %in% c("cramer", "fisher")) && statistics != "phi") { # get cramer's V s <- cramer(tab) # if minimum expected values below 5, compute fisher's exact test if (statistics == "fisher" || min(tab.val$expected) < 5 || (min(tab.val$expected) < 10 && chsq$parameter == 1)) { pv <- stats::fisher.test(tab, simulate.p.value = TRUE, ...)$p.value use.fisher <- TRUE } # set statistics statistics <- "cramer" } else { # get Phi s <- phi(tab) # if minimum expected values below 5 and df=1, compute fisher's exact test if (min(tab.val$expected) < 5 || (min(tab.val$expected) < 10 && chsq$parameter == 1)) { pv <- stats::fisher.test(tab, ...)$p.value use.fisher <- TRUE } # set statistics statistics <- "phi" } } else { # compute correlation coefficient cv <- stats::cor.test(x = data[[1]], y = data[[2]], method = statistics, ...) # get statistics and p-value s <- cv$estimate pv <- cv$p.value test <- cv$statistic stat.html <- names(test) } # compute method string method <- ifelse(statistics == "kendall", "Kendall's tau", ifelse(statistics == "spearman", "Spearman's rho", # nolint ifelse(statistics == "pearson", "Pearson's r", # nolint ifelse(statistics == "cramer", "Cramer's V", "Phi") # nolint ) ) ) # compute method string method.html <- ifelse(statistics == "kendall", "Kendall's τ", ifelse(statistics == "spearman", "Spearman's ρ", # nolint ifelse(statistics == "pearson", "Pearson's r", # nolint ifelse(statistics == "cramer", "Cramer's V", "&phi") # nolint ) ) ) # return result structure(class = "sj_xtab_stat", list( estimate = s, p.value = pv, statistic = test, stat.name = names(test), stat.html = stat.html, df = (nrow(tab) - 1) * (ncol(tab) - 1), n_obs = sum(tab, na.rm = TRUE), method = method, method.html = method.html, method.short = statistics, fisher = use.fisher )) } #' @rdname crosstable_statistics #' @export xtab_statistics <- crosstable_statistics sjstats/NEWS.md0000644000176200001440000003470514620444364013063 0ustar liggesusers# sjstats 0.19.0 * Weighted significance tests have been re-designed. The functions `weighted_ttest()`, `weighted_mannwhitney()` and `weighted_chisqtest()` are no longer available. These are now re-implemented in `t_test()`, `mann_whitney_test()` and `chi_squared_test()`. If weights are required, the `weights` argument can be used. Furthermore, new functions for significance testing were added: `kruskal_wallis_test()` and `wilcoxon_test()`. * `means_by_group()` and `mean_n()` were removed. The replacements are `datawizard::means_by_group()` and `datawizard::row_means()` (using the `min_valid` argument). * `weighted_median()`, `weighted_sd()` and `weighted_mean()` were removed. Their replacements are `datawizard::weighted_median()`, `datawizard::weighted_sd()` and `datawizard::weighted_mean()`. * Package dependency was dramatically reduced. _sjstats_ now requires much fewer and much more light-weight packages to work. * Some minor bugs were fixed. # sjstats 0.18.2 * Fix issues with changes in forthcoming upstream packages. # sjstats 0.18.1 ## Deprecated and defunct _sjstats_ is being re-structured, and many functions are re-implemented in new packages that are part of a new project called **easystats**. Therefore, following functions are now defunct: * `mediation()`, , please use `bayestestR::mediation()`. * `eta_sq()`, please use `effectsize::eta_squared()`. * `omega_sq()`, please use `effectsize::omega_squared()`. * `epsilon_sq()`, please use `effectsize::epsilon_squared()`. * `odds_to_rr()`, please use `effectsize::oddsratio_to_riskratio()`. * `std_beta()`, please use `effectsize::standardize_parameters()`. * `robust()`, please use `parameters::standard_error_robust()`. * `scale_weights()`, , please use `datawizard::rescale_weights()`. ## General * Improved printing for `weighted_mannwhitney()`. * `weighted_chisqtest()` can now be computed for given probabilities. * `means_by_group()` now contains numeric values in the returned data frame. Value formatting is completely done insight the print-method. * Updated imports. # sjstats 0.18.0 ## General * Effect size computation functions (like `eta_sq()`) now internally call the related functions from the *effectsize* package. * Remove packages from "Suggest" that have been removed from CRAN. # sjstats 0.17.9 ## Bug fixes * Fixed documentation for `chisq_gof()`. * Fixed issue in `anova_stats()` with incorrect effect sizes for certain Anova types (that included an intercept). # sjstats 0.17.8 ## Deprecated and defunct _sjstats_ is being re-structured, and many functions are re-implemented in new packages that are part of a new project called **easystats**. Therefore, following functions are now deprecated: * `cohens_f()`, please use `effectsize::cohens_f()`. * `std_beta()`, please use `effectsize::standardize_parameters()`. * `tidy_stan()`, please use `parameters::model_parameters()`. * `scale_weights()`, please use `parameters::rescale_weights()`. * `robust()`, please use `parameters::standard_error_robust()`. ## General * Functions for weighted statistics with prefix `wtd_*()` have been renamed to `weighted_*()`. * `svy_md()` was renamed to `survey_median()`. * `mannwhitney()` is an alias for `mwu()`. * `means_by_group()` is an alias for `grpmean()`. # sjstats 0.17.7 ## Deprecated and defunct _sjstats_ is being re-structured, and many functions are re-implemented in new packages that are part of a new project called **easystats**. The aim of **easystats** is to provide a unifying and consistent framework to tame, discipline and harness the scary R statistics and their pesky models. Therefore, following functions are now deprecated: * `p_value()`, please use `parameters::p_value()` * `se()`, please use `parameters::standard_error()` ## General * Revise some functions to cope with the forthcoming _insight_ update. # sjstats 0.17.6 ## General * Minor revisions to meet the changes in the forthcoming update from *tidyr*. * `design_effect()` is an alias for `deff()`. * `samplesize_mixed()` is an alias for `smpsize_lmm()`. * `crosstable_statistics()` is an alias for `xtab_statistics()`. ## New functions * `svyglm.zip()` to fit zero-inflated Poisson models for survey-designs. ## Changes to functions * `phi()` and `cramer()` can now compute confidence intervals. * `tidy_stan()` removes prior parameters from output. * `tidy_stan()` now also prints the probability of direction. ## Bug fixes * Fix bug with wrong computation in `odds_to_rr()`. # sjstats 0.17.5 ## New functions * `epsilon_sq()`, to compute epsilon-squared effect-size. ## Deprecated and defunct _sjstats_ is being re-structured, and many functions are re-implemented in new packages that are part of a new project called **easystats**. The aim of **easystats** is to provide a unifying and consistent framework to tame, discipline and harness the scary R statistics and their pesky models. Therefore, following functions are now deprecated: * `link_inverse()`, please use `insight::link_inverse()` * `model_family()`, please use `insight::model_info()` * `model_frame()`, please use `insight::get_data()` * `pred_vars()`, please use `insight::find_predictors()` * `re_grp_var()`, please use `insight::find_random()` * `grp_var()`, please use `insight::find_random()` * `resp_val()`, please use `insight::get_response()` * `resp_var()`, please use `insight::find_response()` * `var_names()`, please use `insight::clean_names()` * `overdisp()`, please use `performance::check_overdispersion()` * `zero_count()`, please use `performance::check_zeroinflation()` * `converge_ok()`, please use `performance::check_convergence()` * `is_singular()`, please use `performance::check_singularity()` * `reliab_test()`, please use `performance::item_reliability()` * `split_half()`, please use `performance::item_split_half()` * `predictive_accurarcy()`, please use `performance::performance_accuracy()` * `cronb()`, please use `performance::cronbachs_alpha()` * `difficulty()`, please use `performance::item_difficulty()` * `mic()`, please use `performance::item_intercor()` * `pca()`, please use `parameters::principal_components()` * `pca_rotate()`, please use `parameters::principal_components()` * `r2()`, please use `performance::r2()` * `icc()`, please use `performance::icc()` * `rmse()`, please use `performance::rmse()` * `rse()`, please use `performance::rse()` * `mse()`, please use `performance::mse()` * `hdi()`, please use `bayestestR::hdi()` * `cred_int()`, please use `bayestestR::ci()` * `rope()`, please use `bayestestR::rope()` * `n_eff()`, please use `bayestestR::effective_sample()` * `equi_test()`, please use `bayestestR::equivalence_test()` * `multicollin()`, please use `performance::check_collinearity()` * `normality()`, please use `performance::check_normality()` * `autocorrelation()`, please use `performance::check_autocorrelation()` * `heteroskedastic()`, please use `performance::check_heteroscedasticity()` * `outliers()`, please use `performance::check_outliers()` ## Changes to functions * Anova-stats functions (like `eta_sq()`) get a `method`-argument to define the method for computing confidence intervals from bootstrapping. ## Bug fixes * In some situations, `smpsize_lmm()` could result in negative sample-size recommendations. This was fixed, and a warning is now shown indicating that the parameters for the power-calculation should be modified. * Fixed issue with wrong calculated effect size `r` in `mwu()` if group-factor contained more than two groups. # sjstats 0.17.4 ## General * Following models/objects are now supported by model-information functions like `model_family()`, `link_inverse()` or `model_frame()`: `MixMod` (package **GLMMadaptive**), **MCMCglmm**, `mlogit` and `gmnl`. * Reduce package dependencies. ## New functions * `cred_int()`, to compute uncertainty intervals of Bayesian models. Mimics the behaviour and style of `hdi()` and is thus a convenient complement to functions like `posterior_interval()`. ## Changes to functions * `equi_test()` now finds better defaults for models with binomial outcome (like logistic regression models). * `r2()` for mixed models now also should work properly for mixed models fitted with **rstanarm**. * `anova_stats()` and alike (e.g. `eta_sq()`) now all preserve original term names. * `model_family()` now returns `$is_count = TRUE`, when model is a count-model, and `$is_beta = TRUE` for models with beta-family. * `pred_vars()` checks that return value has only unique values. * `pred_vars()` gets a `zi`-argument to return the variables from a model's zero-inflation-formula. ## Bug fixes * Fix minor issues in `wtd_sd()` and `wtd_mean()` when weight was `NULL` (which usually shoudln't be the case anyway). * Fix potential issue with `deparse()`, cutting off very long formulas in various functions. * Fix encoding issues in help-files. # sjstats 0.17.3 ## General * Export `dplyr::n()`, to meet forthcoming changes in dplyr 0.8.0. ## Changes to functions * `boot_ci()` gets a `ci.lvl`-argument. * The `rotation`-argument in `pca_rotate()` now supports all rotations from `psych::principal()`. * `pred_vars()` gets a `fe.only`-argument to return only fixed effects terms from mixed models, and a `disp`-argument to return the variables from a model's dispersion-formula. * `icc()` for Bayesian models gets a `adjusted`-argument, to calculate adjusted and conditional ICC (however, only for Gaussian models). * For `icc()` for non-Gaussian Bayes-models, a message is printed that recommends setting argument `ppd` to `TRUE`. * `resp_val()` and `resp_var()` now also work for **brms**-models with additional response information (like `trial()` in formula). * `resp_var()` gets a `combine`-argument, to return either the name of the matrix-column or the original variable names for matrix-columns. * `model_frame()` now also returns the original variables for matrix-column-variables. * `model_frame()` now also returns the variable from the dispersion-formula of **glmmTMB**-models. * `model_family()` and `link_inverse()` now supports **glmmPQL**, **felm** and **lm_robust**-models. * `anova_stats()` and alike (`omeqa_sq()` etc.) now support gam-models from package **gam**. * `p_value()` now supports objects of class `svyolr`. ## Bug fixes * Fix issue with `se()` and `get_re_var()` for objects returned by `icc()`. * Fix issue with `icc()` for Stan-models. * `var_names()` did not clear terms with log-log transformation, e.g. `log(log(y))`. * Fix issue in `model_frame()` for models with splines with only one column. # sjstats 0.17.2 ## General * Revised help-files for `r2()` and `icc()`, also by adding more references. ## New functions * `re_grp_var()` to find group factors of random effects in mixed models. ## Changes to functions * `omega_sq()` and `eta_sq()` give more informative messages when using non-supported objects. * `r2()` and `icc()` give more informative warnings and messages. * `tidy_stan()` supports printing simplex parameters of monotonic effects of **brms** models. * `grpmean()` and `mwu()` get a `file` and `encoding` argument, to save the HTML output as file. ## Bug fixes * `model_frame()` now correctly names the offset-columns for terms provided as `offset`-argument (i.e. for models where the offset was not specified inside the formula). * Fixed issue with `weights`-argument in `grpmean()` when variable name was passed as character vector. * Fixed issue with `r2()` for **glmmTMB** models with `ar1` random effects structure. # sjstats 0.17.1 ## New functions * `wtd_chisqtest()` to compute a weighted Chi-squared test. * `wtd_median()` to compute the weighted median of variables. * `wtd_cor()` to compute weighted correlation coefficients of variables. ## Changes to functions * `mediation()` can now cope with models from different families, e.g. if the moderator or outcome is binary, while the treatment-effect is continuous. * `model_frame()`, `link_inverse()`, `pred_vars()`, `resp_var()`, `resp_val()`, `r2()` and `model_family()` now support `clm2`-objects from package **ordinal**. * `anova_stats()` gives a more informative message for non-supported models or ANOVA-options. ## Bug fixes * Fixed issue with `model_family()` and `link_inverse()` for models fitted with `pscl::hurdle()` or `pscl::zeroinfl()`. * Fixed issue with wrong title in `grpmean()` for grouped data frames, when grouping variable was an unlabelled factor. * Fix issue with `model_frame()` for **coxph**-models with polynomial or spline-terms. * Fix issue with `mediation()` for logical variables. # sjstats 0.17.0 ## General * Reduce package dependencies. ## New functions * `wtd_ttest()` to compute a weighted t-test. * `wtd_mwu()` to compute a weighted Mann-Whitney-U or Kruskal-Wallis test. ## Changes to functions * `robust()` was revised, getting more arguments to specify different types of covariance-matrix estimation, and handling these more flexible. * Improved `print()`-method for `tidy_stan()` for _brmsfit_-objects with categorical-families. * `se()` now also computes standard errors for relative frequencies (proportions) of a vector. * `r2()` now also computes r-squared values for _glmmTMB_-models from `genpois`-families. * `r2()` gives more precise warnings for non-supported model-families. * `xtab_statistics()` gets a `weights`-argument, to compute measures of association for contingency tables for weighted data. * The `statistics`-argument in `xtab_statistics()` gets a `"fisher"`-option, to force Fisher's Exact Test to be used. * Improved variance calculation in `icc()` for generalized linear mixed models with Poisson or negative binomial families. * `icc()` gets an `adjusted`-argument, to calculate the adjusted and conditional ICC for mixed models. * To get consistent argument names across functions, argument `weight.by` is now deprecated and renamed into `weights`. ## Bug fixes * Fix issues with effect size computation for repeated-measure Anova when using bootstrapping to compute confidence intervals. * `grpmean()` now also adjusts the `n`-columm for weighted data. * `icc()`, `re_var()` and `get_re_var()` now correctly compute the random-effect-variances for models with multiple random slopes per random effect term (e.g., `(1 + rs1 + rs2 | grp)`). * Fix issues in `tidy_stan()`, `mcse()`, `hdi()` and `n_eff()` for `stan_polr()`-models. * Plotting `equi_test()` did not work for intercept-only models. sjstats/MD50000644000176200001440000001135214620625152012262 0ustar liggesusers517d5b88f63ee13206d33280567b26ff *DESCRIPTION 74455df2ddc77d6e8704d5268f9c6204 *NAMESPACE 96785cdbb9b484e4406e99f469aba879 *NEWS.md e10dd983c750c5e52dcbd672c628fbf2 *R/Deprecated.R d9dba7573d21e93660ec5a360b45ce43 *R/S3-methods.R 292fb71d8ced0f748dd0f4e66f29925b *R/anova_stats.R 94f104911f33da856c41be988c125168 *R/auto_prior.R afa0b3edfad717886d11f70b5a3b8dc2 *R/boot_ci.R 9e96fc4c88cf6dde542dad506fbd024c *R/bootstrap.R 2bd500387eb22a566943d0088f44fd1c *R/chi_squared_test.R bea37c3b504f502316ea0c0ea13ace6d *R/confint_ncg.R fdc049d40c1128eda89e331c5c90d9ba *R/cramer.R a4d0e395069f0b478afde3129cc335f1 *R/cv.R d4bd61851d9cbf5c43ec4eb85307155c *R/cv_error.R b826d04f20826fbf7e12669244ca6e1f *R/design_effect.R 1ad1908f342e4326610442ec1f87a733 *R/find_beta.R 9e41625c78d4f0fc616fda45af0ddb8c *R/gmd.R 999e7912c1aefa456570911f5d78fe6c *R/gof.R 0cc0afccd5e8618261a77a39fcaf48ab *R/helpfunctions.R 6a846d9f6d8e46234264fafb97ccb650 *R/inequ_trends.R 1fa8b40681d24aee9e3f61fe2f5c3280 *R/is_prime.R b47c158d336c2a122b0175fe15b07a70 *R/kruskal_wallis_test.R 1d5d666a8a158249ded68970dbc2b1a7 *R/mann_whitney_test.R 95ac928d84740007386966a5acd85477 *R/nhanes_sample.R d6b62025eb44e0cec4b5b2b3ac35d95c *R/phi.R c312d4430cc490f1dfef3d22c2caaefa *R/prop.R 3b73410ea4df483db4d088b26cd67fcd *R/re-exports.R e05b4f907d36a356776c91a2efd4dd64 *R/samplesize_mixed.R 361bc5f678c9c2319dbdb8eeac7eb45a *R/se_ybar.R 086354712673aab0cc83fd08f1989359 *R/select_helpers.R 7d974672824a933c8324bcd8d01469a2 *R/sjStatistics.R 045c06ea2a7397394b2953a9b583298c *R/svy_median.R 6493371b66b336aa577c64dcb106c184 *R/svyglmnb.R 93a3a551f4b1f0aee327253aa5c8900e *R/svyglmzip.R 3e75c25d915766e171526bdf56c751be *R/t_test.R b31743de39c751d4d377fa619511b60f *R/var_pop.R 974429e0f1399cf4bf3fd978cd48ea88 *R/weight.R f116a04b43b7ccd58e83a073d3127d92 *R/wilcoxon_test.R 87f10c7fc8851aadc42785eb054f1513 *R/wtd_cor.R 3d485f8685d0eab81d5ef06afc2a6ebe *R/wtd_se.R 0ff693defb2e755fcf9335563a05af75 *R/wtd_variance.R 5aabab241149572088cd6eee7608af2d *R/xtab_statistics.R ae778cf204793e0dfedee9997d09129b *README.md 181b5ac5a69bd3e3e542680739e88fd5 *build/partial.rdb 3172b22b3d87d0f86d78326bc26891fc *data/efc.RData ad3f18b79c24699a3c122d8b377b5028 *data/nhanes_sample.RData 87f8a6a5cb2f59e0b2110feef3926498 *inst/CITATION e7b45979874646edfe06751f551ecac0 *man/anova_stats.Rd 59646cdd3bf69f527ab406e00ab87fa9 *man/auto_prior.Rd 70644108d79f2803c7942ac793429ff5 *man/boot_ci.Rd 1145032561d5db541e2d6507fb7a0767 *man/bootstrap.Rd aa75706bfe65fddfef67c466659bb300 *man/chi_squared_test.Rd d442b8e1e4f45c610e7f36156b45f337 *man/chisq_gof.Rd 7c79e41fdc70e1658f589abeb6b461e8 *man/crosstable_statistics.Rd ffdb50b8e82e22adbf2a02c0109ad4df *man/cv.Rd fc1e6d5ec9fa612aff5b4bb84a3859bc *man/cv_error.Rd 14ad0d0e233fe45af5a81dcdea171414 *man/design_effect.Rd 07b7d0ca4d598d4623ef85ce91255f2c *man/efc.Rd 00a3f8db1d678464d37705968d42286f *man/figures/logo.png 5972d9fb77c826cf2dfc1bedf013ae6d *man/find_beta.Rd 1d3f4f4138dfbce4505b8d4a115b50e3 *man/gmd.Rd aba99aa993b4a557f0263a74ac9d03e3 *man/inequ_trend.Rd c8358bb5d63a741d5fc35a043228614a *man/is_prime.Rd 7d0da9e0dddae98ae29ef61067d92858 *man/kruskal_wallis_test.Rd a260eeaa63dfd35efd31bef099bc52ca *man/mann_whitney_test.Rd 63d8c10d8c662c3265b596aa2dee6ddf *man/nhanes_sample.Rd 84219c1c63adff086c055edf59993124 *man/prop.Rd 489cce3aeab9138295c8f9b96bc8b411 *man/r2.Rd df2053d8e790c20ba0628b8e14c6661f *man/reexports.Rd 8b6bca7c1d0ca430e195d7f85957d2b3 *man/samplesize_mixed.Rd 6376e9c3db371cfc7941e2867d848567 *man/se_ybar.Rd ba2a06e0f10b01f98b0063f695e86a3b *man/svyglm.nb.Rd 905e7cfed53af047f7b763d3662fdde5 *man/svyglm.zip.Rd bf3b5ec64335984fc48f271d39f15f9a *man/t_test.Rd 2e48d0e005e36561b6d3ebc03c2970f8 *man/table_values.Rd 12f085a686effc5f7215f57dbe9a337a *man/var_pop.Rd e40b1a815405bce16f8851e02e1a1214 *man/weight.Rd 2e7b20401a5f7788b47893d95ee60a29 *man/weighted_se.Rd fe079ad00575d5017383eadab1fb321c *man/wilcoxon_test.Rd bcdd319f419f3a3c026bd5c6cf6c6606 *tests/testthat.R c3c62f713122eb923ceb784e811bfaa8 *tests/testthat/_snaps/chi_squared_test.md cdc4c6c33056ce23e1f8ea10c81c6153 *tests/testthat/_snaps/kruskal_wallis_test.md 44f5aa1953dacf2508276030089c1b7d *tests/testthat/_snaps/mann_whitney_test.md 8abd67404361072251463e4ec47310c4 *tests/testthat/_snaps/t_test.md 4dc8c45020c146986def33cf0994b64f *tests/testthat/_snaps/wilcoxon_test.md 79df90b4a1ef0dfa51a14229e13354ec *tests/testthat/test-autoprior.R d27fa76035935d6f1a89e537ff11f132 *tests/testthat/test-chi_squared_test.R 26a2f03b97851963289a3b6ea589cf36 *tests/testthat/test-kruskal_wallis_test.R 0fe1150cbf1b8f688c88f6dc6ebfea8f *tests/testthat/test-mann_whitney_test.R 113db343ce6d3454faa6f62cc812a317 *tests/testthat/test-t_test.R b70593e19a84da47a7fd5631e0817458 *tests/testthat/test-wilcoxon_test.R 3ddf1a5493907c2646b615b2aaeb56e4 *tests/testthat/test-wtd.R sjstats/inst/0000755000176200001440000000000014616613032012724 5ustar liggesuserssjstats/inst/CITATION0000644000176200001440000000055714616613032014070 0ustar liggesusersyear <- sub("-.*", "", meta$Date) title <- sprintf("sjstats: Statistical Functions for Regression Models (Version %s)", meta$Version) bibentry(bibtype="manual", title = title, author = person("Daniel", "Lüdecke"), year = year, url = "https://CRAN.R-project.org/package=sjstats", doi = "10.5281/zenodo.1284472")