sjmisc/0000755000176200001440000000000014620415212011537 5ustar liggesuserssjmisc/NAMESPACE0000644000176200001440000000730214153357271012773 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(all_na,data.frame) S3method(all_na,default) S3method(all_na,list) S3method(as.data.frame,sjmisc_frq) S3method(big_mark,data.frame) S3method(big_mark,default) S3method(big_mark,list) S3method(center,default) S3method(center,mids) S3method(dicho,default) S3method(dicho,mids) S3method(format,sjmisc_frq) S3method(is_even,data.frame) S3method(is_even,default) S3method(is_even,list) S3method(is_float,data.frame) S3method(is_float,default) S3method(is_float,list) S3method(is_odd,data.frame) S3method(is_odd,default) S3method(is_odd,list) S3method(is_whole,data.frame) S3method(is_whole,default) S3method(is_whole,list) S3method(prcn,data.frame) S3method(prcn,default) S3method(print,sj_has_na) S3method(print,sj_merge.imp) S3method(print,sjmisc_descr) S3method(print,sjmisc_frq) S3method(print,sjmisc_frq2) S3method(print,sjmisc_grpdescr) S3method(print_html,sjmisc_frq) S3method(print_md,sjmisc_frq) S3method(rec,default) S3method(rec,mids) S3method(recode_to,default) S3method(recode_to,mids) S3method(round_num,data.frame) S3method(round_num,default) S3method(round_num,list) S3method(row_count,default) S3method(row_count,mids) S3method(row_means,default) S3method(row_means,mids) S3method(row_sums,default) S3method(row_sums,mids) S3method(std,default) S3method(std,mids) S3method(to_long,default) S3method(to_long,mids) S3method(total_mean,data.frame) S3method(trim,data.frame) S3method(trim,default) S3method(trim,list) export("%>%") export("%nin%") export(add_case) export(add_columns) export(add_id) export(add_rows) export(add_variables) export(all_na) export(big_mark) export(center) export(center_if) export(clean_values) export(col_count) export(complete_cases) export(complete_vars) export(count_na) export(de_mean) export(descr) export(dicho) export(dicho_if) export(empty_cols) export(empty_rows) export(find_in_data) export(find_var) export(flat_table) export(frq) export(group_labels) export(group_labels_if) export(group_str) export(group_var) export(group_var_if) export(has_na) export(incomplete_cases) export(incomplete_vars) export(is_cross_classified) export(is_crossed) export(is_empty) export(is_even) export(is_float) export(is_nested) export(is_num_chr) export(is_num_fac) export(is_odd) export(is_whole) export(merge_df) export(merge_imputations) export(move_columns) export(numeric_to_factor) export(prcn) export(print_html) export(print_md) export(rec) export(rec_if) export(rec_pattern) export(recode_to) export(recode_to_if) export(ref_lvl) export(remove_cols) export(remove_empty_cols) export(remove_empty_rows) export(remove_var) export(rename_columns) export(rename_variables) export(replace_columns) export(replace_na) export(reshape_longer) export(rotate_df) export(round_num) export(row_count) export(row_means) export(row_sums) export(seq_col) export(seq_row) export(set_na) export(set_na_if) export(shorten_string) export(split_var) export(split_var_if) export(spread_coef) export(std) export(std_if) export(str_contains) export(str_end) export(str_find) export(str_start) export(tidy_values) export(to_character) export(to_dummy) export(to_factor) export(to_label) export(to_long) export(to_numeric) export(to_value) export(total_mean) export(trim) export(typical_value) export(var_rename) export(var_type) export(word_wrap) export(zap_inf) importFrom(insight,print_html) importFrom(insight,print_md) importFrom(magrittr,"%>%") importFrom(rlang,.data) importFrom(sjlabelled,set_na) importFrom(sjlabelled,to_character) importFrom(sjlabelled,to_factor) importFrom(sjlabelled,to_label) importFrom(sjlabelled,to_numeric) sjmisc/README.md0000644000176200001440000000575114620407517013037 0ustar liggesusers# sjmisc - Data and Variable Transformation Functions [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/sjmisc)](https://cran.r-project.org/package=sjmisc)    [![DOI](http://joss.theoj.org/papers/10.21105/joss.00754/status.svg)](https://doi.org/10.21105/joss.00754)    [![Documentation](https://img.shields.io/badge/documentation-sjmisc-orange.svg?colorB=E91E63)](https://strengejacke.github.io/sjmisc/)    [![downloads](http://cranlogs.r-pkg.org/badges/sjmisc)](https://cranlogs.r-pkg.org:443/)    [![total](http://cranlogs.r-pkg.org/badges/grand-total/sjmisc)](https://cranlogs.r-pkg.org:443/) Data preparation is a common task in research, which usually takes the most amount of time in the analytical process. Packages for data preparation have been released recently as part of the _tidyverse_, focussing on the transformation of data sets. Packages with special focus on transformation of _variables_, which fit into the workflow and design-philosophy of the tidyverse, are missing. **sjmisc** tries to fill this gap. Basically, this package complements the **dplyr** package in that **sjmisc** takes over data transformation tasks on variables, like recoding, dichotomizing or grouping variables, setting and replacing missing values, etc. A distinctive feature of **sjmisc** is the support for labelled data, which is especially useful for users who often work with data sets from other statistical software packages like _SPSS_ or _Stata_. The functions of **sjmisc** are designed to work together seamlessly with other packages from the tidyverse, like **dplyr**. For instance, you can use the functions from **sjmisc** both within a pipe-workflow to manipulate data frames, or to create new variables with `mutate()`. See `vignette("design_philosophy", "sjmisc")` for more details. ## Contributing to the package Please follow [this guide](https://github.com/strengejacke/sjmisc/blob/master/.github/CONTRIBUTING.md) if you like to contribute to this package. ## Installation ### Latest development build To install the latest development snapshot (see latest changes below), type following commands into the R console: ```r library(devtools) devtools::install_github("strengejacke/sjmisc") ``` ### Officiale, stable release To install the latest stable release from CRAN, type following command into the R console: ```r install.packages("sjmisc") ``` ## References, documentation and examples Please visit [https://strengejacke.github.io/sjmisc/](https://strengejacke.github.io/sjmisc/) for documentation and vignettes. ## Citation In case you want / have to cite my package, please cite as (see also `citation('sjmisc')`): Lüdecke D (2018). sjmisc: Data and Variable Transformation Functions. _Journal of Open Source Software_, *3*(26), 754. doi: 10.21105/joss.00754 [![DOI](http://joss.theoj.org/papers/10.21105/joss.00754/status.svg)](https://doi.org/10.21105/joss.00754) sjmisc/data/0000755000176200001440000000000013451124270012452 5ustar liggesuserssjmisc/data/efc.RData0000644000176200001440000004213413451124270014130 0ustar liggesusers՝]qߏIJ9 1*TRhjIlHI)AIˡ. ÿ( p/gι V_3ܻKw{{򉧺'ϧ?.|rsXq{?9/ pӍ~<=|o11}z G 7~g9Pv^1r<371?Wü+1y5KA FO> }z6̼cz'ZcB?AS|n<_#}DXK?3UӍaEG||] |+w< A#:;wgc="˾9GUҏqƥ_zKBn#ZW?6<_Q/`ޛh ~艾_"e܏OyѮ71ۀE@# :_ A#K q?x9/{s{ u`a܈@z^MSqsK#O1ޫh˟T1_H/z^釶zM?qߒCr)owcݠ/OET_}ճʺ'fK鳀u81ƕW*QrI^': <1X2?'ᾥy)=~3?Yw=v'/bu\puR;_A .\>E)oH?W/E:G[+u~-d?׸W(uR?!}|:W?-swGWP/;U|kS?\RP柕2x.O|y;=q!wCyP5@/> Aj?ެKXKq8к>SYgJWЯ~+qd\K{ ^tq<ȼsYi\tY${J ^J: ΒCr3I?kS1#7|mٝ 9#zzu%_=S?1{s}y?I/8[xCGXǯοf!A~k|n>on>:v6[w;l]W\v`C7^̍L_Զ|kV@Jyj嶰7Vg?/l?fgW`oIwuH;.k!;;%kGx2N0?'puvz6`I^uwߟM7<0VA7rvj7K|&~ .6o&H?v=̷I?ۼ}7Od$/+`~>o#:\?3-7Q>g`nѿB?f2l5nƃf2щtun?ߏۡ?ߣ 4 q)Xv[</;y-{+}26%g7 ?|Y?ܻ:9\morT==; j|zx/o>]k7}w뻷?y֣ova7?ڽ}p2?:\HkwHÓWǷ6n|{\F:_S_ȢnVhvnWbr̕^R0SKuVuprΕ{vY:s6͛ۆu_t\nn0Wu+Wv}N.VV[+G_;\j#"_ ʿz{4ޚOxY~54>KieG߶?.g-VX[_,pf,]ϭ"zrDmG[~\VX{XJmוKvwy헭c-ut|սُl#ڿ9^}9(jt r?<1Py^7޹OūC~s@߱<8_\?7H. o[wc“<?%:])'r@Kø~Sn'9 >/<ѻ*yGE^]I_>nS阯"=eOqihz4^3›Uz8a\~#{79ܢϗ.)@%\c}/4̧h=Dڲ[mtJԺmK[_zNW Oo?1yWX.ҁOIGqvi#vYDK>`=WP?+qq?!bn/yOr֩Oz_'؍aON|$Ax%/Pv,~=oG%7a>^;ҫ D)zЖi=$&bn.P9[.%+~y+%N/'zN0_v+tcyJ>C'&@G1?3r]p?r2%ҁ?RzOc~ܷ5z~Yߍ^% mօ$!'g_X_^zI<گ/}%~yMyWx|`/r7o~'7_<ެ/qֳzL]%<9x~;OdYx{n7SRIu>wB(zЯ3gQX⇐@yq(~cXп̸諮[9o/<%+w uhړs`ǐ1^YeC /\WħY`?Ճ83wg[z +yӑ<ϕ~Wa4\][n,Oo}JI5uԖ+tYG{.vb r%~C?I}MУnL笲Ϟ^8/׼r4cXn=sm?(x#XG_ %L@KuqCx^;cy$nHo3%}q{ ޿yxߡ]M~re=%K?r,|ov?HO#`yd^^Kٻ^ےR'Tfpc vt0^B>NGϭ#؟{)z.dGD wh;?ҋ镍xrK:(m [0._NHHO'wmܸ=Y9H?қrDuRkE9\6Ku#9"ġ+5ym~ y}*gh◜M؁~4߬[m} m'_m5P7{Ooȅ{9c/7o#*g; n4-ڢ>_6xjkf7|\a$g>Y|E$~oժwEszfv~|%ή ڭvM-$lt,Z9:VIq۹v'ed_W?N~VxY/rY9"~%R޺N<],,.h8590]_3Wv{kdY*:yɟx?+_>Ѻf[+F"<G7?/͏=jj_H?U_`mUY+ja<‹dyuvuCϬ]+^\?\}yx{ucٿk6/6յݣͧ{o~o| thg0zxd?/v<ܟݖz9`2~i6r[zce$_em>\yj#W_Q^Ot1ʩJ }3"lg2ڼG+-C_DՋ= =G"G=Vێ֞W|y]|*[;_9uhk]泥9N/|Ζ1vi=':j9P.;wzWf?N8qէwALt}e;9֏sa^6un=lu~N_t|:9<\]|{pnk)>%744ߝ)"]%^oHVkqU+=f)̋Bx1~fE>B>OI{C70+md%lu^J k)[4Wi=HZ6E둎p3M;QOKw6txxpշ?yǵ'}[xR狭#EYgga>x#T{xw޵'{wG~z:ܻbG{Ǐ %x<sQv|zT񫽊Ke֫HW31JZzH/*%R/Ֆh|55m/[$m_F7ˑJq2K?|5o?+Wv^6+;/h'Yfam !˿8ڼ[$-S7yl})oWЬ<曵cz=gr#y'x= I?N?8GV~VN>'W׆o?[g;݈_ϭ/ οnK{mK'p_M~z: Oh8yulɵZ+O8 U>wnqvOFn?;?zʏQse */̛{^((ݓܸv:޺޵id- z}8^I/SR/d[ԥv #k_a Ul<>eyBIjdo'owT_)"vmK@'WVh>kߢZ׃rYX?ݣ^G%wm;M {8 k"ۊʶ||Nɷz|uw"g{_>~ORkw4e3*W{ίl]Z2}^3xAU脪̸.Ym󣂭q=t ?HV9kS+GMS?ǣ7"͒RO~Xק\9֣^󼾍Ow%ǶjG.|{>ϛ_zlIj=sl-[[A]O/OB'[ӳUpi:o=/={G}%WX_ok^j]Gn'|6Wm{\8΋߈g?I۳;=?z<-M[[xقrUjedx{k1{% L~o'6~ꓵSѺ?(/M8^|#wq'/ȷKtuCv_sr̿]5o18:a~ݝ mIʪ%+_z3'x٧=:kS@ڪ:?g#7'r8}Ο뇑fuH~#YldKϵu{jd_FO{3j[y#ɗpOo?FlkWϤu4ċ괰 }W։lgw"o@7ʱSr=~e-ț1Kl,:inuxۨsGlyYhK;f|v=CF=K-n/#[r'yqUS{ѭo|re[Ns`^<`__{tuc_޺ny9zmm }4vƈ o=+y\HE+od0?^tCZäAn }(g7Ͼo?2. qke>`?}:c$'ac=Pwv6,'Fj5k$K406HwN.rTog3ڳô$l>o͏s`:?ϯW]gwnd_tԝ]xZ us2Nxpqum -a/uJxټh7ylU{ya=A:A^ehlN=_$w+ >ub7 }ڀ({5'Gz٩3'njy"pl3սُK۷nl>ruhospGOo~rxh|a1~ana)7,H%)Oh^q7J#g ތGKo4zFG|Xtt+WV\1=>ɋ#[<8fGnz$ED 9l1c=D$(oZ?-ʳQQߏa8ʧ[NroY97Ѹrg}>+7z&%{ߛ~dvcKᅣʼn|K97y/H>=g=[uG:Oh䭮ϻ) pޢ'ߥOU~>@WyO\j-.<,|~O'~\oѹ2~>֥0~E|J<hk=^Ez0>-<Y?};g,ZG# tdwe? ⧼!. 7\e=ŕK֛y!AtOP~Z e7_w:SkZz x%Otxn.yFrgݸ_)ڥ%ɾokOW.|8W呝ؖ]ŷC~JP~% m/NmzJn[|7ɏ8)'d0/DF~ŋݹ>Gޜ|}u/>^Zi='9 PQPq-ܲV~?2.Z_~]۬/~7[8My4&׾U~zZo5Kayߢ/{eOkHѓZ?~7{].{hRgJ'[= /S?CAy03@?_tx ۏ15]G+3~yZoK/Cvơ_o99[>_o/џCT\Zq]ŏyMtyK_A<_wӱ|7Z֡K9d?7!4_<?+~> ޝ_rtc>̃?] 8_B-91/Z|uȯ?HٕH֏Zr^> &;JN_8:*.e_`(Uih(?)?~S^̓Z5T?U. zNPuH9QفuIr)ym~/g~,}.O0./qo}U's=&Gʝ1]y5=,Жh}ӱ>W<%<8w~,z7O7Y tE7 KQ|q/mn-ʹGa[m+/so{Z kV+ݬZ˥٥_f#p)o7KQ?n~}dqn>*y{㳟FǫO77Gr_|bԯw77}-+sZdjA}ӺY?TٺٽniŏZ~m}PI:Y9y#:q\ܖO3H~>W`Y'!n|v0K&+|yܼu'yynߚjoy9jWoe<[럵v\*oF~ZKGs~Mg);ͷ|ZtnܺfnTim[Rȍg8noךg?=OG{g\|b*ڙ_}pד_n{S ~XM oBw6v0߀~/!~C"i==TM6=#='ziOKoRz: }'~}f7_cbo =d#?zn_)!o+/ /2BWj=} q#?xxq["_yA8 _0O|4q$<͗Ԗj3j3S?}3.zCrreGӏh}v1%%y[m o?^C?\ Z_ E z7ySWu#go$旿n||A3%yDr0иGڕu1e7>yc~$%=92_Ň%-y/_rNqyN}InIx̷_iG~)YGބS^~OPzf9(9eWm"q? 3ga~)3?. _AB?a|g''<楫<_п>Ʃח~Tv\<0nW˼+> _֫Oc\s&4yJ|ȼI:AgisySOw'S?%GE#c޽yoc>9nߡYr`=C=9a\:!ƙ;rn1.3o\|P^ek<Żg;c*ƕאzmVo_~7N>70WA76_8P/{\ y(lOm{~#Wr<_\l2.+.oɲ= ~E|_|Ux0_~sZO-\y/a>?C~C;пwxỤm^/ڃs/<~#uz㤻}(w_a5θѸ:b>=2oH~o:r=W/;t5.zcvg\oŗ?EP{'ѡ=o2}N\O̗'\vD~.=޴ x\/;=7ig]M;0|/WL?Au)/ggޣo(%7W/n>Ÿd[8bGH}is˿k3YH^ {S </~:M|Wv"㹚Jwu7Rnݙ8(huO3@un_\dڢCeJ}俬cO9dc>sʸqu'ϛNb]]6S>wwx]ar.Et_dn_y sk>5Ps8yݹٕʸ&ڍqF?qu#v>t 3.p\=-|9{Gv#9tyu0L?$?w>]1E95׍\}ud?ӯ}c]H.|s\MʟÂyB'}gv;zmwl%O2]2ߺ{<^Sy#y9̘~-@!b_|ڑv7v/ :2;0`~K'sv>?}j^wv^.U} _O+9hmw} ϯ㮎wN;gcwp>;qĽgϼGݜ|FyCGyw>tŗ{n LsuoszoQrw~pu9K}Y;yב~9ǣM\1\tujGuGdwrg}ףs~Wq^;;[eqxwoo?xIn<̠T!02_ 3Y|9--9fZ+%rݞOwq7yC'?W=^󝬣i_'kԓ)|=Y^i~2џտc'@k7)]>>M>rM~+1EyK._c3-Ӟ.nWGoOC8aqXkxw_8?+/Vgsybޡ;9t}1{|"|ܿkW߽.y?Wz=)|^FF}ǟwpr3/Ǽf` ?EƺOv:_tײyG={oꗞy+RN=KvdRaŁzJ% hx>ղ3w0y=.aAs.9<oSx<*._~ui]WЇ_'Cɟyc/ o=vr&~7=ɗ3Ty|_c=~x 9<+~6%<%W:Dk_wp|N<ڼ'~O=X2_puwoځwncJxܟd:/Gre\ODGy/FgtG#zԏ<Wx=Eɟtytksu;܌ϣxB]\B'/W7V{76W&3}*݇џtxsJi]~H7u2McՌ'Пfe>ۢms^;?:O.o~h?rhϲٹVߵ_l܏Շ~j]dX K.?y˵zq&y>=w_繥?p80Zow>o͇/&N--\}wEGYz6>y}j]>Zw;>a[^~tO=ãE5ҟm~x~:9Nn3M:du9>9ڻvwx0d?bF'7yyxǿ<66| ?cUOcXCI?k>\r{#wzB??o=|}aO瑾>r"΢?LyîOwrkZƥw=`;ƹgOeW>?Z:wo=Sظi=!]kW6] 8^dZ=|ܼ}֭&~$G6~D6l>'v#|~_~%~Z~SM =G7+'.ټT~D5`m}\Ol{׋ڻZY:s[yqލvuz#:}rMo#?gUi[o9dv7,q~ާm6.w,ݹ뜥Ϛ08B\eWdS6nRugxɺjB|3[ɞgܼzn ߻9^8kHoğ]GH{sc'9O ΍DA<__wm]q|#;dOvϞkΝ=H} '}'_'KJK]Z;z}j7"'[OFi=Om|ϞsO30eT=ۓT#?o[D=':czig g=7+v ?Or|8x._c|kE%eO&{޻~=YxFKgӟv>x*rѶwE>QY {aq{qY?6gO=KC^Fބ[6|QvqvrLrx[zj^/FFGyr\U&v(<nݢuO>Dkտ>O"y9//ʇv6Nz/ҋ|s5RQYg˻|#{8ΞV`}ks-}g.Q=ЪeNX~VʇKom>qtjjK!~]!jo_{;̷3F٫v=<5x=[Ng,kn|q"!ͻN^]-0̇s풥ptn=E@N(.UG@WQ^7k0 #[ŮKwm>ΛOO/ǣ8~Þ>Yt~I菾,tD\{񥛇ӛ_9 oo^|x^tSZ_Uҵ[/>h|҃Ż͗4_ۼ+mkom^om@Wwu <}Jy}+nӟ<99| [}~;@\o.o}|}Y/~ljOkt'/8;_0sjmisc/man/0000755000176200001440000000000014153357271012325 5ustar liggesuserssjmisc/man/row_sums.Rd0000644000176200001440000000731613676323120014474 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/row_sums.R \name{row_sums} \alias{row_sums} \alias{row_sums.default} \alias{row_sums.mids} \alias{row_means} \alias{total_mean} \alias{row_means.default} \alias{row_means.mids} \title{Row sums and means for data frames} \usage{ row_sums(x, ...) \method{row_sums}{default}(x, ..., n, var = "rowsums", append = TRUE) \method{row_sums}{mids}(x, ..., var = "rowsums", append = TRUE) row_means(x, ...) total_mean(x, ...) \method{row_means}{default}(x, ..., n, var = "rowmeans", append = TRUE) \method{row_means}{mids}(x, ..., var = "rowmeans", append = TRUE) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{n}{May either be \itemize{ \item a numeric value that indicates the amount of valid values per row to calculate the row mean or sum; \item a value between 0 and 1, indicating a proportion of valid values per row to calculate the row mean or sum (see 'Details'). \item or \code{Inf}. If \code{n = Inf}, all values per row must be non-missing to compute row mean or sum. } If a row's sum of valid (i.e. non-\code{NA}) values is less than \code{n}, \code{NA} will be returned as value for the row mean or sum.} \item{var}{Name of new the variable with the row sums or means.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} } \value{ For \code{row_sums()}, a data frame with a new variable: the row sums from \code{x}; for \code{row_means()}, a data frame with a new variable: the row means from \code{x}. If \code{append = FALSE}, only the new variable with row sums resp. row means is returned. \code{total_mean()} returns the mean of all values from all specified columns in a data frame. } \description{ \code{row_sums()} and \code{row_means()} compute row sums or means for at least \code{n} valid values per row. The functions are designed to work nicely within a pipe-workflow and allow select-helpers for selecting variables. } \details{ For \code{n}, must be a numeric value from \code{0} to \code{ncol(x)}. If a \emph{row} in \code{x} has at least \code{n} non-missing values, the row mean or sum is returned. If \code{n} is a non-integer value from 0 to 1, \code{n} is considered to indicate the proportion of necessary non-missing values per row. E.g., if \code{n = .75}, a row must have at least \code{ncol(x) * n} non-missing values for the row mean or sum to be calculated. See 'Examples'. } \examples{ data(efc) efc \%>\% row_sums(c82cop1:c90cop9, n = 3, append = FALSE) library(dplyr) row_sums(efc, contains("cop"), n = 2, append = FALSE) dat <- data.frame( c1 = c(1,2,NA,4), c2 = c(NA,2,NA,5), c3 = c(NA,4,NA,NA), c4 = c(2,3,7,8), c5 = c(1,7,5,3) ) dat row_means(dat, n = 4) row_sums(dat, n = 4) row_means(dat, c1:c4, n = 4) # at least 40\% non-missing row_means(dat, c1:c4, n = .4) row_sums(dat, c1:c4, n = .4) # total mean of all values in the data frame total_mean(dat) # create sum-score of COPE-Index, and append to data efc \%>\% select(c82cop1:c90cop9) \%>\% row_sums(n = 1) # if data frame has only one column, this column is returned row_sums(dat[, 1, drop = FALSE], n = 0) } sjmisc/man/find_var.Rd0000644000176200001440000001071714620403337014404 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/find_var.R \name{find_var} \alias{find_var} \alias{find_in_data} \title{Find variable by name or label} \usage{ find_var( data, pattern, ignore.case = TRUE, search = c("name_label", "name_value", "label_value", "name", "label", "value", "all"), out = c("table", "df", "index"), fuzzy = FALSE, regex = FALSE ) find_in_data( data, pattern, ignore.case = TRUE, search = c("name_label", "name_value", "label_value", "name", "label", "value", "all"), out = c("table", "df", "index"), fuzzy = FALSE, regex = FALSE ) } \arguments{ \item{data}{A data frame.} \item{pattern}{Character string to be matched in \code{data}. May also be a character vector of length > 1 (see 'Examples'). \code{pattern} is searched for in column names and variable label attributes of \code{data} (see \code{\link[sjlabelled]{get_label}}). \code{pattern} might also be a regular-expression object, as returned by \code{stringr::regex()}. Alternatively, use \code{regex = TRUE} to treat \code{pattern} as a regular expression rather than a fixed string.} \item{ignore.case}{Logical, whether matching should be case sensitive or not. \code{ignore.case} is ignored when \code{pattern} is no regular expression or \code{regex = FALSE}.} \item{search}{Character string, indicating where \code{pattern} is sought. Use one of following options: \describe{ \item{\code{"name_label"}}{The default, searches for \code{pattern} in variable names and variable labels.} \item{\code{"name_value"}}{Searches for \code{pattern} in variable names and value labels.} \item{\code{"label_value"}}{Searches for \code{pattern} in variable and value labels.} \item{\code{"name"}}{Searches for \code{pattern} in variable names.} \item{\code{"label"}}{Searches for \code{pattern} in variable labels} \item{\code{"value"}}{Searches for \code{pattern} in value labels.} \item{\code{"all"}}{Searches for \code{pattern} in variable names, variable and value labels.} }} \item{out}{Output (return) format of the search results. May be abbreviated and must be one of: \describe{ \item{\code{"table"}}{A tabular overview (as data frame) with column indices, variable names and labels of matching variables. } \item{\code{"df"}}{A data frame with all matching variables.} \item{\code{"index"}}{ A named vector with column indices of all matching variables. } }} \item{fuzzy}{Logical, if \code{TRUE}, "fuzzy matching" (partial and close distance matching) will be used to find \code{pattern} in \code{data} if no exact match was found.} \item{regex}{Logical, if \code{TRUE}, \code{pattern} is treated as a regular expression rather than a fixed string.} } \value{ By default (i.e. \code{out = "table"}, returns a data frame with three columns: column number, variable name and variable label. If \code{out = "index"}, returns a named vector with column indices of matching variables (variable names are used as names-attribute); if \code{out = "df"}, returns the matching variables as data frame } \description{ This functions finds variables in a data frame, which variable names or variable (and value) label attribute match a specific pattern. Regular expression for the pattern is supported. } \details{ This function searches for \code{pattern} in \code{data}'s column names and - for labelled data - in all variable and value labels of \code{data}'s variables (see \code{\link[sjlabelled]{get_label}} for details on variable labels and labelled data). Regular expressions are supported as well, by simply using \code{pattern = stringr::regex(...)} or \code{regex = TRUE}. } \examples{ data(efc) # find variables with "cop" in variable name find_var(efc, "cop") # return data frame with matching variables find_var(efc, "cop", out = "df") # or return column numbers find_var(efc, "cop", out = "index") # find variables with "dependency" in names and variable labels library(sjlabelled) find_var(efc, "dependency") get_label(efc$e42dep) # find variables with "level" in names and value labels res <- find_var(efc, "level", search = "name_value", out = "df") res get_labels(res, attr.only = FALSE) # use sjPlot::view_df() to view results \dontrun{ library(sjPlot) view_df(res)} } sjmisc/man/dicho.Rd0000644000176200001440000001207514046746443013713 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dicho.R \name{dicho} \alias{dicho} \alias{dicho_if} \title{Dichotomize variables} \usage{ dicho( x, ..., dich.by = "median", as.num = FALSE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_d" ) dicho_if( x, predicate, dich.by = "median", as.num = FALSE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_d" ) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{dich.by}{Indicates the split criterion where a variable is dichotomized. Must be one of the following values (may be abbreviated): \describe{ \item{\code{"median"} or \code{"md"}}{by default, \code{x} is split into two groups at the median.} \item{\code{"mean"} or \code{"m"}}{splits \code{x} into two groups at the mean of \code{x}.} \item{numeric value}{splits \code{x} into two groups at the specific value. Note that the value is inclusive, i.e. \code{dich.by = 10} will split \code{x} into one group with values from lowest to 10 and another group with values greater than 10.} }} \item{as.num}{Logical, if \code{TRUE}, return value will be numeric, not a factor.} \item{var.label}{Optional string, to set variable label attribute for the returned variable (see vignette \href{https://cran.r-project.org/package=sjlabelled/vignettes/intro_sjlabelled.html}{Labelled Data and the sjlabelled-Package}). If \code{NULL} (default), variable label attribute of \code{x} will be used (if present). If empty, variable label attributes will be removed.} \item{val.labels}{Optional character vector (of length two), to set value label attributes of dichotomized variable (see \code{\link[sjlabelled]{set_labels}}). If \code{NULL} (default), no value labels will be set.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} \item{suffix}{Indicates which suffix will be added to each dummy variable. Use \code{"numeric"} to number dummy variables, e.g. \emph{x_1}, \emph{x_2}, \emph{x_3} etc. Use \code{"label"} to add value label, e.g. \emph{x_low}, \emph{x_mid}, \emph{x_high}. May be abbreviated.} \item{predicate}{A predicate function to be applied to the columns. The variables for which \code{predicate} returns \code{TRUE} are selected.} } \value{ \code{x}, dichotomized. If \code{x} is a data frame, for \code{append = TRUE}, \code{x} including the dichotomized. variables as new columns is returned; if \code{append = FALSE}, only the dichotomized variables will be returned. If \code{append = TRUE} and \code{suffix = ""}, recoded variables will replace (overwrite) existing variables. } \description{ Dichotomizes variables into dummy variables (0/1). Dichotomization is either done by median, mean or a specific value (see \code{dich.by}). \code{dicho_if()} is a scoped variant of \code{dicho()}, where recoding will be applied only to those variables that match the logical condition of \code{predicate}. } \details{ \code{dicho()} also works on grouped data frames (see \code{\link[dplyr]{group_by}}). In this case, dichotomization is applied to the subsets of variables in \code{x}. See 'Examples'. } \note{ Variable label attributes are preserved (unless changed via \code{var.label}-argument). } \examples{ data(efc) summary(efc$c12hour) # split at median table(dicho(efc$c12hour)) # split at mean table(dicho(efc$c12hour, dich.by = "mean")) # split between value lowest to 30, and above 30 table(dicho(efc$c12hour, dich.by = 30)) # sample data frame, values from 1-4 head(efc[, 6:10]) # dichtomized values (1 to 2 = 0, 3 to 4 = 1) library(dplyr) efc \%>\% select(6:10) \%>\% dicho(dich.by = 2) \%>\% head() # dichtomize several variables in a data frame dicho(efc, c12hour, e17age, c160age, append = FALSE) # dichotomize and set labels frq(dicho( efc, e42dep, var.label = "Dependency (dichotomized)", val.labels = c("lower", "higher"), append = FALSE )) # works also with gouped data frames mtcars \%>\% dicho(disp, append = FALSE) \%>\% table() mtcars \%>\% group_by(cyl) \%>\% dicho(disp, append = FALSE) \%>\% table() # dichotomizing grouped data frames leads to different # results for a dichotomized variable, because the split # value is different for each group. # compare: mtcars \%>\% group_by(cyl) \%>\% summarise(median = median(disp)) median(mtcars$disp) # dichotomize only variables with more than 10 unique values p <- function(x) dplyr::n_distinct(x) > 10 dicho_if(efc, predicate = p, append = FALSE) } sjmisc/man/str_contains.Rd0000644000176200001440000000637514046746443015341 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/str_contains.R \name{str_contains} \alias{str_contains} \title{Check if string contains pattern} \usage{ str_contains(x, pattern, ignore.case = FALSE, logic = NULL, switch = FALSE) } \arguments{ \item{x}{Character string where matches are sought. May also be a character vector of length > 1 (see 'Examples').} \item{pattern}{Character string to be matched in \code{x}. May also be a character vector of length > 1 (see 'Examples').} \item{ignore.case}{Logical, whether matching should be case sensitive or not.} \item{logic}{Indicates whether a logical combination of multiple search pattern should be made. \itemize{ \item Use \code{"or"}, \code{"OR"} or \code{"|"} for a logical or-combination, i.e. at least one element of \code{pattern} is in \code{x}. \item Use \code{"and"}, \code{"AND"} or \code{"&"} for a logical AND-combination, i.e. all elements of \code{pattern} are in \code{x}. \item Use \code{"not"}, \code{"NOT"} or \code{"!"} for a logical NOT-combination, i.e. no element of \code{pattern} is in \code{x}. \item By default, \code{logic = NULL}, which means that \code{TRUE} or \code{FALSE} is returned for each element of \code{pattern} separately. }} \item{switch}{Logical, if \code{TRUE}, \code{x} will be sought in each element of \code{pattern}. If \code{switch = TRUE}, \code{x} needs to be of length 1.} } \value{ \code{TRUE} if \code{x} contains \code{pattern}. } \description{ This functions checks whether a string or character vector \code{x} contains the string \code{pattern}. By default, this function is case sensitive. } \details{ This function iterates all elements in \code{pattern} and looks for each of these elements if it is found in \emph{any} element of \code{x}, i.e. which elements of \code{pattern} are found in the vector \code{x}. \cr \cr Technically, it iterates \code{pattern} and calls \code{grep(x, pattern[i], fixed = TRUE)} for each element of \code{pattern}. If \code{switch = TRUE}, it iterates \code{pattern} and calls \code{grep(pattern[i], x, fixed = TRUE)} for each element of \code{pattern}. Hence, in the latter case (if \code{switch = TRUE}), \code{x} must be of length 1. } \examples{ str_contains("hello", "hel") str_contains("hello", "hal") str_contains("hello", "Hel") str_contains("hello", "Hel", ignore.case = TRUE) # which patterns are in "abc"? str_contains("abc", c("a", "b", "e")) # is pattern in any element of 'x'? str_contains(c("def", "abc", "xyz"), "abc") # is "abcde" in any element of 'x'? str_contains(c("def", "abc", "xyz"), "abcde") # no... # is "abc" in any of pattern? str_contains("abc", c("defg", "abcde", "xyz12"), switch = TRUE) str_contains(c("def", "abcde", "xyz"), c("abc", "123")) # any pattern in "abc"? str_contains("abc", c("a", "b", "e"), logic = "or") # all patterns in "abc"? str_contains("abc", c("a", "b", "e"), logic = "and") str_contains("abc", c("a", "b"), logic = "and") # no patterns in "abc"? str_contains("abc", c("a", "b", "e"), logic = "not") str_contains("abc", c("d", "e", "f"), logic = "not") } sjmisc/man/descr.Rd0000644000176200001440000000623214046746443013723 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/descr.R \name{descr} \alias{descr} \title{Basic descriptive statistics} \usage{ descr( x, ..., max.length = NULL, weights = NULL, show = "all", out = c("txt", "viewer", "browser"), encoding = "UTF-8", file = NULL ) } \arguments{ \item{x}{A vector or a data frame. May also be a grouped data frame (see 'Note' and 'Examples').} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{max.length}{Numeric, indicating the maximum length of variable labels in the output. If variable names are longer than \code{max.length}, they will be shortened to the last whole word within the first \code{max.length} chars.} \item{weights}{Bare name, or name as string, of a variable in \code{x} that indicates the vector of weights, which will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} \item{show}{Character vector, indicating which information (columns) that describe the data should be returned. May be one or more of \code{"type", "label", "n", "NA.prc", "mean", "sd", "se", "md", "trimmed", "range", "iqr", "skew"}. There are two shortcuts: \code{show = "all"} (default) shows all information, \code{show = "short"} just shows n, missing percentage, mean and standard deviation.} \item{out}{Character vector, indicating whether the results should be printed to console (\code{out = "txt"}) or as HTML-table in the viewer-pane (\code{out = "viewer"}) or browser (\code{out = "browser"}).} \item{encoding}{Character vector, indicating the charset encoding used for variable and value labels. Default is \code{"UTF-8"}. Only used when \code{out} is not \code{"txt"}.} \item{file}{Destination file, if the output should be saved as file. Only used when \code{out} is not \code{"txt"}.} } \value{ A data frame with basic descriptive statistics. } \description{ This function prints a basic descriptive statistic, including variable labels. } \note{ \code{data} may also be a grouped data frame (see \code{\link[dplyr]{group_by}}) with up to two grouping variables. Descriptive tables are created for each subgroup then. } \examples{ data(efc) descr(efc, e17age, c160age) efc$weights <- abs(rnorm(nrow(efc), 1, .3)) descr(efc, c12hour, barthtot, weights = weights) library(dplyr) efc \%>\% select(e42dep, e15relat, c172code) \%>\% descr() # show just a few elements efc \%>\% select(e42dep, e15relat, c172code) \%>\% descr(show = "short") # with grouped data frames efc \%>\% group_by(e16sex) \%>\% select(e16sex, e42dep, e15relat, c172code) \%>\% descr() # you can select variables also inside 'descr()' efc \%>\% group_by(e16sex, c172code) \%>\% descr(e16sex, c172code, e17age, c160age) # or even use select-helpers descr(efc, contains("cop"), max.length = 20) } sjmisc/man/round_num.Rd0000644000176200001440000000107013567234676014633 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/round_num.R \name{round_num} \alias{round_num} \title{Round numeric variables in a data frame} \usage{ round_num(x, digits = 0) } \arguments{ \item{x}{A vector or data frame.} \item{digits}{Numeric, number of decimals to round to.} } \value{ \code{x} with all numeric variables rounded. } \description{ \code{round_num()} rounds numeric variables in a data frame that also contains non-numeric variables. Non-numeric variables are ignored. } \examples{ data(iris) round_num(iris) } sjmisc/man/count_na.Rd0000644000176200001440000000336013676323120014417 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/count_na.R \name{count_na} \alias{count_na} \title{Frequency table of tagged NA values} \usage{ count_na(x, ...) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} } \value{ A data frame with counted tagged NA values. } \description{ This method counts tagged NA values (see \code{\link[haven]{tagged_na}}) in a vector and prints a frequency table of counted tagged NAs. } \examples{ if (require("haven")) { x <- labelled( x = c(1:3, tagged_na("a", "c", "z"), 4:1, tagged_na("a", "a", "c"), 1:3, tagged_na("z", "c", "c"), 1:4, tagged_na("a", "c", "z")), labels = c("Agreement" = 1, "Disagreement" = 4, "First" = tagged_na("c"), "Refused" = tagged_na("a"), "Not home" = tagged_na("z")) ) count_na(x) y <- labelled( x = c(1:3, tagged_na("e", "d", "f"), 4:1, tagged_na("f", "f", "d"), 1:3, tagged_na("f", "d", "d"), 1:4, tagged_na("f", "d", "f")), labels = c("Agreement" = 1, "Disagreement" = 4, "An E" = tagged_na("e"), "A D" = tagged_na("d"), "The eff" = tagged_na("f")) ) # create data frame dat <- data.frame(x, y) # possible count()-function calls count_na(dat) count_na(dat$x) count_na(dat, x) count_na(dat, x, y) } } sjmisc/man/numeric_to_factor.Rd0000644000176200001440000000201314046746443016316 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/num_to_fac.R \name{numeric_to_factor} \alias{numeric_to_factor} \title{Convert numeric vectors into factors associated value labels} \usage{ numeric_to_factor(x, n = 4) } \arguments{ \item{x}{A data frame.} \item{n}{Numeric, indicating the maximum amount of unique values in \code{x} to be considered as "factor". Variables with more unique values than \code{n} are not converted to factor.} } \value{ \code{x}, with numeric values with a maximum of \code{n} unique values being converted to factors. } \description{ This function converts numeric variables into factors, and uses associated value labels as factor levels. } \details{ If \code{x} is a labelled vector, associated value labels will be used as level. Else, the numeric vector is simply coerced using \code{as.factor()}. } \examples{ library(dplyr) data(efc) efc \%>\% select(e42dep, e16sex, c12hour, c160age, c172code) \%>\% numeric_to_factor() } sjmisc/man/is_empty.Rd0000644000176200001440000000345114046746443014454 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_empty.R \name{is_empty} \alias{is_empty} \title{Check whether string, list or vector is empty} \usage{ is_empty(x, first.only = TRUE, all.na.empty = TRUE) } \arguments{ \item{x}{String, character vector, list, data.frame or numeric vector or factor.} \item{first.only}{Logical, if \code{FALSE} and \code{x} is a character vector, each element of \code{x} will be checked if empty. If \code{TRUE}, only the first element of \code{x} will be checked.} \item{all.na.empty}{Logical, if \code{x} is a vector with \code{NA}-values only, \code{is_empty} will return \code{FALSE} if \code{all.na.empty = FALSE}, and will return \code{TRUE} if \code{all.na.empty = TRUE} (default).} } \value{ Logical, \code{TRUE} if \code{x} is a character vector or string and is empty, \code{TRUE} if \code{x} is a vector or list and of length 0, \code{FALSE} otherwise. } \description{ This function checks whether a string or character vector (of length 1), a list or any vector (numeric, atomic) is empty or not. } \note{ \code{NULL}- or \code{NA}-values are also considered as "empty" (see 'Examples') and will return \code{TRUE}, unless \code{all.na.empty==FALSE}. } \examples{ is_empty("test") is_empty("") is_empty(NA) is_empty(NULL) # string is not empty is_empty(" ") # however, this trimmed string is is_empty(trim(" ")) # numeric vector x <- 1 is_empty(x) x <- x[-1] is_empty(x) # check multiple elements of character vectors is_empty(c("", "a")) is_empty(c("", "a"), first.only = FALSE) # empty data frame d <- data.frame() is_empty(d) # empty list is_empty(list(NULL)) # NA vector x <- rep(NA,5) is_empty(x) is_empty(x, all.na.empty = FALSE) } sjmisc/man/var_rename.Rd0000644000176200001440000000257714046746443014752 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/var_labels.R \name{var_rename} \alias{var_rename} \alias{rename_variables} \alias{rename_columns} \title{Rename variables} \usage{ var_rename(x, ..., verbose = TRUE) rename_variables(x, ..., verbose = TRUE) rename_columns(x, ..., verbose = TRUE) } \arguments{ \item{x}{A data frame.} \item{...}{A named vector, or pairs of named vectors, where the name (lhs) equals the column name that should be renamed, and the value (rhs) is the new column name.} \item{verbose}{Logical, if \code{TRUE}, a warning is displayed when variable names do not exist in \code{x}.} } \value{ \code{x}, with new column names for those variables specified in \code{...}. } \description{ This function renames variables in a data frame, i.e. it renames the columns of the data frame. } \examples{ dummy <- data.frame( a = sample(1:4, 10, replace = TRUE), b = sample(1:4, 10, replace = TRUE), c = sample(1:4, 10, replace = TRUE) ) rename_variables(dummy, a = "first.col", c = "3rd.col") # using quasi-quotation library(rlang) v1 <- "first.col" v2 <- "3rd.col" rename_variables(dummy, a = !!v1, c = !!v2) x1 <- "a" x2 <- "b" rename_variables(dummy, !!x1 := !!v1, !!x2 := !!v2) # using a named vector new_names <- c(a = "first.col", c = "3rd.col") rename_variables(dummy, new_names) } sjmisc/man/rec_pattern.Rd0000644000176200001440000000305113567234676015134 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rec_pattern.R \name{rec_pattern} \alias{rec_pattern} \title{Create recode pattern for 'rec' function} \usage{ rec_pattern(from, to, width = 5, other = NULL) } \arguments{ \item{from}{Minimum value that should be recoded.} \item{to}{Maximum value that should be recoded.} \item{width}{Numeric, indicating the range of each group.} \item{other}{String token, indicating how to deal with all other values that have not been captured by the recode pattern. See 'Details' on the \code{else}-token in \code{\link{rec}}.} } \value{ A list with two values: \describe{ \item{\code{pattern}}{string pattern that can be used as \code{rec} argument for the \code{\link{rec}}-function.} \item{\code{labels}}{the associated values labels that can be used with \code{\link[sjlabelled]{set_labels}}.} } } \description{ Convenient function to create a recode pattern for the \code{\link{rec}} function, which recodes (numeric) vectors into smaller groups. } \examples{ rp <- rec_pattern(1, 100) rp # sample data, inspect age of carers data(efc) table(efc$c160age, exclude = NULL) table(rec(efc$c160age, rec = rp$pattern), exclude = NULL) # recode carers age into groups of width 5 x <- rec( efc$c160age, rec = rp$pattern, val.labels = rp$labels ) # watch result frq(x) } \seealso{ \code{\link{group_var}} for recoding variables into smaller groups, and \code{\link{group_labels}} to create the asssociated value labels. } sjmisc/man/tidy_values.Rd0000644000176200001440000000227414046746443015155 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tidy_values.R \name{tidy_values} \alias{tidy_values} \alias{clean_values} \title{Clean values of character vectors.} \usage{ tidy_values(x, ...) clean_values(x, ...) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} } \value{ \code{x}, with "cleaned" values or levels. } \description{ This function "cleans" values of a character vector or levels of a factor by removing space and punctuation characters. } \examples{ f1 <- sprintf("Char \%s", sample(LETTERS[1:5], size = 10, replace = TRUE)) f2 <- as.factor(sprintf("F / \%s", sample(letters[1:5], size = 10, replace = TRUE))) f3 <- sample(1:5, size = 10, replace = TRUE) x <- data.frame(f1, f2, f3, stringsAsFactors = FALSE) clean_values(f1) clean_values(f2) clean_values(x) } sjmisc/man/add_columns.Rd0000644000176200001440000001116714620403337015104 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/add_columns.R \name{add_columns} \alias{add_columns} \alias{replace_columns} \alias{add_id} \title{Add or replace data frame columns} \usage{ add_columns(data, ..., replace = TRUE) replace_columns(data, ..., add.unique = TRUE) add_id(data, var = "ID") } \arguments{ \item{data}{A data frame. For \code{add_columns()}, will be bound after data frames specified in \code{...}. For \code{replace_columns()}, duplicated columns in \code{data} will be replaced by columns in \code{...}.} \item{...}{More data frames to combine, resp. more data frames with columns that should replace columns in \code{data}.} \item{replace}{Logical, if \code{TRUE} (default), columns in \code{...} with identical names in \code{data} will replace the columns in \code{data}. The order of columns after replacing is preserved.} \item{add.unique}{Logical, if \code{TRUE} (default), remaining columns in \code{...} that did not replace any column in \code{data}, are appended as new columns to \code{data}.} \item{var}{Name of new the ID-variable.} } \value{ For \code{add_columns()}, a data frame, where columns of \code{data} are appended after columns of \code{...}. \cr \cr For \code{replace_columns()}, a data frame where columns in \code{data} will be replaced by identically named columns in \code{...}, and remaining columns from \code{...} will be appended to \code{data} (if \code{add.unique = TRUE}). \cr \cr For \code{add_id()}, a new column with ID numbers. This column is always the first column in the returned data frame. } \description{ \code{add_columns()} combines two or more data frames, but unlike \code{\link{cbind}} or \code{\link[dplyr:bind]{dplyr::bind_cols()}}, this function binds \code{data} as last columns of a data frame (i.e., behind columns specified in \code{...}). This can be useful in a "pipe"-workflow, where a data frame returned by a previous function should be appended \emph{at the end} of another data frame that is processed in \code{add_colums()}. \cr \cr \code{replace_columns()} replaces all columns in \code{data} with identically named columns in \code{...}, and adds remaining (non-duplicated) columns from \code{...} to \code{data}. \cr \cr \code{add_id()} simply adds an ID-column to the data frame, with values from 1 to \code{nrow(data)}, respectively for grouped data frames, values from 1 to group size. See 'Examples'. } \note{ For \code{add_columns()}, by default, columns in \code{data} with identical names like columns in one of the data frames in \code{...} will be dropped (i.e. variables with identical names in \code{...} will replace existing variables in \code{data}). Use \code{replace = FALSE} to keep all columns. Identical column names will then be renamed, to ensure unique column names (which happens by default when using \code{\link[dplyr:bind]{dplyr::bind_cols()}}). When replacing columns, replaced columns are not added to the end of the data frame. Rather, the original order of columns will be preserved. } \examples{ data(efc) d1 <- efc[, 1:3] d2 <- efc[, 4:6] if (require("dplyr") && require("sjlabelled")) { head(bind_cols(d1, d2)) add_columns(d1, d2) \%>\% head() d1 <- efc[, 1:3] d2 <- efc[, 2:6] add_columns(d1, d2, replace = TRUE) \%>\% head() add_columns(d1, d2, replace = FALSE) \%>\% head() # use case: we take the original data frame, select specific # variables and do some transformations or recodings # (standardization in this example) and add the new, transformed # variables *to the end* of the original data frame efc \%>\% select(e17age, c160age) \%>\% std() \%>\% add_columns(efc) \%>\% head() # new variables with same name will overwrite old variables # in "efc". order of columns is not changed. efc \%>\% select(e16sex, e42dep) \%>\% to_factor() \%>\% add_columns(efc) \%>\% head() # keep both old and new variables, automatically # rename variables with identical name efc \%>\% select(e16sex, e42dep) \%>\% to_factor() \%>\% add_columns(efc, replace = FALSE) \%>\% head() # create sample data frames d1 <- efc[, 1:10] d2 <- efc[, 2:3] d3 <- efc[, 7:8] d4 <- efc[, 10:12] # show original head(d1) library(sjlabelled) # slightly change variables, to see effect d2 <- as_label(d2) d3 <- as_label(d3) # replace duplicated columns, append remaining replace_columns(d1, d2, d3, d4) \%>\% head() # replace duplicated columns, omit remaining replace_columns(d1, d2, d3, d4, add.unique = FALSE) \%>\% head() # add ID to dataset library(dplyr) data(mtcars) add_id(mtcars) mtcars \%>\% group_by(gear) \%>\% add_id() \%>\% arrange(gear, ID) \%>\% print(n = 100) } } sjmisc/man/zap_inf.Rd0000644000176200001440000000250214046746443014245 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/zap_inf.R \name{zap_inf} \alias{zap_inf} \title{Convert infiite or NaN values into regular NA} \usage{ zap_inf(x, ...) } \arguments{ \item{x}{A vector or a data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} } \value{ \code{x}, where all \code{Inf}, \code{-Inf} and \code{NaN} are converted to \code{NA}. } \description{ Replaces all infinite (\code{Inf} and \code{-Inf}) or \code{NaN} values with regular \code{NA}. } \examples{ x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7) zap_inf(x) data(efc) # produce some NA and NaN values efc$e42dep[1] <- NaN efc$e42dep[2] <- NA efc$c12hour[1] <- NaN efc$c12hour[2] <- NA efc$e17age[2] <- NaN efc$e17age[1] <- NA # only zap NaN for c12hour zap_inf(efc$c12hour) # only zap NaN for c12hour and e17age, not for e42dep, # but return complete data framee zap_inf(efc, c12hour, e17age) # zap NaN for complete data frame zap_inf(efc) } sjmisc/man/spread_coef.Rd0000644000176200001440000000723014620405300015052 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/spread_coef.R \name{spread_coef} \alias{spread_coef} \title{Spread model coefficients of list-variables into columns} \usage{ spread_coef(data, model.column, model.term, se, p.val, append = TRUE) } \arguments{ \item{data}{A (nested) data frame with a list-variable that contains fitted model objects (see 'Details').} \item{model.column}{Name or index of the list-variable that contains the fitted model objects.} \item{model.term}{Optional, name of a model term. If specified, only this model term (including p-value) will be extracted from each model and added as new column.} \item{se}{Logical, if \code{TRUE}, standard errors for estimates will also be extracted.} \item{p.val}{Logical, if \code{TRUE}, p-values for estimates will also be extracted.} \item{append}{Logical, if \code{TRUE} (default), this function returns \code{data} with new columns for the model coefficients; else, a new data frame with model coefficients only are returned.} } \value{ A data frame with columns for each coefficient of the models that are stored in the list-variable of \code{data}; or, if \code{model.term} is given, a data frame with the term's estimate. If \code{se = TRUE} or \code{p.val = TRUE}, the returned data frame also contains columns for the coefficients' standard error and p-value. If \code{append = TRUE}, the columns are appended to \code{data}, i.e. \code{data} is also returned. } \description{ This function extracts coefficients (and standard error and p-values) of fitted model objects from (nested) data frames, which are saved in a list-variable, and spreads the coefficients into new colummns. } \details{ This function requires a (nested) data frame (e.g. created by the \code{\link[tidyr]{nest}}-function of the \pkg{tidyr}-package), where several fitted models are saved in a list-variable (see 'Examples'). Since nested data frames with fitted models stored as list-variable are typically fit with an identical formula, all models have the same dependent and independent variables and only differ in their subsets of data. The function then extracts all coefficients from each model and saves each estimate in a new column. The result is a data frame, where each \emph{row} is a model with each model's coefficients in an own \emph{column}. } \examples{ if (require("dplyr") && require("tidyr") && require("purrr")) { data(efc) # create nested data frame, grouped by dependency (e42dep) # and fit linear model for each group. These models are # stored in the list variable "models". model.data <- efc \%>\% filter(!is.na(e42dep)) \%>\% group_by(e42dep) \%>\% nest() \%>\% mutate( models = map(data, ~lm(neg_c_7 ~ c12hour + c172code, data = .x)) ) # spread coefficients, so we can easily access and compare the # coefficients over all models. arguments `se` and `p.val` default # to `FALSE`, when `model.term` is not specified spread_coef(model.data, models) spread_coef(model.data, models, se = TRUE) # select only specific model term. `se` and `p.val` default to `TRUE` spread_coef(model.data, models, c12hour) # spread_coef can be used directly within a pipe-chain efc \%>\% filter(!is.na(e42dep)) \%>\% group_by(e42dep) \%>\% nest() \%>\% mutate( models = map(data, ~lm(neg_c_7 ~ c12hour + c172code, data = .x)) ) \%>\% spread_coef(models) } } sjmisc/man/to_dummy.Rd0000644000176200001440000000402013676323120014440 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_dummy.R \name{to_dummy} \alias{to_dummy} \title{Split (categorical) vectors into dummy variables} \usage{ to_dummy(x, ..., var.name = "name", suffix = c("numeric", "label")) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{var.name}{Indicates how the new dummy variables are named. Use \code{"name"} to use the variable name or any other string that will be used as is. Only applies, if \code{x} is a vector. See 'Examples'.} \item{suffix}{Indicates which suffix will be added to each dummy variable. Use \code{"numeric"} to number dummy variables, e.g. \emph{x_1}, \emph{x_2}, \emph{x_3} etc. Use \code{"label"} to add value label, e.g. \emph{x_low}, \emph{x_mid}, \emph{x_high}. May be abbreviated.} } \value{ A data frame with dummy variables for each category of \code{x}. The dummy coded variables are of type \code{\link{atomic}}. } \description{ This function splits categorical or numeric vectors with more than two categories into 0/1-coded dummy variables. } \note{ \code{NA} values will be copied from \code{x}, so each dummy variable has the same amount of \code{NA}'s at the same position as \code{x}. } \examples{ data(efc) head(to_dummy(efc$e42dep)) # add value label as suffix to new variable name head(to_dummy(efc$e42dep, suffix = "label")) # use "dummy" as new variable name head(to_dummy(efc$e42dep, var.name = "dummy")) # create multiple dummies, append to data frame to_dummy(efc, c172code, e42dep) # pipe-workflow library(dplyr) efc \%>\% select(e42dep, e16sex, c172code) \%>\% to_dummy() } sjmisc/man/has_na.Rd0000644000176200001440000000572614046746443014063 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/has_na.R \name{has_na} \alias{has_na} \alias{incomplete_cases} \alias{complete_cases} \alias{complete_vars} \alias{incomplete_vars} \title{Check if variables or cases have missing / infinite values} \usage{ has_na(x, ..., by = c("col", "row"), out = c("table", "df", "index")) incomplete_cases(x, ...) complete_cases(x, ...) complete_vars(x, ...) incomplete_vars(x, ...) } \arguments{ \item{x}{A data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{by}{Whether to check column- or row-wise for missing and infinite values. If \code{by = "col"}, \code{has_na()} checks for \code{NA}/\code{NaN}/\code{Inf} in \emph{columns}; If \code{by = "row"}, \code{has_na()} checks each row for these values.} \item{out}{Output (return) format of the results. May be abbreviated.} } \value{ If \code{x} is a vector, returns \code{TRUE} if \code{x} has any missing or infinite values. If \code{x} is a data frame, returns \code{TRUE} for each variable (if \code{by = "col"}) or observation (if \code{by = "row"}) that has any missing or infinite values. If \code{out = "table"}, results are returned as data frame, with column number, variable name and label, and a logical vector indicating if a variable has missing values or not. However, it's printed in colors, with green rows indicating that a variable has no missings, while red rows indicate the presence of missings or infinite values. If \code{out = "index"}, a named vector is returned. } \description{ This functions checks if variables or observations in a data frame have \code{NA}, \code{NaN} or \code{Inf} values. } \note{ \code{complete_cases()} and \code{incomplete_cases()} are convenient shortcuts for \code{has_na(by = "row", out = "index")}, where the first only returns case-id's for all complete cases, and the latter only for non-complete cases. \cr \cr \code{complete_vars()} and \code{incomplete_vars()} are convenient shortcuts for \code{has_na(by = "col", out = "index")}, and again only return those column-id's for variables which are (in-)complete. } \examples{ data(efc) has_na(efc$e42dep) has_na(efc, e42dep, tot_sc_e, c161sex) has_na(efc) has_na(efc, e42dep, tot_sc_e, c161sex, out = "index") has_na(efc, out = "df") has_na(efc, by = "row") has_na(efc, e42dep, tot_sc_e, c161sex, by = "row", out = "index") has_na(efc, by = "row", out = "df") complete_cases(efc, e42dep, tot_sc_e, c161sex) incomplete_cases(efc, e42dep, tot_sc_e, c161sex) complete_vars(efc, e42dep, tot_sc_e, c161sex) incomplete_vars(efc, e42dep, tot_sc_e, c161sex) } sjmisc/man/ref_lvl.Rd0000644000176200001440000000523514046746443014256 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/ref_lvl.R \name{ref_lvl} \alias{ref_lvl} \title{Change reference level of (numeric) factors} \usage{ ref_lvl(x, ..., lvl = NULL) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{lvl}{Either numeric, indicating the new reference level, or a string, indicating the value label from the new reference level. If \code{x} is a factor with non-numeric factor levels, \code{relevel(x, ref = lvl)} is returned. See 'Examples'.} } \value{ \code{x} with new reference level. If \code{x} is a data frame, the complete data frame \code{x} will be returned, where variables specified in \code{...} will be re-leveled; if \code{...} is not specified, applies to all variables in the data frame. } \description{ Changes the reference level of (numeric) factor. } \details{ Unlike \code{\link[stats]{relevel}}, this function behaves differently for factor with numeric factor levels or for labelled data, i.e. factors with value labels for the values. \code{ref_lvl()} changes the reference level by recoding the factor's values using the \code{\link{rec}} function. Hence, all values from lowest up to the reference level indicated by \code{lvl} are recoded, with \code{lvl} starting as lowest factor value. For factors with non-numeric factor levels, the function simply returns \code{relevel(x, ref = lvl)}. See 'Examples'. } \examples{ data(efc) x <- to_factor(efc$e42dep) str(x) frq(x) # see column "val" in frq()-output, which indicates # how values/labels were recoded after using ref_lvl() x <- ref_lvl(x, lvl = 3) str(x) frq(x) library(dplyr) dat <- efc \%>\% select(c82cop1, c83cop2, c84cop3) \%>\% to_factor() frq(dat) ref_lvl(dat, c82cop1, c83cop2, lvl = 2) \%>\% frq() # compare numeric and string value for "lvl"-argument x <- to_factor(efc$e42dep) frq(x) ref_lvl(x, lvl = 2) \%>\% frq() ref_lvl(x, lvl = "slightly dependent") \%>\% frq() # factors with non-numeric factor levels data(iris) levels(iris$Species) levels(ref_lvl(iris$Species, lvl = 3)) levels(ref_lvl(iris$Species, lvl = "versicolor")) } \seealso{ \code{\link{to_factor}} to convert numeric vectors into factors; \code{\link{rec}} to recode variables. } sjmisc/man/std.Rd0000644000176200001440000001253613676323120013410 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/std.R \name{std} \alias{std} \alias{std_if} \alias{center} \alias{center_if} \title{Standardize and center variables} \usage{ std( x, ..., robust = c("sd", "2sd", "gmd", "mad"), include.fac = FALSE, append = TRUE, suffix = "_z" ) std_if( x, predicate, robust = c("sd", "2sd", "gmd", "mad"), include.fac = FALSE, append = TRUE, suffix = "_z" ) center(x, ..., include.fac = FALSE, append = TRUE, suffix = "_c") center_if(x, predicate, include.fac = FALSE, append = TRUE, suffix = "_c") } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{robust}{Character vector, indicating the method applied when standardizing variables with \code{std()}. By default, standardization is achieved by dividing the centered variables by their standard deviation (\code{robust = "sd"}). However, for skewed distributions, the median absolute deviation (MAD, \code{robust = "mad"}) or Gini's mean difference (\code{robust = "gmd"}) might be more robust measures of dispersion. For the latter option, \CRANpkg{sjstats} needs to be installed. \code{robust = "2sd"} divides the centered variables by two standard deviations, following a suggestion by \emph{Gelman (2008)}, so the rescaled input is comparable to binary variables.} \item{include.fac}{Logical, if \code{TRUE}, factors will be converted to numeric vectors and also standardized or centered.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} \item{suffix}{Indicates which suffix will be added to each dummy variable. Use \code{"numeric"} to number dummy variables, e.g. \emph{x_1}, \emph{x_2}, \emph{x_3} etc. Use \code{"label"} to add value label, e.g. \emph{x_low}, \emph{x_mid}, \emph{x_high}. May be abbreviated.} \item{predicate}{A predicate function to be applied to the columns. The variables for which \code{predicate} returns \code{TRUE} are selected.} } \value{ If \code{x} is a vector, returns a vector with standardized or centered variables. If \code{x} is a data frame, for \code{append = TRUE}, \code{x} including the transformed variables as new columns is returned; if \code{append = FALSE}, only the transformed variables will be returned. If \code{append = TRUE} and \code{suffix = ""}, recoded variables will replace (overwrite) existing variables. } \description{ \code{std()} computes a z-transformation (standardized and centered) on the input. \code{center()} centers the input. \code{std_if()} and \code{center_if()} are scoped variants of \code{std()} and \code{center()}, where transformation will be applied only to those variables that match the logical condition of \code{predicate}. } \details{ \code{std()} and \code{center()} also work on grouped data frames (see \code{\link[dplyr]{group_by}}). In this case, standardization or centering is applied to the subsets of variables in \code{x}. See 'Examples'. \cr \cr For more complicated models with many predictors, Gelman and Hill (2007) suggest leaving binary inputs as is and only standardize continuous predictors by dividing by two standard deviations. This ensures a rough comparability in the coefficients. } \note{ \code{std()} and \code{center()} only return a vector, if \code{x} is a vector. If \code{x} is a data frame and only one variable is specified in the \code{...}-ellipses argument, both functions do return a data frame (see 'Examples'). } \examples{ data(efc) std(efc$c160age) \%>\% head() std(efc, e17age, c160age, append = FALSE) \%>\% head() center(efc$c160age) \%>\% head() center(efc, e17age, c160age, append = FALSE) \%>\% head() # NOTE! std(efc$e17age) # returns a vector std(efc, e17age) # returns a data frame # with quasi-quotation x <- "e17age" center(efc, !!x, append = FALSE) \%>\% head() # works with mutate() library(dplyr) efc \%>\% select(e17age, neg_c_7) \%>\% mutate(age_std = std(e17age), burden = center(neg_c_7)) \%>\% head() # works also with grouped data frames mtcars \%>\% std(disp) # compare new column "disp_z" w/ output above mtcars \%>\% group_by(cyl) \%>\% std(disp) data(iris) # also standardize factors std(iris, include.fac = TRUE, append = FALSE) # don't standardize factors std(iris, include.fac = FALSE, append = FALSE) # standardize only variables with more than 10 unique values p <- function(x) dplyr::n_distinct(x) > 10 std_if(efc, predicate = p, append = FALSE) } \references{ Gelman A (2008) Scaling regression inputs by dividing by two standard deviations. \emph{Statistics in Medicine 27: 2865-2873.} \url{http://www.stat.columbia.edu/~gelman/research/published/standardizing7.pdf} \cr \cr Gelman A, Hill J (2007) Data Analysis Using Regression and Multilevel/Hierarchical Models. Cambdridge, Cambdrige University Press: 55-57 } sjmisc/man/group_str.Rd0000644000176200001440000000451514046746443014651 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group_str.R \name{group_str} \alias{group_str} \title{Group near elements of string vectors} \usage{ group_str( strings, precision = 2, strict = FALSE, trim.whitespace = TRUE, remove.empty = TRUE, verbose = FALSE, maxdist ) } \arguments{ \item{strings}{Character vector with string elements.} \item{precision}{Maximum distance ("precision") between two string elements, which is allowed to treat them as similar or equal. Smaller values mean less tolerance in matching.} \item{strict}{Logical; if \code{TRUE}, value matching is more strictly. See 'Examples'.} \item{trim.whitespace}{Logical; if \code{TRUE} (default), leading and trailing white spaces will be removed from string values.} \item{remove.empty}{Logical; if \code{TRUE} (default), empty string values will be removed from the character vector \code{strings}.} \item{verbose}{Logical; if \code{TRUE}, the progress bar is displayed when computing the distance matrix. Default in \code{FALSE}, hence the bar is hidden.} \item{maxdist}{Deprecated. Please use \code{precision} now.} } \value{ A character vector where similar string elements (values) are recoded into a new, single value. The return value is of same length as \code{strings}, i.e. grouped elements appear multiple times, so the count for each grouped string is still avaiable (see 'Examples'). } \description{ This function groups elements of a string vector (character or string variable) according to the element's distance ('similatiry'). The more similar two string elements are, the higher is the chance to be combined into a group. } \examples{ oldstring <- c("Hello", "Helo", "Hole", "Apple", "Ape", "New", "Old", "System", "Systemic") newstring <- group_str(oldstring) # see result newstring # count for each groups table(newstring) # print table to compare original and grouped string frq(oldstring) frq(newstring) # larger groups newstring <- group_str(oldstring, precision = 3) frq(oldstring) frq(newstring) # be more strict with matching pairs newstring <- group_str(oldstring, precision = 3, strict = TRUE) frq(oldstring) frq(newstring) } \seealso{ \code{\link{str_find}} } sjmisc/man/typical_value.Rd0000644000176200001440000000555513567234676015502 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/typical.R \name{typical_value} \alias{typical_value} \title{Return the typical value of a vector} \usage{ typical_value(x, fun = "mean", weights = NULL, ...) } \arguments{ \item{x}{A variable.} \item{fun}{Character vector, naming the function to be applied to \code{x}. Currently, \code{"mean"}, \code{"weighted.mean"}, \code{"median"} and \code{"mode"} are supported, which call the corresponding R functions (except \code{"mode"}, which calls an internal function to compute the most common value). \code{"zero"} simply returns 0. \strong{Note:} By default, if \code{x} is a factor, only \code{fun = "mode"} is applicable; for all other functions (including the default, \code{"mean"}) the reference level of \code{x} is returned. For character vectors, only the mode is returned. You can use a named vector to apply other different functions to integer, numeric and categorical \code{x}, where factors are first converted to numeric vectors, e.g. \code{fun = c(numeric = "median", factor = "mean")}. See 'Examples'.} \item{weights}{Name of variable in \code{x} that indicated the vector of weights that will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} \item{...}{Further arguments, passed down to \code{fun}.} } \value{ The "typical" value of \code{x}. } \description{ This function returns the "typical" value of a variable. } \details{ By default, for numeric variables, \code{typical_value()} returns the mean value of \code{x} (unless changed with the \code{fun}-argument). \cr \cr For factors, the reference level is returned or the most common value (if \code{fun = "mode"}), unless \code{fun} is a named vector. If \code{fun} is a named vector, specify the function for integer, numeric and categorical variables as element names, e.g. \code{fun = c(integer = "median", factor = "mean")}. In this case, factors are converted to numeric values (using \code{\link{to_value}}) and the related function is applied. You may abbreviate the names \code{fun = c(i = "median", f = "mean")}. See also 'Examples'. \cr \cr For character vectors the most common value (mode) is returned. } \examples{ data(iris) typical_value(iris$Sepal.Length) library(purrr) map(iris, ~ typical_value(.x)) # example from ?stats::weighted.mean wt <- c(5, 5, 4, 1) / 15 x <- c(3.7, 3.3, 3.5, 2.8) typical_value(x, fun = "weighted.mean") typical_value(x, fun = "weighted.mean", weights = wt) # for factors, return either reference level or mode value set.seed(123) x <- sample(iris$Species, size = 30, replace = TRUE) typical_value(x) typical_value(x, fun = "mode") # for factors, use a named vector to apply other functions than "mode" map(iris, ~ typical_value(.x, fun = c(n = "median", f = "mean"))) } sjmisc/man/remove_var.Rd0000644000176200001440000000157114046746443014771 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/remove_vars.R \name{remove_var} \alias{remove_var} \alias{remove_cols} \title{Remove variables from a data frame} \usage{ remove_var(x, ...) remove_cols(x, ...) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Character vector with variable names, or unquoted names of variables that should be removed from the data frame. You may also use functions like \code{:} or tidyselect's select-helpers.} } \value{ \code{x}, with variables specified in \code{...} removed. } \description{ This function removes variables from a data frame, and is intended to use within a pipe-workflow. \code{remove_cols()} is an alias for \code{remove_var()}. } \examples{ mtcars \%>\% remove_var("disp", "cyl") mtcars \%>\% remove_var(c("wt", "vs")) mtcars \%>\% remove_var(drat:am) } sjmisc/man/set_na_if.Rd0000644000176200001440000000463314046746443014555 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/set_na.R \name{set_na_if} \alias{set_na_if} \title{Replace specific values in vector with NA} \usage{ set_na_if(x, predicate, na, drop.levels = TRUE, as.tag = FALSE) } \arguments{ \item{x}{A vector or data frame.} \item{predicate}{A predicate function to be applied to the columns. The variables for which \code{predicate} returns \code{TRUE} are selected.} \item{na}{Numeric vector with values that should be replaced with NA values, or a character vector if values of factors or character vectors should be replaced. For labelled vectors, may also be the name of a value label. In this case, the associated values for the value labels in each vector will be replaced with \code{NA}. \code{na} can also be a named vector. If \code{as.tag = FALSE}, values will be replaced only in those variables that are indicated by the value names (see 'Examples').} \item{drop.levels}{Logical, if \code{TRUE}, factor levels of values that have been replaced with \code{NA} are dropped. See 'Examples'.} \item{as.tag}{Logical, if \code{TRUE}, values in \code{x} will be replaced by \code{tagged_na}, else by usual \code{NA} values. Use a named vector to assign the value label to the tagged NA value (see 'Examples').} } \value{ \code{x}, with all values in \code{na} being replaced by \code{NA}. If \code{x} is a data frame, the complete data frame \code{x} will be returned, with NA's set for variables specified in \code{...}; if \code{...} is not specified, applies to all variables in the data frame. } \description{ \code{set_na_if()} is a scoped variant of \code{\link[sjlabelled]{set_na}}, where values will be replaced only with NA's for those variables that match the logical condition of \code{predicate}. } \examples{ dummy <- data.frame(var1 = sample(1:8, 100, replace = TRUE), var2 = sample(1:10, 100, replace = TRUE), var3 = sample(1:6, 100, replace = TRUE)) p <- function(x) max(x, na.rm = TRUE) > 7 tmp <- set_na_if(dummy, predicate = p, na = 8:9) head(tmp) } \seealso{ \code{\link{replace_na}} to replace \code{\link{NA}}'s with specific values, \code{\link{rec}} for general recoding of variables and \code{\link{recode_to}} for re-shifting value ranges. See \code{\link[sjlabelled]{get_na}} to get values of missing values in labelled vectors. } sjmisc/man/replace_na.Rd0000644000176200001440000000745013676323120014706 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/replace_na.R \name{replace_na} \alias{replace_na} \title{Replace NA with specific values} \usage{ replace_na(x, ..., value, na.label = NULL, tagged.na = NULL) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{value}{Value that will replace the \code{\link{NA}}'s.} \item{na.label}{Optional character vector, used to label the the former NA-value (i.e. adding a \code{labels} attribute for \code{value} to \code{x}).} \item{tagged.na}{Optional single character, specifies a \code{\link[haven]{tagged_na}} value that will be replaced by \code{value}. Herewith it is possible to replace only specific \code{NA} values of \code{x}.} } \value{ \code{x}, where \code{NA}'s are replaced with \code{value}. If \code{x} is a data frame, the complete data frame \code{x} will be returned, with replaced NA's for variables specified in \code{...}; if \code{...} is not specified, applies to all variables in the data frame. } \description{ This function replaces (tagged) NA's of a variable, data frame or list of variables with \code{value}. } \details{ While regular \code{NA} values can only be \emph{completely} replaced with a single value, \code{\link[haven]{tagged_na}} allows to differentiate between different qualitative values of \code{NA}s. Tagged \code{NA}s work exactly like regular R missing values except that they store one additional byte of information: a tag, which is usually a letter ("a" to "z") or character number ("0" to "9"). Therewith it is possible to replace only specific NA values, while other NA values are preserved. } \note{ Value and variable label attributes are preserved. } \examples{ library(sjlabelled) data(efc) table(efc$e42dep, useNA = "always") table(replace_na(efc$e42dep, value = 99), useNA = "always") # the original labels get_labels(replace_na(efc$e42dep, value = 99)) # NA becomes "99", and is labelled as "former NA" get_labels( replace_na(efc$e42dep, value = 99, na.label = "former NA"), values = "p" ) dummy <- data.frame( v1 = efc$c82cop1, v2 = efc$c83cop2, v3 = efc$c84cop3 ) # show original distribution lapply(dummy, table, useNA = "always") # show variables, NA's replaced with 99 lapply(replace_na(dummy, v2, v3, value = 99), table, useNA = "always") if (require("haven")) { x <- labelled(c(1:3, tagged_na("a", "c", "z"), 4:1), c("Agreement" = 1, "Disagreement" = 4, "First" = tagged_na("c"), "Refused" = tagged_na("a"), "Not home" = tagged_na("z"))) # get current NA values x get_na(x) # replace only the NA, which is tagged as NA(c) replace_na(x, value = 2, tagged.na = "c") get_na(replace_na(x, value = 2, tagged.na = "c")) table(x) table(replace_na(x, value = 2, tagged.na = "c")) # tagged NA also works for non-labelled class # init vector x <- c(1, 2, 3, 4) # set values 2 and 3 as tagged NA x <- set_na(x, na = c(2, 3), as.tag = TRUE) # see result x # now replace only NA tagged with 2 with value 5 replace_na(x, value = 5, tagged.na = "2") } } \seealso{ \code{\link[sjlabelled]{set_na}} for setting \code{NA} values, \code{\link{rec}} for general recoding of variables and \code{\link{recode_to}} for re-shifting value ranges. } sjmisc/man/row_count.Rd0000644000176200001440000000537013676323120014633 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/row_count.R \name{row_count} \alias{row_count} \alias{col_count} \title{Count row or column indices} \usage{ row_count(x, ..., count, var = "rowcount", append = TRUE) col_count(x, ..., count, var = "colcount", append = TRUE) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{count}{The value for which the row or column sum should be computed. May be a numeric value, a character string (for factors or character vectors), \code{NA}, \code{Inf} or \code{NULL} to count missing or infinite values, or null-values.} \item{var}{Name of new the variable with the row or column counts.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} } \value{ For \code{row_count()}, a data frame with one variable: the sum of \code{count} appearing in each row of \code{x}; for \code{col_count()}, a data frame with one row and the same number of variables as in \code{x}: each variable holds the sum of \code{count} appearing in each variable of \code{x}. If \code{append = TRUE}, \code{x} including this variable will be returned. } \description{ \code{row_count()} mimics base R's \code{rowSums()}, with sums for a specific value indicated by \code{count}. Hence, it is equivalent to \code{rowSums(x == count, na.rm = TRUE)}. However, this function is designed to work nicely within a pipe-workflow and allows select-helpers for selecting variables and the return value is always a data frame (with one variable). \cr \cr \code{col_count()} does the same for columns. The return value is a data frame with one row (the column counts) and the same number of columns as \code{x}. } \examples{ dat <- data.frame( c1 = c(1, 2, 3, 1, 3, NA), c2 = c(3, 2, 1, 2, NA, 3), c3 = c(1, 1, 2, 1, 3, NA), c4 = c(1, 1, 3, 2, 1, 2) ) row_count(dat, count = 1, append = FALSE) row_count(dat, count = NA, append = FALSE) row_count(dat, c1:c3, count = 2, append = TRUE) col_count(dat, count = 1, append = FALSE) col_count(dat, count = NA, append = FALSE) col_count(dat, c1:c3, count = 2, append = TRUE) } sjmisc/man/trim.Rd0000644000176200001440000000160113567234676013600 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/trim.R \name{trim} \alias{trim} \title{Trim leading and trailing whitespaces from strings} \usage{ trim(x) } \arguments{ \item{x}{Character vector or string, or a list or data frame with such vectors. Function is vectorized, i.e. vector may have a length greater than 1. See 'Examples'.} } \value{ Trimmed \code{x}, i.e. with leading and trailing spaces removed. } \description{ Trims leading and trailing whitespaces from strings or character vectors. } \examples{ trim("white space at end ") trim(" white space at start and end ") trim(c(" string1 ", " string2", "string 3 ")) tmp <- data.frame(a = c(" string1 ", " string2", "string 3 "), b = c(" strong one ", " string two", " third string "), c = c(" str1 ", " str2", "str3 ")) tmp trim(tmp) } sjmisc/man/is_crossed.Rd0000644000176200001440000000450113777355176014766 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_crossed.R \name{is_crossed} \alias{is_crossed} \alias{is_nested} \alias{is_cross_classified} \title{Check whether two factors are crossed or nested} \usage{ is_crossed(f1, f2) is_nested(f1, f2) is_cross_classified(f1, f2) } \arguments{ \item{f1}{Numeric vector or \code{\link{factor}}.} \item{f2}{Numeric vector or \code{\link{factor}}.} } \value{ Logical. For \code{is_crossed()}, \code{TRUE} if factors are (fully) crossed, \code{FALSE} otherwise. For \code{is_nested()}, \code{TRUE} if factors are nested, \code{FALSE} otherwise. For \code{is_cross_classified()}, \code{TRUE}, if one factor level occurs in some, but not all levels of another factor. } \description{ These functions checks whether two factors are (fully) crossed or nested, i.e. if each level of one factor occurs in combination with each level of the other factor (\code{is_crossed()}) resp. if each category of the first factor co-occurs with only one category of the other (\code{is_nested()}). \code{is_cross_classified()} checks if one factor level occurs in some, but not all levels of another factor. } \note{ If factors are nested, a message is displayed to tell whether \code{f1} is nested within \code{f2} or vice versa. } \examples{ # crossed factors, each category of # x appears in each category of y x <- c(1,4,3,2,3,2,1,4) y <- c(1,1,1,2,2,1,2,2) # show distribution table(x, y) # check if crossed is_crossed(x, y) # not crossed factors x <- c(1,4,3,2,3,2,1,4) y <- c(1,1,1,2,1,1,2,2) # show distribution table(x, y) # check if crossed is_crossed(x, y) # nested factors, each category of # x appears in one category of y x <- c(1,2,3,4,5,6,7,8,9) y <- c(1,1,1,2,2,2,3,3,3) # show distribution table(x, y) # check if nested is_nested(x, y) is_nested(y, x) # not nested factors x <- c(1,2,3,4,5,6,7,8,9,1,2) y <- c(1,1,1,2,2,2,3,3,3,2,3) # show distribution table(x, y) # check if nested is_nested(x, y) is_nested(y, x) # also not fully crossed is_crossed(x, y) # but partially crossed is_cross_classified(x, y) } \references{ Grace, K. The Difference Between Crossed and Nested Factors. \href{https://www.theanalysisfactor.com/the-difference-between-crossed-and-nested-factors/}{(web)} } sjmisc/man/to_long.Rd0000644000176200001440000000774013567234676014300 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_long.R \name{to_long} \alias{to_long} \title{Convert wide data to long format} \usage{ to_long(data, keys, values, ..., labels = NULL, recode.key = FALSE) } \arguments{ \item{data}{A \code{data.frame} that should be tansformed from wide to long format.} \item{keys}{Character vector with name(s) of key column(s) to create in output. Either one key value per column group that should be gathered, or a single string. In the latter case, this name will be used as key column, and only one key column is created. See 'Examples'.} \item{values}{Character vector with names of value columns (variable names) to create in output. Must be of same length as number of column groups that should be gathered. See 'Examples'.} \item{...}{Specification of columns that should be gathered. Must be one character vector with variable names per column group, or a numeric vector with column indices indicating those columns that should be gathered. See 'Examples'.} \item{labels}{Character vector of same length as \code{values} with variable labels for the new variables created from gathered columns. See 'Examples' and 'Details'.} \item{recode.key}{Logical, if \code{TRUE}, the values of the \code{key} column will be recoded to numeric values, in sequential ascending order.} } \description{ This function converts wide data into long format. It allows to transform multiple key-value pairs to be transformed from wide to long format in one single step. } \details{ This function reshapes data from wide to long format, however, you can gather multiple column groups at once. Value and variable labels for non-gathered variables are preserved. Attributes from gathered variables, such as information about the variable labels, are lost during reshaping. Hence, the new created variables from gathered columns don't have any variable label attributes. In such cases, use \code{labels} argument to set back variable label attributes. } \examples{ # create sample mydat <- data.frame(age = c(20, 30, 40), sex = c("Female", "Male", "Male"), score_t1 = c(30, 35, 32), score_t2 = c(33, 34, 37), score_t3 = c(36, 35, 38), speed_t1 = c(2, 3, 1), speed_t2 = c(3, 4, 5), speed_t3 = c(1, 8, 6)) # gather multiple columns. both time and speed are gathered. to_long( data = mydat, keys = "time", values = c("score", "speed"), c("score_t1", "score_t2", "score_t3"), c("speed_t1", "speed_t2", "speed_t3") ) # alternative syntax, using "reshape_longer()" reshape_longer( mydat, columns = list( c("score_t1", "score_t2", "score_t3"), c("speed_t1", "speed_t2", "speed_t3") ), names.to = "time", values.to = c("score", "speed") ) # or ... reshape_longer( mydat, list(3:5, 6:8), names.to = "time", values.to = c("score", "speed") ) # gather multiple columns, use numeric key-value to_long( data = mydat, keys = "time", values = c("score", "speed"), c("score_t1", "score_t2", "score_t3"), c("speed_t1", "speed_t2", "speed_t3"), recode.key = TRUE ) # gather multiple columns by colum names and colum indices to_long( data = mydat, keys = "time", values = c("score", "speed"), c("score_t1", "score_t2", "score_t3"), 6:8, recode.key = TRUE ) # gather multiple columns, use separate key-columns # for each value-vector to_long( data = mydat, keys = c("time_score", "time_speed"), values = c("score", "speed"), c("score_t1", "score_t2", "score_t3"), c("speed_t1", "speed_t2", "speed_t3") ) # gather multiple columns, label columns mydat <- to_long( data = mydat, keys = "time", values = c("score", "speed"), c("score_t1", "score_t2", "score_t3"), c("speed_t1", "speed_t2", "speed_t3"), labels = c("Test Score", "Time needed to finish") ) library(sjlabelled) str(mydat$score) get_label(mydat$speed) } \seealso{ \code{\link{reshape_longer}} } sjmisc/man/all_na.Rd0000644000176200001440000000106414046746443014047 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_na.R \name{all_na} \alias{all_na} \title{Check if vector only has NA values} \usage{ all_na(x) } \arguments{ \item{x}{A vector or data frame.} } \value{ Logical, \code{TRUE} if \code{x} has only NA values, \code{FALSE} if \code{x} has at least one non-missing value. } \description{ Check if all values in a vector are \code{NA}. } \examples{ x <- c(NA, NA, NA) y <- c(1, NA, NA) all_na(x) all_na(y) all_na(data.frame(x, y)) all_na(list(x, y)) } sjmisc/man/empty_cols.Rd0000644000176200001440000000242413567234676015007 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_empty.R \name{empty_cols} \alias{empty_cols} \alias{empty_rows} \alias{remove_empty_cols} \alias{remove_empty_rows} \title{Return or remove variables or observations that are completely missing} \usage{ empty_cols(x) empty_rows(x) remove_empty_cols(x) remove_empty_rows(x) } \arguments{ \item{x}{A data frame.} } \value{ For \code{empty_cols} and \code{empty_rows}, a numeric (named) vector with row or column indices of those variables that completely have missing values. \cr \cr For \code{remove_empty_cols} and \code{remove_empty_rows}, a data frame with "empty" columns or rows removed. } \description{ These functions check which rows or columns of a data frame completely contain missing values, i.e. which observations or variables completely have missing values, and either 1) returns their indices; or 2) removes them from the data frame. } \examples{ tmp <- data.frame(a = c(1, 2, 3, NA, 5), b = c(1, NA, 3, NA , 5), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5)) tmp empty_cols(tmp) empty_rows(tmp) remove_empty_cols(tmp) remove_empty_rows(tmp) } sjmisc/man/is_float.Rd0000644000176200001440000000207413567234676014432 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_float.R \name{is_float} \alias{is_float} \alias{is_whole} \title{Check if a variable is of (non-integer) double type or a whole number} \usage{ is_float(x) is_whole(x) } \arguments{ \item{x}{A value, vector or data frame.} } \value{ For \code{is_float()}, \code{TRUE} if \code{x} is a floating value (non-integer double), \code{FALSE} otherwise (also returns \code{FALSE} for character vectors and factors). For \code{is_whole()}, \code{TRUE} if \code{x} is a vector with whole numbers only, \code{FALSE} otherwise (returns \code{TRUE} for character vectors and factors). } \description{ \code{is_float()} checks whether an input vector or value is a numeric non-integer (double), depending on fractional parts of the value(s). \code{is_whole()} does the opposite and checks whether an input vector is a whole number (without fractional parts). } \examples{ data(mtcars) data(iris) is.double(4) is_float(4) is_float(4.2) is_float(iris) is_whole(4) is_whole(4.2) is_whole(mtcars) } sjmisc/man/figures/0000755000176200001440000000000013451124270013760 5ustar liggesuserssjmisc/man/figures/logo.png0000644000176200001440000004235613451124270015440 0ustar liggesusersPNG  IHDRxb]e iCCPICC Profile8U]hU>sg#$Sl4t? % V46nI6"dΘ83OEP|1Ŀ (>/ % (>P苦;3ie|{g蹪X-2s=+WQ+]L6O w[C{_F qb Uvz?Zb1@/zcs>~if,ӈUSjF 1_Mjbuݠpamhmçϙ>a\+5%QKFkm}ۖ?ޚD\!~6,-7SثŜvķ5Z;[rmS5{yDyH}r9|-ăFAJjI.[/]mK 7KRDrYQO-Q||6 (0 MXd(@h2_f<:”_δ*d>e\c?~,7?& ك^2Iq2"y@g|UP`o pHYs  YiTXtXML:com.adobe.xmp 1 L'Y?"IDATx|Ź߭{rq$ؤZ r%!QrrN M )`ܫ\p/EY}zmٖ-+3og>ʜsl͝닩k:*~6n,8?|+げ9/-[馢קYQFfVf>FT04ۮwϼyB78-+NgLW32 77x<{n䕂Te" ̚K̈NsXlgOy@<AV:uKN^AAMUcѠ|(tlr<666,/ه}÷\1y0]0,-];wcN ([ JBs5hy|uNNnS/4[}}]o@t#F{Ɩ]f>z3ηbQ5шEُo?sc۩2^ӽt+5Ň毜BW UWÝ!Zymm.e*lcmzBو`jD!zlv؊y`d-k勚OZw߭LX{;N{Ŏ)'ϭ 27fAHC8lkH_nk*, O;ʍ|)A*GؤLڧvϵPbϴ}̙yo|䖫'TDұ1q_EbEџE>ޕ=AYWW9d{˫mr[yƪG0`"Z+lzCј6[M~]rg ј23sz^}]j8}ky gD,YYO͟_R2'^\p:<㾯edf!gVVcLJ됳K+m_C̺BY~^ '꤀#k ~A#tL~<m#ٹPfO^A+. ж8U6D}nYoW|_OfeX, ~u}6٭@Ejnk;J5Gd>^Ykv E.$ۺ п綹Ӷ^'ab7c bЭ;sF" }q̞GlfϺ:Gy;nmcەkYS8 ^pV+凫[ ѝv6Uf*Y<[MYHcl"kq{<{S޳"_iw=?;(uW@S:8~4΄2ER h3z+AξyDV5IC N9&5@ L2(/^Vػ `L@9rNgK&%Y>gY]XX{>_.rNlX#Xǿ_ٳP nn.+\:Rzln߷uK=cˏjcUXR} ĭRs.n7ipţ_6w\oߴg!g{]lY覢caG 1_ιgU1Ba|r/_fUYKO\YӡH̗SNx&{Dı uկvmW%%S\,SX:v?"ZV"QQ"=I>_YMm*-%=֗Y&6㠁 I8:3deV̷.gϑX,+vw}}Ңv߭s'.WE.ɠfksm&U?-h%7u^[S{"6_-*[{q%fO>qkXl[EO[sc9>zqrh¹am ӜU3W\ے=; -OPaq=r/#5V9ĽrtN&ܞ >ks{$0.I"r=3hF/td "k16رr܋>FK`bYq/Bb67ڄ{9+b/(6޵Y4rP6hvܞ@)% j0MohX$/ۮ531ܞ vM?q;6[x` }}";'?GaBVD݈{q9?oD=0x :9r UNF+FG>qDpVV3̫7ۯ^MH8If$9$3Oy <~e=3wPMy#z܋ dA1{zXÕhϺzBXTrƍIܞ6cfmaE}dQ$%hhc(Bb fĂ|~#P3~ X|Fii*(Yxf܋{R Ɠi\ܞҶ# LD&Ҕ6GB)gY]}Z|c)1’[`eϬ`Òd(~ <>zg.%W͘Jl~7coO)e ::/GIPtל wH*la?p{b {Iy8][p*_G;Ψ#E dlF.A;VOtrF|]'ntܞ#Jb$6`?Ǟѭ'nVϭSN &A)nBVN^?ːS4 Dh0ު:^T|Ί>Plg>] _5¸=φ?OΨ0%I:ZZo<({51_)=P_ph~"ᾃ=q/yڙ "Zcu;G@/ՎfcϽr[֕!IԯYoz3^'{+G3L pe*H?XHO7^MbYhXM&;Zu|zVh~~D =taID@SgK8C/,)o˛95Ar~/C6ySN~\Ʋg#hʖInYٳzsٳX x~@Ϧ.s䍒EE-նysKϒƭCSKUb?G@H|UyC.ܩ։@v`m`NgNfˈH}*' g %=b1Bf0\"rEQh%ϵٚP{ l)Xf֑ɻ|Y5DXҭƀo221j*b+9,ټ#?*߾HVPoDB`ùIr#_{k+lXQw,e tzvASh!+4LLgV9gVcDeZ-?u?~:ZQDĎ0^ٹc&[% (rvr[vbرV X lR%ӕe6aHw;D/دw,2m0Zj8~g 92b}֯IP_϶-=d.ODTdjQY!+B6GajFñr~˄'>̚r8KYU, f6Dl+a<XK/0~M`ݘZla fLm/M5\';n n6f`w7XVh2dSp[Uߪ_z7֙ @PPdu&v1Heaƨ{bV}x${j7|s;b+%Wk]UrJLy7=O?*ޱcW%'iZ{<"҅2>I7 GG hU `/kDd/դ˸jt˪8]vNIa=5y gYym]fl+0ru+#Ytuzw>`i` d}ӡw׳s٩EvxL6ړ|DUI\OaɣLHBZ1o:#Ճ0^ {6Si55Jla'j7wv+ XLn)V\/zQaq;TӗwOvust&$+S/UMʁ3]ffl$NEu(:rPиԛǨSL/]Zqm>Ѝ{[/|Z W"VмFq{:wJjf]Þ(b!AD#.pB[yn&e}bY첢6@ P-ݿn[(01sp9V(joʓ6ݯna0 OF׋ƤrV ,)ط/d_zkn9p"z4m$ Z>ʊ@Las1Ϣb CΨgw ڬ&;vPwIhM|iXl+&9eWMX ǑЗGQDTGv܇{Yu]āӿ{&i+"vqlޜ)vި>i đqIolR9{NC]h"'|Nw d_/ZVvBf ie_sf` w/ICsWObfE"/)VjrpY&w=rsCN\8][w޶%6j79IiO"m(~v{F?Rm{m-~&n}0ӇUN`; [G; ݁;lII+n8rP얃w;gȱpsceT 'Dwأ*Ulw ,WNh&l FkokZlZvL|dڕݖIŎ%O锔_m9bn=٭t-@6K[9F$?'"_0q[w.J{uwIwyR]FaYƶczV6-~dRʹ;gnu7] d8f+|ٴ*Wӟ:rN:;2:bLU!xWt2T\5מ);}yeSN<]՟ hS׫Aƚu^7Mh@/s'5hY`ԑ̉.4M?ާw5Sȴkͪ})>9 #m?{y sDE Xrt䧖raO{A>"ev ml^96ʛPJ`Bce|o;6OaPrQTFj]Y &߬g4q볅y)-iCqy`7`ާN-ﻎ8=34 UF߽pud;H}/1zRWa. 0y3QD_I|O=0B=l8hjc"Zie7ijf\&.] teYvnn_ڦ䂦45Ugۤ|uVr,I;/w] EL߇1/297kX84Wu.ytN]" LdhW 7a=OZbO1VĽW=yW|kΝg%}&;WSnGKp%--%l}#<Pl\ q֛<} '! J4B?c4ZuOH/Ф @ |L`(VR _ޕO05^eV4fBovjN$PM}gWGm:^vx:O8-kE# ebn8iBVVN?+ ;ST~8*EٲӍSNnׅ?5r?=D<"AGSnYxSXVIw]:L6{b*%…!"ua 무SPwƶ=2=ӥYwRCi5qib}XtF,V(HI$H p?3s}! p$1^޸VÂ歂z֯ehKITsӠE8#a4`e;V@ZG;q%P@kSm@0izhs6M-܏ή)6ce/#cCC:TѮ6z0Y]O|A赱Xv\oۈiYdś8jLzͼ-S;)2[x {1eF&]0Ò)C{X!u9d\̲˦fP6T)u;/֟5JaϑL"Ks$JNJvxuwg wdSϖR ܉׎nbm7/s&5ձ٤[Mad83Fo\fv_JP7Yۺ2E;i4) \C,pL⮟9ʆWa0[[~usJpj)}^0u(yuf֫21iĖOrXM'>0;i"GtG+7}y _8} ]ufC,&7xg{3mIwabQ䮣$Z;X㴅wLQbkAY{Uz碨3څJZ~*7=l Wf4ik)7dg))wlefh)z6.:{m㕼T |K?ⴺNNW6a, ӷ`rMgi+'b[n` iZb>Sd~] {wʠCO.r٧ԤO^fk#gubp&varu$S;KHshJm~SnI?SX (ʇd)p.qخ6l u7$8}~_}$Zz~+ FO ^`e|--4,CLPOdRhF/ZN# 6n=]FXڳ\5@e[|b0ja˻9Q=uf(rZw8mf2mNY-AkPp"5X~-d$Ȉ浬#Ccm(zA <^/!~PLC:m ~>vo Sr{^8vȫpPJʩk&4Toc6+Ku3R>|?uzj@>=|vomZ0V/dlZCI5{J|in钝n%bl}bW흫`)ʰ<7oMV*^7 p)U:ߺk#QfnA(=F+J<@(/FA¢Kd=R-ev8$w;uijjS(81h:WmPV珛3Į{(dJb'? Zm=Ϋ]3'Fo7h ]O,wCpE'%)]tw(M٣W2(1oalZ%= kی< T9&~))rymyE=1vuR倢T˰V_,#:V g֮Cp‡ a1aZKX5š`ĦǬ1 XīFglVѯ];h Ti{:P{L~_ĝ8 ){w7 _3:3< پ,++D 1$YCnQb«_[ot'PYWk;/Uty vm0 vaH!Sw_SMi~I',;]b .} $F.[c9z4kLJ^}%F`WTWd}2;`VZ}*LJM@ $1Jؗg OEylk@y ::Dݢ5;/>F U_{q!sV듀UEGƶM^?D[D5jz ;'4X~a/~WO~Zs*k|# j yOYm/8ba QU-$x'f "^ [*k;~++#1S:C1(B J1V㍙0k+71#}E57gtS۫g7 ǣR9y] j+ؠ$af{No޼e}7N5wxXI6 d}~̬nj~ *ymPfEu dɢr Y|Jre, 6Ǥu[m>v8Bͥn%%gAF?s%j֞s|f\ܤ`j߼˃7%YzfLfe瞯Uu518ضe>5Pbs-5DhI2IM3\fT{PܲhX ppJycvd& 􌉼װ'Yqرefz7 z{lY覢ijY ~sr25Ĵg!]Ve`+k-Vq 6֫vdK-XL Xs#;1J*TkXԷ `&i< UIІ@FfTbL~~GSTÓx ͜Ķ c2kkFNނ/-mVm&i @Q s_zJm;NjhqT^˻1h*WU?i6wIէxoZ/.u0IV%heo-Z(u!)բ> Qe|R䥖[J>t&So"pCHΖ#bzNgl[ JMm1wo3^{D|3gƨL՞`ba7}sV]ŏM)S[؄c) Wc@?X+؛6˨m3!:~٨$r\w#̚@xDV؊zhXr^fX;ڪسY(PUu߯aqči{UY<ÍW6 |)ORinO̪Uڭ;&M&ʘ}7mj%aO%09G(Et 槯R'Uc)>ϳ8oB xƵw(7Hsldg nO3nZ:ܣ~cLJgSp/N6YYXy~ƥz]_x'g,'WZ3NmpQR^~ys<2Ux0 YWW|4`Qfv6\Ed<3BN TڶǎˡZd&\Tc,(26`AHn;ƾ^Vjw:s3xjE{9y2Ld3 l͸=,’՛Q~KbyQME86 ;XOr>{ I&OaxmRڔS[ps{*,RfVeVu`>"#QAxa7azc#ʴHW-`e)]F{̳$L8ӝ) ȴ]#RP x&7a>^iN F>ʀɘ%,)y)h0xPSڲgZlZ \meŽas3l:0BNY9VSUQ vՄsɎNl7&4D(Kyvt7_٨2t8 PJc?-/'V^(= @Ύ`ϊDqӖmc,-hbvE~O ӱG]=7L ۰Q8wVmPe[S~ZVlE@<^e79E@<^$ f)ٌ߽eg/yFdrh),9昧=O"Zl$p3( u$@a%|!Ag!:i %w" ܴ!I fddIԠ6 HqW:Xy{\`Μ9g\ă3D%e| '2vͺ@͊҈Nǿ-`=Gؿ㾀YX7^ك{QwyBUµOq_{^55 DOaao.}8UO׷GI&6Ƕ9{5kw7 O{Kj[9Оsa*(OiCDQrN X={1I2Yً~0?nC? g{oz^Ŏ̱%J-Z~~SV_܎^u#j򹁂 (\ ZdgضuO,9+d m gd j3*<~n f4oں-^G%li)M%?|"_ٹH.$^̪ͤ -#z#)ݡȞ +v Ž g'*)bٰRk*yۤşdrwՓ֩mDzPu,K Zk>bw(g5$,h’뱝ܗCI߸?AIҟ4ƋD^"k[U.:6t,K "lxAIx}Yßy6չVcMڐ’KR$y G"ݢ-*m[*gSkߝ`srWOEٹs:g70T&J* a>}{ƭ-2U:JX-mXѴJzjͯxŶ1}r88jB}*.w#T;NԿ;=^Sme\% select(1:5) \%>\% slice(1:10) x2 <- efc \%>\% select(3:7) \%>\% slice(11:20) mydf <- add_rows(x1, x2) mydf str(mydf) \dontrun{ library(sjPlot) view_df(mydf)} x3 <- efc \%>\% select(5:9) \%>\% slice(21:30) x4 <- efc \%>\% select(11:14) \%>\% slice(31:40) mydf <- add_rows(x1, x2, x3, x4, id = "subsets") mydf str(mydf) } sjmisc/man/reexports.Rd0000644000176200001440000000163014153357271014647 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/S3-methods.R, R/re-exports.R \docType{import} \name{reexports} \alias{reexports} \alias{print_md} \alias{print_html} \alias{to_character} \alias{to_label} \alias{to_numeric} \alias{to_factor} \alias{\%>\%} \alias{set_na} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{insight}{\code{\link[insight:display]{print_html}}, \code{\link[insight:display]{print_md}}} \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} \item{sjlabelled}{\code{\link[sjlabelled]{set_na}}, \code{\link[sjlabelled:as_label]{to_character}}, \code{\link[sjlabelled:as_factor]{to_factor}}, \code{\link[sjlabelled:as_label]{to_label}}, \code{\link[sjlabelled:as_numeric]{to_numeric}}} }} sjmisc/man/move_columns.Rd0000644000176200001440000000415313751046775015334 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/move_column.R \name{move_columns} \alias{move_columns} \title{Move columns to other positions in a data frame} \usage{ move_columns(data, ..., .before, .after) } \arguments{ \item{data}{A data frame.} \item{...}{Unquoted names or character vector with names of variables that should be move to another position. You may also use functions like \code{:} or tidyselect's select-helpers.} \item{.before}{Optional, column name or numeric index of the position where \code{col} should be moved to. If not missing, \code{col} is moved to the position \emph{before} the column indicated by \code{.before}.} \item{.after}{Optional, column name or numeric index of the position where \code{col} should be moved to. If not missing, \code{col} is moved to the position \emph{after} the column indicated by \code{.after}.} } \value{ \code{data}, with resorted columns. } \description{ \code{move_columns()} moves one or more columns in a data frame to another position. } \note{ If neither \code{.before} nor \code{.after} are specified, the column is moved to the end of the data frame by default. \code{.before} and \code{.after} are evaluated in a non-standard fashion, so you need quasi-quotation when the value for \code{.before} or \code{.after} is a vector with the target-column value. See 'Examples'. } \examples{ \dontrun{ data(iris) iris \%>\% move_columns(Sepal.Width, .after = "Species") \%>\% head() iris \%>\% move_columns(Sepal.Width, .before = Sepal.Length) \%>\% head() iris \%>\% move_columns(Species, .before = 1) \%>\% head() iris \%>\% move_columns("Species", "Petal.Length", .after = 1) \%>\% head() library(dplyr) iris \%>\% move_columns(contains("Width"), .after = "Species") \%>\% head()} # using quasi-quotation target <- "Petal.Width" # does not work, column is moved to the end iris \%>\% move_columns(Sepal.Width, .after = target) \%>\% head() # using !! works iris \%>\% move_columns(Sepal.Width, .after = !!target) \%>\% head() } sjmisc/man/rec.Rd0000644000176200001440000002425214046746443013376 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rec.R \name{rec} \alias{rec} \alias{rec_if} \title{Recode variables} \usage{ rec( x, ..., rec, as.num = TRUE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_r", to.factor = !as.num ) rec_if( x, predicate, rec, as.num = TRUE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_r", to.factor = !as.num ) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{rec}{String with recode pairs of old and new values. See 'Details' for examples. \code{\link{rec_pattern}} is a convenient function to create recode strings for grouping variables.} \item{as.num}{Logical, if \code{TRUE}, return value will be numeric, not a factor.} \item{var.label}{Optional string, to set variable label attribute for the returned variable (see vignette \href{https://cran.r-project.org/package=sjlabelled/vignettes/intro_sjlabelled.html}{Labelled Data and the sjlabelled-Package}). If \code{NULL} (default), variable label attribute of \code{x} will be used (if present). If empty, variable label attributes will be removed.} \item{val.labels}{Optional character vector, to set value label attributes of recoded variable (see vignette \href{https://cran.r-project.org/package=sjlabelled/vignettes/intro_sjlabelled.html}{Labelled Data and the sjlabelled-Package}). If \code{NULL} (default), no value labels will be set. Value labels can also be directly defined in the \code{rec}-syntax, see 'Details'.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} \item{suffix}{String value, will be appended to variable (column) names of \code{x}, if \code{x} is a data frame. If \code{x} is not a data frame, this argument will be ignored. The default value to suffix column names in a data frame depends on the function call: \itemize{ \item recoded variables (\code{rec()}) will be suffixed with \code{"_r"} \item recoded variables (\code{recode_to()}) will be suffixed with \code{"_r0"} \item dichotomized variables (\code{dicho()}) will be suffixed with \code{"_d"} \item grouped variables (\code{split_var()}) will be suffixed with \code{"_g"} \item grouped variables (\code{group_var()}) will be suffixed with \code{"_gr"} \item standardized variables (\code{std()}) will be suffixed with \code{"_z"} \item centered variables (\code{center()}) will be suffixed with \code{"_c"} } If \code{suffix = ""} and \code{append = TRUE}, existing variables that have been recoded/transformed will be overwritten.} \item{to.factor}{Logical, alias for \code{as.num}. If \code{TRUE}, return value will be a factor, not numeric.} \item{predicate}{A predicate function to be applied to the columns. The variables for which \code{predicate} returns \code{TRUE} are selected.} } \value{ \code{x} with recoded categories. If \code{x} is a data frame, for \code{append = TRUE}, \code{x} including the recoded variables as new columns is returned; if \code{append = FALSE}, only the recoded variables will be returned. If \code{append = TRUE} and \code{suffix = ""}, recoded variables will replace (overwrite) existing variables. } \description{ \code{rec()} recodes values of variables, where variable selection is based on variable names or column position, or on select helpers (see documentation on \code{...}). \code{rec_if()} is a scoped variant of \code{rec()}, where recoding will be applied only to those variables that match the logical condition of \code{predicate}. } \details{ The \code{rec} string has following syntax: \describe{ \item{recode pairs}{each recode pair has to be separated by a \code{;}, e.g. \code{rec = "1=1; 2=4; 3=2; 4=3"}} \item{multiple values}{multiple old values that should be recoded into a new single value may be separated with comma, e.g. \code{"1,2=1; 3,4=2"}} \item{value range}{a value range is indicated by a colon, e.g. \code{"1:4=1; 5:8=2"} (recodes all values from 1 to 4 into 1, and from 5 to 8 into 2)} \item{value range for doubles}{for double vectors (with fractional part), all values within the specified range are recoded; e.g. \code{1:2.5=1;2.6:3=2} recodes 1 to 2.5 into 1 and 2.6 to 3 into 2, but 2.55 would not be recoded (since it's not included in any of the specified ranges)} \item{\code{"min"} and \code{"max"}}{minimum and maximum values are indicates by \emph{min} (or \emph{lo}) and \emph{max} (or \emph{hi}), e.g. \code{"min:4=1; 5:max=2"} (recodes all values from minimum values of \code{x} to 4 into 1, and from 5 to maximum values of \code{x} into 2)} \item{\code{"else"}}{all other values, which have not been specified yet, are indicated by \emph{else}, e.g. \code{"3=1; 1=2; else=3"} (recodes 3 into 1, 1 into 2 and all other values into 3)} \item{\code{"copy"}}{the \code{"else"}-token can be combined with \emph{copy}, indicating that all remaining, not yet recoded values should stay the same (are copied from the original value), e.g. \code{"3=1; 1=2; else=copy"} (recodes 3 into 1, 1 into 2 and all other values like 2, 4 or 5 etc. will not be recoded, but copied, see 'Examples')} \item{\code{NA}'s}{\code{\link{NA}} values are allowed both as old and new value, e.g. \code{"NA=1; 3:5=NA"} (recodes all NA into 1, and all values from 3 to 5 into NA in the new variable)} \item{\code{"rev"}}{\code{"rev"} is a special token that reverses the value order (see 'Examples')} \item{direct value labelling}{value labels for new values can be assigned inside the recode pattern by writing the value label in square brackets after defining the new value in a recode pair, e.g. \code{"15:30=1 [young aged]; 31:55=2 [middle aged]; 56:max=3 [old aged]"}. See 'Examples'.} } } \note{ Please note following behaviours of the function: \itemize{ \item the \code{"else"}-token should always be the last argument in the \code{rec}-string. \item Non-matching values will be set to \code{NA}, unless captured by the \code{"else"}-token. \item Tagged NA values (see \code{\link[haven]{tagged_na}}) and their value labels will be preserved when copying NA values to the recoded vector with \code{"else=copy"}. \item Variable label attributes (see, for instance, \code{\link[sjlabelled]{get_label}}) are preserved (unless changed via \code{var.label}-argument), however, value label attributes are removed (except for \code{"rev"}, where present value labels will be automatically reversed as well). Use \code{val.labels}-argument to add labels for recoded values. \item If \code{x} is a data frame, all variables should have the same categories resp. value range (else, see second bullet, \code{NA}s are produced). } } \examples{ data(efc) table(efc$e42dep, useNA = "always") # replace NA with 5 table(rec(efc$e42dep, rec = "1=1;2=2;3=3;4=4;NA=5"), useNA = "always") # recode 1 to 2 into 1 and 3 to 4 into 2 table(rec(efc$e42dep, rec = "1,2=1; 3,4=2"), useNA = "always") # keep value labels. variable label is automatically preserved library(dplyr) efc \%>\% select(e42dep) \%>\% rec(rec = "1,2=1; 3,4=2", val.labels = c("low dependency", "high dependency")) \%>\% frq() # works with mutate efc \%>\% select(e42dep, e17age) \%>\% mutate(dependency_rev = rec(e42dep, rec = "rev")) \%>\% head() # recode 1 to 3 into 1 and 4 into 2 table(rec(efc$e42dep, rec = "min:3=1; 4=2"), useNA = "always") # recode 2 to 1 and all others into 2 table(rec(efc$e42dep, rec = "2=1; else=2"), useNA = "always") # reverse value order table(rec(efc$e42dep, rec = "rev"), useNA = "always") # recode only selected values, copy remaining table(efc$e15relat) table(rec(efc$e15relat, rec = "1,2,4=1; else=copy")) # recode variables with same category in a data frame head(efc[, 6:9]) head(rec(efc[, 6:9], rec = "1=10;2=20;3=30;4=40")) # recode multiple variables and set value labels via recode-syntax dummy <- rec( efc, c160age, e17age, rec = "15:30=1 [young]; 31:55=2 [middle]; 56:max=3 [old]", append = FALSE ) frq(dummy) # recode variables with same value-range lapply( rec( efc, c82cop1, c83cop2, c84cop3, rec = "1,2=1; NA=9; else=copy", append = FALSE ), table, useNA = "always" ) # recode character vector dummy <- c("M", "F", "F", "X") rec(dummy, rec = "M=Male; F=Female; X=Refused") # recode numeric to character rec(efc$e42dep, rec = "1=first;2=2nd;3=third;else=hi") \%>\% head() # recode non-numeric factors data(iris) table(rec(iris, Species, rec = "setosa=huhu; else=copy", append = FALSE)) # recode floating points table(rec( iris, Sepal.Length, rec = "lo:5=1;5.01:6.5=2;6.501:max=3", append = FALSE )) # preserve tagged NAs if (require("haven")) { x <- labelled(c(1:3, tagged_na("a", "c", "z"), 4:1), c("Agreement" = 1, "Disagreement" = 4, "First" = tagged_na("c"), "Refused" = tagged_na("a"), "Not home" = tagged_na("z"))) # get current value labels x # recode 2 into 5; Values of tagged NAs are preserved rec(x, rec = "2=5;else=copy") } # use select-helpers from dplyr-package out <- rec( efc, contains("cop"), c161sex:c175empl, rec = "0,1=0; else=1", append = FALSE ) head(out) # recode only variables that have a value range from 1-4 p <- function(x) min(x, na.rm = TRUE) > 0 && max(x, na.rm = TRUE) < 5 out <- rec_if(efc, predicate = p, rec = "1:3=1;4=2;else=copy") head(out) } \seealso{ \code{\link[sjlabelled]{set_na}} for setting \code{NA} values, \code{\link{replace_na}} to replace \code{NA}'s with specific value, \code{\link{recode_to}} for re-shifting value ranges and \code{\link{ref_lvl}} to change the reference level of (numeric) factors. } sjmisc/man/group_var.Rd0000644000176200001440000001511314046746443014625 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group_var.R \name{group_var} \alias{group_var} \alias{group_var_if} \alias{group_labels} \alias{group_labels_if} \title{Recode numeric variables into equal-ranged groups} \usage{ group_var( x, ..., size = 5, as.num = TRUE, right.interval = FALSE, n = 30, append = TRUE, suffix = "_gr" ) group_var_if( x, predicate, size = 5, as.num = TRUE, right.interval = FALSE, n = 30, append = TRUE, suffix = "_gr" ) group_labels(x, ..., size = 5, right.interval = FALSE, n = 30) group_labels_if(x, predicate, size = 5, right.interval = FALSE, n = 30) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{size}{Numeric; group-size, i.e. the range for grouping. By default, for each 5 categories of \code{x} a new group is defined, i.e. \code{size = 5}. Use \code{size = "auto"} to automatically resize a variable into a maximum of 30 groups (which is the ggplot-default grouping when plotting histograms). Use \code{n} to determine the amount of groups.} \item{as.num}{Logical, if \code{TRUE}, return value will be numeric, not a factor.} \item{right.interval}{Logical; if \code{TRUE}, grouping starts with the lower bound of \code{size}. See 'Details'.} \item{n}{Sets the maximum number of groups that are defined when auto-grouping is on (\code{size = "auto"}). Default is 30. If \code{size} is not set to \code{"auto"}, this argument will be ignored.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} \item{suffix}{Indicates which suffix will be added to each dummy variable. Use \code{"numeric"} to number dummy variables, e.g. \emph{x_1}, \emph{x_2}, \emph{x_3} etc. Use \code{"label"} to add value label, e.g. \emph{x_low}, \emph{x_mid}, \emph{x_high}. May be abbreviated.} \item{predicate}{A predicate function to be applied to the columns. The variables for which \code{predicate} returns \code{TRUE} are selected.} } \value{ \itemize{ \item For \code{group_var()}, a grouped variable, either as numeric or as factor (see paramter \code{as.num}). If \code{x} is a data frame, only the grouped variables will be returned. \item For \code{group_labels()}, a string vector or a list of string vectors containing labels based on the grouped categories of \code{x}, formatted as "from lower bound to upper bound", e.g. \code{"10-19" "20-29" "30-39"} etc. See 'Examples'. } } \description{ Recode numeric variables into equal ranged, grouped factors, i.e. a variable is cut into a smaller number of groups, where each group has the same value range. \code{group_labels()} creates the related value labels. \code{group_var_if()} and \code{group_labels_if()} are scoped variants of \code{group_var()} and \code{group_labels()}, where grouping will be applied only to those variables that match the logical condition of \code{predicate}. } \details{ If \code{size} is set to a specific value, the variable is recoded into several groups, where each group has a maximum range of \code{size}. Hence, the amount of groups differ depending on the range of \code{x}. \cr \cr If \code{size = "auto"}, the variable is recoded into a maximum of \code{n} groups. Hence, independent from the range of \code{x}, always the same amount of groups are created, so the range within each group differs (depending on \code{x}'s range). \cr \cr \code{right.interval} determins which boundary values to include when grouping is done. If \code{TRUE}, grouping starts with the \strong{lower bound} of \code{size}. For example, having a variable ranging from 50 to 80, groups cover the ranges from 50-54, 55-59, 60-64 etc. If \code{FALSE} (default), grouping starts with the \code{upper bound} of \code{size}. In this case, groups cover the ranges from 46-50, 51-55, 56-60, 61-65 etc. \strong{Note:} This will cover a range from 46-50 as first group, even if values from 46 to 49 are not present. See 'Examples'. \cr \cr If you want to split a variable into a certain amount of equal sized groups (instead of having groups where values have all the same range), use the \code{\link{split_var}} function! \cr \cr \code{group_var()} also works on grouped data frames (see \code{\link[dplyr]{group_by}}). In this case, grouping is applied to the subsets of variables in \code{x}. See 'Examples'. } \note{ Variable label attributes (see, for instance, \code{\link[sjlabelled]{set_label}}) are preserved. Usually you should use the same values for \code{size} and \code{right.interval} in \code{group_labels()} as used in the \code{group_var} function if you want matching labels for the related recoded variable. } \examples{ age <- abs(round(rnorm(100, 65, 20))) age.grp <- group_var(age, size = 10) hist(age) hist(age.grp) age.grpvar <- group_labels(age, size = 10) table(age.grp) print(age.grpvar) # histogram with EUROFAMCARE sample dataset # variable not grouped library(sjlabelled) data(efc) hist(efc$e17age, main = get_label(efc$e17age)) # bar plot with EUROFAMCARE sample dataset # grouped variable ageGrp <- group_var(efc$e17age) ageGrpLab <- group_labels(efc$e17age) barplot(table(ageGrp), main = get_label(efc$e17age), names.arg = ageGrpLab) # within a pipe-chain library(dplyr) efc \%>\% select(e17age, c12hour, c160age) \%>\% group_var(size = 20) # create vector with values from 50 to 80 dummy <- round(runif(200, 50, 80)) # labels with grouping starting at lower bound group_labels(dummy) # labels with grouping startint at upper bound group_labels(dummy, right.interval = TRUE) # works also with gouped data frames mtcars \%>\% group_var(disp, size = 4, append = FALSE) \%>\% table() mtcars \%>\% group_by(cyl) \%>\% group_var(disp, size = 4, append = FALSE) \%>\% table() } \seealso{ \code{\link{split_var}} to split variables into equal sized groups, \code{\link{group_str}} for grouping string vectors or \code{\link{rec_pattern}} and \code{\link{rec}} for another convenient way of recoding variables into smaller groups. } sjmisc/man/de_mean.Rd0000644000176200001440000000616014046746443014213 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/de_mean.R \name{de_mean} \alias{de_mean} \title{Compute group-meaned and de-meaned variables} \usage{ de_mean(x, ..., grp, append = TRUE, suffix.dm = "_dm", suffix.gm = "_gm") } \arguments{ \item{x}{A data frame.} \item{...}{Names of variables that should be group- and de-meaned.} \item{grp}{Quoted or unquoted name of the variable that indicates the group- or cluster-ID.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} \item{suffix.dm, suffix.gm}{String value, will be appended to the names of the group-meaned and de-meaned variables of \code{x}. By default, de-meaned variables will be suffixed with \code{"_dm"} and grouped-meaned variables with \code{"_gm"}.} } \value{ For \code{append = TRUE}, \code{x} including the group-/de-meaned variables as new columns is returned; if \code{append = FALSE}, only the group-/de-meaned variables will be returned. } \description{ \code{de_mean()} computes group- and de-meaned versions of a variable that can be used in regression analysis to model the between- and within-subject effect. } \details{ \code{de_mean()} is intended to create group- and de-meaned variables for complex random-effect-within-between models (see \cite{Bell et al. 2018}), where group-effects (random effects) and fixed effects correlate (see \cite{Bafumi and Gelman 2006)}). This violation of one of the \emph{Gauss-Markov-assumptions} can happen, for instance, when analysing panel data. To control for correlating predictors and group effects, it is recommended to include the group-meaned and de-meaned version of \emph{time-varying covariates} in the model. By this, one can fit complex multilevel models for panel data, including time-varying, time-invariant predictors and random effects. This approach is superior to simple fixed-effects models, which lack information of variation in the group-effects or between-subject effects. \cr \cr A description of how to translate the formulas described in \emph{Bell et al. 2018} into R using \code{lmer()} from \pkg{lme4} or \code{glmmTMB()} from \pkg{glmmTMB} can be found here: \href{https://strengejacke.github.io/mixed-models-snippets/random-effects-within-between-effects-model.html}{for lmer()} and \href{https://strengejacke.github.io/mixed-models-snippets/random-effects-within-between-effects-model-glmmtmb.html}{for glmmTMB()}. } \examples{ data(efc) efc$ID <- sample(1:4, nrow(efc), replace = TRUE) # fake-ID de_mean(efc, c12hour, barthtot, grp = ID, append = FALSE) } \references{ Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the American Political Science Association. \cr \cr Bell A, Fairbrother M, Jones K. 2018. Fixed and Random Effects Models: Making an Informed Choice. Quality & Quantity. \doi{10.1007/s11135-018-0802-x} } sjmisc/man/str_find.Rd0000644000176200001440000000733714046746443014442 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/str_pos.R \name{str_find} \alias{str_find} \title{Find partial matching and close distance elements in strings} \usage{ str_find(string, pattern, precision = 2, partial = 0, verbose = FALSE) } \arguments{ \item{string}{Character vector with string elements.} \item{pattern}{String that should be matched against the elements of \code{string}.} \item{precision}{Maximum distance ("precision") between two string elements, which is allowed to treat them as similar or equal. Smaller values mean less tolerance in matching.} \item{partial}{Activates similar matching (close distance strings) for parts (substrings) of the \code{string}. Following values are accepted: \itemize{ \item 0 for no partial distance matching \item 1 for one-step matching, which means, only substrings of same length as \code{pattern} are extracted from \code{string} matching \item 2 for two-step matching, which means, substrings of same length as \code{pattern} as well as strings with a slightly wider range are extracted from \code{string} matching } Default value is 0. See 'Details' for more information.} \item{verbose}{Logical; if \code{TRUE}, the progress bar is displayed when computing the distance matrix. Default in \code{FALSE}, hence the bar is hidden.} } \value{ A numeric vector with index position of elements in \code{string} that partially match or are similar to \code{pattern}. Returns \code{-1} if no match was found. } \description{ This function finds the element indices of partial matching or similar strings in a character vector. Can be used to find exact or slightly mistyped elements in a string vector. } \details{ \strong{Computation Details} \cr \cr Fuzzy string matching is based on regular expressions, in particular \code{grep(pattern = "(){~}", x = string)}. This means, \code{precision} indicates the number of chars inside \code{pattern} that may differ in \code{string} to cosinder it as "matching". The higher \code{precision} is, the more tolerant is the search (i.e. yielding more possible matches). Furthermore, the higher the value for \code{partial} is, the more matches may be found. \cr \cr \strong{Partial Distance Matching} \cr \cr For \code{partial = 1}, a substring of \code{length(pattern)} is extracted from \code{string}, starting at position 0 in \code{string} until the end of \code{string} is reached. Each substring is matched against \code{pattern}, and results with a maximum distance of \code{precision} are considered as "matching". If \code{partial = 2}, the range of the extracted substring is increased by 2, i.e. the extracted substring is two chars longer and so on. } \note{ This function does \emph{not} return the position of a matching string \emph{inside} another string, but the element's index of the \code{string} vector, where a (partial) match with \code{pattern} was found. Thus, searching for "abc" in a string "this is abc" will not return 9 (the start position of the substring), but 1 (the element index, which is always 1 if \code{string} only has one element). } \examples{ string <- c("Hello", "Helo", "Hole", "Apple", "Ape", "New", "Old", "System", "Systemic") str_find(string, "hel") # partial match str_find(string, "stem") # partial match str_find(string, "R") # no match str_find(string, "saste") # similarity to "System" # finds two indices, because partial matching now # also applies to "Systemic" str_find(string, "sytsme", partial = 1) # finds partial matching of similarity str_find("We are Sex Pistols!", "postils") } \seealso{ \code{\link{group_str}} } sjmisc/man/rotate_df.Rd0000644000176200001440000000264114046746443014572 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rotate_df.R \name{rotate_df} \alias{rotate_df} \title{Rotate a data frame} \usage{ rotate_df(x, rn = NULL, cn = FALSE) } \arguments{ \item{x}{A data frame.} \item{rn}{Character vector (optional). If not \code{NULL}, the data frame's rownames will be added as (first) column to the output, with \code{rn} being the name of this column.} \item{cn}{Logical (optional), if \code{TRUE}, the values of the first column in \code{x} will be used as column names in the rotated data frame.} } \value{ A (rotated) data frame. } \description{ This function rotates a data frame, i.e. columns become rows and vice versa. } \examples{ x <- mtcars[1:3, 1:4] rotate_df(x) rotate_df(x, rn = "property") # use values in 1. column as column name rotate_df(x, cn = TRUE) rotate_df(x, rn = "property", cn = TRUE) # also works on list-results library(purrr) dat <- mtcars[1:3, 1:4] tmp <- purrr::map(dat, function(x) { sdev <- stats::sd(x, na.rm = TRUE) ulsdev <- mean(x, na.rm = TRUE) + c(-sdev, sdev) names(ulsdev) <- c("lower_sd", "upper_sd") ulsdev }) tmp as.data.frame(tmp) rotate_df(tmp) tmp <- purrr::map_df(dat, function(x) { sdev <- stats::sd(x, na.rm = TRUE) ulsdev <- mean(x, na.rm = TRUE) + c(-sdev, sdev) names(ulsdev) <- c("lower_sd", "upper_sd") ulsdev }) tmp rotate_df(tmp) } sjmisc/man/efc.Rd0000644000176200001440000000063213641627014013346 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/efc.R \docType{data} \name{efc} \alias{efc} \title{Sample dataset from the EUROFAMCARE project} \description{ A SPSS sample data set, imported with the \code{\link[sjlabelled]{read_spss}} function. } \examples{ # Attach EFC-data data(efc) # Show structure str(efc) # show first rows head(efc) } \keyword{data} sjmisc/man/add_variables.Rd0000644000176200001440000000373414046746443015407 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/add_cases.R \name{add_variables} \alias{add_variables} \alias{add_case} \title{Add variables or cases to data frames} \usage{ add_variables(data, ..., .after = Inf, .before = NULL) add_case(data, ..., .after = Inf, .before = NULL) } \arguments{ \item{data}{A data frame.} \item{...}{One or more named vectors that indicate the variables or values, which will be added as new column or row to \code{data}. For \code{add_case()}, non-matching columns in \code{data} will be filled with \code{NA}.} \item{.after, .before}{Numerical index of row or column, where after or before the new variable or case should be added. If \code{.after = -1}, variables or cases are added at the beginning; if \code{.after = Inf}, variables and cases are added at the end. In case of \code{add_variables()}, \code{.after} and \code{.before} may also be a character name indicating the column in \code{data}, after or infront of what \code{...} should be inserted.} } \value{ \code{data}, including the new variables or cases from \code{...}. } \description{ \code{add_variables()} adds a new column to a data frame, while \code{add_case()} adds a new row to a data frame. These are convenient functions to add columns or rows not only at the end of a data frame, but at any column or row position. Furthermore, they allow easy integration into a pipe-workflow. } \note{ For \code{add_case()}, if variable does not exist, a new variable is created and existing cases for this new variable get the value \code{NA}. See 'Examples'. } \examples{ d <- data.frame( a = c(1, 2, 3), b = c("a", "b", "c"), c = c(10, 20, 30), stringsAsFactors = FALSE ) add_case(d, b = "d") add_case(d, b = "d", a = 5, .before = 1) # adding a new case for a new variable add_case(d, e = "new case") add_variables(d, new = 5) add_variables(d, new = c(4, 4, 4), new2 = c(5, 5, 5), .after = "b") } sjmisc/man/is_num_fac.Rd0000644000176200001440000000201313567234676014726 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_num_fac.R \name{is_num_fac} \alias{is_num_fac} \alias{is_num_chr} \title{Check whether a factor has numeric levels only} \usage{ is_num_fac(x) is_num_chr(x) } \arguments{ \item{x}{A factor for \code{is_num_fac()} and a character vector for \code{is_num_chr()}} } \value{ Logical, \code{TRUE} if factor has numeric factor levels only, or if character vector has numeric strings only, \code{FALSE} otherwise. } \description{ \code{is_num_fac()} checks whether a factor has only numeric or any non-numeric factor levels, while \code{is_num_chr()} checks whether a character vector has only numeric strings. } \examples{ # numeric factor levels f1 <- factor(c(NA, 1, 3, NA, 2, 4)) is_num_fac(f1) # not completeley numeric factor levels f2 <- factor(c(NA, "C", 1, 3, "A", NA, 2, 4)) is_num_fac(f2) # not completeley numeric factor levels f3 <- factor(c("Justus", "Bob", "Peter")) is_num_fac(f3) is_num_chr(c("a", "1")) is_num_chr(c("2", "1")) } sjmisc/man/str_start.Rd0000644000176200001440000000402114046746443014642 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/str_start_end.R \name{str_start} \alias{str_start} \alias{str_end} \title{Find start and end index of pattern in string} \usage{ str_start(x, pattern, ignore.case = TRUE, regex = FALSE) str_end(x, pattern, ignore.case = TRUE, regex = FALSE) } \arguments{ \item{x}{A character vector.} \item{pattern}{Character string to be matched in \code{x}. \code{pattern} might also be a regular-expression object, as returned by \code{stringr::regex()}. Alternatively, use \code{regex = TRUE} to treat \code{pattern} as a regular expression rather than a fixed string.} \item{ignore.case}{Logical, whether matching should be case sensitive or not. \code{ignore.case} is ignored when \code{pattern} is no regular expression or \code{regex = FALSE}.} \item{regex}{Logical, if \code{TRUE}, \code{pattern} is treated as a regular expression rather than a fixed string.} } \value{ A numeric vector with index of start/end position(s) of \code{pattern} found in \code{x}, or \code{-1}, if \code{pattern} was not found in \code{x}. } \description{ \code{str_start()} finds the beginning position of \code{pattern} in each element of \code{x}, while \code{str_end()} finds the stopping position of \code{pattern} in each element of \code{x}. } \examples{ path <- "this/is/my/fileofinterest.csv" str_start(path, "/") path <- "this//is//my//fileofinterest.csv" str_start(path, "//") str_end(path, "//") x <- c("my_friend_likes me", "your_friend likes_you") str_start(x, "_") # pattern "likes" starts at position 11 in first, and # position 13 in second string str_start(x, "likes") # pattern "likes" ends at position 15 in first, and # position 17 in second string str_end(x, "likes") x <- c("I like to move it, move it", "You like to move it") str_start(x, "move") str_end(x, "move") x <- c("test1234testagain") str_start(x, "\\\\d+4") str_start(x, "\\\\d+4", regex = TRUE) str_end(x, "\\\\d+4", regex = TRUE) } sjmisc/man/frq.Rd0000644000176200001440000001550514153357271013412 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/frq.R \name{frq} \alias{frq} \title{Frequency table of labelled variables} \usage{ frq( x, ..., sort.frq = c("none", "asc", "desc"), weights = NULL, auto.grp = NULL, show.strings = TRUE, show.na = TRUE, grp.strings = NULL, min.frq = 0, out = c("txt", "viewer", "browser"), title = NULL, encoding = "UTF-8", file = NULL ) } \arguments{ \item{x}{A vector or a data frame. May also be a grouped data frame (see 'Note' and 'Examples').} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{sort.frq}{Determines whether categories should be sorted according to their frequencies or not. Default is \code{"none"}, so categories are not sorted by frequency. Use \code{"asc"} or \code{"desc"} for sorting categories ascending or descending order.} \item{weights}{Bare name, or name as string, of a variable in \code{x} that indicates the vector of weights, which will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} \item{auto.grp}{Numeric value, indicating the minimum amount of unique values in a variable, at which automatic grouping into smaller units is done (see \code{\link{group_var}}). Default value for \code{auto.group} is \code{NULL}, i.e. auto-grouping is off.} \item{show.strings}{Logical, if \code{TRUE}, frequency tables for character vectors will not be printed. This is useful when printing frequency tables of all variables from a data frame, and due to computational reasons character vectors should not be printed.} \item{show.na}{Logical, or \code{"auto"}. If \code{TRUE}, the output always contains information on missing values, even if variables have no missing values. If \code{FALSE}, information on missing values are removed from the output. If \code{show.na = "auto"}, information on missing values is only shown when variables actually have missing values, else it's not shown.} \item{grp.strings}{Numeric, if not \code{NULL}, groups string values in character vectors, based on their similarity. See \code{\link{group_str}} and \code{\link{str_find}} for details on grouping, and their \code{precision}-argument to get more details on the distance of strings to be treated as equal.} \item{min.frq}{Numeric, indicating the minimum frequency for which a value will be shown in the output (except for the missing values, prevailing \code{show.na}). Default value for \code{min.frq} is \code{0}, so all value frequencies are shown. All values or categories that have less than \code{min.frq} occurences in the data will be summarized in a \code{"n < 100"} category.} \item{out}{Character vector, indicating whether the results should be printed to console (\code{out = "txt"}) or as HTML-table in the viewer-pane (\code{out = "viewer"}) or browser (\code{out = "browser"}).} \item{title}{String, will be used as alternative title to the variable label. If \code{x} is a grouped data frame, \code{title} must be a vector of same length as groups.} \item{encoding}{Character vector, indicating the charset encoding used for variable and value labels. Default is \code{"UTF-8"}. Only used when \code{out} is not \code{"txt"}.} \item{file}{Destination file, if the output should be saved as file. Only used when \code{out} is not \code{"txt"}.} } \value{ A list of data frames with values, value labels, frequencies, raw, valid and cumulative percentages of \code{x}. } \description{ This function returns a frequency table of labelled vectors, as data frame. } \details{ The \dots-argument not only accepts variable names or expressions from select-helpers. You can also use logical conditions, math operations, or combining variables to produce "crosstables". See 'Examples' for more details. } \note{ \code{x} may also be a grouped data frame (see \code{\link[dplyr]{group_by}}) with up to two grouping variables. Frequency tables are created for each subgroup then. \cr \cr The \code{print()}-method adds a table header with information on the variable label, variable type, total and valid N, and mean and standard deviations. Mean and SD are \emph{always} printed, even for categorical variables (factors) or character vectors. In this case, values are coerced into numeric vector to calculate the summary statistics. \cr \cr To print tables in markdown or HTML format, use \code{print_md()} or \code{print_html()}. } \examples{ # simple vector data(efc) frq(efc$e42dep) # with grouped data frames, in a pipe library(dplyr) efc \%>\% group_by(e16sex, c172code) \%>\% frq(e42dep) # show only categories with a minimal amount of frequencies frq(mtcars$gear) frq(mtcars$gear, min.frq = 10) frq(mtcars$gear, min.frq = 15) # with select-helpers: all variables from the COPE-Index # (which all have a "cop" in their name) frq(efc, contains("cop")) # all variables from column "c161sex" to column "c175empl" frq(efc, c161sex:c175empl) # for non-labelled data, variable name is printed, # and "label" column is removed from output data(iris) frq(iris, Species) # also works on grouped data frames efc \%>\% group_by(c172code) \%>\% frq(is.na(nur_pst)) # group variables with large range and with weights efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) frq(efc, c160age, auto.grp = 5, weights = weights) # different weight options frq(efc, c172code, weights = weights) frq(efc, c172code, weights = "weights") frq(efc, c172code, weights = efc$weights) frq(efc$c172code, weights = efc$weights) # group string values dummy <- efc[1:50, 3, drop = FALSE] dummy$words <- sample( c("Hello", "Helo", "Hole", "Apple", "Ape", "New", "Old", "System", "Systemic"), size = nrow(dummy), replace = TRUE ) frq(dummy) frq(dummy, grp.strings = 2) #### other expressions than variables # logical conditions frq(mtcars, cyl ==6) frq(efc, is.na(nur_pst), contains("cop")) iris \%>\% frq(starts_with("Petal"), Sepal.Length > 5) # computation of variables "on the fly" frq(mtcars, (gear + carb) / cyl) # crosstables set.seed(123) d <- data.frame( var_x = sample(letters[1:3], size = 30, replace = TRUE), var_y = sample(1:2, size = 30, replace = TRUE), var_z = sample(LETTERS[8:10], size = 30, replace = TRUE) ) table(d$var_x, d$var_z) frq(d, paste0(var_x, var_z)) frq(d, paste0(var_x, var_y, var_z)) } \seealso{ \code{\link{flat_table}} for labelled (proportional) tables. } sjmisc/man/reshape_longer.Rd0000644000176200001440000000543314046746443015622 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reshape_longer.R \name{reshape_longer} \alias{reshape_longer} \title{Reshape data into long format} \usage{ reshape_longer( x, columns = colnames(x), names.to = "key", values.to = "value", labels = NULL, numeric.timevar = FALSE, id = ".id" ) } \arguments{ \item{x}{A data frame.} \item{columns}{Names of variables (as character vector), or column index of variables, that should be reshaped. If multiple column groups should be reshaped, use a list of vectors (see 'Examples').} \item{names.to}{Character vector with name(s) of key column(s) to create in output. Either one name per column group that should be gathered, or a single string. In the latter case, this name will be used as key column, and only one key column is created.} \item{values.to}{Character vector with names of value columns (variable names) to create in output. Must be of same length as number of column groups that should be gathered. See 'Examples'.} \item{labels}{Character vector of same length as \code{values.to} with variable labels for the new variables created from gathered columns. See 'Examples'.} \item{numeric.timevar}{Logical, if \code{TRUE}, the values of the \code{names.to} column will be recoded to numeric values, in sequential ascending order.} \item{id}{Name of ID-variable.} } \value{ A reshaped data frame. } \description{ \code{reshape_longer()} reshapes one or more columns from wide into long format. } \examples{ # Reshape one column group into long format mydat <- data.frame( age = c(20, 30, 40), sex = c("Female", "Male", "Male"), score_t1 = c(30, 35, 32), score_t2 = c(33, 34, 37), score_t3 = c(36, 35, 38) ) reshape_longer( mydat, columns = c("score_t1", "score_t2", "score_t3"), names.to = "time", values.to = "score" ) # Reshape multiple column groups into long format mydat <- data.frame( age = c(20, 30, 40), sex = c("Female", "Male", "Male"), score_t1 = c(30, 35, 32), score_t2 = c(33, 34, 37), score_t3 = c(36, 35, 38), speed_t1 = c(2, 3, 1), speed_t2 = c(3, 4, 5), speed_t3 = c(1, 8, 6) ) reshape_longer( mydat, columns = list( c("score_t1", "score_t2", "score_t3"), c("speed_t1", "speed_t2", "speed_t3") ), names.to = "time", values.to = c("score", "speed") ) # or ... reshape_longer( mydat, list(3:5, 6:8), names.to = "time", values.to = c("score", "speed") ) # gather multiple columns, label columns x <- reshape_longer( mydat, list(3:5, 6:8), names.to = "time", values.to = c("score", "speed"), labels = c("Test Score", "Time needed to finish") ) library(sjlabelled) str(x$score) get_label(x$speed) } \seealso{ \code{\link{to_long}} } sjmisc/man/word_wrap.Rd0000644000176200001440000000220314046746443014621 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/word_wrap.R \name{word_wrap} \alias{word_wrap} \title{Insert line breaks in long labels} \usage{ word_wrap(labels, wrap, linesep = NULL) } \arguments{ \item{labels}{Label(s) as character string, where a line break should be inserted. Several strings may be passed as vector (see 'Examples').} \item{wrap}{Maximum amount of chars per line (i.e. line length). If \code{wrap = Inf} or \code{wrap = 0}, no word wrap will be performed (i.e. \code{labels} will be returned as is).} \item{linesep}{By default, this argument is \code{NULL} and a regular new line string (\code{"\\n"}) is used. For HTML-purposes, for instance, \code{linesep} could be \code{"
"}.} } \value{ New label(s) with line breaks inserted at every \code{wrap}'s position. } \description{ Insert line breaks in long character strings. Useful if you want to wordwrap labels / titles for plots or tables. } \examples{ word_wrap(c("A very long string", "And another even longer string!"), 10) message(word_wrap("Much too long string for just one line!", 15)) } sjmisc/man/seq_col.Rd0000644000176200001440000000106013567234676014251 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/seq_col.R \name{seq_col} \alias{seq_col} \alias{seq_row} \title{Sequence generation for column or row counts of data frames} \usage{ seq_col(x) seq_row(x) } \arguments{ \item{x}{A data frame.} } \value{ A numeric sequence from 1 to number of columns or rows. } \description{ \code{seq_col(x)} is a convenient wrapper for \code{seq_len(ncol(x))}, while \code{seq_row(x)} is a convenient wrapper for \code{seq_len(nrow(x))}. } \examples{ data(iris) seq_col(iris) seq_row(iris) } sjmisc/man/merge_imputations.Rd0000644000176200001440000001047613627417615016363 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/merge_imputations.R \name{merge_imputations} \alias{merge_imputations} \title{Merges multiple imputed data frames into a single data frame} \usage{ merge_imputations( dat, imp, ori = NULL, summary = c("none", "dens", "hist", "sd"), filter = NULL ) } \arguments{ \item{dat}{The data frame that was imputed and used as argument in the \code{\link[mice]{mice}}-function call.} \item{imp}{The \code{\link[mice:mids-class]{mice::mids()}}-object with the imputed data frames from \code{dat}.} \item{ori}{Optional, if \code{ori} is specified, the imputed variables are appended to this data frame; else, a new data frame with the imputed variables is returned.} \item{summary}{After merging multiple imputed data, \code{summary} displays a graphical summary of the "quality" of the merged values, compared to the original imputed values. \describe{ \item{\code{"dens"}}{ Creates a density plot, which shows the distribution of the mean of the imputed values for each variable at each observation. The larger the areas overlap, the better is the fit of the merged value compared to the imputed value. } \item{\code{"hist"}}{ Similar to \code{summary = "dens"}, however, mean and merged values are shown as histogram. Bins should have almost equal height for both groups (mean and merged). } \item{\code{"sd"}}{ Creates a dot plot, where data points indicate the standard deviation for all imputed values (y-axis) at each merged value (x-axis) for all imputed variables. The higher the standard deviation, the less precise is the imputation, and hence the merged value. } }} \item{filter}{A character vector with variable names that should be plotted. All non-defined variables will not be shown in the plot.} } \value{ A data frame with (merged) imputed variables; or \code{ori} with appended imputed variables, if \code{ori} was specified. If \code{summary} is included, returns a list with the data frame \code{data} with (merged) imputed variables and some other summary information, including the \code{plot} as ggplot-object. } \description{ This function merges multiple imputed data frames from \code{\link[mice:mids-class]{mice::mids()}}-objects into a single data frame by computing the mean or selecting the most likely imputed value. } \details{ This method merges multiple imputations of variables into a single variable by computing the (rounded) mean of all imputed values of missing values. By this, each missing value is replaced by those values that have been imputed the most times. \cr \cr \code{imp} must be a \code{mids}-object, which is returned by the \code{mice()}-function of the \pkg{mice}-package. \code{merge_imputations()} than creates a data frame for each imputed variable, by combining all imputations (as returned by the \code{\link[mice]{complete}}-function) of each variable, and computing the row means of this data frame. The mean value is then rounded for integer values (and not for numerical values with fractional part), which corresponds to the most frequent imputed value (mode) for a missing value. Missings in the original variable are replaced by the most frequent imputed value. } \note{ Typically, further analyses are conducted on pooled results of multiple imputed data sets (see \code{\link[mice]{pool}}), however, sometimes (in social sciences) it is also feasible to compute the mean or mode of multiple imputed variables (see \cite{Burns et al. 2011}). } \examples{ if (require("mice")) { imp <- mice(nhanes) # return data frame with imputed variables merge_imputations(nhanes, imp) # append imputed variables to original data frame merge_imputations(nhanes, imp, nhanes) # show summary of quality of merging imputations merge_imputations(nhanes, imp, summary = "dens", filter = c("chl", "hyp")) } } \references{ Burns RA, Butterworth P, Kiely KM, Bielak AAM, Luszcz MA, Mitchell P, et al. 2011. Multiple imputation was an efficient method for harmonizing the Mini-Mental State Examination with missing item-level data. Journal of Clinical Epidemiology;64:787-93 \doi{10.1016/j.jclinepi.2010.10.011} } sjmisc/man/sjmisc-package.Rd0000644000176200001440000000370013451124270015464 0ustar liggesusers\encoding{UTF-8} \name{sjmisc-package} \alias{sjmisc-package} \alias{sjmisc} \docType{package} \title{Data and Variable Transformation Functions} \description{ \strong{Purpose of this package} Collection of miscellaneous utility functions, supporting data transformation tasks like recoding, dichotomizing or grouping variables, setting and replacing missing values. The data transformation functions also support labelled data, and all integrate seamlessly into a 'tidyverse'-workflow. \strong{Design philosophy - consistent api} The design of this package follows, where appropriate, the \emph{tidyverse-approach}, with the first argument of a function always being the data (either a data frame or vector), followed by variable names that should be processed by the function. If no variables are specified as argument, the function applies to the complete data that was indicated as first function argument. There are two types of function designs: \describe{ \item{\emph{transformation/recoding functions}}{ Functions like \code{rec()} or \code{dicho()}, which transform or recode variables, typically return the complete data frame that was given as first argument, \emph{additionally including} the transformed and recoded variables specified in the \code{...}-ellipses argument. The variables usually get a suffix, so original variables are preserved in the data. } \item{\emph{coercing/converting functions}}{ Functions like \code{to_factor()} or \code{to_label()}, which convert variables into other types or add additional information like variable or value labels as attribute, also typically return the complete data frame that was given as first argument. However, the variables specified in the \code{...}-ellipses argument are converted ("overwritten"), all other variables remain unchanged. Hence, these functions do not return any new, additional variables. } } } \author{ Daniel Lüdecke \email{d.luedecke@uke.de} } sjmisc/man/grapes-nin-grapes.Rd0000644000176200001440000000151213567234676016150 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/notin.R \name{\%nin\%} \alias{\%nin\%} \title{Value matching} \usage{ x \%nin\% y } \arguments{ \item{x}{Vector with values to be matched.} \item{y}{Vector with values to be matched against.} } \value{ A logical vector, indicating if a match was \emph{not} located for each element of \code{x}, thus the values are \code{TRUE} or \code{FALSE} and never \code{NA}. } \description{ \%nin\% is the complement to \%in\%. It looks which values in \code{x} do \emph{not} match (hence, are \emph{not in}) values in \code{y}. } \details{ See 'Details' in \code{\link{match}}. } \examples{ c("a", "B", "c") \%in\% letters c("a", "B", "c") \%nin\% letters c(1, 2, 3, 4) \%in\% c(3, 4, 5, 6) c(1, 2, 3, 4) \%nin\% c(3, 4, 5, 6) } sjmisc/man/var_type.Rd0000644000176200001440000000250114046746443014447 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/var_type.R \name{var_type} \alias{var_type} \title{Determine variable type} \usage{ var_type(x, ..., abbr = FALSE) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{abbr}{Logical, if \code{TRUE}, returns a shortened, abbreviated value for the variable type (as returned by \code{pillar::type_sum()}). If \code{FALSE} (default), a longer "description" is returned.} } \value{ The variable type of \code{x}, as character. } \description{ This function returns the type of a variable as character. It is similar to \code{pillar::type_sum()}, however, the return value is not truncated, and \code{var_type()} works on data frames and within pipe-chains. } \examples{ data(efc) var_type(1) var_type(1L) var_type("a") var_type(efc$e42dep) var_type(to_factor(efc$e42dep)) library(dplyr) var_type(efc, contains("cop")) } sjmisc/man/split_var.Rd0000644000176200001440000001274514046746443014634 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/split_var.R \name{split_var} \alias{split_var} \alias{split_var_if} \title{Split numeric variables into smaller groups} \usage{ split_var( x, ..., n, as.num = FALSE, val.labels = NULL, var.label = NULL, inclusive = FALSE, append = TRUE, suffix = "_g" ) split_var_if( x, predicate, n, as.num = FALSE, val.labels = NULL, var.label = NULL, inclusive = FALSE, append = TRUE, suffix = "_g" ) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{n}{The new number of groups that \code{x} should be split into.} \item{as.num}{Logical, if \code{TRUE}, return value will be numeric, not a factor.} \item{val.labels}{Optional character vector, to set value label attributes of recoded variable (see vignette \href{https://cran.r-project.org/package=sjlabelled/vignettes/intro_sjlabelled.html}{Labelled Data and the sjlabelled-Package}). If \code{NULL} (default), no value labels will be set. Value labels can also be directly defined in the \code{rec}-syntax, see 'Details'.} \item{var.label}{Optional string, to set variable label attribute for the returned variable (see vignette \href{https://cran.r-project.org/package=sjlabelled/vignettes/intro_sjlabelled.html}{Labelled Data and the sjlabelled-Package}). If \code{NULL} (default), variable label attribute of \code{x} will be used (if present). If empty, variable label attributes will be removed.} \item{inclusive}{Logical; if \code{TRUE}, cut point value are included in the preceding group. This may be necessary if cutting a vector into groups does not define proper ("equal sized") group sizes. See 'Note' and 'Examples'.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} \item{suffix}{Indicates which suffix will be added to each dummy variable. Use \code{"numeric"} to number dummy variables, e.g. \emph{x_1}, \emph{x_2}, \emph{x_3} etc. Use \code{"label"} to add value label, e.g. \emph{x_low}, \emph{x_mid}, \emph{x_high}. May be abbreviated.} \item{predicate}{A predicate function to be applied to the columns. The variables for which \code{predicate} returns \code{TRUE} are selected.} } \value{ A grouped variable with equal sized groups. If \code{x} is a data frame, for \code{append = TRUE}, \code{x} including the grouped variables as new columns is returned; if \code{append = FALSE}, only the grouped variables will be returned. If \code{append = TRUE} and \code{suffix = ""}, recoded variables will replace (overwrite) existing variables. } \description{ Recode numeric variables into equal sized groups, i.e. a variable is cut into a smaller number of groups at specific cut points. \code{split_var_if()} is a scoped variant of \code{split_var()}, where transformation will be applied only to those variables that match the logical condition of \code{predicate}. } \details{ \code{split_var()} splits a variable into equal sized groups, where the amount of groups depends on the \code{n}-argument. Thus, this functions \code{\link{cut}s} a variable into groups at the specified \code{\link[stats]{quantile}s}. \cr \cr By contrast, \code{\link{group_var}} recodes a variable into groups, where groups have the same value range (e.g., from 1-5, 6-10, 11-15 etc.). \cr \cr \code{split_var()} also works on grouped data frames (see \code{\link[dplyr]{group_by}}). In this case, splitting is applied to the subsets of variables in \code{x}. See 'Examples'. } \note{ In case a vector has only few number of unique values, splitting into equal sized groups may fail. In this case, use the \code{inclusive}-argument to shift a value at the cut point into the lower, preceeding group to get equal sized groups. See 'Examples'. } \examples{ data(efc) # non-grouped table(efc$neg_c_7) # split into 3 groups table(split_var(efc$neg_c_7, n = 3)) # split multiple variables into 3 groups split_var(efc, neg_c_7, pos_v_4, e17age, n = 3, append = FALSE) frq(split_var(efc, neg_c_7, pos_v_4, e17age, n = 3, append = FALSE)) # original table(efc$e42dep) # two groups, non-inclusive cut-point # vector split leads to unequal group sizes table(split_var(efc$e42dep, n = 2)) # two groups, inclusive cut-point # group sizes are equal table(split_var(efc$e42dep, n = 2, inclusive = TRUE)) # Unlike dplyr's ntile(), split_var() never splits a value # into two different categories, i.e. you always get a clean # separation of original categories library(dplyr) x <- dplyr::ntile(efc$neg_c_7, n = 3) table(efc$neg_c_7, x) x <- split_var(efc$neg_c_7, n = 3) table(efc$neg_c_7, x) # works also with gouped data frames mtcars \%>\% split_var(disp, n = 3, append = FALSE) \%>\% table() mtcars \%>\% group_by(cyl) \%>\% split_var(disp, n = 3, append = FALSE) \%>\% table() } \seealso{ \code{\link{group_var}} to group variables into equal ranged groups, or \code{\link{rec}} to recode variables. } sjmisc/man/flat_table.Rd0000644000176200001440000000444214046746443014721 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/prop_table.R \name{flat_table} \alias{flat_table} \title{Flat (proportional) tables} \usage{ flat_table( data, ..., margin = c("counts", "cell", "row", "col"), digits = 2, show.values = FALSE, weights = NULL ) } \arguments{ \item{data}{A data frame. May also be a grouped data frame (see 'Note' and 'Examples').} \item{...}{One or more variables of \code{data} that should be printed as table.} \item{margin}{Specify the table margin that should be computed for proportional tables. By default, counts are printed. Use \code{margin = "cell"}, \code{margin = "col"} or \code{margin = "row"} to print cell, column or row percentages of the table margins.} \item{digits}{Numeric; for proportional tables, \code{digits} indicates the number of decimal places.} \item{show.values}{Logical, if \code{TRUE}, value labels are prefixed by the associated value.} \item{weights}{Bare name, or name as string, of a variable in \code{x} that indicates the vector of weights, which will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} } \value{ An object of class \code{\link[stats]{ftable}}. } \description{ This function creates a labelled flat table or flat proportional (marginal) table. } \note{ \code{data} may also be a grouped data frame (see \code{\link[dplyr]{group_by}}) with up to two grouping variables. Cross tables are created for each subgroup then. } \examples{ data(efc) # flat table with counts flat_table(efc, e42dep, c172code, e16sex) # flat table with proportions flat_table(efc, e42dep, c172code, e16sex, margin = "row") # flat table from grouped data frame. You need to select # the grouping variables and at least two more variables for # cross tabulation. library(dplyr) efc \%>\% group_by(e16sex) \%>\% select(e16sex, c172code, e42dep) \%>\% flat_table() efc \%>\% group_by(e16sex, e42dep) \%>\% select(e16sex, e42dep, c172code, n4pstu) \%>\% flat_table() # now it gets weird... efc \%>\% group_by(e16sex, e42dep) \%>\% select(e16sex, e42dep, c172code, n4pstu, c161sex) \%>\% flat_table() } \seealso{ \code{\link{frq}} for simple frequency table of labelled vectors. } sjmisc/man/shorten_string.Rd0000644000176200001440000000217413567234676015703 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/shorten_string.R \name{shorten_string} \alias{shorten_string} \title{Shorten character strings} \usage{ shorten_string(s, max.length = NULL, abbr = "...") } \arguments{ \item{s}{A string.} \item{max.length}{Maximum length of chars for the string.} \item{abbr}{String that will be used as suffix, if \code{s} was shortened.} } \value{ A shortened string. } \description{ This function shortens strings that are longer than \code{max.length} chars, without cropping words. } \details{ If the string length defined in \code{max.length} happens to be inside a word, this word is removed from the returned string (see 'Examples'), so the returned string has a \emph{maximum length} of \code{max.length}, but might be shorter. } \examples{ s <- "This can be considered as very long string!" # string is shorter than max.length, so returned as is shorten_string(s, 60) # string is shortened to as many words that result in # a string of maximum 20 chars shorten_string(s, 20) # string including "considered" is exactly of length 22 chars shorten_string(s, 22) } sjmisc/man/to_value.Rd0000644000176200001440000000445414046746443014445 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_value.R \name{to_value} \alias{to_value} \title{Convert factors to numeric variables} \usage{ to_value(x, ..., start.at = NULL, keep.labels = TRUE, use.labels = FALSE) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{start.at}{Starting index, i.e. the lowest numeric value of the variable's value range. By default, this argument is \code{NULL}, hence the lowest value of the returned numeric variable corresponds to the lowest factor level (if factor levels are numeric) or to \code{1} (if factor levels are not numeric).} \item{keep.labels}{Logical, if \code{TRUE}, former factor levels will be added as value labels. For numeric factor levels, values labels will be used, if present. See 'Examples' and \code{\link{set_labels}} for more details.} \item{use.labels}{Logical, if \code{TRUE} and \code{x} has numeric value labels, these value labels will be set as numeric values.} } \value{ A numeric variable with values ranging either from \code{start.at} to \code{start.at} + length of factor levels, or to the corresponding factor levels (if these were numeric). If \code{x} is a data frame, the complete data frame \code{x} will be returned, where variables specified in \code{...} are coerced to numeric; if \code{...} is not specified, applies to all variables in the data frame. } \description{ This function converts (replaces) factor levels with the related factor level index number, thus the factor is converted to a numeric variable. \code{to_value()} and \code{to_numeric()} are aliases. } \note{ This function is kept for backwards-compatibility. It is preferred to use \code{\link[sjlabelled]{as_numeric}}. } \examples{ library(sjlabelled) data(efc) test <- as_label(efc$e42dep) table(test) table(to_value(test)) # Find more examples at '?sjlabelled::as_numeric' } sjmisc/man/recode_to.Rd0000644000176200001440000000714413676323120014560 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/recode_to.R \name{recode_to} \alias{recode_to} \alias{recode_to_if} \title{Recode variable categories into new values} \usage{ recode_to(x, ..., lowest = 0, highest = -1, append = TRUE, suffix = "_r0") recode_to_if( x, predicate, lowest = 0, highest = -1, append = TRUE, suffix = "_r0" ) } \arguments{ \item{x}{A vector or data frame.} \item{...}{Optional, unquoted names of variables that should be selected for further processing. Required, if \code{x} is a data frame (and no vector) and only selected variables from \code{x} should be processed. You may also use functions like \code{:} or tidyselect's select-helpers. See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}.} \item{lowest}{Indicating the lowest category value for recoding. Default is 0, so the new variable starts with value 0.} \item{highest}{If specified and greater than \code{lowest}, all category values larger than \code{highest} will be set to \code{NA}. Default is \code{-1}, i.e. this argument is ignored and no NA's will be produced.} \item{append}{Logical, if \code{TRUE} (the default) and \code{x} is a data frame, \code{x} including the new variables as additional columns is returned; if \code{FALSE}, only the new variables are returned.} \item{suffix}{Indicates which suffix will be added to each dummy variable. Use \code{"numeric"} to number dummy variables, e.g. \emph{x_1}, \emph{x_2}, \emph{x_3} etc. Use \code{"label"} to add value label, e.g. \emph{x_low}, \emph{x_mid}, \emph{x_high}. May be abbreviated.} \item{predicate}{A predicate function to be applied to the columns. The variables for which \code{predicate} returns \code{TRUE} are selected.} } \value{ \code{x} with recoded category values, where \code{lowest} indicates the lowest value; If \code{x} is a data frame, for \code{append = TRUE}, \code{x} including the recoded variables as new columns is returned; if \code{append = FALSE}, only the recoded variables will be returned. If \code{append = TRUE} and \code{suffix = ""}, recoded variables will replace (overwrite) existing variables. } \description{ Recodes (or "renumbers") the categories of variables into new category values, beginning with the lowest value specified by \code{lowest}. Useful when recoding dummy variables with 1/2 values to 0/1 values, or recoding scales from 1-4 to 0-3 etc. \code{recode_to_if()} is a scoped variant of \code{recode_to()}, where recoding will be applied only to those variables that match the logical condition of \code{predicate}. } \note{ Value and variable label attributes are preserved. } \examples{ # recode 1-4 to 0-3 dummy <- sample(1:4, 10, replace = TRUE) recode_to(dummy) # recode 3-6 to 0-3 # note that numeric type is returned dummy <- as.factor(3:6) recode_to(dummy) # lowest value starting with 1 dummy <- sample(11:15, 10, replace = TRUE) recode_to(dummy, lowest = 1) # lowest value starting with 1, highest with 3 # all others set to NA dummy <- sample(11:15, 10, replace = TRUE) recode_to(dummy, lowest = 1, highest = 3) # recode multiple variables at once data(efc) recode_to(efc, c82cop1, c83cop2, c84cop3, append = FALSE) library(dplyr) efc \%>\% select(c82cop1, c83cop2, c84cop3) \%>\% mutate( c82new = recode_to(c83cop2, lowest = 5), c83new = recode_to(c84cop3, lowest = 3) ) \%>\% head() } \seealso{ \code{\link{rec}} for general recoding of variables and \code{\link[sjlabelled]{set_na}} for setting \code{\link{NA}} values. } sjmisc/man/is_even.Rd0000644000176200001440000000134313567234676014260 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/is_even.R \name{is_even} \alias{is_even} \alias{is_odd} \title{Check whether value is even or odd} \usage{ is_even(x) is_odd(x) } \arguments{ \item{x}{Numeric vector or single numeric value, or a data frame or list with such vectors.} } \value{ \code{is_even()} returns \code{TRUE} for each even value of \code{x}, \code{FALSE} for odd values. \code{is_odd()} returns \code{TRUE} for each odd value of \code{x} and \code{FALSE} for even values. } \description{ Checks whether \code{x} is an even or odd number. Only accepts numeric vectors. } \examples{ is_even(4) is_even(5) is_even(1:4) is_odd(4) is_odd(5) is_odd(1:4) } sjmisc/man/big_mark.Rd0000644000176200001440000000265314046746443014401 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/big_mark.R \name{big_mark} \alias{big_mark} \alias{prcn} \title{Format numbers} \usage{ big_mark(x, big.mark = ",", ...) prcn(x) } \arguments{ \item{x}{A vector or data frame. All numeric inputs (including numeric character) vectors) will be prettified. For \code{prcn()}, a number between 0 and 1, or a vector or data frame with such numbers.} \item{big.mark}{Character, used as mark between every 3 decimals before the decimal point.} \item{...}{Other arguments passed down to the \code{\link{prettyNum}}-function.} } \value{ For \code{big_mark()}, a prettified \code{x} as character, with big marks. For \code{prcn}, a character vector with a percentage number. } \description{ \code{big_mark()} formats large numbers with big marks, while \code{prcn()} converts a numeric scalar between 0 and 1 into a character vector, representing the percentage-value. } \examples{ # simple big mark big_mark(1234567) # big marks for several values at once, mixed numeric and character big_mark(c(1234567, "55443322")) # pre-defined width of character output big_mark(c(1234567, 55443322), width = 15) # convert numbers into percentage, as character prcn(0.2389) prcn(c(0.2143887, 0.55443, 0.12345)) dat <- data.frame( a = c(.321, .121, .64543), b = c("a", "b", "c"), c = c(.435, .54352, .234432) ) prcn(dat) } sjmisc/DESCRIPTION0000644000176200001440000000304514620415212013247 0ustar liggesusersPackage: sjmisc Type: Package Encoding: UTF-8 Title: Data and Variable Transformation Functions Version: 2.8.10 Authors@R: c(person("Daniel", "Lüdecke", role = c("aut", "cre"), email = "d.luedecke@uke.de", comment = c(ORCID = "0000-0002-8895-3206")), person("Iago", "Giné-Vázquez", role = c("ctb"), email = "i.gine@pssjd.org"), person("Alexander", "Bartel", role = "ctb", email = "alexander.bartel@fu-berlin.de", comment = c(ORCID = "0000-0002-1280-6138"))) Maintainer: Daniel Lüdecke Description: Collection of miscellaneous utility functions, supporting data transformation tasks like recoding, dichotomizing or grouping variables, setting and replacing missing values. The data transformation functions also support labelled data, and all integrate seamlessly into a 'tidyverse'-workflow. License: GPL-3 Depends: R (>= 3.4) Imports: dplyr, insight, datawizard, magrittr, methods, purrr, rlang, sjlabelled (>= 1.1.1), stats, tidyselect, utils Suggests: ggplot2, graphics, haven (>= 2.0.0), mice, nnet, sjPlot, sjstats, knitr, rmarkdown, stringdist, testthat, tidyr URL: https://strengejacke.github.io/sjmisc/ BugReports: https://github.com/strengejacke/sjmisc/issues RoxygenNote: 7.3.1 VignetteBuilder: knitr NeedsCompilation: no Packaged: 2024-05-13 13:03:33 UTC; mail Author: Daniel Lüdecke [aut, cre] (), Iago Giné-Vázquez [ctb], Alexander Bartel [ctb] () Repository: CRAN Date/Publication: 2024-05-13 13:50:02 UTC sjmisc/build/0000755000176200001440000000000014620407644012650 5ustar liggesuserssjmisc/build/vignette.rds0000644000176200001440000000045114620407644015207 0ustar liggesusersQO0ǻ-11}0| ŗe.7nԔܛ_\`=2\{o!dJb 0IHHqNTƙ2?ī(Je;*DNE+2AdVG:4it!Cg4Q{/4,}nWI_zΓWBu8Щ:Uh1FBiĭ>|=BxąfET*ֲ`~;&/Q 'q y4f#ɾ!usQÎhCsjmisc/build/partial.rdb0000644000176200001440000000007414620407633014774 0ustar liggesusersb```b`a 00 FN ͚Z d@$$7sjmisc/tests/0000755000176200001440000000000013451124270012703 5ustar liggesuserssjmisc/tests/testthat/0000755000176200001440000000000014620415212014541 5ustar liggesuserssjmisc/tests/testthat/test-std.R0000644000176200001440000000162313612252477016451 0ustar liggesusersif (require("testthat") && require("sjmisc") && require("dplyr")) { data(efc) data(mtcars) efc$c172code <- as.factor(efc$c172code) efc$e42dep <- as.factor(efc$e42dep) test_that("std, vector", { expect_is(std(efc$c12hour), "numeric") }) test_that("std, data.frame", { expect_is(std(efc, c12hour), "data.frame") }) test_that("std, robust", { expect_is(std(efc, c12hour, c160age, robust = "2sd"), "data.frame") }) test_that("std, robust", { expect_is(std(efc, c12hour, c160age, robust = "gmd", append = FALSE), "data.frame") }) test_that("std, factors", { tmp <- std(efc, append = FALSE) expect_is(tmp$c172code_z, "factor") tmp <- std(efc, append = FALSE, include.fac = TRUE) expect_is(tmp$c172code_z, "numeric") }) test_that("std, factors", { mtcars %>% dplyr::group_by(cyl) %>% std(disp) }) } sjmisc/tests/testthat/test-colcnt.R0000644000176200001440000000102013612252477017130 0ustar liggesusersif (require("testthat") && require("sjmisc")) { dat <- data.frame( c1 = c(1, 2, 3, 1, 3, NA), c2 = c(3, 2, 1, 2, NA, 3), c3 = c(1, 1, 2, 1, 3, NA), c4 = c(1, 1, 3, 2, 1, 2) ) test_that("col_count", { expect_equal(sum(col_count(dat, count = 1, append = FALSE)), 9) }) test_that("col_count", { expect_equal(sum(col_count(dat, count = NA, append = FALSE)), 3) }) test_that("col_count", { expect_equal(sum(col_count(dat, c2:c4, count = 2, append = FALSE)), 5) }) } sjmisc/tests/testthat/test-rowsums.R0000644000176200001440000000177413451124270017373 0ustar liggesuserscontext("sjmisc, row_sums") library(sjmisc) dat <- data.frame( c1 = c(1,2,NA,4), c2 = c(NA,2,NA,5), c3 = c(NA,4,NA,NA), c4 = c(2,3,7,8), c5 = c(1,7,5,3) ) test_that("row_sums", { tmp <- row_sums(dat, n = 4, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 2) expect_equal(sum(tmp[[1]], na.rm = TRUE), 38) tmp <- row_sums(dat, n = .4, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 0) expect_equal(sum(tmp[[1]], na.rm = TRUE), 54) # this one is R-behaviour, because round(2.5) = 2 tmp <- row_sums(dat, n = .5, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 0) expect_equal(sum(tmp[[1]], na.rm = TRUE), 54) tmp <- row_sums(dat, n = .51, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 1) expect_equal(sum(tmp[[1]], na.rm = TRUE), 42) tmp <- row_sums(dat, n = 3, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 1) expect_equal(sum(tmp[[1]], na.rm = TRUE), 42) expect_message(row_sums(dat[, 1, drop = FALSE], n = 0)) }) sjmisc/tests/testthat/test-rowcnt.R0000644000176200001440000000076113451124270017163 0ustar liggesuserscontext("sjmisc, row_count") library(sjmisc) dat <- data.frame( c1 = c(1, 2, 3, 1, 3, NA), c2 = c(3, 2, 1, 2, NA, 3), c3 = c(1, 1, 2, 1, 3, NA), c4 = c(1, 1, 3, 2, 1, 2) ) test_that("row_count", { expect_equal(sum(row_count(dat, count = 1, append = FALSE)), 9) }) test_that("row_count", { expect_equal(sum(row_count(dat, count = NA, append = FALSE)), 3) }) test_that("row_count", { expect_equal(sum(row_count(dat, c1:c3, count = 2, append = FALSE)), 4) }) sjmisc/tests/testthat/test-to_dummy.R0000644000176200001440000000050013451124270017473 0ustar liggesuserscontext("sjmisc, to_dummy") library(sjmisc) x <- to_dummy(data.frame(x = c("yes", "no", "yes", "maybe"), stringsAsFactors = FALSE), suffix = "label") y <- to_dummy(data.frame(x = c("yes", "no", "yes", "maybe"), stringsAsFactors = TRUE), suffix = "label") test_that("to_dummy", { expect_identical(x, y) }) sjmisc/tests/testthat/test-setna.R0000644000176200001440000000157514272453251016773 0ustar liggesusersif (requireNamespace("haven", quietly = TRUE) && requireNamespace("sjlabelled", quietly = TRUE)) { library(sjmisc) test_that("set_na", { x <- factor(c("a", "b", "c")) expect_equal(nlevels(set_na(x, na = "b", as.tag = TRUE)), 2) expect_equal(nlevels(set_na(x, na = "b", drop.levels = FALSE, as.tag = TRUE)), 3) }) test_that("set_na", { x <- c(1, 2, 3) expect_true(is.null(attr(set_na(x, na = 1, as.tag = FALSE), "labels"))) expect_true(!is.null(attr(set_na(x, na = 1, as.tag = TRUE), "labels"))) expect_true(all_na(set_na(x, na = 1:3))) }) test_that("set_na", { x <- c(1, 2, NA, 3) expect_warning(expect_equal(set_na(x, na = c(1, NA), as.tag = FALSE), c(1, 2, NA, 3))) expect_warning(expect_equal(set_na(x, na = NA), c(1, 2, NA, 3))) expect_warning(expect_equal(set_na(x, na = NULL), c(1, 2, NA, 3))) }) } sjmisc/tests/testthat/test-removevar.R0000644000176200001440000000034013451124270017646 0ustar liggesuserscontext("sjmisc, remove_var") library(sjmisc) data(efc) test_that("remove_var", { x <- remove_var(efc, 1:3) expect_equal(ncol(x), 23) x <- remove_var(efc, c82cop1:c90cop9) expect_equal(ncol(x), 17) }) sjmisc/tests/testthat/test-group_str.R0000644000176200001440000000132413451124270017667 0ustar liggesuserscontext("sjmisc, group_str") library(sjmisc) library(dplyr) test_that("group_str", { data(efc) efc$e15relat <- to_label(efc$e15relat) levels(efc$e15relat) <- c("Hello", "Helo", "Hole", "Apple", "Ape", "System", "Systemic", "new") efc$e15relat <- to_character(efc$e15relat) group_str(efc$e15relat) x <- efc$e15relat group_str(x) oldstring <- c("Hello", "Helo", "Hole", "Apple", "Ape", "New", "Old", "System", "Systemic") group_str(oldstring) oldstring <- c("Hello", "Helo", "Hole", NA_character_, "Apple", "Ape", "New", "Old", "System", "Systemic", NA_character_) expect_equal(group_str(oldstring)[4], NA_character_) expect_equal(group_str(oldstring)[11], NA_character_) }) sjmisc/tests/testthat/test-to_value.R0000644000176200001440000000144013451124270017460 0ustar liggesuserscontext("sjmisc, to_value") library(sjmisc) test_that("to_value", { expect_equal(to_value(factor(c(0,1,2)), keep.labels = FALSE), c(0,1,2)) expect_equal(to_value(factor(c(2,3,4)), keep.labels = FALSE), c(2,3,4)) expect_equal(to_value(factor(c("a", "b", "c")), keep.labels = FALSE), c(1,2,3)) expect_equal(to_value(factor(c("d", "e", "f")), keep.labels = FALSE), c(1,2,3)) }) test_that("to_value", { expect_equal(to_value(factor(c(0,1,2)), start.at = 4, keep.labels = FALSE), c(4,5,6)) expect_equal(to_value(factor(c(2,3,4)), start.at = 4, keep.labels = FALSE), c(4,5,6)) expect_equal(to_value(factor(c("a", "b", "c")), start.at = 4, keep.labels = FALSE), c(4,5,6)) expect_equal(to_value(factor(c("d", "e", "f")), start.at = 4, keep.labels = FALSE), c(4,5,6)) }) sjmisc/tests/testthat/test-word_wrap.R0000644000176200001440000000100713542421717017654 0ustar liggesuserscontext("sjmisc, word_wrap") library(sjmisc) test_that("word_wrap", { x <- c("my friend likes me", "your friend likes you") x_5 <- c("my\nfriend\nlikes\nme", "your\nfriend\nlikes\nyou") x_20 <- c("my friend likes me", "your friend likes\nyou") expect_equal(word_wrap(x, 5), x_5) expect_equal(word_wrap(x, 20), x_20) expect_equal(word_wrap(x, 0), x) expect_equal(word_wrap(x, Inf), x) x <- c(expression(paste(italic("Index"), ""^"®™"))) expect_warning(word_wrap(x, 10)) }) sjmisc/tests/testthat/test-dicho.R0000644000176200001440000000136013451124270016731 0ustar liggesuserscontext("sjmisc, dicho") library(sjmisc) data(efc) test_that("dicho", { tmp <- dicho(efc$c12hour) expect_equal(sum(tmp == 1, na.rm = T), 438) }) test_that("dicho", { tmp <- dicho(efc, c12hour, c160age, append = FALSE) expect_equal(sum(diag(table(tmp))), 574) }) test_that("dicho", { tmp <- dicho(efc, c12hour, c160age, dich.by = "mean", append = FALSE, as.num = TRUE) expect_equal(sum(diag(table(tmp))), 556) }) test_that("dicho", { tmp <- dicho( efc, c12hour, c160age, dich.by = "mean", append = FALSE, as.num = TRUE, var.label = "variable labels", val.labels = c("zero", "one") ) expect_equal(names(lapply(tmp, attributes)[[2]]$labels), c("zero", "one")) }) sjmisc/tests/testthat/test-frq_whitespace.R0000644000176200001440000000177314272453251020665 0ustar liggesuserslibrary(testthat) library(sjmisc) if (requireNamespace("haven", quietly = TRUE) && requireNamespace("sjlabelled", quietly = TRUE)) { x.char <- c("Category 1", "Category 2", "Category 3", NA) # with spaces y.char <- c("Category1", "Category2", "Category3", NA) # w/o spaces dat <- data.frame(x.char, y.char, stringsAsFactors = FALSE) # make a data frame x <- sjmisc::rec( dat, y.char, suffix = "_r", as.num = TRUE, rec = "Category1 = 1 [Label 1]; Category2 = 2 [Label 2]; Category3 = 3 [Label 3];" ) expect_equal(as.vector(x$y.char_r), c(1, 2, 3, NA)) expect_equal(attributes(x$y.char_r)$labels, c(`Label 1` = 1, `Label 2` = 2, `Label 3` = 3)) x <- sjmisc::rec( dat, x.char, suffix = "_r", as.num = TRUE, rec = "Category 1 = 1 [Label 1]; Category 2 = 2 [Label 2]; Category 3 = 3 [Label 3];" ) expect_equal(as.vector(x$x.char_r), c(1, 2, 3, NA)) expect_equal(attributes(x$x.char_r)$labels, c(`Label 1` = 1, `Label 2` = 2, `Label 3` = 3)) } sjmisc/tests/testthat/test-prcn.R0000644000176200001440000000031613604103360016602 0ustar liggesuserscontext("sjmisc, prcn") library(sjmisc) test_that("prcn", { expect_equal(prcn(0.2389), "23.89%") }) test_that("prcn", { expect_equal(prcn(c(0.2143887, 0.55443)), c("21.44%", "55.44%")) }) sjmisc/tests/testthat/test-frq.R0000644000176200001440000001376113627435140016451 0ustar liggesuserscontext("sjmisc, frq") library(sjmisc) library(dplyr) data(efc) test_that("frq", { expect_is(frq(efc$e42dep), class = "list") }) test_that("frq", { expect_is(frq(efc$e42dep, sort.frq = "asc"), class = "list") }) test_that("frq", { expect_is(frq(efc, e42dep, e16sex), class = "list") }) test_that("frq", { expect_is(frq(efc, e42dep, e16sex, sort.frq = "asc"), class = "list") }) test_that("frq", { expect_is(frq(efc$c12hour, auto.grp = 5), class = "list") }) test_that("frq", { expect_is(frq(efc, c12hour, e17age, auto.grp = 5), class = "list") }) test_that("frq", { expect_is(frq(efc, c12hour, e17age, auto.grp = 5, sort.frq = "desc"), class = "list") }) test_that("frq", { frq(efc$e42dep, title = "test") efc %>% dplyr::group_by(c172code) %>% frq(e16sex, title = c("1", "2", "3")) }) test_that("frq", { efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) frq(efc, c160age, auto.grp = 5, weights = weights) frq(efc, e42dep, weights = weights) frq(efc, e42dep, weights = "weights") frq(efc, e42dep, weights = efc$weights) test.weight <- function(x, y, w) { frq(x, y, weights = w) } test.weight(efc, "neg_c_7", "weights") }) v1 <- c(1, 2, 1, 2, 1, 1) v2 <- c(1, 2, 1, NA, 1, 1) test_that("frq, show.na", { expect_equal(nrow(frq(v1, show.na = TRUE)[[1]]), 3) expect_equal(nrow(frq(v1, show.na = FALSE)[[1]]), 2) expect_equal(nrow(frq(v1, show.na = "auto")[[1]]), 2) expect_equal(nrow(frq(v2, show.na = TRUE)[[1]]), 3) expect_equal(nrow(frq(v2, show.na = FALSE)[[1]]), 2) expect_equal(nrow(frq(v2, show.na = "auto")[[1]]), 3) }) test_that("frq", { data(efc) efc$e15relat <- to_label(efc$e15relat) levels(efc$e15relat) <- c("Hello", "Helo", "Hole", "Apple", "Ape", "System", "Systemic", "new") efc$e15relat <- to_character(efc$e15relat) frq(efc$e15relat) frq(efc, e15relat) frq(efc$e15relat, grp.strings = 2) frq(efc, e15relat, grp.strings = 2) x <- efc$e15relat frq(x) frq(x, grp.strings = 2) efc %>% dplyr::group_by(c172code) %>% frq(c161sex, e15relat, grp.strings = 2) }) #' # with grouped data frames for which some groups are completely missing, #' # and also choosing minimum frequencies #' efc %>% #' slice(1:4) %>% #' group_by(c12hour) %>% #' frq(nur_pst) #' #' efc %>% #' slice(1:4) %>% #' group_by(c12hour) %>% #' frq(nur_pst, min.frq = 1) #' test_that("frq, string", { dat <- data.frame( x = c("", "", "a", "a", "b"), stringsAsFactors = FALSE ) expect_equal( frq(dat$x), structure(list(structure(list( val = structure(c(1L, 2L, 3L, NA), .Label = c("", "a", "b"), class = "factor"), label = c("", "", "", NA), frq = c(2L, 2L, 1L, 0L), raw.prc = c(40, 40, 20, 0), valid.prc = c(40, 40, 20, NA), cum.prc = c(40, 80, 100, NA) ), row.names = c(NA, -4L), class = "data.frame", label = "x", vartype = "character", mean = 1.8, sd = 0.836660026534076, ci = structure( list( lower = c(-0.147032972460588, -0.147032972460588, -0.753045081153163, 0), upper = c(4.14703297246059, 4.14703297246059, 2.75304508115316, 0) ), class = "data.frame", row.names = c(NA, -4L) ), relative.ci = structure( list( lower = c(-0.0294065944921176, -0.0294065944921176, -0.150609016230633, 0), upper = c(0.829406594492118, 0.829406594492118, 0.550609016230633, 0) ), class = "data.frame", row.names = c(NA, -4L)))), class = c("sjmisc_frq", "list"), print = "txt", encoding = "UTF-8" ), tolerance = 1e-4 ) expect_equal( frq(dat[["x"]]), structure(list(structure(list( val = structure(c(1L, 2L, 3L, NA), .Label = c("", "a", "b"), class = "factor"), label = c("", "", "", NA), frq = c(2L, 2L, 1L, 0L), raw.prc = c(40, 40, 20, 0), valid.prc = c(40, 40, 20, NA), cum.prc = c(40, 80, 100, NA) ), row.names = c(NA, -4L), class = "data.frame", label = "x", vartype = "character", mean = 1.8, sd = 0.836660026534076, ci = structure( list( lower = c(-0.147032972460588, -0.147032972460588, -0.753045081153163, 0), upper = c(4.14703297246059, 4.14703297246059, 2.75304508115316, 0) ), class = "data.frame", row.names = c(NA, -4L) ), relative.ci = structure( list( lower = c(-0.0294065944921176, -0.0294065944921176, -0.150609016230633, 0), upper = c(0.829406594492118, 0.829406594492118, 0.550609016230633, 0) ), class = "data.frame", row.names = c(NA, -4L)))), class = c("sjmisc_frq", "list"), print = "txt", encoding = "UTF-8" ), tolerance = 1e-4 ) expect_equal( frq(dat["x"]), structure(list(structure(list( val = structure(c(1L, 2L, 3L, NA), .Label = c("", "a", "b"), class = "factor"), label = c("", "", "", NA), frq = c(2L, 2L, 1L, 0L), raw.prc = c(40, 40, 20, 0), valid.prc = c(40, 40, 20, NA), cum.prc = c(40, 80, 100, NA) ), row.names = c(NA, -4L), class = "data.frame", label = "x", vartype = "character", mean = 1.8, sd = 0.836660026534076, ci = structure( list( lower = c(-0.147032972460588, -0.147032972460588, -0.753045081153163, 0), upper = c(4.14703297246059, 4.14703297246059, 2.75304508115316, 0) ), class = "data.frame", row.names = c(NA, -4L) ), relative.ci = structure( list( lower = c(-0.0294065944921176, -0.0294065944921176, -0.150609016230633, 0), upper = c(0.829406594492118, 0.829406594492118, 0.550609016230633, 0) ), class = "data.frame", row.names = c(NA, -4L)))), class = c("sjmisc_frq", "list"), print = "txt", encoding = "UTF-8" ), tolerance = 1e-4 ) }) sjmisc/tests/testthat/test-tidy_values.R0000644000176200001440000000075213451124270020177 0ustar liggesuserscontext("sjmisc, tidy_values") library(sjmisc) f1 <- "A B C" f2 <- "A.BC" f3 <- "A/B" f4 <- "A_B" f5 <- "AB" test_that("tidy_values", { expect_equal(tidy_values(f1), "A_B_C") }) test_that("tidy_values", { expect_equal(tidy_values(f2), "A_BC") }) test_that("tidy_values", { expect_equal(tidy_values(f3), "A_B") }) test_that("tidy_values", { expect_equal(tidy_values(f4), "A_B") }) test_that("tidy_values", { expect_equal(tidy_values(f5), "AB") }) sjmisc/tests/testthat/test-reflvl.R0000644000176200001440000000113214272453251017140 0ustar liggesusersif (requireNamespace("haven", quietly = TRUE) && requireNamespace("sjlabelled", quietly = TRUE)) { library(sjmisc) data(iris) data(efc) test_that("ref_lvl", { x <- to_factor(efc$e42dep) tmp <- ref_lvl(x, lvl = 2) expect_match(names(attr(tmp, "labels"))[1], "slightly dependent", fixed = T) }) test_that("ref_lvl", { tmp <- levels(ref_lvl(iris$Species, lvl = 3)) expect_match(tmp[1], levels(iris$Species)[3], fixed = T) tmp <- levels(ref_lvl(iris$Species, lvl = "versicolor")) expect_match(tmp[1], levels(iris$Species)[2], fixed = T) }) } sjmisc/tests/testthat/test-rec.R0000644000176200001440000000410714272453251016424 0ustar liggesuserslibrary(sjmisc) data(iris) data(efc) if (requireNamespace("haven", quietly = TRUE) && requireNamespace("sjlabelled", quietly = TRUE)) { test_that("rec", { expect_equal(length(unique(rec(iris$Sepal.Length, rec = "lo:5=1;5.01:6.5=2;6.501:max=3", append = TRUE))), 3) }) test_that("rec", { expect_equal(length(unique(rec(efc$c12hour, rec = "5=10;else=2"))), 3) expect_equal(length(unique(rec(efc$c12hour, rec = "5=10;else=2;NA=2"))), 2) expect_equal(length(unique(rec(efc$c82cop1, rec = "1,2=1; NA=9; else=copy"))), 4) }) test_that("rec", { efc$c172code <- as.factor(efc$c172code) expect_is(rec(efc$c12hour, rec = "5=10;else=2", as.num = FALSE), "factor") expect_is(rec(efc$c172code, rec = "rev", as.num = TRUE), "numeric") }) test_that("rec", { x <- c(1,2,3,NA) expect_equal(rec(x, rec = "1:3=NA;NA=1;else=2"), c(NA, NA, NA, 1)) expect_equal(rec(x, rec = "1=NA;NA=1;else=copy"), c(NA, 2, 3, 1)) expect_equal(rec(x, rec = "min=10;max=5;NA=9;else=copy"), c(10, 2, 5, 9)) }) test_that("rec", { skip_on_cran() expect_equal( unique(rec(iris, Petal.Length, rec = "lo:3=1;3.01:4.5=2;4.501:max=3", append = T, suffix = "")$Petal.Length), c(1, 3, 2) ) }) test_that("rec rev", { x <- c(1, 2, 3, 4, 1, 2, 3, 4) x <- sjlabelled::set_labels(x, labels = c("1" = "a", "2" = "b", "3" = "c")) expect_equal( rec(x, rec = "rev"), structure(c(4, 3, 2, 1, 4, 3, 2, 1), labels = c(c = 2, b = 3, a = 4)) ) x <- c(1, 2, 3, 4, 1, 2, 3, 4) x <- sjlabelled::set_labels(x, labels = c("1" = "a", "2" = "b", "3" = "c", "4" = "d")) expect_equal( rec(x, rec = "rev"), structure(c(4, 3, 2, 1, 4, 3, 2, 1), labels = c(d = 1, c = 2, b = 3, a = 4)) ) x <- c(1, 2, 3, 4, 1, 2, 3, 4) x <- sjlabelled::set_labels(x, labels = c("1" = "a", "2" = "b", "3" = "c", "4" = "d", "5" = "e")) expect_equal( rec(x, rec = "rev"), structure(c(5, 4, 3, 2, 5, 4, 3, 2), labels = c(e = 1, d = 2, c = 3, b = 4, a = 5)) ) } ) } sjmisc/tests/testthat/test-countna.R0000644000176200001440000000217513612252477017331 0ustar liggesusersif (require("testthat") && require("sjmisc") && require("haven")) { x <- labelled( x = c(1:3, tagged_na("a", "c", "z"), 4:1, tagged_na("a", "a", "c"), 1:3, tagged_na("z", "c", "c"), 1:4, tagged_na("a", "c", "z")), labels = c("Agreement" = 1, "Disagreement" = 4, "First" = tagged_na("c"), "Refused" = tagged_na("a"), "Not home" = tagged_na("z")) ) y <- labelled( x = c(1:3, tagged_na("e", "d", "f"), 4:1, tagged_na("f", "f", "d"), 1:3, tagged_na("f", "d", "d"), 1:4, tagged_na("f", "d", "f")), labels = c("Agreement" = 1, "Disagreement" = 4, "An E" = tagged_na("e"), "A D" = tagged_na("d"), "The eff" = tagged_na("f")) ) test_that("count_na, general", { dat <- data.frame(x, y) # possible count()-function calls count_na(dat) count_na(dat$x) count_na(dat, x) count_na(dat, x, y) }) test_that("count_na, labels", { tmp <- count_na(x) tmp$label <- as.character(tmp$label) expect_match(tmp$label, "First", fixed = T, all = F) expect_match(tmp$label, "Refused", fixed = T, all = F) expect_match(tmp$label, "Not home", fixed = T, all = F) }) } sjmisc/tests/testthat/test-splitvar.R0000644000176200001440000000057713612252477017532 0ustar liggesusersif (require("testthat") && require("sjmisc") && require("dplyr")) { data(mtcars) test_that("std, split_var", { tmp <- split_var(mtcars, disp, n = 3, append = FALSE) expect_equal(sum(tmp$disp_g == 1), 11) tmp <- mtcars %>% dplyr::group_by(cyl) %>% split_var(disp, n = 3, append = FALSE) expect_equal(sum(tmp$disp_g == 1), 10) }) } sjmisc/tests/testthat/test-add_case.R0000644000176200001440000000254513612252477017406 0ustar liggesusersif (require("testthat") && require("sjmisc")) { d <- data.frame( a = c(1, 2, 3), b = c("a", "b", "c"), c = c(10, 20, 30), stringsAsFactors = FALSE ) attr(d, "test") <- "abc" test_that("add_case", { expect_equal(nrow(add_case(d, b = "d")), 4) expect_equal(add_case(d, b = "d", .after = -1)[1, 1], as.numeric(NA)) expect_equal(add_case(d, b = "d", a = 5, .before = 1)[1, 1], 5) expect_equal(add_case(d, b = "d", a = 5, .after = Inf)[4, 1], 5) expect_equal(add_case(d, b = "d", .after = 2)[3, 1], as.numeric(NA)) expect_equal(add_case(d, b = "d", .after = 5)[4, 1], as.numeric(NA)) expect_equal(add_case(d, b = "d", a = 5, .after = 2, .before = 2)[2, 1], 5) }) test_that("add_variable", { expect_equal(ncol(add_variables(d, new = 5)), 4) expect_equal(colnames(add_variables(d, new = 5, .after = 3))[4], "new") expect_equal(colnames(add_variables(d, new = 5, .after = Inf))[4], "new") expect_equal(colnames(add_variables(d, new = c(4, 4, 4), new2 = c(5, 5, 5), .after = "b"))[3:4], c("new", "new2")) add_variables(d, new = c(4, 4, 4), new2 = c(5, 5, 5), .after = Inf) add_variables(d, new = c(4, 4, 4), new2 = c(5, 5, 5), .after = -1) }) test_that("add_variable", { x <- add_variables(d, new = 5) expect_equal(attr(x, "test", exact = TRUE), "abc") }) } sjmisc/tests/testthat/test-is_num_fac.R0000644000176200001440000000156713451124270017757 0ustar liggesuserscontext("sjmisc, to_dummy") library(sjmisc) test_that("is_num_fac", { expect_true(is_num_fac(factor(c(NA, 1, 3, NA, 2, 4)))) }) test_that("is_num_fac", { expect_false(is_num_fac(c(NA, 1, 3, NA, 2, 4))) }) test_that("is_num_fac", { expect_false(is_num_fac(factor(c(NA, "C", 1, 3, "A", NA, 2, 4)))) }) test_that("is_num_fac", { expect_false(is_num_fac(factor(c("Justus", "Bob", "Peter")))) }) test_that("is_num_chr", { expect_false(is_num_chr(c("a", "1"))) }) test_that("is_num_chr", { expect_false(is_num_chr(c("a", NA, "1"))) }) test_that("is_num_chr", { expect_true(is_num_chr(c("2", "1"))) }) test_that("is_num_chr", { expect_true(is_num_chr(c("2", NA_character_, "1"))) }) test_that("is_num_chr", { expect_true(is_num_chr(c("2", NA, "1"))) }) test_that("is_num_chr", { expect_false(is_num_chr(c(2, NA, 1))) }) sjmisc/tests/testthat/test-allna.R0000644000176200001440000000047613612252477016753 0ustar liggesusersif (require("testthat") && require("sjmisc")) { x <- c(NA, NA, NA) y <- c(1, NA, NA) test_that("all_na", { expect_true(all_na(x)) }) test_that("all_na", { expect_false(all_na(y)) }) test_that("all_na, data.frame", { expect_is(all_na(data.frame(x, y)), "data.frame") }) } sjmisc/tests/testthat/test-zapinf.R0000644000176200001440000000030713451124270017132 0ustar liggesuserscontext("sjmisc, zap_inf") library(sjmisc) test_that("zap_inf", { x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7) tmp <- zap_inf(x) expect_equal(any(is.infinite(tmp)), FALSE) }) sjmisc/tests/testthat/test-replacena.R0000644000176200001440000000121413612252477017605 0ustar liggesusersif (require("testthat") && require("sjmisc") && require("haven")) { data(efc) x <- labelled( c(1:3, tagged_na("a", "z"), 4:1), c("Agreement" = 1, "Disagreement" = 4, "First" = tagged_na("c"), "Not home" = tagged_na("z")) ) test_that("replace_na", { expect_true(sum(is.na(replace_na(efc$e42dep, value = 99))) == 0) }) test_that("replace_na", { expect_true(sum(is.na(replace_na(x, value = 99))) == 0) expect_true(sum(is.na(replace_na(x, value = 99, tagged.na = "a"))) == 1) expect_equal(names(attr(replace_na(x, value = 99, tagged.na = "a", na.label = "test"), "labels") == 99)[3], "test") }) } sjmisc/tests/testthat/test-merge_imputations.R0000644000176200001440000000037214153357271021411 0ustar liggesusersif (require("testthat") && require("sjmisc") && require("mice") && require("nnet")) { data(iris) iris$Species[round(runif(5, 1, 150))] <- NA imp <- mice(iris) test_that("merge_imputations", { merge_imputations(iris, imp) }) } sjmisc/tests/testthat/test-demean.R0000644000176200001440000000077213612252477017114 0ustar liggesusersif (require("testthat") && require("sjmisc")) { data(efc) efc$ID <- sample(1:4, nrow(efc), replace = TRUE) # fake-ID test_that("de_mean", { de_mean(efc, c12hour, barthtot, grp = ID) de_mean(efc, c12hour, barthtot, grp = ID, append = FALSE) de_mean(efc, c12hour, barthtot, grp = ID, append = FALSE, suffix.dm = "dm", suffix.gm = "gm") de_mean(efc, c12hour, barthtot, grp = ID, suffix.dm = "dm", suffix.gm = "gm") de_mean(efc, c12hour, barthtot, grp = "ID") }) } sjmisc/tests/testthat/test-empty.R0000644000176200001440000000056313451124270017005 0ustar liggesuserscontext("sjmisc, empty_cols") library(sjmisc) tmp <- data.frame(a = c(1, 2, 3, NA, 5), b = c(1, NA, 3, NA , 5), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5)) test_that("empty_cols", { expect_equal(unname(empty_cols(tmp)), 3) }) test_that("empty_rows", { expect_equal(empty_rows(tmp), 4) }) sjmisc/tests/testthat/test-strstartend.R0000644000176200001440000000124513451124270020222 0ustar liggesuserscontext("sjmisc, str_start") library(sjmisc) test_that("str_start", { x <- c("my_friend_likes me", "your_friend likes_you") expect_equal(str_start(x, "_"), list(c(3, 10), c(5, 18))) expect_equal(str_start(x, "likes"), list(c(11), c(13))) x <- c("my_friend_likes me", "your_friend likes_you") expect_equal(str_start(x, "ho"), list(-1, -1)) }) test_that("str_end", { x <- c("my_friend_likes me", "your_friend likes_you") expect_equal(str_end(x, "_"), list(c(3, 10), c(5, 18))) expect_equal(str_end(x, "likes"), list(c(15), c(17))) x <- c("my_friend_likes me", "your_friend likes_you") expect_equal(str_end(x, "ho"), list(-1, -1)) }) sjmisc/tests/testthat/test-findvar.R0000644000176200001440000000056713451124270017304 0ustar liggesuserscontext("sjmisc, find_var") library(sjmisc) data(efc) test_that("find_var", { x <- find_var(efc, pattern = "cop", search = "label_value") expect_equal(nrow(x), 1) x <- find_var(efc, pattern = "cop", search = "name_label") expect_equal(nrow(x), 9) x <- find_var(efc, pattern = "cop", search = "all", fuzzy = TRUE) expect_equal(nrow(x), 10) }) sjmisc/tests/testthat/test-vartype.R0000644000176200001440000000063513451124270017341 0ustar liggesuserscontext("sjmisc, var_type") library(sjmisc) data(efc) test_that("var_type", { expect_equal(var_type(1, abbr = FALSE), "numeric") expect_equal(var_type(1, abbr = TRUE), "dbl") expect_equal(var_type(1L, abbr = FALSE), "integer") expect_equal(var_type(1L, abbr = TRUE), "int") expect_equal(var_type("1", abbr = FALSE), "character") expect_equal(var_type("1", abbr = TRUE), "chr") }) sjmisc/tests/testthat/test-isempty.R0000644000176200001440000000074313451124270017341 0ustar liggesuserscontext("sjmisc, is_empty") library(sjmisc) test_that("is_empty", { skip_on_cran() expect_true(is_empty(NULL)) expect_true(is_empty(NA)) expect_true(is_empty("")) expect_false(is_empty(" ")) x <- 1 x <- x[-1] expect_true(is_empty(x)) expect_true(is_empty(c("", " "), first.only = TRUE)) expect_equal(is_empty(c("", " "), first.only = FALSE), c(TRUE, FALSE)) expect_true(is_empty(data.frame())) expect_true(is_empty(list(NULL))) }) sjmisc/tests/testthat/test-frq-weights.R0000644000176200001440000000112713451124270020104 0ustar liggesuserscontext("sjmisc, frq-weights") library(sjmisc) data(efc) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) efc$w <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) test_that("frq-weights", { frq(efc, c172code, weights = weights) frq(efc, c172code, weights = "weights") frq(efc, c172code, weights = efc$weights) frq(efc$c172code, weights = efc$weights) }) test_that("frq-weights", { frq(efc, e16sex) frq(efc$e16sex) frq(efc, e16sex, weights = w) frq(efc, e16sex, weights = "w") frq(efc, e16sex, weights = efc$w) frq(efc$e16sex, weights = efc$w) }) sjmisc/tests/testthat/test-rowmeans.R0000644000176200001440000000127113451124270017477 0ustar liggesuserscontext("sjmisc, row_means") library(sjmisc) dat <- data.frame( c1 = c(1,2,NA,4), c2 = c(NA,2,NA,5), c3 = c(NA,4,NA,NA), c4 = c(2,3,7,8), c5 = c(1,7,5,3) ) test_that("std, row_means", { tmp <- row_means(dat, n = 4, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 2) tmp <- row_means(dat, n = .4, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 0) # this one is R-behaviour, because round(2.5) = 2 tmp <- row_means(dat, n = .5, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 0) tmp <- row_means(dat, n = .51, append = FALSE) expect_equal(sum(is.na(tmp[[1]])), 1) expect_message(row_means(dat[, 1, drop = FALSE], n = 0)) }) sjmisc/tests/testthat/test-move_columns.R0000644000176200001440000000144213451124270020352 0ustar liggesuserscontext("sjmisc, to_dummy") library(sjmisc) data(iris) test_that("move_columns", { skip_on_cran() expect_equal( colnames(move_columns(iris, Sepal.Width, .after = "Species")), c("Sepal.Length", "Petal.Length", "Petal.Width", "Species", "Sepal.Width") ) expect_equal( colnames(move_columns(iris, Species, .before = 1)), c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_equal( colnames(move_columns(iris, Sepal.Width, .before = Sepal.Length)), c("Sepal.Width", "Sepal.Length", "Petal.Length", "Petal.Width", "Species") ) expect_equal( colnames(move_columns(iris, "Species", "Petal.Length", .after = 1)), c("Sepal.Length", "Species", "Petal.Length", "Sepal.Width", "Petal.Width") ) }) sjmisc/tests/testthat.R0000644000176200001440000000007413451124270014667 0ustar liggesuserslibrary(testthat) library(sjmisc) test_check("sjmisc") sjmisc/vignettes/0000755000176200001440000000000014620407644013561 5ustar liggesuserssjmisc/vignettes/recodingvariables.Rmd0000644000176200001440000002530714620405065017712 0ustar liggesusers--- title: "Recoding Variables" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Recoding Variables} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, comment = "#>") if (!requireNamespace("dplyr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } suppressPackageStartupMessages(library(sjmisc)) ``` Data preparation is a common task in research, which usually takes the most amount of time in the analytical process. **sjmisc** is a package with special focus on transformation of _variables_ that fits into the workflow and design-philosophy of the so-called "tidyverse". Basically, this package complements the **dplyr** package in that **sjmisc** takes over data transformation tasks on variables, like recoding, dichotomizing or grouping variables, setting and replacing missing values, etc. A distinctive feature of **sjmisc** is the support for labelled data, which is especially useful for users who often work with data sets from other statistical software packages like _SPSS_ or _Stata_. This vignette demonstrate some of the important recoding-functions in **sjmisc**. The examples are based on data from the EUROFAMCARE project, a survey on the situation of family carers of older people in Europe. The sample data set `efc` is part of this package. ```{r message=FALSE} library(sjmisc) data(efc) ``` To show the results after recoding variables, the `frq()` function is used to print frequency tables. ## Dichotomization: dividing variables into two groups `dicho()` dichotomizes variables into "dummy" variables (with 0/1 coding). Dichotomization is either done by median, mean or a specific value (see argument `dich.by`). Like all recoding-functions in **sjmisc**, `dicho()` returns the complete data frame _including_ the recoded variables, if the first argument is a `data.frame`. If the first argument is a vector, only the recoded variable is returned. See [this vignette](design_philosophy.html) for details about the function-design. If `dicho()` returns a data frame, the recoded variables have the same name as the original variable, including a suffix `_d`. ```{r} # age, ranged from 65 to 104, in this output # grouped to get a shorter table frq(efc, e17age, auto.grp = 5) # splitting is done at the median by default: median(efc$e17age, na.rm = TRUE) # the recoded variable is now named "e17age_d" efc <- dicho(efc, e17age) frq(efc, e17age_d) ``` As `dicho()`, like all recoding-functions, supports [labelled data](https://cran.r-project.org/package=sjlabelled), the variable preserves it variable label (but not the value labels). You can directly define value labels inside the function: ```{r} x <- dicho(efc$e17age, val.labels = c("young age", "old age")) frq(x) ``` To split a variable at a different value, use the `dich.by`-argument. The value specified in `dich.by` is _inclusive_, i.e. all values from lowest to and including `dich.by` are recoded into the lower category, while all values _above_ `dich.by` are recoded into the higher category. ```{r} # split at upper quartile x <- dicho( efc$e17age, dich.by = quantile(efc$e17age, probs = .75, na.rm = TRUE), val.labels = c("younger three quarters", "oldest quarter") ) frq(x) ``` Since the distribution of values in a dataset may differ for different subgroups, all recoding-functions also work on grouped data frames. In the following example, first, the age-variable `e17age` is dichotomized at the median. Then, the data is grouped by gender (`c161sex`) and the dichotomization is done for each subgroup, i.e. it once relates to the median age in the subgroup of female, and once to the median age in the subgroup of male family carers. ```{r} data(efc) x1 <- dicho(efc$e17age) x2 <- efc %>% dplyr::group_by(c161sex) %>% dicho(e17age) %>% dplyr::pull(e17age_d) # median age of total sample frq(x1) # median age of total sample, with median-split applied # to distribution of age by subgroups of gender frq(x2) ``` ## Splitting variables into several groups `split_var()` recodes numeric variables into equal sized groups, i.e. a variable is cut into a smaller number of groups at specific cut points. The amount of groups depends on the `n`-argument and cuts a variable into `n` quantiles. Similar to `dicho()`, if the first argument in `split_var()` is a data frame, the complete data frame including the new recoded variable(s), with suffix `_g`, is returned. ```{r} x <- split_var(efc$e17age, n = 3) frq(x) ``` Unlike dplyr's `ntile()`, `split_var()` never splits a value into two different categories, i.e. you always get a "clean" separation of original categories. In other words: cases that have identical values in a variable will always be recoded into the same group. The following example demonstrates the differences: ```{r} x <- dplyr::ntile(efc$neg_c_7, n = 3) # for some cases, value "10" is recoded into category "1", # for other cases into category "2". Same is true for value "13" table(efc$neg_c_7, x) x <- split_var(efc$neg_c_7, n = 3) # no separation of cases with identical values. table(efc$neg_c_7, x) ``` `split_var()`, unlike `ntile()`, does therefor not always return exactly equal-sized groups: ```{r} x <- dplyr::ntile(efc$neg_c_7, n = 3) frq(x) x <- split_var(efc$neg_c_7, n = 3) frq(x) ``` ## Recode variables into equal-ranged groups With `group_var()`, variables can be grouped into equal ranged categories, i.e. a variable is cut into a smaller number of groups, where each group has the same value range. `group_labels()` creates the related value labels. The range of the groups is defined in the `size`-argument. At the same time, the `size`-argument also defines the _lower bound_ of one of the groups. For instance, if the lowest value of a variable is 1 and the maximum is 10, and `size = 5`, then a) each group will have a range of 5, and b) one of the groups will start with the value 5. This means, that an equal-ranged grouping will define groups from _0 to 4_, _5 to 9_ and _10-14_. Each of these groups has a range of 5, and one of the groups starts with the value 5. The group assignment becomes clearer, when `group_labels()` is used in parallel: ```{r} set.seed(123) x <- round(runif(n = 150, 1, 10)) frq(x) frq(group_var(x, size = 5)) group_labels(x, size = 5) dummy <- group_var(x, size = 5, as.num = FALSE) levels(dummy) <- group_labels(x, size = 5) frq(dummy) dummy <- group_var(x, size = 3, as.num = FALSE) levels(dummy) <- group_labels(x, size = 3) frq(dummy) ``` The argument `right.interval` can be used when `size` should indicate the _upper bound_ of a group-range. ```{r} dummy <- group_var(x, size = 4, as.num = FALSE) levels(dummy) <- group_labels(x, size = 4) frq(dummy) dummy <- group_var(x, size = 4, as.num = FALSE, right.interval = TRUE) levels(dummy) <- group_labels(x, size = 4, right.interval = TRUE) frq(dummy) ``` ## Flexible recoding of variables `rec()` recodes old values of variables into new values, and can be considered as a "classical" recode-function. The recode-pattern, i.e. which new values should replace the old values, is defined in the `rec`-argument. This argument has a specific "syntax": * **recode pairs**: Each recode pair has to be separated by a ;, e.g. `rec = "1=1; 2=4; 3=2; 4=3"` * **multiple values**: Multiple old values that should be recoded into a new single value may be separated with comma, e.g. `rec = "1,2=1; 3,4=2"` * **value range**: A value range is indicated by a colon, e.g. `rec = "1:4=1; 5:8=2"` (recodes all values from 1 to 4 into 1, and from 5 to 8 into 2) * **value range for doubles**: For double vectors (with fractional part), all values within the specified range are recoded; e.g. `rec = "1:2.5=1;2.6:3=2"` recodes 1 to 2.5 into 1 and 2.6 to 3 into 2, but 2.55 would not be recoded (since it's not included in any of the specified ranges) * **"min" and "max"**: Minimum and maximum values are indicates by `min` (or `lo`) and `max` (or `hi`), e.g. `rec = "min:4=1; 5:max=2"` (recodes all values from minimum values of x to 4 into 1, and from 5 to maximum values of x into 2) You can also use `min` or `max` to recode a value into the minimum or maximum value of a variable, e.g. `rec = "min:4=1; 5:7=max"` (recodes all values from minimum values of x to 4 into 1, and from 5 to 7 into the maximum value of x). * **"else"**: All other values, which have not been specified yet, are indicated by else, e.g. `rec = "3=1; 1=2; else=3"` (recodes 3 into 1, 1 into 2 and all other values into 3) * **"copy"**: The `"else"`-token can be combined with `"copy"`, indicating that all remaining, not yet recoded values should stay the same (are copied from the original value), e.g. `rec = "3=1; 1=2; else=copy"` (recodes 3 into 1, 1 into 2 and all other values like 2, 4 or 5 etc. will not be recoded, but copied. * **NA's**: `NA` values are allowed both as old and new value, e.g. `rec = "NA=1; 3:5=NA"` (recodes all `NA` into 1, and all values from 3 to 5 into NA in the new variable) * **"rev"**: `"rev"` is a special token that reverses the value order. * **direct value labelling**: Value labels for new values can be assigned inside the recode pattern by writing the value label in square brackets after defining the new value in a recode pair, e.g. `rec = "15:30=1 [young aged]; 31:55=2 [middle aged]; 56:max=3 [old aged]"` * **non-captured values**: Non-matching values will be set to `NA`, unless captured by the `"else"`- or `"copy"`-token. Here are some examples: ```{r} frq(efc$e42dep) # replace NA with 5 frq(rec(efc$e42dep, rec = "NA=5;else=copy")) # recode 1 to 2 into 1 and 3 to 4 into 2 frq(rec(efc$e42dep, rec = "1,2=1; 3,4=2")) # recode 1 to 3 into 4 into 2 frq(rec(efc$e42dep, rec = "min:3=1; 4=2")) # recode numeric to character, and remaining values # into the highest value (="hi") of e42dep frq(rec(efc$e42dep, rec = "1=first;2=2nd;else=hi")) data(iris) frq(rec(iris, Species, rec = "setosa=huhu; else=copy", append = FALSE)) # works with mutate efc %>% dplyr::select(e42dep, e17age) %>% dplyr::mutate(dependency_rev = rec(e42dep, rec = "rev")) %>% head() # recode multiple variables and set value labels via recode-syntax dummy <- rec( efc, c160age, e17age, rec = "15:30=1 [young]; 31:55=2 [middle]; 56:max=3 [old]", append = FALSE ) frq(dummy) ``` ## Scoped variants Where applicable, the recoding-functions in **sjmisc** have "scoped" versions as well, e.g. `dicho_if()` or `split_var_if()`, where transformation will be applied only to those variables that match the logical condition of `predicate`. sjmisc/vignettes/exploringdatasets.Rmd0000644000176200001440000001657714046746443020012 0ustar liggesusers--- title: "Exploring Data Sets" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Exploring Data Sets} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, comment = "#>") if (!requireNamespace("dplyr", quietly = TRUE) || !requireNamespace("tidyr", quietly = TRUE) || !requireNamespace("purrr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } suppressPackageStartupMessages(library(sjmisc)) ``` Tidying up, transforming and exploring data is an important part of data analysis, and you can manage many common tasks in this process with the *tidyverse* or related packages. The **sjmisc**-package fits into this workflow, especially when you work with [labelled data](https://cran.r-project.org/package=sjlabelled), because it offers functions for data transformation and labelled data utility functions. This vignette describes typical steps when beginning with data exploration. The examples are based on data from the EUROFAMCARE project, a survey on the situation of family carers of older people in Europe. The sample data set `efc` is part of this package. Let us see how the family carer's gender and subjective perception of negative impact of care as well as the cared-for person's dependency are associated with the family carer's quality of life. ```{r message=FALSE} library(sjmisc) library(dplyr) data(efc) ``` ## Print frequencies with labels The first thing that may be of interest is probably the distribution of gender. You can plot frequencies for labelled data with `frq()`. This function requires either a vector or data frame as input and prints the variable label as first line, followed by a frequency-table with values, labels, counts and percentages of the vector. ```{r} frq(efc$c161sex) ``` ## Find variables in a data frame Next, let’s look at the distribution of gender by the cared-for person's dependency. To compute cross tables, you can use `flat_table()`. It requires the data as first argument, followed by any number of variable names. But first, we need to know the name of the dependency-variable. This is where `find_var()` comes into play. It searches for variables in a data frame by 1. variable names, 2. variable labels, 3. value labels 4. or any combination of these. By default, it looks for variable name and labels. The function also supports regex-patterns. By default, `find_var()` returns the column-indices, but you can also print a small "summary"" with the `out`-argument. ```{r} # find all variables with "dependency" in name or label find_var(efc, "dependency", out = "table") ``` Variable in column 5, named _e42dep_, is what we are looking for. ## Print crosstables with labels Now we can look at the distribution of gender by dependency: ```{r} flat_table(efc, e42dep, c161sex) ``` Since the distribution of male and female carers is skewed, let's see the proportions. To compute crosstables with row or column percentages, use the `margin`-argument: ```{r} flat_table(efc, e42dep, c161sex, margin = "col") ``` ## Recoding variables Next, we need the negatice impact of care (*neg_c_7*) and want to create three groups: low, middle and high negative impact. We can easily recode and label vectors with `rec()`. This function does not only recode vectors, it also allows direct labelling of categories inside the recode-syntax (this is optional, you can also use the `val.labels`-argument). We now recode *neg_c_7* into a new variable _burden_. The cut-points are a bit arbitrary, for the sake of demonstration. ```{r} efc$burden <- rec( efc$neg_c_7, rec = c("min:9=1 [low]; 10:12=2 [moderate]; 13:max=3 [high]; else=NA"), var.label = "Subjective burden", as.num = FALSE # we want a factor ) # print frequencies frq(efc$burden) ``` You can see the variable _burden_ has a variable label ("Subjective burden"), which was set inside `rec()`, as well as three values with labels ("low", "moderate" and "high"). From the lowest value in *neg_c_7* to 9 were recoded into 1, values 10 to 12 into 2 and values 13 to the highest value in *neg_c_7* into 3. All remaining values are set to missing (`else=NA` – for details on the recode-syntax, see `?rec`). ## Grouped data frames How is burden distributed by gender? We can group the data and print frequencies using `frq()` for this as well, as this function also accepts grouped data frames. Frequencies for grouped data frames first print the group-details (variable name and category), followed by the frequency table. Thanks to labelled data, the output is easy to understand. ```{r} efc %>% select(burden, c161sex) %>% group_by(c161sex) %>% frq() ``` ## Nested data frames Let's investigate the association between quality of life and burden across the different dependency categories, by fitting linear models for each category of _e42dep_. We can do this using _nested data frames_. `nest()` from the **tidyr**-package can create subsets of a data frame, based on grouping criteria, and create a new _list-variable_, where each element itself is a data frame (so it’s nested, because we have data frames inside a data frame). In the following example, we group the data by _e42dep_, and "nest" the groups. Now we get a data frame with two columns: First, the grouping variable (_e42dep_) and second, the datasets (subsets) for each country as data frame, stored in the list-variable _data_. The data frames in the subsets (in _data_) all contain the selected variables _burden_, _c161sex_ and *quol_5* (quality of life). ```{r} # convert variable to labelled factor, because we then # have the labels as factor levels in the output efc$e42dep <- to_label(efc$e42dep, drop.levels = TRUE) efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() ``` ## Get coefficients of nested models Using `map()` from the **purrr**-package, we can iterate this list and apply any function on each data frame in the list-variable "data". We want to apply the `lm()`-function to the list-variable, to run linear models for all "dependency-datasets". The results of these linear regressions are stored in another list-variable, _models_ (created with `mutate()`). To quickly access and look at the coefficients, we can use `spread_coef()`. ```{r} efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() %>% na.omit() %>% # remove nested group for NA arrange(e42dep) %>% # arrange by order of levels mutate(models = purrr::map( data, ~ lm(quol_5 ~ burden + c161sex, data = .)) ) %>% spread_coef(models) ``` We see that higher burden is associated with lower quality of life, for all dependency-groups. The `se` and `p.val`-arguments add standard errors and p-values to the output. `model.term` returns the statistics only for a specific term. If you specify a `model.term`, arguments `se` and `p.val` automatically default to `TRUE`. ```{r} efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() %>% na.omit() %>% # remove nested group for NA arrange(e42dep) %>% # arrange by order of levels mutate(models = purrr::map( data, ~ lm(quol_5 ~ burden + c161sex, data = .)) ) %>% spread_coef(models, burden3) ``` sjmisc/vignettes/design_philosophy.Rmd0000644000176200001440000001253014046746443017762 0ustar liggesusers--- title: "The Design Philosophy of Functions in sjmisc" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{The Design Philosophy of Functions in sjmisc} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r echo = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) if (!requireNamespace("dplyr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } options(max.print = 1000) suppressPackageStartupMessages(library(sjmisc)) ``` Basically, this package complements the _dplyr_ package in that _sjmisc_ takes over data transformation tasks on variables, like recoding, dichotomizing or grouping variables, setting and replacing missing values, etc. The data transformation functions also support labelled data. # The design of data transformation functions The design of data transformation functions in this package follows, where appropriate, the _tidyverse-approach_, with the first argument of a function always being the data (either a data frame or vector), followed by variable names that should be processed by the function. If no variables are specified as argument, the function applies to the complete data that was indicated as first function argument. ## The data-argument A major difference to dplyr-functions like `select()` or `filter()` is that the data-argument (the first argument of each function), may either be a _data frame_ or a _vector_. The returned object for each function _equals the type of the data-argument_: * If the data-argument is a vector, the function returns a vector. * If the data-argument is a data frame, the function returns a data frame. ```{r} library(sjmisc) data(efc) # returns a vector x <- rec(efc$e42dep, rec = "1,2=1; 3,4=2") str(x) # returns a data frame rec(efc, e42dep, rec = "1,2=1; 3,4=2", append = FALSE) %>% head() ``` This design-choice is mainly due to compatibility- and convenience-reasons. It does not affect the usual "tidyverse-workflow" or when using pipe-chains. ## The ...-ellipses-argument The selection of variables specified in the `...`-ellipses-argument is powered by dplyr's `select()` and tidyselect's `select_helpers()`. This means, you can use existing functions like `:` to select a range of variables, or also use tidyselect's `select_helpers`, like `contains()` or `one_of()`. ```{r echo=FALSE, message=FALSE} library(dplyr) ``` ```{r collapse=TRUE} # select all variables with "cop" in their names, and also # the range from c161sex to c175empl rec( efc, contains("cop"), c161sex:c175empl, rec = "0,1=0; else=1", append = FALSE ) %>% head() # center all variables with "age" in name, variable c12hour # and all variables from column 19 to 21 center(efc, c12hour, contains("age"), 19:21, append = FALSE) %>% head() ``` ## The function-types There are two types of function designs: ### coercing/converting functions Functions like `to_factor()` or `to_label()`, which convert variables into other types or add additional information like variable or value labels as attribute, typically _return the complete data frame_ that was given as first argument _without any new variables_. The variables specified in the `...`-ellipses argument are converted (overwritten), all other variables remain unchanged. ```{r} x <- efc[, 3:5] x %>% str() to_factor(x, e42dep, e16sex) %>% str() ``` ### transformation/recoding functions Functions like `rec()` or `dicho()`, which transform or recode variables, by default add _the transformed or recoded variables_ to the data frame, so they return the new variables _and_ the original data as combined data frame. To return _only the transformed and recoded variables_ specified in the `...`-ellipses argument, use argument `append = FALSE`. ```{r} # complete data, including new columns rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE) %>% head() # only new columns rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% head() ``` These variables usually get a suffix, so you can bind these variables as new columns to a data frame, for instance with `add_columns()`. The function `add_columns()` is useful if you want to bind/add columns within a pipe-chain _to the end_ of a data frame. ```{r} efc %>% rec(c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% add_columns(efc) %>% head() ``` If `append = TRUE` and `suffix = ""`, recoded variables will replace (overwrite) existing variables. ```{r} # complete data, existing columns c82cop1 and c83cop2 are replaced rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE, suffix = "") %>% head() ``` ## sjmisc and dplyr The functions of **sjmisc** are designed to work together seamlessly with other packages from the tidyverse, like **dplyr**. For instance, you can use the functions from **sjmisc** both within a pipe-worklflow to manipulate data frames, or to create new variables with `mutate()`: ```{r} efc %>% select(c82cop1, c83cop2) %>% rec(rec = "1,2=0; 3:4=2") %>% head() efc %>% select(c82cop1, c83cop2) %>% mutate( c82cop1_dicho = rec(c82cop1, rec = "1,2=0; 3:4=2"), c83cop2_dicho = rec(c83cop2, rec = "1,2=0; 3:4=2") ) %>% head() ``` This makes it easy to adapt the **sjmisc** functions to your own workflow. sjmisc/R/0000755000176200001440000000000014272453251011750 5ustar liggesuserssjmisc/R/ref_lvl.R0000644000176200001440000001125414046746443013536 0ustar liggesusers#' @title Change reference level of (numeric) factors #' @name ref_lvl #' #' @description Changes the reference level of (numeric) factor. #' #' @seealso \code{\link{to_factor}} to convert numeric vectors into factors; #' \code{\link{rec}} to recode variables. #' #' @param lvl Either numeric, indicating the new reference level, or a string, #' indicating the value label from the new reference level. If \code{x} is a #' factor with non-numeric factor levels, \code{relevel(x, ref = lvl)} is #' returned. See 'Examples'. #' #' @inheritParams to_dummy #' #' @return \code{x} with new reference level. If \code{x} #' is a data frame, the complete data frame \code{x} will be returned, #' where variables specified in \code{...} will be re-leveled; #' if \code{...} is not specified, applies to all variables in the #' data frame. #' #' @details Unlike \code{\link[stats]{relevel}}, this function behaves differently #' for factor with numeric factor levels or for labelled data, i.e. factors #' with value labels for the values. \code{ref_lvl()} changes the reference #' level by recoding the factor's values using the \code{\link{rec}} function. #' Hence, all values from lowest up to the reference level indicated by #' \code{lvl} are recoded, with \code{lvl} starting as lowest factor value. #' For factors with non-numeric factor levels, the function simply returns #' \code{relevel(x, ref = lvl)}. See 'Examples'. #' #' @examples #' data(efc) #' x <- to_factor(efc$e42dep) #' str(x) #' frq(x) #' #' # see column "val" in frq()-output, which indicates #' # how values/labels were recoded after using ref_lvl() #' x <- ref_lvl(x, lvl = 3) #' str(x) #' frq(x) #' #' library(dplyr) #' dat <- efc %>% #' select(c82cop1, c83cop2, c84cop3) %>% #' to_factor() #' #' frq(dat) #' ref_lvl(dat, c82cop1, c83cop2, lvl = 2) %>% frq() #' #' # compare numeric and string value for "lvl"-argument #' x <- to_factor(efc$e42dep) #' frq(x) #' ref_lvl(x, lvl = 2) %>% frq() #' ref_lvl(x, lvl = "slightly dependent") %>% frq() #' #' # factors with non-numeric factor levels #' data(iris) #' levels(iris$Species) #' levels(ref_lvl(iris$Species, lvl = 3)) #' levels(ref_lvl(iris$Species, lvl = "versicolor")) #' @export ref_lvl <- function(x, ..., lvl = NULL) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) if (is.data.frame(x)) { # iterate variables of data frame for (i in colnames(.dat)) { x[[i]] <- ref_lvl_helper(.dat[[i]], value = lvl) } } else { x <- ref_lvl_helper(.dat, value = lvl) } x } ref_lvl_helper <- function(x, value) { # check correct arguments if (is.null(x)) { warning("`x` is NULL.", call. = FALSE) return(x) } if (!is.factor(x)) { warning("`x` needs to be a factor.", call. = FALSE) return(x) } if (!is_num_fac(x)) { return(stats::relevel(x, ref = value)) } if (is.numeric(value)) { # get values from factor vals <- as.numeric(levels(x)) } else { # get value labels, check if we have a label instead of number as lab.values <- sjlabelled::get_labels( x, attr.only = TRUE, values = "n", drop.na = TRUE ) vals <- as.numeric(names(lab.values)) value <- as.numeric(names(lab.values[lab.values == value])) } # check if ref-lvl exists in values if (!value %in% vals) { warning("`x` has no factor level indicated by the reference level `value`.", call. = FALSE) return(x) } # get value labels val.labs <- sjlabelled::get_labels(x) # get variable label var.lab <- sjlabelled::get_label(x) # find position of reference level refpos <- which(vals == value) # new order of factor levels, if reference level # is on first position neword <- c(vals[refpos], vals[-refpos]) # now recode variable. therefore, we need a string pattern # for the recoding rec.pattern <- paste0(sprintf("%i=%i;", neword, vals), collapse = "") # recode now x <- rec(x, rec = rec.pattern, as.num = FALSE, append = FALSE) # set back labels if (!is.null(var.lab) && !sjmisc::is_empty(var.lab)) { sjlabelled::set_label(x) <- var.lab } if (!is.null(val.labs)) { # we need "order" twice here, because "neword" refers to the actual # values of "x", so "neword" might have negative values, or zero. # so we first need the "order" function to have numeric values from # 1 to length(x) - and a second "order" call to get the correct order # of these values. x <- sjlabelled::set_labels(x, labels = val.labs[order(order(neword))]) } x } sjmisc/R/is_num_fac.R0000644000176200001440000000240614046746443014207 0ustar liggesusers#' @title Check whether a factor has numeric levels only #' @name is_num_fac #' @description \code{is_num_fac()} checks whether a factor has only numeric or #' any non-numeric factor levels, while \code{is_num_chr()} checks whether #' a character vector has only numeric strings. #' #' @param x A factor for \code{is_num_fac()} and a character vector for #' \code{is_num_chr()} #' #' @return Logical, \code{TRUE} if factor has numeric factor levels only, or #' if character vector has numeric strings only, \code{FALSE} otherwise. #' #' @examples #' # numeric factor levels #' f1 <- factor(c(NA, 1, 3, NA, 2, 4)) #' is_num_fac(f1) #' #' # not completeley numeric factor levels #' f2 <- factor(c(NA, "C", 1, 3, "A", NA, 2, 4)) #' is_num_fac(f2) #' #' # not completeley numeric factor levels #' f3 <- factor(c("Justus", "Bob", "Peter")) #' is_num_fac(f3) #' #' is_num_chr(c("a", "1")) #' is_num_chr(c("2", "1")) #' #' @export is_num_fac <- function(x) { # check if we have numeric levels is.factor(x) && !anyNA(suppressWarnings(as.numeric(levels(x)))) } #' @rdname is_num_fac #' @export is_num_chr <- function(x) { # check if we have numeric bvalues is.character(x) && !anyNA(suppressWarnings(as.numeric(stats::na.omit(x)))) } sjmisc/R/is_empty.R0000644000176200001440000001143614046746443013740 0ustar liggesusers#' @title Check whether string, list or vector is empty #' @name is_empty #' @description This function checks whether a string or character vector (of #' length 1), a list or any vector (numeric, atomic) is empty or not. #' #' #' @param x String, character vector, list, data.frame or numeric vector or factor. #' @param first.only Logical, if \code{FALSE} and \code{x} is a character #' vector, each element of \code{x} will be checked if empty. If #' \code{TRUE}, only the first element of \code{x} will be checked. #' @param all.na.empty Logical, if \code{x} is a vector with \code{NA}-values #' only, \code{is_empty} will return \code{FALSE} if \code{all.na.empty = FALSE}, #' and will return \code{TRUE} if \code{all.na.empty = TRUE} (default). #' @return Logical, \code{TRUE} if \code{x} is a character vector or string and #' is empty, \code{TRUE} if \code{x} is a vector or list and of length 0, #' \code{FALSE} otherwise. #' #' @note \code{NULL}- or \code{NA}-values are also considered as "empty" (see #' 'Examples') and will return \code{TRUE}, unless \code{all.na.empty==FALSE}. #' #' @examples #' is_empty("test") #' is_empty("") #' is_empty(NA) #' is_empty(NULL) #' #' # string is not empty #' is_empty(" ") #' #' # however, this trimmed string is #' is_empty(trim(" ")) #' #' # numeric vector #' x <- 1 #' is_empty(x) #' x <- x[-1] #' is_empty(x) #' #' # check multiple elements of character vectors #' is_empty(c("", "a")) #' is_empty(c("", "a"), first.only = FALSE) #' #' # empty data frame #' d <- data.frame() #' is_empty(d) #' #' # empty list #' is_empty(list(NULL)) #' #' # NA vector #' x <- rep(NA,5) #' is_empty(x) #' is_empty(x, all.na.empty = FALSE) #' @export is_empty <- function(x, first.only = TRUE, all.na.empty = TRUE) { # do we have a valid vector? if (!is.null(x)) { # if it's a character, check if we have only one element in that vector if (is.character(x)) { # characters may also be of length 0 if (length(x) == 0) return(TRUE) # else, check all elements of x zero_len <- nchar(x) == 0 # return result for multiple elements of character vector if (first.only) { zero_len <- .is_true(zero_len[1]) if (length(x) > 0) x <- x[1] } else { return(unname(zero_len)) } # we have a non-character vector here. check for length } else if (is.list(x)) { x <- purrr::compact(x) zero_len <- length(x) == 0 } else { zero_len <- length(x) == 0 } } any(is.null(x) || zero_len || (all.na.empty && all(is.na(x)))) } .is_true <- function(x) { is.logical(x) && length(x) == 1L && !is.na(x) && x } #' @title Return or remove variables or observations that are completely missing #' @name empty_cols #' #' @description These functions check which rows or columns of a data frame completely #' contain missing values, i.e. which observations or variables #' completely have missing values, and either 1) returns their #' indices; or 2) removes them from the data frame. #' #' #' @param x A data frame. #' #' @return For \code{empty_cols} and \code{empty_rows}, a numeric (named) vector #' with row or column indices of those variables that completely have #' missing values. #' \cr \cr #' For \code{remove_empty_cols} and \code{remove_empty_rows}, a #' data frame with "empty" columns or rows removed. #' #' @examples #' tmp <- data.frame(a = c(1, 2, 3, NA, 5), #' b = c(1, NA, 3, NA , 5), #' c = c(NA, NA, NA, NA, NA), #' d = c(1, NA, 3, NA, 5)) #' #' tmp #' #' empty_cols(tmp) #' empty_rows(tmp) #' #' remove_empty_cols(tmp) #' remove_empty_rows(tmp) #' #' @export empty_cols <- function(x) { if ((!is.matrix(x) && !is.data.frame(x)) || ncol(x) < 2) vector("numeric") else which(colSums(is.na(x)) == nrow(x)) } #' @rdname empty_cols #' @export empty_rows <- function(x) { if ((!is.matrix(x) && !is.data.frame(x)) || nrow(x) < 2) vector("numeric") else which(rowSums(is.na(x)) == ncol(x)) } #' @rdname empty_cols #' @export remove_empty_cols <- function(x) { # check if we have any empty columns at all ec <- empty_cols(x) # if yes, removing works, else an empty df would be returned if (!sjmisc::is_empty(ec)) dplyr::select(x, !! -ec) else x } #' @rdname empty_cols #' @export remove_empty_rows <- function(x) { # check if we have any empty rows at all er <- empty_rows(x) # if yes, removing works, else an empty df would be returned if (!sjmisc::is_empty(er)) dplyr::slice(x, !! -er) else x } sjmisc/R/rotate_df.R0000644000176200001440000000476114046746443014061 0ustar liggesusers#' @title Rotate a data frame #' @name rotate_df #' @description This function rotates a data frame, i.e. columns become rows #' and vice versa. #' #' @param x A data frame. #' @param rn Character vector (optional). If not \code{NULL}, the data frame's #' rownames will be added as (first) column to the output, with #' \code{rn} being the name of this column. #' @param cn Logical (optional), if \code{TRUE}, the values of the first column #' in \code{x} will be used as column names in the rotated data frame. #' #' @return A (rotated) data frame. #' #' @examples #' x <- mtcars[1:3, 1:4] #' rotate_df(x) #' rotate_df(x, rn = "property") #' #' # use values in 1. column as column name #' rotate_df(x, cn = TRUE) #' rotate_df(x, rn = "property", cn = TRUE) #' #' # also works on list-results #' library(purrr) #' #' dat <- mtcars[1:3, 1:4] #' tmp <- purrr::map(dat, function(x) { #' sdev <- stats::sd(x, na.rm = TRUE) #' ulsdev <- mean(x, na.rm = TRUE) + c(-sdev, sdev) #' names(ulsdev) <- c("lower_sd", "upper_sd") #' ulsdev #' }) #' tmp #' as.data.frame(tmp) #' rotate_df(tmp) #' #' tmp <- purrr::map_df(dat, function(x) { #' sdev <- stats::sd(x, na.rm = TRUE) #' ulsdev <- mean(x, na.rm = TRUE) + c(-sdev, sdev) #' names(ulsdev) <- c("lower_sd", "upper_sd") #' ulsdev #' }) #' tmp #' rotate_df(tmp) #' @export rotate_df <- function(x, rn = NULL, cn = FALSE) { # check if first column has column names # that should be used for rotated df if (cn) { cnames <- x[[1]] x <- dplyr::select(x, -1) } else cnames <- NULL # copy attributes a <- attributes(x) # rotate data frame for 90° x <- x %>% as.data.frame() %>% t() %>% as.data.frame() # add column names, if requested if (!is.null(cnames)) { # check if we have correct length of column names if (length(cnames) != ncol(x)) warning("Length of provided column names does not match number of columns. No column names changed.", call. = FALSE) else colnames(x) <- cnames } # add rownames as column, if requested if (!is.null(rn)) x <- rownames_as_column(x, var = rn) # add back attributes. therefore, delete the common attributes, like class etc. # and then add attributes to our final df # a[c("names", "row.names", "class", "dim", "dimnames")] <- NULL a[names(a) %in% names(attributes(x))] <- NULL attributes(x) <- c(attributes(x), a) x } sjmisc/R/all_na.R0000644000176200001440000000144414046746443013333 0ustar liggesusers#' @title Check if vector only has NA values #' @name all_na #' #' @description Check if all values in a vector are \code{NA}. #' #' @param x A vector or data frame. #' #' @return Logical, \code{TRUE} if \code{x} has only NA values, \code{FALSE} if #' \code{x} has at least one non-missing value. #' #' @examples #' x <- c(NA, NA, NA) #' y <- c(1, NA, NA) #' #' all_na(x) #' all_na(y) #' all_na(data.frame(x, y)) #' all_na(list(x, y)) #' @export all_na <- function(x) { UseMethod("all_na") } #' @export all_na.default <- function(x) { sum(!is.na(x)) == 0 } #' @export all_na.data.frame <- function(x) { as.data.frame(lapply(x, function(v) sum(!is.na(v)) == 0)) } #' @export all_na.list <- function(x) { lapply(x, function(v) sum(!is.na(v)) == 0) } sjmisc/R/find_var.R0000644000176200001440000001640714046746443013702 0ustar liggesusers#' @title Find variable by name or label #' @name find_var #' #' @description This functions finds variables in a data frame, which variable #' names or variable (and value) label attribute match a specific #' pattern. Regular expression for the pattern is supported. #' #' @param data A data frame. #' @param pattern Character string to be matched in \code{data}. May also be a #' character vector of length > 1 (see 'Examples'). \code{pattern} is #' searched for in column names and variable label attributes of #' \code{data} (see \code{\link[sjlabelled]{get_label}}). \code{pattern} #' might also be a regular-expression object, as returned by \code{stringr::regex()}. #' Alternatively, use \code{regex = TRUE} to treat \code{pattern} as a regular #' expression rather than a fixed string. #' @param ignore.case Logical, whether matching should be case sensitive or not. #' \code{ignore.case} is ignored when \code{pattern} is no regular expression or #' \code{regex = FALSE}. #' @param search Character string, indicating where \code{pattern} is sought. #' Use one of following options: #' \describe{ #' \item{\code{"name_label"}}{The default, searches for \code{pattern} in #' variable names and variable labels.} #' \item{\code{"name_value"}}{Searches for \code{pattern} in #' variable names and value labels.} #' \item{\code{"label_value"}}{Searches for \code{pattern} in #' variable and value labels.} #' \item{\code{"name"}}{Searches for \code{pattern} in #' variable names.} #' \item{\code{"label"}}{Searches for \code{pattern} in #' variable labels} #' \item{\code{"value"}}{Searches for \code{pattern} in #' value labels.} #' \item{\code{"all"}}{Searches for \code{pattern} in #' variable names, variable and value labels.} #' } #' @param out Output (return) format of the search results. May be abbreviated #' and must be one of: #' \describe{ #' \item{\code{"table"}}{A tabular overview (as data frame) with #' column indices, variable names and labels of matching variables. #' } #' \item{\code{"df"}}{A data frame with all matching variables.} #' \item{\code{"index"}}{ #' A named vector with column indices of all matching variables. #' } #' } #' @param fuzzy Logical, if \code{TRUE}, "fuzzy matching" (partial and #' close distance matching) will be used to find \code{pattern} #' in \code{data} if no exact match was found. #' @param regex Logical, if \code{TRUE}, \code{pattern} is treated as a regular #' expression rather than a fixed string. #' #' @return By default (i.e. \code{out = "table"}, returns a data frame with three #' columns: column number, variable name and variable label. If #' \code{out = "index"}, returns a named vector with column indices #' of matching variables (variable names are used as names-attribute); #' if \code{out = "df"}, returns the matching variables as data frame #' #' @details This function searches for \code{pattern} in \code{data}'s column names #' and - for labelled data - in all variable and value labels of \code{data}'s #' variables (see \code{\link[sjlabelled]{get_label}} for details on variable labels and #' labelled data). Regular expressions are supported as well, by simply using #' \code{pattern = stringr::regex(...)} or \code{regex = TRUE}. #' #' @examples #' data(efc) #' #' # find variables with "cop" in variable name #' find_var(efc, "cop") #' #' # return data frame with matching variables #' find_var(efc, "cop", out = "df") #' #' # or return column numbers #' find_var(efc, "cop", out = "index") #' #' # find variables with "dependency" in names and variable labels #' library(sjlabelled) #' find_var(efc, "dependency") #' get_label(efc$e42dep) #' #' # find variables with "level" in names and value labels #' res <- find_var(efc, "level", search = "name_value", out = "df") #' res #' get_labels(res, attr.only = FALSE) #' #' # use sjPlot::view_df() to view results #' \dontrun{ #' library(sjPlot) #' view_df(res)} #' @export find_var <- function(data, pattern, ignore.case = TRUE, search = c("name_label", "name_value", "label_value", "name", "label", "value", "all"), out = c("table", "df", "index"), fuzzy = FALSE, regex = FALSE) { # check valid args if (!is.data.frame(data)) { stop("`data` must be a data frame.", call. = FALSE) } # match args search <- match.arg(search) out <- match.arg(out) if (regex) class(pattern) <- c("regex", class(pattern)) pos1 <- pos2 <- pos3 <- c() fixed <- !inherits(pattern, "regex") # avoid warning. For fixed=TRUE, ignore.case is ignored. if (.is_true(fixed)) ignore.case <- FALSE # search for pattern in variable names if (search %in% c("name", "name_label", "name_value", "all")) { pos1 <- which(grepl(pattern = pattern, x = colnames(data), ignore.case = ignore.case, fixed = fixed)) # if nothing found, find in near distance if (sjmisc::is_empty(pos1) && fuzzy && !inherits(pattern, "regex")) { pos1 <- fuzzy_grep(x = colnames(data), pattern = pattern) } } # search for pattern in variable labels if (search %in% c("label", "name_label", "label_value", "all")) { labels <- sjlabelled::get_label(data) pos2 <- which(grepl(pattern, x = labels, ignore.case = ignore.case, fixed = fixed)) # if nothing found, find in near distance if (sjmisc::is_empty(pos2) && fuzzy && !inherits(pattern, "regex")) { pos2 <- fuzzy_grep(x = labels, pattern = pattern) } } # search for pattern in value labels if (search %in% c("value", "name_value", "label_value", "all")) { labels <- sjlabelled::get_labels(data, attr.only = FALSE) pos3 <- which(sapply(labels, function(.x) any(grepl(pattern, x = .x, ignore.case = ignore.case, fixed = fixed)), simplify = TRUE)) # if nothing found, find in near distance if (sjmisc::is_empty(pos3) && fuzzy && !inherits(pattern, "regex")) { pos3 <- which(sapply( labels, function(.x) { p <- fuzzy_grep( x = .x, pattern = pattern ) !sjmisc::is_empty(p[1]) }, simplify = TRUE)) } } # get unique variable indices pos <- unique(c(pos1, pos2, pos3)) # remove -1 pos <- pos[which(pos != -1)] # return data frame? if (out == "df") { return(data[, pos, drop = FALSE]) } # return variable labels? if (out == "table") { return(data_frame( col.nr = pos, var.name = colnames(data)[pos], var.label = sjlabelled::get_label(data[, pos, drop = FALSE], def.value = colnames(data)[pos]) )) } # use column names names(pos) <- colnames(data)[pos] pos } #' @rdname find_var #' @export find_in_data <- find_var sjmisc/R/dicho.R0000644000176200001440000002224514046746443013175 0ustar liggesusers#' @title Dichotomize variables #' @name dicho #' #' @description Dichotomizes variables into dummy variables (0/1). Dichotomization is #' either done by median, mean or a specific value (see \code{dich.by}). #' \code{dicho_if()} is a scoped variant of \code{dicho()}, where recoding #' will be applied only to those variables that match the logical condition #' of \code{predicate}. #' #' @param dich.by Indicates the split criterion where a variable is dichotomized. #' Must be one of the following values (may be abbreviated): #' \describe{ #' \item{\code{"median"} or \code{"md"}}{by default, \code{x} is split into two groups at the median.} #' \item{\code{"mean"} or \code{"m"}}{splits \code{x} into two groups at the mean of \code{x}.} #' \item{numeric value}{splits \code{x} into two groups at the specific value. Note that the value is inclusive, i.e. \code{dich.by = 10} will split \code{x} into one group with values from lowest to 10 and another group with values greater than 10.} #' } #' @param val.labels Optional character vector (of length two), to set value label #' attributes of dichotomized variable (see \code{\link[sjlabelled]{set_labels}}). #' If \code{NULL} (default), no value labels will be set. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return \code{x}, dichotomized. If \code{x} is a data frame, #' for \code{append = TRUE}, \code{x} including the dichotomized. variables #' as new columns is returned; if \code{append = FALSE}, only #' the dichotomized variables will be returned. If \code{append = TRUE} and #' \code{suffix = ""}, recoded variables will replace (overwrite) existing #' variables. #' #' @note Variable label attributes are preserved (unless changed via #' \code{var.label}-argument). #' #' @details \code{dicho()} also works on grouped data frames (see \code{\link[dplyr]{group_by}}). #' In this case, dichotomization is applied to the subsets of variables #' in \code{x}. See 'Examples'. #' #' @examples #' data(efc) #' summary(efc$c12hour) #' # split at median #' table(dicho(efc$c12hour)) #' # split at mean #' table(dicho(efc$c12hour, dich.by = "mean")) #' # split between value lowest to 30, and above 30 #' table(dicho(efc$c12hour, dich.by = 30)) #' #' # sample data frame, values from 1-4 #' head(efc[, 6:10]) #' #' # dichtomized values (1 to 2 = 0, 3 to 4 = 1) #' library(dplyr) #' efc %>% #' select(6:10) %>% #' dicho(dich.by = 2) %>% #' head() #' #' # dichtomize several variables in a data frame #' dicho(efc, c12hour, e17age, c160age, append = FALSE) #' #' # dichotomize and set labels #' frq(dicho( #' efc, e42dep, #' var.label = "Dependency (dichotomized)", #' val.labels = c("lower", "higher"), #' append = FALSE #' )) #' #' # works also with gouped data frames #' mtcars %>% #' dicho(disp, append = FALSE) %>% #' table() #' #' mtcars %>% #' group_by(cyl) %>% #' dicho(disp, append = FALSE) %>% #' table() #' #' # dichotomizing grouped data frames leads to different #' # results for a dichotomized variable, because the split #' # value is different for each group. #' # compare: #' mtcars %>% #' group_by(cyl) %>% #' summarise(median = median(disp)) #' #' median(mtcars$disp) #' #' # dichotomize only variables with more than 10 unique values #' p <- function(x) dplyr::n_distinct(x) > 10 #' dicho_if(efc, predicate = p, append = FALSE) #' @export dicho <- function(x, ..., dich.by = "median", as.num = FALSE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_d") { UseMethod("dicho") } #' @export dicho.default <- function(x, ..., dich.by = "median", as.num = FALSE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_d") { # check for correct dichotome types if (!is.numeric(dich.by) && !dich.by %in% c("median", "mean", "md", "m")) { stop("argument `dich.by` must either be `median`, `mean` or a numerical value." , call. = FALSE) } # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) recode_fun( x = x, .dat = .dat, fun = get("dicho_helper", asNamespace("sjmisc")), suffix = suffix, append = append, dich.by = dich.by, as.num = as.num, var.label = var.label, val.labels = val.labels ) } #' @export dicho.mids <- function(x, ..., dich.by = "median", as.num = FALSE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_d") { vars <- dplyr::quos(...) ndf <- prepare_mids_recode(x) # select variable and compute rowsums. add this variable # to each imputed ndf$data <- purrr::map( ndf$data, function(.x) { dat <- dplyr::select(.x, !!! vars) dplyr::bind_cols( .x, recode_fun( x = dat, .dat = dat, fun = get("dicho_helper", asNamespace("sjmisc")), suffix = suffix, append = FALSE, dich.by = dich.by, as.num = as.num, var.label = var.label, val.labels = val.labels )) } ) final_mids_recode(ndf) } #' @rdname dicho #' @export dicho_if <- function(x, predicate, dich.by = "median", as.num = FALSE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_d") { # check for correct dichotome types if (!is.numeric(dich.by) && !dich.by %in% c("median", "mean", "md", "m")) { stop("argument `dich.by` must either be `median`, `mean` or a numerical value." , call. = FALSE) } # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } recode_fun( x = x, .dat = .dat, fun = get("dicho_helper", asNamespace("sjmisc")), suffix = suffix, append = append, dich.by = dich.by, as.num = as.num, var.label = var.label, val.labels = val.labels ) } dicho_helper <- function(x, dich.by, as.num, var.label, val.labels) { # do we have labels? if not, try to # automatically get variable labels if (is.null(var.label)) varlab <- sjlabelled::get_label(x) else varlab <- var.label # check if factor. factors need conversion # to numeric before dichtomizing if (is.factor(x)) { # non-numeric-factor cannot be converted if (is_num_fac(x)) { # try to convert to numeric x <- as.numeric(as.character(x)) } else { # convert non-numeric factor to numeric # factor levels are replaced by numeric values x <- sjlabelled::as_numeric(x, keep.labels = FALSE) message("Trying to dichotomize non-numeric factor.") } } # split at specific value if (is.numeric(dich.by)) { x <- ifelse(x <= dich.by, 0, 1) } else if (dich.by == "median" || dich.by == "md") { x <- ifelse(x <= stats::median(x, na.rm = TRUE), 0, 1) # split at mean } else if (dich.by == "mean" || dich.by == "m") { x <- ifelse(x <= mean(x, na.rm = TRUE), 0, 1) } if (!as.num) x <- as.factor(x) # set back variable labels if (!is.null(varlab)) x <- suppressWarnings(sjlabelled::set_label(x, label = varlab)) # set value labels if (!is.null(val.labels)) x <- suppressWarnings(sjlabelled::set_labels(x, labels = val.labels)) x } recode_fun <- function(x, .dat, fun, suffix, append, ...) { if (is.data.frame(x)) { # remember original data, if user wants to bind columns orix <- x # do we have a grouped data frame? if (inherits(.dat, "grouped_df")) { # get grouping indices and variables grps <- dplyr::group_indices(.dat) grp.vars <- dplyr::group_vars(.dat) # names of grouping variables vars <- colnames(.dat)[colnames(.dat) %nin% grp.vars] .dat <- as.data.frame(.dat) # iterate all groups for (i in unique(grps)) { # slice cases for each group keep <- which(grps == i) group <- dplyr::slice(.dat, !! keep) # now iterate all variables of interest for (j in vars) { group[[j]] <- fun(x = group[[j]], ...) } # write back data .dat[keep, ] <- group } # remove grouping column x <- .dat[colnames(.dat) %nin% grp.vars] } else { # iterate variables of data frame for (i in colnames(.dat)) { x[[i]] <- fun(x = .dat[[i]], ...) } # select only recoded variables x <- x[colnames(.dat)] } # add suffix to recoded variables and combine data x <- append_columns(x, orix, suffix, append) } else { x <- fun(x = .dat, ...) } x } append_columns <- function(x, orix, suffix, append) { append.replace <- !is.null(suffix) && sjmisc::is_empty(suffix) # add suffix to recoded variables? if (!is.null(suffix) && !sjmisc::is_empty(suffix)) { colnames(x) <- sprintf("%s%s", colnames(x), suffix) } # combine data if (append) { if (append.replace) x <- add_columns(x, orix, replace = TRUE) else x <- dplyr::bind_cols(orix, x) } x } sjmisc/R/rec_pattern.R0000644000176200001440000000454413451124270014402 0ustar liggesusers#' @title Create recode pattern for 'rec' function #' @name rec_pattern #' #' @description Convenient function to create a recode pattern for the #' \code{\link{rec}} function, which recodes (numeric) #' vectors into smaller groups. #' #' @param from Minimum value that should be recoded. #' @param to Maximum value that should be recoded. #' @param width Numeric, indicating the range of each group. #' @param other String token, indicating how to deal with all other values #' that have not been captured by the recode pattern. See 'Details' #' on the \code{else}-token in \code{\link{rec}}. #' @return A list with two values: #' \describe{ #' \item{\code{pattern}}{string pattern that can be used as \code{rec} argument for the \code{\link{rec}}-function.} #' \item{\code{labels}}{the associated values labels that can be used with \code{\link[sjlabelled]{set_labels}}.} #' } #' #' @seealso \code{\link{group_var}} for recoding variables into smaller groups, and #' \code{\link{group_labels}} to create the asssociated value labels. #' #' @examples #' rp <- rec_pattern(1, 100) #' rp #' #' # sample data, inspect age of carers #' data(efc) #' table(efc$c160age, exclude = NULL) #' table(rec(efc$c160age, rec = rp$pattern), exclude = NULL) #' #' # recode carers age into groups of width 5 #' x <- rec( #' efc$c160age, #' rec = rp$pattern, #' val.labels = rp$labels #' ) #' # watch result #' frq(x) #' #' @export rec_pattern <- function(from, to, width = 5, other = NULL){ # init variables rec.pat <- c() rec.labels <- c() # create sequence of recode-groups values <- seq(from, to + width, by = width) # create pattern for each group for (x in seq_len(length(values) - 1)) { rec.pat <- paste0(rec.pat, sprintf("%i:%i=%i", values[x], values[x + 1] - 1, x), sep = ";") # also create associated labels rec.labels <- c(rec.labels, sprintf("%i-%i", values[x], values[x + 1] - 1)) } # do we have an "else"-token? if (!is.null(other) && !sjmisc::is_empty(other)) rec.pat <- paste0(rec.pat, "else=", other, sep = "") # name labels names(rec.labels) <- seq_len(length(values) - 1) # return results list(pattern = rec.pat, labels = rec.labels) } sjmisc/R/recode_mids_fun.R0000644000176200001440000000202514046746443015226 0ustar liggesusersprepare_mids_recode <- function(x) { # check if suggested package is available if (!requireNamespace("mice", quietly = TRUE)) stop("Package `mice` needed for this function to work. Please install it.", call. = FALSE) # check classes if (!inherits(x, "mids")) stop("`x` must be a `mids`-object, as returned by the `mice()`-function.", call. = FALSE) # convert mids into long-data.frame long <- mice::complete(x, action = "long", include = TRUE) # group by imputation, so we can easily iterate each imputed dataset long %>% dplyr::group_by(.data$.imp) %>% .nest() } final_mids_recode <- function(x) { # check if suggested package is available if (!requireNamespace("mice", quietly = TRUE)) stop("Package `mice` needed for this function to work. Please install it.", call. = FALSE) # return mids-object. need to use "as.data.frame()", # because "as.mids()" can't cope with tibbles x %>% .unnest() %>% as.data.frame() %>% mice::as.mids() } sjmisc/R/count_na.R0000644000176200001440000000644214046746443013716 0ustar liggesusers#' @title Frequency table of tagged NA values #' @name count_na #' #' @description This method counts tagged NA values (see \code{\link[haven]{tagged_na}}) #' in a vector and prints a frequency table of counted tagged NAs. #' #' @inheritParams to_dummy #' #' @return A data frame with counted tagged NA values. #' #' @examples #' if (require("haven")) { #' x <- labelled( #' x = c(1:3, tagged_na("a", "c", "z"), #' 4:1, tagged_na("a", "a", "c"), #' 1:3, tagged_na("z", "c", "c"), #' 1:4, tagged_na("a", "c", "z")), #' labels = c("Agreement" = 1, "Disagreement" = 4, #' "First" = tagged_na("c"), "Refused" = tagged_na("a"), #' "Not home" = tagged_na("z")) #' ) #' count_na(x) #' #' y <- labelled( #' x = c(1:3, tagged_na("e", "d", "f"), #' 4:1, tagged_na("f", "f", "d"), #' 1:3, tagged_na("f", "d", "d"), #' 1:4, tagged_na("f", "d", "f")), #' labels = c("Agreement" = 1, "Disagreement" = 4, "An E" = tagged_na("e"), #' "A D" = tagged_na("d"), "The eff" = tagged_na("f")) #' ) #' #' # create data frame #' dat <- data.frame(x, y) #' #' # possible count()-function calls #' count_na(dat) #' count_na(dat$x) #' count_na(dat, x) #' count_na(dat, x, y) #' } #' @export count_na <- function(x, ...) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) # return values dataframes <- list() if (is.data.frame(x)) { # iterate variables of data frame for (i in colnames(.dat)) { # print freq dummy <- count_na_helper(.dat[[i]], cn = i) cat(sprintf("# %s\n\n", sjlabelled::get_label(.dat[[i]], def.value = i))) print(dummy) cat("\n\n") # save data frame for return value dataframes[[length(dataframes) + 1]] <- dummy } # return list invisible(dataframes) } else { # get counts dummy <- count_na_helper(.dat, cn = names(.dat)) # check if we have variable label and print, if yes vl <- sjlabelled::get_label(.dat) if (!is.null(vl)) cat(sprintf("# %s\n\n", vl)) # print count table print(dummy) cat("\n") # return data frame invisible(dummy) } } count_na_helper <- function(x, cn) { if (!requireNamespace("haven", quietly = TRUE)) { stop("Package `haven` needed for this function to work. Please install it.", call. = FALSE) } # check if x has any tagged NA values if (sum(haven::is_tagged_na(x)) < 1) { message("`x` has no tagged NA values.") return(NULL) } # get NA as tagged NA nav <- haven::na_tag(sjlabelled::get_na(x, as.tag = FALSE)) nav.labels <- names(sjlabelled::get_na(x, as.tag = TRUE)) # get values from x, including different NA tags values <- haven::na_tag(x) # only keep missing values values <- values[values %in% nav] # replace NA tag with label for (i in seq_len(length(nav))) { values[values == nav[i]] <- nav.labels[i] } # now compute frequency, and return a proper data frame frq_helper(values, sort.frq = "none", weight.by = NULL, cn = cn, auto.grp = NULL) %>% dplyr::mutate(label = .data$val) %>% dplyr::select(-.data$val) %>% dplyr::filter(.data$label != "NA") } sjmisc/R/zzz.R0000644000176200001440000000075213542342535012735 0ustar liggesusers.onAttach <- function(libname, pkgname) { if (format(Sys.time(), "%m%d") == "0504") { packageStartupMessage("May the fourth be with you!") } else if (stats::runif(1) > .9) { packageStartupMessage("Learn more about sjmisc with 'browseVignettes(\"sjmisc\")'.") } else if (stats::runif(1) > .9) { packageStartupMessage("Install package \"strengejacke\" from GitHub (`devtools::install_github(\"strengejacke/strengejacke\")`) to load all sj-packages at once!") } } sjmisc/R/seq_col.R0000644000176200001440000000154314046746443013532 0ustar liggesusers#' @title Sequence generation for column or row counts of data frames #' @name seq_col #' #' @description \code{seq_col(x)} is a convenient wrapper for \code{seq_len(ncol(x))}, #' while \code{seq_row(x)} is a convenient wrapper for \code{seq_len(nrow(x))}. #' #' @param x A data frame. #' #' @return A numeric sequence from 1 to number of columns or rows. #' #' @examples #' data(iris) #' seq_col(iris) #' seq_row(iris) #' #' @export seq_col <- function(x) { if (!is.matrix(x) && !is.data.frame(x)) { warning("`x` needs to be a matrix or data frame.", call. = FALSE) return(NULL) } seq_len(ncol(x)) } #' @rdname seq_col #' @export seq_row <- function(x) { if (!is.matrix(x) && !is.data.frame(x)) { warning("`x` needs to be a matrix or data frame.", call. = FALSE) return(NULL) } seq_len(nrow(x)) } sjmisc/R/remove_vars.R0000644000176200001440000000171214153357271014427 0ustar liggesusers#' @title Remove variables from a data frame #' @name remove_var #' #' @description This function removes variables from a data frame, and is #' intended to use within a pipe-workflow. \code{remove_cols()} is an #' alias for \code{remove_var()}. #' #' @param ... Character vector with variable names, or unquoted names #' of variables that should be removed from the data frame. #' You may also use functions like \code{:} or tidyselect's #' select-helpers. #' #' @inheritParams to_dummy #' #' @return \code{x}, with variables specified in \code{...} removed. #' #' @examples #' mtcars %>% remove_var("disp", "cyl") #' mtcars %>% remove_var(c("wt", "vs")) #' mtcars %>% remove_var(drat:am) #' @export remove_var <- function(x, ...) { vars_to_remove <- tidyselect::vars_select(colnames(x), ...) x[colnames(x) %nin% vars_to_remove] } #' @rdname remove_var #' @export remove_cols <- function(x, ...) { remove_var(x, ...) } sjmisc/R/big_mark.R0000644000176200001440000000432414046746443013660 0ustar liggesusers#' @title Format numbers #' @name big_mark #' #' @description \code{big_mark()} formats large numbers with big marks, while #' \code{prcn()} converts a numeric scalar between 0 and 1 into a character #' vector, representing the percentage-value. #' #' @param x A vector or data frame. All numeric inputs (including numeric character) #' vectors) will be prettified. For \code{prcn()}, a number between #' 0 and 1, or a vector or data frame with such numbers. #' @param big.mark Character, used as mark between every 3 decimals before the decimal point. #' @param ... Other arguments passed down to the \code{\link{prettyNum}}-function. #' #' @return For \code{big_mark()}, a prettified \code{x} as character, with big marks. #' For \code{prcn}, a character vector with a percentage number. #' #' @examples #' # simple big mark #' big_mark(1234567) #' #' # big marks for several values at once, mixed numeric and character #' big_mark(c(1234567, "55443322")) #' #' # pre-defined width of character output #' big_mark(c(1234567, 55443322), width = 15) #' #' # convert numbers into percentage, as character #' prcn(0.2389) #' prcn(c(0.2143887, 0.55443, 0.12345)) #' #' dat <- data.frame( #' a = c(.321, .121, .64543), #' b = c("a", "b", "c"), #' c = c(.435, .54352, .234432) #' ) #' prcn(dat) #' @export big_mark <- function(x, big.mark = ",", ...) { UseMethod("big_mark") } #' @export big_mark.data.frame <- function(x, big.mark = ",", ...) { as.data.frame(lapply(x, FUN = big_mark_helper, big.mark, ...)) } #' @export big_mark.list <- function(x, big.mark = ",", ...) { lapply(x, FUN = big_mark_helper, big.mark, ...) } #' @export big_mark.default <- function(x, big.mark = ",", ...) { big_mark_helper(x, big.mark, ...) } big_mark_helper <- function(x, big.mark, ...) { prettyNum(x, big.mark = big.mark, ...) } #' @export #' @rdname big_mark prcn <- function(x) { UseMethod("prcn") } #' @export prcn.default <- function(x) sprintf("%.2f%%", round(x * 100, 2)) #' @export prcn.data.frame <- function(x) { as.data.frame( purrr::map_if(x, is.numeric, ~ sprintf("%.2f%%", round(.x * 100, 2))), stringsAsFactors = FALSE ) } sjmisc/R/descr.R0000644000176200001440000002102714046746443013204 0ustar liggesusers#' @title Basic descriptive statistics #' @name descr #' #' @description This function prints a basic descriptive statistic, including #' variable labels. #' #' @param x A vector or a data frame. May also be a grouped data frame #' (see 'Note' and 'Examples'). #' @param max.length Numeric, indicating the maximum length of variable labels #' in the output. If variable names are longer than \code{max.length}, #' they will be shortened to the last whole word within the first #' \code{max.length} chars. #' @param show Character vector, indicating which information (columns) that describe #' the data should be returned. May be one or more of \code{"type", "label", "n", #' "NA.prc", "mean", "sd", "se", "md", "trimmed", "range", "iqr", "skew"}. There are #' two shortcuts: \code{show = "all"} (default) shows all information, #' \code{show = "short"} just shows n, missing percentage, mean and standard #' deviation. #' @param out Character vector, indicating whether the results should be printed #' to console (\code{out = "txt"}) or as HTML-table in the viewer-pane #' (\code{out = "viewer"}) or browser (\code{out = "browser"}). #' #' @inheritParams to_dummy #' @inheritParams frq #' #' @return A data frame with basic descriptive statistics. #' #' @note \code{data} may also be a grouped data frame (see \code{\link[dplyr]{group_by}}) #' with up to two grouping variables. Descriptive tables are created for each #' subgroup then. #' #' @examples #' data(efc) #' descr(efc, e17age, c160age) #' #' efc$weights <- abs(rnorm(nrow(efc), 1, .3)) #' descr(efc, c12hour, barthtot, weights = weights) #' #' library(dplyr) #' efc %>% select(e42dep, e15relat, c172code) %>% descr() #' #' # show just a few elements #' efc %>% select(e42dep, e15relat, c172code) %>% descr(show = "short") #' #' # with grouped data frames #' efc %>% #' group_by(e16sex) %>% #' select(e16sex, e42dep, e15relat, c172code) %>% #' descr() #' #' # you can select variables also inside 'descr()' #' efc %>% #' group_by(e16sex, c172code) %>% #' descr(e16sex, c172code, e17age, c160age) #' #' # or even use select-helpers #' descr(efc, contains("cop"), max.length = 20) #' @importFrom rlang .data #' @export descr <- function(x, ..., max.length = NULL, weights = NULL, show = "all", out = c("txt", "viewer", "browser"), encoding = "UTF-8", file = NULL) { out <- match.arg(out) if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) { message("Package `sjPlot` needs to be loaded to print HTML tables.") out <- "txt" } # select elements that should be shown if ("all" %in% show) show <- c("type", "label", "n", "NA.prc", "mean", "sd", "se", "md", "trimmed", "range", "iqr", "skew") else if ("short" %in% show) show <- c("n", "NA.prc", "mean", "sd") show <- c("var", show) # get dot data dd <- get_dot_data(x, dplyr::quos(...)) w.name <- deparse(substitute(weights)) if (w.name != "NULL") { w.name <- gsub("\"", "", w.name, fixed = FALSE) if (!is.null(x[[w.name]])) { dd[[w.name]] <- NULL dd$.weights <- x[[w.name]] } else { dd$.weights <- eval(substitute(weights)) } } else { w.name <- NULL } # return values dataframes <- list() # do we have a grouped data frame? if (inherits(dd, "grouped_df")) { # get grouped data grps <- get_grouped_data(dd) # now plot everything for (i in seq_len(nrow(grps))) { # copy back labels to grouped data frame tmp <- sjlabelled::copy_labels(grps$data[[i]], dd) dummy <- descr_helper(tmp, max.length)[, show] attr(dummy, "group") <- get_grouped_title(x, grps, i, sep = ", ", long = FALSE) # save data frame for return value dataframes[[length(dataframes) + 1]] <- dummy } # add class-attr for print-method() if (out == "txt") class(dataframes) <- c("sjmisc_grpdescr", "list") else class(dataframes) <- c("sjt_grpdescr", "list") } else { dataframes <- descr_helper(dd, max.length) dataframes <- dataframes[, intersect(colnames(dataframes), show)] # add class-attr for print-method() if (out == "txt") class(dataframes) <- c("sjmisc_descr", class(dataframes)) else class(dataframes) <- c("sjt_descr", class(dataframes)) } # save how to print output attr(dataframes, "print") <- out attr(dataframes, "encoding") <- encoding attr(dataframes, "file") <- file dataframes } descr_helper <- function(dd, max.length) { # check if we have a single vector, because purrr would return # a result for each *value*, instead one result for the complete vector if (!is.data.frame(dd)) dd <- as.data.frame(dd) if (obj_has_name(dd, ".weights")) { weights <- dd$.weights dd <- dplyr::select(dd, -.data$.weights) } else { weights <- NULL } ff <- function(x) is.numeric(x) | is.factor(x) dd <- dplyr::select_if(dd, ff) # get default variable name var.name <- colnames(dd) if (is.null(var.name)) var.name <- NA type <- var_type(dd) label <- unname(sjlabelled::get_label(dd, def.value = var.name)) dd <- to_value(dd, keep.labels = FALSE) if (is.null(weights)) { x <- suppressWarnings( dd %>% dplyr::select_if(is.numeric) %>% .gather(key = "var", value = "val") %>% dplyr::group_by(.data$var) %>% dplyr::summarise_all( dplyr::funs( n = length(stats::na.omit(.data$val)), NA.prc = 100 * sum(is.na(.data$val)) / length(.data$val), mean = mean(.data$val, na.rm = TRUE), sd = stats::sd(.data$val, na.rm = TRUE), se = sqrt(stats::var(.data$val, na.rm = TRUE) / length(stats::na.omit(.data$val))), md = stats::median(.data$val, na.rm = TRUE), trimmed = mean(.data$val, na.rm = TRUE, trim = .1), range = sprintf( "%s (%s-%s)", as.character(round(diff(range(.data$val, na.rm = TRUE)), 2)), as.character(round(min(.data$val, na.rm = TRUE), 2)), as.character(round(max(.data$val, na.rm = TRUE), 2)) ), iqr = stats::IQR(.data$val, na.rm = TRUE), skew = sjmisc.skew(.data$val) )) ) %>% as.data.frame() } else { dd$.weights <- weights tmp <- dplyr::select_if(dd, is.numeric) tmp <- .gather(tmp, key = "var", value = "val", setdiff(colnames(tmp), ".weights")) x <- suppressWarnings( tmp %>% dplyr::group_by(.data$var) %>% dplyr::summarise( n = round(sum(.data$.weights[!is.na(.data$val)], na.rm = TRUE)), NA.prc = 100 * sum(is.na(.data$val)) / length(.data$val), mean = stats::weighted.mean(.data$val, w = .data$.weights, na.rm = TRUE), sd = wtd_sd_helper(.data$val, weights = .data$.weights), se = wtd_se_helper(.data$val, weights = .data$.weights), range = sprintf( "%s (%s-%s)", as.character(round(diff(range(.data$val, na.rm = TRUE)), 2)), as.character(round(min(.data$val, na.rm = TRUE), 2)), as.character(round(max(.data$val, na.rm = TRUE), 2)) ), iqr = stats::IQR(.data$val, na.rm = TRUE), skew = sjmisc.skew(.data$val) ) ) %>% as.data.frame() } # summarise_all() sorts variables, so restore order x <- x[match(var.name, x$var), ] %>% add_variables(type, label, .after = 1) # check if labels should be truncated x$label <- shorten_string(x$label, max.length) if (!is.null(weights)) attr(x, "weights") <- "TRUE" x } wtd_se_helper <- function(x, weights) { sqrt(wtd_var(x, weights) / length(stats::na.omit(x))) } wtd_sd_helper <- function(x, weights = NULL) { sqrt(wtd_var(x, weights)) } wtd_var <- function(x, w) { if (is.null(w)) w <- rep(1, length(x)) x[is.na(w)] <- NA w[is.na(x)] <- NA w <- stats::na.omit(w) x <- stats::na.omit(x) xbar <- sum(w * x) / sum(w) sum(w * ((x - xbar)^2)) / (sum(w) - 1) } sjmisc.skew <- function(x) { if (any(ina <- is.na(x))) x <- x[!ina] n <- length(x) x <- x - mean(x) if (n < 3) return(NA) # type-1 skewness out <- (sum((x - mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5 # type-2 skewness out * sqrt(n * (n - 1)) / (n - 2) } sjmisc/R/str_start_end.R0000644000176200001440000000500514046746443014755 0ustar liggesusers#' @title Find start and end index of pattern in string #' @name str_start #' @description \code{str_start()} finds the beginning position of \code{pattern} #' in each element of \code{x}, while \code{str_end()} finds the stopping position #' of \code{pattern} in each element of \code{x}. #' #' @param x A character vector. #' @param pattern Character string to be matched in \code{x}. \code{pattern} might also #' be a regular-expression object, as returned by \code{stringr::regex()}. #' Alternatively, use \code{regex = TRUE} to treat \code{pattern} as a regular #' expression rather than a fixed string. #' #' @inheritParams find_var #' #' @return A numeric vector with index of start/end position(s) of \code{pattern} #' found in \code{x}, or \code{-1}, if \code{pattern} was not found #' in \code{x}. #' #' @examples #' path <- "this/is/my/fileofinterest.csv" #' str_start(path, "/") #' #' path <- "this//is//my//fileofinterest.csv" #' str_start(path, "//") #' str_end(path, "//") #' #' x <- c("my_friend_likes me", "your_friend likes_you") #' str_start(x, "_") #' #' # pattern "likes" starts at position 11 in first, and #' # position 13 in second string #' str_start(x, "likes") #' #' # pattern "likes" ends at position 15 in first, and #' # position 17 in second string #' str_end(x, "likes") #' #' x <- c("I like to move it, move it", "You like to move it") #' str_start(x, "move") #' str_end(x, "move") #' #' x <- c("test1234testagain") #' str_start(x, "\\d+4") #' str_start(x, "\\d+4", regex = TRUE) #' str_end(x, "\\d+4", regex = TRUE) #' @export str_start <- function(x, pattern, ignore.case = TRUE, regex = FALSE) { if (regex) class(pattern) <- c("regex", class(pattern)) str_start_end(x, pattern, ignore.case, index = "start") } #' @rdname str_start #' @export str_end <- function(x, pattern, ignore.case = TRUE, regex = FALSE) { if (regex) class(pattern) <- c("regex", class(pattern)) str_start_end(x, pattern, ignore.case, index = "end") } str_start_end <- function(x, pattern, ignore.case, index) { # get all locations of pattern pos <- gregexpr(pattern, text = x, fixed = !inherits(pattern, "regex")) # add end index if required if (index == "end") { pos <- lapply(pos, function(i) { if (i[1] != -1) i <- i + attr(i, "match.length", exact = TRUE) - 1 i }) } # remove attributes l <- lapply(pos, as.vector) if (length(l) == 1) unlist(l) else l } sjmisc/R/S3-methods.R0000644000176200001440000001633714153357271014036 0ustar liggesusers# Reexports ------------------------ #' @importFrom insight print_md #' @export insight::print_md #' @importFrom insight print_html #' @export insight::print_html #' @export print.sjmisc_frq2 <- function(x, ...) { purrr::walk(x, function(dat) { # get variable label lab <- attr(dat, "label", exact = TRUE) vt <- attr(dat, "vartype", exact = TRUE) # fix variable type string if (!sjmisc::is_empty(vt)) vt <- sprintf(" <%s>", vt) else vt <- "" cat("\n") # print label if (!is.null(lab)) { insight::print_color(sprintf("%s", lab), "red") insight::print_color(sprintf("%s\n", vt), "blue") } # get grouping title label grp <- attr(dat, "group", exact = TRUE) # print title for grouping if (!is.null(grp)) { insight::print_color("# grouped by: ", "blue") insight::print_color(sprintf("%s\n", grp), "cyan") } # add Total N insight::print_color(sprintf( "# total N=%i valid N=%i mean=%.2f sd=%.2f\n\n", sum(dat$frq, na.rm = TRUE), sum(dat$frq[0:(nrow(dat) - 1)], na.rm = TRUE), attr(dat, "mean", exact = TRUE), attr(dat, "sd", exact = TRUE) ), "blue") # don't print labels, if all except for the NA value are "none" if ((dplyr::n_distinct(dat$label[!is.na(dat$val)]) == 1 && unique(dat$label[!is.na(dat$val)]) == "") || (length(dat$val) == 1 && is.na(dat$val))) dat <- dplyr::select(dat, -.data$label) # fix colnames colnames(dat)[names(dat) == "val"] <- "Value" colnames(dat)[names(dat) == "label"] <- "Label" colnames(dat)[names(dat) == "frq"] <- "N" colnames(dat)[names(dat) == "raw.prc"] <- "Raw %" colnames(dat)[names(dat) == "valid.prc"] <- "Valid %" colnames(dat)[names(dat) == "cum.prc"] <- "Cum. %" # print frq-table cat(insight::export_table(dat, missing = "")) cat("\n") }) } #' @export format.sjmisc_frq <- function(x, format = NULL, ...) { lapply(x, function(dat) { # get variable label lab <- attr(dat, "label", exact = TRUE) vt <- attr(dat, "vartype", exact = TRUE) # fix variable type string if (!sjmisc::is_empty(vt) && !identical(format, "html")) vt <- sprintf(" <%s>", vt) else vt <- "" title <- NULL subtitle <- NULL footer <- NULL # print label if (!is.null(lab)) { if (is.null(format) || identical(format, "text")) { title <- paste0(insight::color_text(lab, "red"), insight::color_text(vt, "blue")) } else { title <- paste0(lab, vt) } } # get grouping title label grp <- attr(dat, "group", exact = TRUE) # print title for grouping if (!is.null(grp)) { if (is.null(format) || identical(format, "text")) { subtitle <- paste0(insight::color_text("# grouped by: ", "blue"), insight::color_text(grp, "cyan")) } else if (identical(format, "markdown")) { subtitle <- paste0("grouped by: ", grp) } else { title <- paste0(title, ", grouped by: ", grp) } } # add Total N if (is.null(format) || identical(format, "text")) { footer <- insight::color_text(sprintf( "# total N=%i valid N=%i mean=%.2f sd=%.2f", sum(dat$frq, na.rm = TRUE), sum(dat$frq[0:(nrow(dat) - 1)], na.rm = TRUE), attr(dat, "mean", exact = TRUE), attr(dat, "sd", exact = TRUE) ), "blue") } else { footer <- sprintf( "total N=%i valid N=%i mean=%.2f sd=%.2f", sum(dat$frq, na.rm = TRUE), sum(dat$frq[0:(nrow(dat) - 1)], na.rm = TRUE), attr(dat, "mean", exact = TRUE), attr(dat, "sd", exact = TRUE) ) } # don't print labels, if all except for the NA value are "none" if ((dplyr::n_distinct(dat$label[!is.na(dat$val)]) == 1 && unique(dat$label[!is.na(dat$val)]) == "") || (length(dat$val) == 1 && is.na(dat$val))) dat <- dplyr::select(dat, -.data$label) # fix colnames colnames(dat)[names(dat) == "val"] <- "Value" colnames(dat)[names(dat) == "label"] <- "Label" colnames(dat)[names(dat) == "frq"] <- "N" colnames(dat)[names(dat) == "raw.prc"] <- "Raw %" colnames(dat)[names(dat) == "valid.prc"] <- "Valid %" colnames(dat)[names(dat) == "cum.prc"] <- "Cum. %" if (is.null(format) || identical(format, "text")) { if (!is.null(subtitle)) { subtitle <- paste0("\n", subtitle) } if (!is.null(footer)) { subtitle <- paste0(subtitle, "\n", footer) footer <- NULL } } if (identical(format, "html")) { footer <- NULL } attr(dat, "table_title") <- title attr(dat, "table_subtitle") <- subtitle attr(dat, "table_footer") <- footer dat }) } #' @export print.sjmisc_frq <- function(x, ...) { out <- format(x, format = "text", ...) cat(insight::export_table(out, missing = "")) } #' @export print_md.sjmisc_frq <- function(x, ...) { out <- format(x, format = "markdown", ...) insight::export_table(out, format = "markdown", missing = "") } #' @export print_html.sjmisc_frq <- function(x, ...) { out <- format(x, format = "html", ...) insight::export_table(out, format = "html", missing = "", title = attr(x[[1]], "label", exact = TRUE)) } #' @export print.sjmisc_descr <- function(x, ...) { cat("\n") insight::print_color("## Basic descriptive statistics\n\n", "blue") print_descr_helper(x, ...) } print_descr_helper <- function(x, ...) { digits <- 2 # do we have digits argument? add.args <- lapply(match.call(expand.dots = FALSE)$`...`, function(x) x) if ("digits" %in% names(add.args)) digits <- eval(add.args[["digits"]]) # round values to.round <- c("NA.prc", "mean", "sd", "se", "md", "trimmed") if (is.null(attr(x, "weights", exact = TRUE))) to.round <- c(to.round, "skew") to.round <- intersect(to.round, colnames(x)) x[, to.round] <- round(x[, to.round], digits = digits) # print frq-table print.data.frame(x, ..., row.names = FALSE) } #' @export print.sjmisc_grpdescr <- function(x, ...) { cat("\n") insight::print_color("## Basic descriptive statistics\n", "blue") purrr::walk(x, function(.x) { # print title for grouping insight::print_color("\n\nGrouped by: ", "red") insight::print_color(sprintf("%s\n\n", attr(.x, "group", exact = TRUE)), "cyan") print_descr_helper(.x, ...) }) } #' @export print.sj_merge.imp <- function(x, ...) { graphics::plot(x$plot, ...) } #' @export print.sj_has_na <- function(x, ...) { insight::print_color("## Variables with missing or infinite values (in red)\n\n", "blue") s1 <- max(c(nchar(x$name), nchar("Name"))) s2 <- max(c(nchar(x$label), nchar("Variable Label"))) cat(sprintf(" Column %*s %*s\n\n", s1, "Name", s2, "Variable Label")) for (i in 1:nrow(x)) { row <- sprintf(" %*i %*s %*s\n", 6, x[i, "col"], s1, x[i, "name"], s2, x[i, "label"]) if (.is_true(x[i, "has.na"])) insight::print_color(row, "red") else insight::print_color(row, "green") } cat("\n") } sjmisc/R/set_na.R0000644000176200001440000000567714046746443013372 0ustar liggesusers#' @title Replace specific values in vector with NA #' @name set_na_if #' #' @description \code{set_na_if()} is a scoped variant of #' \code{\link[sjlabelled]{set_na}}, where values will be replaced only #' with NA's for those variables that match the logical condition of #' \code{predicate}. #' #' @seealso \code{\link{replace_na}} to replace \code{\link{NA}}'s with specific #' values, \code{\link{rec}} for general recoding of variables and #' \code{\link{recode_to}} for re-shifting value ranges. See #' \code{\link[sjlabelled]{get_na}} to get values of missing values in #' labelled vectors. #' #' @param na Numeric vector with values that should be replaced with NA values, #' or a character vector if values of factors or character vectors should be #' replaced. For labelled vectors, may also be the name of a value label. In #' this case, the associated values for the value labels in each vector #' will be replaced with \code{NA}. \code{na} can also be a named vector. #' If \code{as.tag = FALSE}, values will be replaced only in those variables #' that are indicated by the value names (see 'Examples'). #' @param drop.levels Logical, if \code{TRUE}, factor levels of values that have #' been replaced with \code{NA} are dropped. See 'Examples'. #' @param as.tag Logical, if \code{TRUE}, values in \code{x} will be replaced #' by \code{tagged_na}, else by usual \code{NA} values. Use a named #' vector to assign the value label to the tagged NA value (see 'Examples'). #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return \code{x}, with all values in \code{na} being replaced by \code{NA}. #' If \code{x} is a data frame, the complete data frame \code{x} will #' be returned, with NA's set for variables specified in \code{...}; #' if \code{...} is not specified, applies to all variables in the #' data frame. #' #' @examples #' dummy <- data.frame(var1 = sample(1:8, 100, replace = TRUE), #' var2 = sample(1:10, 100, replace = TRUE), #' var3 = sample(1:6, 100, replace = TRUE)) #' #' p <- function(x) max(x, na.rm = TRUE) > 7 #' tmp <- set_na_if(dummy, predicate = p, na = 8:9) #' head(tmp) #' @export set_na_if <- function(x, predicate, na, drop.levels = TRUE, as.tag = FALSE) { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { return(x) } if (is.data.frame(x)) { # iterate variables of data frame for (i in colnames(.dat)) { x[[i]] <- sjlabelled::set_na( x = .dat[[i]], na = na, drop.levels = drop.levels, as.tag = as.tag ) } } else { x <- sjlabelled::set_na( x = x, na = na, drop.levels = drop.levels, as.tag = as.tag ) } x } sjmisc/R/tidy_values.R0000644000176200001440000000237314046746443014437 0ustar liggesusers#' @title Clean values of character vectors. #' @name tidy_values #' #' @description This function "cleans" values of a character vector or levels of #' a factor by removing space and punctuation characters. #' #' @inheritParams to_dummy #' #' @return \code{x}, with "cleaned" values or levels. #' #' @examples #' f1 <- sprintf("Char %s", sample(LETTERS[1:5], size = 10, replace = TRUE)) #' f2 <- as.factor(sprintf("F / %s", sample(letters[1:5], size = 10, replace = TRUE))) #' f3 <- sample(1:5, size = 10, replace = TRUE) #' #' x <- data.frame(f1, f2, f3, stringsAsFactors = FALSE) #' #' clean_values(f1) #' clean_values(f2) #' clean_values(x) #' @export tidy_values <- function(x, ...) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) if (is.data.frame(x)) { purrr::map_df(.dat, ~ tidy_value_helper(.x)) } else { tidy_value_helper(x) } } tidy_value_helper <- function(x) { pattern <- "[[:space:][:punct:]]+" if (is.character(x)) { x <- gsub(pattern = pattern, replacement = "_", x = x) } else if (is.factor(x)) { levels(x) <- gsub(pattern = pattern, replacement = "_", x = levels(x)) } x } #' @rdname tidy_values #' @export clean_values <- tidy_values sjmisc/R/recode_to.R0000644000176200001440000001340214046746443014045 0ustar liggesusers#' @title Recode variable categories into new values #' @name recode_to #' #' @description Recodes (or "renumbers") the categories of variables into new #' category values, beginning with the lowest value specified by \code{lowest}. #' Useful when recoding dummy variables with 1/2 values to 0/1 values, or #' recoding scales from 1-4 to 0-3 etc. #' \code{recode_to_if()} is a scoped variant of \code{recode_to()}, where #' recoding will be applied only to those variables that match the #' logical condition of \code{predicate}. #' #' @seealso \code{\link{rec}} for general recoding of variables and \code{\link[sjlabelled]{set_na}} #' for setting \code{\link{NA}} values. #' #' @param lowest Indicating the lowest category value for recoding. Default is 0, so the new #' variable starts with value 0. #' @param highest If specified and greater than \code{lowest}, all category values larger than #' \code{highest} will be set to \code{NA}. Default is \code{-1}, i.e. this argument is ignored #' and no NA's will be produced. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return \code{x} with recoded category values, where \code{lowest} indicates #' the lowest value; If \code{x} is a data frame, for \code{append = TRUE}, #' \code{x} including the recoded variables as new columns is returned; if #' \code{append = FALSE}, only the recoded variables will be returned. If #' \code{append = TRUE} and \code{suffix = ""}, recoded variables will replace #' (overwrite) existing variables. #' #' @note Value and variable label attributes are preserved. #' #' @examples #' # recode 1-4 to 0-3 #' dummy <- sample(1:4, 10, replace = TRUE) #' recode_to(dummy) #' #' # recode 3-6 to 0-3 #' # note that numeric type is returned #' dummy <- as.factor(3:6) #' recode_to(dummy) #' #' # lowest value starting with 1 #' dummy <- sample(11:15, 10, replace = TRUE) #' recode_to(dummy, lowest = 1) #' #' # lowest value starting with 1, highest with 3 #' # all others set to NA #' dummy <- sample(11:15, 10, replace = TRUE) #' recode_to(dummy, lowest = 1, highest = 3) #' #' # recode multiple variables at once #' data(efc) #' recode_to(efc, c82cop1, c83cop2, c84cop3, append = FALSE) #' #' library(dplyr) #' efc %>% #' select(c82cop1, c83cop2, c84cop3) %>% #' mutate( #' c82new = recode_to(c83cop2, lowest = 5), #' c83new = recode_to(c84cop3, lowest = 3) #' ) %>% #' head() #' #' #' @export recode_to <- function(x, ..., lowest = 0, highest = -1, append = TRUE, suffix = "_r0") { UseMethod("recode_to") } #' @export recode_to.default <- function(x, ..., lowest = 0, highest = -1, append = TRUE, suffix = "_r0") { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) rec_to_fun( x = x, .dat = .dat, lowest = lowest, highest = highest, append = append, suffix = suffix ) } #' @export recode_to.mids <- function(x, ..., lowest = 0, highest = -1, append = TRUE, suffix = "_r0") { vars <- dplyr::quos(...) ndf <- prepare_mids_recode(x) # select variable and compute rowsums. add this variable # to each imputed ndf$data <- purrr::map( ndf$data, function(.x) { dat <- dplyr::select(.x, !!! vars) dplyr::bind_cols( .x, rec_to_fun( x = dat, .dat = dat, lowest = lowest, highest = highest, append = FALSE, suffix = suffix )) } ) final_mids_recode(ndf) } #' @rdname recode_to #' @export recode_to_if <- function(x, predicate, lowest = 0, highest = -1, append = TRUE, suffix = "_r0") { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } rec_to_fun( x = x, .dat = .dat, lowest = lowest, highest = highest, append = append, suffix = suffix ) } rec_to_fun <- function(x, .dat, lowest, highest, append, suffix) { if (is.data.frame(x)) { # remember original data, if user wants to bind columns orix <- x # iterate variables of data frame for (i in colnames(.dat)) { x[[i]] <- rec_to_helper( x = .dat[[i]], lowest = lowest, highest = highest ) } # select only recoded variables x <- x[colnames(.dat)] # add suffix to recoded variables and combine data x <- append_columns(x, orix, suffix, append) } else { x <- rec_to_helper( x = .dat, lowest = lowest, highest = highest ) } x } rec_to_helper <- function(x, lowest, highest) { # retrieve value labels val_lab <- sjlabelled::get_labels( x, attr.only = TRUE, values = NULL, non.labelled = TRUE ) # retrieve variable label var_lab <- sjlabelled::get_label(x) # check if factor if (is.factor(x)) { # try to convert to numeric x <- as.numeric(as.character(x)) } # retrieve lowest category minval <- min(x, na.rm = TRUE) # check substraction difference between current lowest value # and requested lowest value downsize <- minval - lowest x <- sapply(x, function(y) y - downsize) # check for highest range # set NA to all values out of range if (highest > lowest) x[x > highest] <- NA # set back labels, if we have any if (!is.null(val_lab)) x <- suppressWarnings(sjlabelled::set_labels(x, labels = val_lab)) if (!is.null(var_lab)) x <- suppressWarnings(sjlabelled::set_label(x, label = var_lab)) # return recoded x x } sjmisc/R/row_count.R0000644000176200001440000001020714046746443014121 0ustar liggesusers#' @title Count row or column indices #' @name row_count #' #' @description \code{row_count()} mimics base R's \code{rowSums()}, with sums #' for a specific value indicated by \code{count}. Hence, it is equivalent #' to \code{rowSums(x == count, na.rm = TRUE)}. However, this function #' is designed to work nicely within a pipe-workflow and allows select-helpers #' for selecting variables and the return value is always a data frame #' (with one variable). #' \cr \cr #' \code{col_count()} does the same for columns. The return value is #' a data frame with one row (the column counts) and the same number #' of columns as \code{x}. #' #' @param count The value for which the row or column sum should be computed. May #' be a numeric value, a character string (for factors or character vectors), #' \code{NA}, \code{Inf} or \code{NULL} to count missing or infinite values, #' or null-values. #' @param var Name of new the variable with the row or column counts. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return For \code{row_count()}, a data frame with one variable: the sum of \code{count} #' appearing in each row of \code{x}; for \code{col_count()}, a data frame with #' one row and the same number of variables as in \code{x}: each variable #' holds the sum of \code{count} appearing in each variable of \code{x}. #' If \code{append = TRUE}, \code{x} including this variable will be returned. #' #' @examples #' dat <- data.frame( #' c1 = c(1, 2, 3, 1, 3, NA), #' c2 = c(3, 2, 1, 2, NA, 3), #' c3 = c(1, 1, 2, 1, 3, NA), #' c4 = c(1, 1, 3, 2, 1, 2) #' ) #' #' row_count(dat, count = 1, append = FALSE) #' row_count(dat, count = NA, append = FALSE) #' row_count(dat, c1:c3, count = 2, append = TRUE) #' #' col_count(dat, count = 1, append = FALSE) #' col_count(dat, count = NA, append = FALSE) #' col_count(dat, c1:c3, count = 2, append = TRUE) #' #' @export row_count <- function(x, ..., count, var = "rowcount", append = TRUE) { UseMethod("row_count") } #' @export row_count.default <- function(x, ..., count, var = "rowcount", append = TRUE) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) # remember original data, if user wants to bind columns orix <- x if (is.data.frame(x)) { rc <- row.count(.dat, count) } else { stop("`x` must be a data frame.", call. = FALSE) } # rename variable rc <- as.data.frame(rc) colnames(rc) <- var # combine data if (append) rc <- dplyr::bind_cols(orix, rc) rc } #' @export row_count.mids <- function(x, ..., count, var = "rowcount", append = TRUE) { rfun <- row.count row_mids(x = x, ..., var = var, append = append, rfun = rfun, count = count) } row.count <- function(.dat, count) { if (is.na(count)) rc <- apply(.dat, 1, function(x) sum(is.na(x), na.rm = TRUE)) else if (is.infinite(count)) rc <- apply(.dat, 1, function(x) sum(is.infinite(x), na.rm = TRUE)) else if (is.null(count)) rc <- apply(.dat, 1, function(x) sum(is.null(x), na.rm = TRUE)) else rc <- apply(.dat, 1, function(x) sum(x == count, na.rm = TRUE)) rc } #' @rdname row_count #' @export col_count <- function(x, ..., count, var = "colcount", append = TRUE) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) # remember original data, if user wants to bind columns orix <- x if (is.data.frame(x)) { if (is.na(count)) rc <- purrr::map_df(.dat, function(x) sum(is.na(x), na.rm = TRUE)) else if (is.infinite(count)) rc <- purrr::map_df(.dat, function(x) sum(is.infinite(x), na.rm = TRUE)) else if (is.null(count)) rc <- purrr::map_df(.dat, function(x) sum(is.null(x), na.rm = TRUE)) else rc <- purrr::map_df(.dat, function(x) sum(x == count, na.rm = TRUE)) } else { stop("`x` must be a data frame.", call. = FALSE) } # combine data if (append) rc <- dplyr::bind_rows(orix, rc) rc } sjmisc/R/to_dummy.R0000644000176200001440000001126014046746443013737 0ustar liggesusers#' @title Split (categorical) vectors into dummy variables #' @name to_dummy #' #' @description This function splits categorical or numeric vectors with #' more than two categories into 0/1-coded dummy variables. #' #' @param x A vector or data frame. #' @param ... Optional, unquoted names of variables that should be selected for #' further processing. Required, if \code{x} is a data frame (and no #' vector) and only selected variables from \code{x} should be processed. #' You may also use functions like \code{:} or tidyselect's #' select-helpers. #' See 'Examples' or \href{../doc/design_philosophy.html}{package-vignette}. #' @param var.name Indicates how the new dummy variables are named. Use #' \code{"name"} to use the variable name or any other string that will #' be used as is. Only applies, if \code{x} is a vector. See 'Examples'. #' @param suffix Indicates which suffix will be added to each dummy variable. #' Use \code{"numeric"} to number dummy variables, e.g. \emph{x_1}, #' \emph{x_2}, \emph{x_3} etc. Use \code{"label"} to add value label, #' e.g. \emph{x_low}, \emph{x_mid}, \emph{x_high}. May be abbreviated. #' #' @return A data frame with dummy variables for each category of \code{x}. #' The dummy coded variables are of type \code{\link{atomic}}. #' #' @note \code{NA} values will be copied from \code{x}, so each dummy variable #' has the same amount of \code{NA}'s at the same position as \code{x}. #' #' @examples #' data(efc) #' head(to_dummy(efc$e42dep)) #' #' # add value label as suffix to new variable name #' head(to_dummy(efc$e42dep, suffix = "label")) #' #' # use "dummy" as new variable name #' head(to_dummy(efc$e42dep, var.name = "dummy")) #' #' # create multiple dummies, append to data frame #' to_dummy(efc, c172code, e42dep) #' #' # pipe-workflow #' library(dplyr) #' efc %>% #' select(e42dep, e16sex, c172code) %>% #' to_dummy() #' #' @export to_dummy <- function(x, ..., var.name = "name", suffix = c("numeric", "label")) { # check for abbr suffix <- match.arg(suffix) # save variable name varname <- deparse(substitute(x)) # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) if (is.data.frame(x)) { # iterate variables of data frame x <- dplyr::bind_cols( purrr::map(colnames(.dat), ~ to_dummy_helper( x = .dat[[.x]], varname = .x, suffix = suffix )) ) } else { # remove "data frame name" dollar_pos <- regexpr("$", varname, fixed = TRUE)[1] if (dollar_pos != -1) varname <- substr(varname, start = dollar_pos + 1, stop = nchar(varname)) # set default variable name if (var.name != "name") varname <- var.name # convert to dummy x <- to_dummy_helper(.dat, varname, suffix) } as.data.frame(x) } to_dummy_helper <- function(x, varname, suffix) { # make sure we have a factor, so order of values is correct if (is.character(x)) x <- to_factor(x) # check whether we have labels labels <- sjlabelled::get_labels(x, attr.only = FALSE, values = "n", non.labelled = TRUE) # get resp. set variable label for new dummy variables # get variable label label <- sjlabelled::get_label(x, def.value = varname) # get unique values values <- sort(unique(x)) # find which labels / categories were # actually used if (is.null(names(labels))) { # find labels index numbers labels.nr <- seq_len(length(labels))[labels %in% values] # copy only used labels labels <- labels[labels %in% values] } else { # get label value labels label.names <- names(labels) # numeric? if (!anyNA(as.numeric(label.names))) label.names <- as.numeric(label.names) # find labels index numbers labels.nr <- seq_len(length(labels))[label.names %in% values] # copy only used labels labels <- labels[label.names %in% values] } # return value mydf <- data.frame() # create all dummy variables for (i in seq_len(length(values))) { # create dummy var dummy <- rep(0, length(x)) # set NA dummy[is.na(x)] <- NA # copy dummy level dummy[which(x == values[i])] <- 1 # set variable name sjlabelled::set_label(dummy) <- sprintf("%s: %s", label, labels[i]) # bind to df if (nrow(mydf) == 0) mydf <- data.frame(dummy) else mydf <- cbind(mydf, dummy) } # prepare col.names col.nam <- rep(varname, ncol(mydf)) if (suffix == "numeric") col.nam <- sprintf("%s_%i", col.nam, labels.nr) else col.nam <- sprintf("%s_%s", col.nam, labels) colnames(mydf) <- col.nam mydf } sjmisc/R/merge_df.R0000644000176200001440000000603414046746443013655 0ustar liggesusers#' @title Merge labelled data frames #' @name add_rows #' #' @description Merges (full join) data frames and preserve value and variable labels. #' #' @param ... Two or more data frames to be merged. #' @param id Optional name for ID column that will be created to indicate the #' source data frames for appended rows. #' #' @return A full joined data frame. #' #' @details This function works like \code{\link[dplyr:bind]{dplyr::bind_rows()}}, but preserves #' variable and value label attributes. \code{add_rows()} row-binds all data #' frames in \code{...}, even if these have different numbers of columns. #' Non-matching columns will be column-bound and filled with \code{NA}-values #' for rows in those data frames that do not have this column. #' \cr \cr #' Value and variable labels are preserved. If matching columns have #' different value label attributes, attributes from first data frame #' will be used. #' \cr \cr #' \code{merge_df()} is an alias for \code{add_rows()}. #' #' @examples #' library(dplyr) #' data(efc) #' x1 <- efc %>% select(1:5) %>% slice(1:10) #' x2 <- efc %>% select(3:7) %>% slice(11:20) #' #' mydf <- add_rows(x1, x2) #' mydf #' str(mydf) #' #' \dontrun{ #' library(sjPlot) #' view_df(mydf)} #' #' x3 <- efc %>% select(5:9) %>% slice(21:30) #' x4 <- efc %>% select(11:14) %>% slice(31:40) #' #' mydf <- add_rows(x1, x2, x3, x4, id = "subsets") #' mydf #' str(mydf) #' @export add_rows <- function(..., id = NULL) { # get column names of all data frames and make sure that ID # variable has unique column name cnames <- purrr::map(list(...), ~ colnames(.x)) %>% purrr::flatten_chr() if (!is.null(id) && id %in% cnames) { id <- make.unique(c(cnames, id))[length(cnames) + 1] warning(sprintf("Value of `id` already exists as column name. ID column was renamed to `%s`.", id), call. = FALSE) } # remove variables with duplicated names dat <- lapply(list(...), function(d) { d[, unique(names(d)), drop = FALSE] }) # bind all data frames x <- dplyr::bind_rows(dat, .id = id) # get attributes from all variables of original data frame # and restore these attributes to the final merged data frame # (bind_rows() currently drops attributes) at <- purrr::map(list(...), function(x) { purrr::map(x, ~ attributes(.x)) }) %>% purrr::flatten() %>% purrr::compact() if (!sjmisc::is_empty(at)) { # make sure attributes from duplicated variables # are removed at <- at[!duplicated(at)] for (i in names(at)) { attr(x[[i]], "labels") <- at[[i]][["labels"]] attr(x[[i]], "label") <- at[[i]][["label"]] attr(x[[i]], "na_values") <- at[[i]][["na_values"]] attr(x[[i]], "na.values") <- at[[i]][["na.values"]] attr(x[[i]], "na_range") <- at[[i]][["na_range"]] attr(x[[i]], "na.range") <- at[[i]][["na.range"]] } } x } #' @rdname add_rows #' @export merge_df <- function(..., id = NULL) { add_rows(..., id = id) } sjmisc/R/var_labels.R0000644000176200001440000000500514046746443014214 0ustar liggesusers#' @title Rename variables #' @name var_rename #' #' @description This function renames variables in a data frame, i.e. it #' renames the columns of the data frame. #' #' @param x A data frame. #' @param ... A named vector, or pairs of named vectors, where the name (lhs) #' equals the column name that should be renamed, and the value (rhs) is #' the new column name. #' @param verbose Logical, if \code{TRUE}, a warning is displayed when variable #' names do not exist in \code{x}. #' #' @return \code{x}, with new column names for those variables specified in \code{...}. #' #' @examples #' dummy <- data.frame( #' a = sample(1:4, 10, replace = TRUE), #' b = sample(1:4, 10, replace = TRUE), #' c = sample(1:4, 10, replace = TRUE) #' ) #' #' rename_variables(dummy, a = "first.col", c = "3rd.col") #' #' # using quasi-quotation #' library(rlang) #' v1 <- "first.col" #' v2 <- "3rd.col" #' rename_variables(dummy, a = !!v1, c = !!v2) #' #' x1 <- "a" #' x2 <- "b" #' rename_variables(dummy, !!x1 := !!v1, !!x2 := !!v2) #' #' # using a named vector #' new_names <- c(a = "first.col", c = "3rd.col") #' rename_variables(dummy, new_names) #' @export var_rename <- function(x, ..., verbose = TRUE) { # get dots .dots <- match.call(expand.dots = FALSE)$`...` if (inherits(.dots, "pairlist")) { d <- lapply(rlang::ensyms(...), rlang::as_string) %>% unlist() # we might have a simple named vector if (sjmisc::is_empty(names(d)) && length(.dots) == 1) d <- eval(.dots[[1]]) .dots <- d } else { .dots <- unlist(.dots) } # select variables old_names <- names(.dots) # get new variable names new_names <- unname(.dots) # non-matching column names non.match <- which(old_names %nin% colnames(x)) # check if all variables exist in data frame if (!sjmisc::is_empty(non.match)) { if (verbose) { # tell user warning(sprintf( "Following elements are no valid column names in `x`: %s", paste(old_names[non.match], collapse = ",") ), call. = FALSE) } # remove invalid names old_names <- old_names[-non.match] new_names <- new_names[-non.match] } # find column indices of variables that should be renamed name_pos <- match(old_names, colnames(x)) # rename column colnames(x)[name_pos] <- new_names # return data x } #' @rdname var_rename #' @export rename_variables <- var_rename #' @rdname var_rename #' @export rename_columns <- var_rename sjmisc/R/notin.R0000644000176200001440000000147313451124270013221 0ustar liggesusers#' @title Value matching #' @name %nin% #' #' @description \%nin\% is the complement to \%in\%. It looks which values #' in \code{x} do \emph{not} match (hence, are \emph{not in}) #' values in \code{y}. #' #' @param x Vector with values to be matched. #' @param y Vector with values to be matched against. #' #' @details See 'Details' in \code{\link{match}}. #' #' @return A logical vector, indicating if a match was \emph{not} located for each element #' of \code{x}, thus the values are \code{TRUE} or \code{FALSE} and #' never \code{NA}. #' #' @examples #' c("a", "B", "c") %in% letters #' c("a", "B", "c") %nin% letters #' #' c(1, 2, 3, 4) %in% c(3, 4, 5, 6) #' c(1, 2, 3, 4) %nin% c(3, 4, 5, 6) #' #' @export "%nin%" <- function(x, y) { !(x %in% y) } sjmisc/R/efc.R0000644000176200001440000000054313451124270012624 0ustar liggesusers#' @docType data #' @title Sample dataset from the EUROFAMCARE project #' @name efc #' @keywords data #' #' @description A SPSS sample data set, imported with the \code{\link[sjlabelled]{read_spss}} function. #' #' @examples #' # Attach EFC-data #' data(efc) #' #' # Show structure #' str(efc) #' #' # show first rows #' head(efc) NULL sjmisc/R/is_float.R0000644000176200001440000000351714046746443013710 0ustar liggesusers#' @title Check if a variable is of (non-integer) double type or a whole number #' @name is_float #' #' @description \code{is_float()} checks whether an input vector or value is a #' numeric non-integer (double), depending on fractional parts of the value(s). #' \code{is_whole()} does the opposite and checks whether an input vector #' is a whole number (without fractional parts). #' #' @param x A value, vector or data frame. #' #' @return For \code{is_float()}, \code{TRUE} if \code{x} is a floating value #' (non-integer double), \code{FALSE} otherwise (also returns \code{FALSE} #' for character vectors and factors). For \code{is_whole()}, \code{TRUE} #' if \code{x} is a vector with whole numbers only, \code{FALSE} otherwise #' (returns \code{TRUE} for character vectors and factors). #' #' @examples #' data(mtcars) #' data(iris) #' #' is.double(4) #' is_float(4) #' is_float(4.2) #' is_float(iris) #' #' is_whole(4) #' is_whole(4.2) #' is_whole(mtcars) #' #' #' @export is_float <- function(x) { UseMethod("is_float") } #' @export is_float.default <- function(x) { is.numeric(x) && !all(x %% 1 == 0, na.rm = TRUE) } #' @export is_float.data.frame <- function(x) { purrr::map_lgl(x, ~ is.numeric(.x) && !all(.x %% 1 == 0, na.rm = TRUE)) } #' @export is_float.list <- function(x) { purrr::map_lgl(x, ~ is.numeric(.x) && !all(.x %% 1 == 0, na.rm = TRUE)) } #' @rdname is_float #' @export is_whole <- function(x) { UseMethod("is_whole") } iwh <- function(x) { (is.numeric(x) && all(floor(x) == x, na.rm = TRUE)) || is.character(x) || is.factor(x) } #' @export is_whole.default <- function(x) { iwh(x) } #' @export is_whole.data.frame <- function(x) { purrr::map_lgl(x, iwh) } #' @export is_whole.list <- function(x) { purrr::map_lgl(x, iwh) } sjmisc/R/num_to_fac.R0000644000176200001440000000266214046746443014222 0ustar liggesusers#' @title Convert numeric vectors into factors associated value labels #' @name numeric_to_factor #' #' @description This function converts numeric variables into factors, #' and uses associated value labels as factor levels. #' #' @param x A data frame. #' @param n Numeric, indicating the maximum amount of unique values in \code{x} #' to be considered as "factor". Variables with more unique values than \code{n} #' are not converted to factor. #' #' @return \code{x}, with numeric values with a maximum of \code{n} unique values #' being converted to factors. #' #' @details If \code{x} is a labelled vector, associated value labels will be used #' as level. Else, the numeric vector is simply coerced using \code{as.factor()}. #' #' @examples #' library(dplyr) #' data(efc) #' efc %>% #' select(e42dep, e16sex, c12hour, c160age, c172code) %>% #' numeric_to_factor() #' @export numeric_to_factor <- function(x, n = 4) { as.data.frame(lapply(x, function(.x) { if (is.numeric(.x) && dplyr::n_distinct(.x, na.rm = TRUE) <= n) { label <- attr(.x, "label", exact = TRUE) labels <- attr(.x, "labels", exact = TRUE) labels <- labels[!is.na(labels)] if (!sjmisc::is_empty(labels)) { for (i in 1:length(labels)) { .x[.x == labels[i]] <- names(labels[i]) } } .x <- as.factor(.x) attr(.x, "label") <- label } .x })) } sjmisc/R/split_var.R0000644000176200001440000001364114046746443014112 0ustar liggesusers#' @title Split numeric variables into smaller groups #' @name split_var #' #' @description Recode numeric variables into equal sized groups, i.e. a #' variable is cut into a smaller number of groups at specific cut points. #' \code{split_var_if()} is a scoped variant of \code{split_var()}, where #' transformation will be applied only to those variables that match the #' logical condition of \code{predicate}. #' #' @seealso \code{\link{group_var}} to group variables into equal ranged groups, #' or \code{\link{rec}} to recode variables. #' #' @param n The new number of groups that \code{x} should be split into. #' @param inclusive Logical; if \code{TRUE}, cut point value are included in #' the preceding group. This may be necessary if cutting a vector into #' groups does not define proper ("equal sized") group sizes. #' See 'Note' and 'Examples'. #' #' @inheritParams to_dummy #' @inheritParams group_var #' @inheritParams rec #' #' @return A grouped variable with equal sized groups. If \code{x} is a data frame, #' for \code{append = TRUE}, \code{x} including the grouped variables as new #' columns is returned; if \code{append = FALSE}, only the grouped variables #' will be returned. If \code{append = TRUE} and \code{suffix = ""}, #' recoded variables will replace (overwrite) existing variables. #' #' @details \code{split_var()} splits a variable into equal sized groups, where #' the amount of groups depends on the \code{n}-argument. Thus, this #' functions \code{\link{cut}s} a variable into groups at the specified #' \code{\link[stats]{quantile}s}. #' \cr \cr #' By contrast, \code{\link{group_var}} recodes a variable into groups, where #' groups have the same value range (e.g., from 1-5, 6-10, 11-15 etc.). #' \cr \cr #' \code{split_var()} also works on grouped data frames #' (see \code{\link[dplyr]{group_by}}). In this case, splitting is applied to #' the subsets of variables in \code{x}. See 'Examples'. #' #' @note In case a vector has only few number of unique values, splitting into #' equal sized groups may fail. In this case, use the \code{inclusive}-argument #' to shift a value at the cut point into the lower, preceeding group to #' get equal sized groups. See 'Examples'. #' #' @examples #' data(efc) #' # non-grouped #' table(efc$neg_c_7) #' #' # split into 3 groups #' table(split_var(efc$neg_c_7, n = 3)) #' #' # split multiple variables into 3 groups #' split_var(efc, neg_c_7, pos_v_4, e17age, n = 3, append = FALSE) #' frq(split_var(efc, neg_c_7, pos_v_4, e17age, n = 3, append = FALSE)) #' #' # original #' table(efc$e42dep) #' #' # two groups, non-inclusive cut-point #' # vector split leads to unequal group sizes #' table(split_var(efc$e42dep, n = 2)) #' #' # two groups, inclusive cut-point #' # group sizes are equal #' table(split_var(efc$e42dep, n = 2, inclusive = TRUE)) #' #' # Unlike dplyr's ntile(), split_var() never splits a value #' # into two different categories, i.e. you always get a clean #' # separation of original categories #' library(dplyr) #' #' x <- dplyr::ntile(efc$neg_c_7, n = 3) #' table(efc$neg_c_7, x) #' #' x <- split_var(efc$neg_c_7, n = 3) #' table(efc$neg_c_7, x) #' #' # works also with gouped data frames #' mtcars %>% #' split_var(disp, n = 3, append = FALSE) %>% #' table() #' #' mtcars %>% #' group_by(cyl) %>% #' split_var(disp, n = 3, append = FALSE) %>% #' table() #' @export split_var <- function(x, ..., n, as.num = FALSE, val.labels = NULL, var.label = NULL, inclusive = FALSE, append = TRUE, suffix = "_g") { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) recode_fun( x = x, .dat = .dat, fun = get("split_var_helper", asNamespace("sjmisc")), suffix = suffix, append = append, groupcount = n, as.num = as.num, var.label = var.label, val.labels = val.labels, inclusive = inclusive ) } #' @rdname split_var #' @export split_var_if <- function(x, predicate, n, as.num = FALSE, val.labels = NULL, var.label = NULL, inclusive = FALSE, append = TRUE, suffix = "_g") { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } recode_fun( x = x, .dat = .dat, fun = get("split_var_helper", asNamespace("sjmisc")), suffix = suffix, append = append, groupcount = n, as.num = as.num, var.label = var.label, val.labels = val.labels, inclusive = inclusive ) } split_var_helper <- function(x, groupcount, as.num, val.labels, var.label, inclusive) { # retrieve variable label if (is.null(var.label)) var_lab <- sjlabelled::get_label(x) else var_lab <- var.label # do we have any value labels? val_lab <- val.labels # amount of "cuts" is groupcount - 1 zaehler <- seq_len(groupcount - 1) # prepare division nenner <- rep(groupcount, length(zaehler)) # get quantiles qu_prob <- zaehler / nenner # get quantile values grp_cuts <- stats::quantile(x, qu_prob, na.rm = TRUE) # create breaks. need to check if these are non-unique breaks <- unique(c(0, grp_cuts, max(x, na.rm = TRUE))) # cut variables into groups retval <- cut( x = x, breaks = breaks, include.lowest = !inclusive, right = inclusive ) # rename factor levels levels(retval) <- seq_len(groupcount) # to numeric? if (as.num) retval <- sjlabelled::as_numeric(retval) # set back variable and value labels retval <- suppressWarnings(sjlabelled::set_label(retval, label = var_lab)) retval <- suppressWarnings(sjlabelled::set_labels(retval, labels = val_lab)) # return value retval } sjmisc/R/move_column.R0000644000176200001440000000706714046746443014437 0ustar liggesusers#' @title Move columns to other positions in a data frame #' @name move_columns #' #' @description \code{move_columns()} moves one or more columns in a data frame #' to another position. #' #' @param data A data frame. #' @param ... Unquoted names or character vector with names of variables that #' should be move to another position. You may also use functions like #' \code{:} or tidyselect's select-helpers. #' @param .before Optional, column name or numeric index of the position where #' \code{col} should be moved to. If not missing, \code{col} is moved to the #' position \emph{before} the column indicated by \code{.before}. #' @param .after Optional, column name or numeric index of the position where #' \code{col} should be moved to. If not missing, \code{col} is moved to the #' position \emph{after} the column indicated by \code{.after}. #' #' @return \code{data}, with resorted columns. #' #' @note If neither \code{.before} nor \code{.after} are specified, the #' column is moved to the end of the data frame by default. \code{.before} #' and \code{.after} are evaluated in a non-standard fashion, so you need #' quasi-quotation when the value for \code{.before} or \code{.after} is #' a vector with the target-column value. See 'Examples'. #' #' @examples #' \dontrun{ #' data(iris) #' #' iris %>% #' move_columns(Sepal.Width, .after = "Species") %>% #' head() #' #' iris %>% #' move_columns(Sepal.Width, .before = Sepal.Length) %>% #' head() #' #' iris %>% #' move_columns(Species, .before = 1) %>% #' head() #' #' iris %>% #' move_columns("Species", "Petal.Length", .after = 1) %>% #' head() #' #' library(dplyr) #' iris %>% #' move_columns(contains("Width"), .after = "Species") %>% #' head()} #' #' # using quasi-quotation #' target <- "Petal.Width" #' # does not work, column is moved to the end #' iris %>% #' move_columns(Sepal.Width, .after = target) %>% #' head() #' #' # using !! works #' iris %>% #' move_columns(Sepal.Width, .after = !!target) %>% #' head() #' @export move_columns <- function(data, ..., .before, .after) { # copy attributes a <- attributes(data) variables <- dplyr::quos(...) dat <- dplyr::select(data, !!! variables) remaining <- which(!(colnames(data) %in% colnames(dat))) data <- dplyr::select(data, !! remaining) pos.before <- rlang::quo_name(rlang::enquo(.before)) pos.after <- rlang::quo_name(rlang::enquo(.after)) if (sjmisc::is_empty(pos.before) && sjmisc::is_empty(pos.after)) { pos.after <- Inf } else { if (!sjmisc::is_empty(pos.before)) { if (is_num_chr(pos.before)) pos.after <- as.numeric(pos.before) - 1 else pos.after <- which(colnames(data) == pos.before) - 1 } else if (!sjmisc::is_empty(pos.after)) { if (is_num_chr(pos.after)) pos.after <- as.numeric(pos.after) else pos.after <- which(colnames(data) == pos.after) } else { pos.after <- Inf } } # final test, to make sure we have a valid value here if (!length(pos.after)) pos.after <- Inf if (!is.infinite(pos.after) && pos.after < 1) { x <- cbind(dat, data) } else if (is.infinite(pos.after) || pos.after >= ncol(data)) { x <- cbind(data, dat) } else { c1 <- 1:pos.after c2 <- (pos.after + 1):ncol(data) x1 <- dplyr::select(data, !! c1) x2 <- dplyr::select(data, !! c2) x <- cbind(x1, dat, x2) } a[names(a) %in% names(attributes(x))] <- NULL attributes(x) <- c(attributes(x), a) x } sjmisc/R/prop_table.R0000644000176200001440000001023514046746443014232 0ustar liggesusers#' @title Flat (proportional) tables #' @name flat_table #' #' @description This function creates a labelled flat table or flat #' proportional (marginal) table. #' #' @param data A data frame. May also be a grouped data frame (see 'Note' and #' 'Examples'). #' @param ... One or more variables of \code{data} that should be printed as table. #' @param margin Specify the table margin that should be computed for proportional #' tables. By default, counts are printed. Use \code{margin = "cell"}, #' \code{margin = "col"} or \code{margin = "row"} to print cell, #' column or row percentages of the table margins. #' @param digits Numeric; for proportional tables, \code{digits} indicates the #' number of decimal places. #' @param show.values Logical, if \code{TRUE}, value labels are prefixed by the #' associated value. #' @inheritParams frq #' #' @return An object of class \code{\link[stats]{ftable}}. #' #' @note \code{data} may also be a grouped data frame (see \code{\link[dplyr]{group_by}}) #' with up to two grouping variables. Cross tables are created for each subgroup then. #' #' @seealso \code{\link{frq}} for simple frequency table of labelled vectors. #' #' @examples #' data(efc) #' #' # flat table with counts #' flat_table(efc, e42dep, c172code, e16sex) #' #' # flat table with proportions #' flat_table(efc, e42dep, c172code, e16sex, margin = "row") #' #' # flat table from grouped data frame. You need to select #' # the grouping variables and at least two more variables for #' # cross tabulation. #' library(dplyr) #' efc %>% #' group_by(e16sex) %>% #' select(e16sex, c172code, e42dep) %>% #' flat_table() #' #' efc %>% #' group_by(e16sex, e42dep) %>% #' select(e16sex, e42dep, c172code, n4pstu) %>% #' flat_table() #' #' # now it gets weird... #' efc %>% #' group_by(e16sex, e42dep) %>% #' select(e16sex, e42dep, c172code, n4pstu, c161sex) %>% #' flat_table() #' @export flat_table <- function(data, ..., margin = c("counts", "cell", "row", "col"), digits = 2, show.values = FALSE, weights = NULL) { # match arguments margin <- match.arg(margin) # check whether no.prop.table <- is.null(margin) || (margin == "counts") # check margins for proportional table marge <- dplyr::case_when( margin == "cell" ~ 0, margin == "row" ~ 1, margin == "col" ~ 2, TRUE ~ 0 ) # correct result if (marge == 0) marge <- NULL # get dot data dd <- get_dot_data(data, dplyr::quos(...)) # weights w <- deparse(substitute(weights)) if (w != "NULL") { w <- gsub("\"", "", w, fixed = FALSE) if (!is.null(data[[w]])) { w <- data[[w]] } else { w <- eval(substitute(weights)) } } else { w <- NULL } # do we have a grouped data frame? if (inherits(dd, "grouped_df")) { # get grouped data grps <- get_grouped_data(dd) # now plot everything for (i in seq_len(nrow(grps))) { # copy back labels to grouped data frame tmp <- sjlabelled::copy_labels(grps$data[[i]], dd) # print title for grouping insight::print_color("\nGrouped by: ", "red") insight::print_color(sprintf("%s\n\n", get_grouped_title(dd, grps, i, sep = ", ", long = FALSE)), "cyan") # print frequencies print(com_ft(tmp, show.values, no.prop.table, marge, digits, w = w)) cat("\n") } } else { com_ft(dd, show.values, no.prop.table, marge, digits, w = w) } } com_ft <- function(dd, show.values, no.prop.table, marge, digits, w = NULL) { # select variables, convert to label and create ftable-pbject x <- sjlabelled::as_label(dd, add.non.labelled = TRUE, prefix = show.values) if (!is.null(w)) { f <- paste(colnames(x), collapse = " + ") x$.weights <- w f <- stats::as.formula(paste(".weights ~ ", f)) x <- round(stats::xtabs(formula = f, data = x)) } x <- stats::ftable(x) # if required, compute table margins if (!no.prop.table) { x <- x %>% prop.table(margin = marge) %>% round(digits = digits + 2) x <- x * 100 } x } sjmisc/R/re-exports.R0000644000176200001440000000066614046746443014222 0ustar liggesusers#' @importFrom sjlabelled to_character #' @export sjlabelled::to_character #' @importFrom sjlabelled to_label #' @export sjlabelled::to_label #' @importFrom sjlabelled to_numeric #' @export sjlabelled::to_numeric #' @importFrom sjlabelled to_factor #' @export sjlabelled::to_factor #' @importFrom magrittr %>% #' @export magrittr::`%>%` #' @importFrom sjlabelled set_na #' @export sjlabelled::set_na sjmisc/R/group_str.R0000644000176200001440000001704314046746443014133 0ustar liggesusers#' @title Group near elements of string vectors #' @name group_str #' #' @seealso \code{\link{str_find}} #' #' @description This function groups elements of a string vector (character or string #' variable) according to the element's distance ('similatiry'). The #' more similar two string elements are, the higher is the #' chance to be combined into a group. #' #' @param strings Character vector with string elements. #' @param precision Maximum distance ("precision") between two string elements, #' which is allowed to treat them as similar or equal. Smaller values mean #' less tolerance in matching. #' @param strict Logical; if \code{TRUE}, value matching is more strictly. See 'Examples'. #' @param trim.whitespace Logical; if \code{TRUE} (default), leading and trailing white spaces will #' be removed from string values. #' @param remove.empty Logical; if \code{TRUE} (default), empty string values will be removed from the #' character vector \code{strings}. #' @param verbose Logical; if \code{TRUE}, the progress bar is displayed when computing the distance matrix. #' Default in \code{FALSE}, hence the bar is hidden. #' @param maxdist Deprecated. Please use \code{precision} now. #' #' @return A character vector where similar string elements (values) are recoded #' into a new, single value. The return value is of same length as #' \code{strings}, i.e. grouped elements appear multiple times, so #' the count for each grouped string is still avaiable (see 'Examples'). #' #' @examples #' oldstring <- c("Hello", "Helo", "Hole", "Apple", #' "Ape", "New", "Old", "System", "Systemic") #' newstring <- group_str(oldstring) #' #' # see result #' newstring #' #' # count for each groups #' table(newstring) #' #' # print table to compare original and grouped string #' frq(oldstring) #' frq(newstring) #' #' # larger groups #' newstring <- group_str(oldstring, precision = 3) #' frq(oldstring) #' frq(newstring) #' #' # be more strict with matching pairs #' newstring <- group_str(oldstring, precision = 3, strict = TRUE) #' frq(oldstring) #' frq(newstring) #' @export group_str <- function( strings, precision = 2, strict = FALSE, trim.whitespace = TRUE, remove.empty = TRUE, verbose = FALSE, maxdist ) { # coerce to character, if necessary if (!is.character(strings)) strings <- as.character(strings) if (!missing(maxdist)) precision <- maxdist # trim white spaces if (trim.whitespace) strings <- unname(sapply(strings, trim)) # remove empty values if (remove.empty) { removers <- which(sjmisc::is_empty(strings, first.only = FALSE)) if (length(removers) > 0) strings <- strings[-removers] } # create matrix from string values of variable if (requireNamespace("stringdist", quietly = TRUE)) { m <- stringdist::stringdistmatrix(strings, strings, method = "lv", useNames = "strings") } else { message("Install the `stringdist`-package to increase performance for grouping strings.") m <- string_dist_matrix(strings) } # init variable that contains "close" pairs pairs <- list() # create progress bar if (verbose) pb <- utils::txtProgressBar(min = 0, max = ncol(m), style = 3) # iterate matrix for (i in seq_len(nrow(m))) { # update progress bar if (verbose) utils::setTxtProgressBar(pb, i) # check if current element is already grouped if (!findInPairs(rownames(m)[i], pairs)) { # current row element has not been grouped # yet, so go on... pairvector <- c() for (j in seq_len(ncol(m))) { # check if we found a pair's distance that # is within the maximum requested distance # i.e. which are "close" enough if (!is.na(m[i, j]) && m[i, j] <= precision) { # go through all rows of this column and # check if there's a better match for the # currently compared token foundBetterToken <- !strict for (cnt in seq_len(nrow(m))) { if (!is.na(m[cnt, j]) && !is.na(m[i, cnt])) { if (strict) { if (m[cnt, j] > 0 && m[cnt, j] < m[i, j]) { foundBetterToken <- TRUE break } } else { if (m[cnt, j] <= precision && m[i, cnt] <= precision) { foundBetterToken <- FALSE break } } } } # in the current column, there's no better # matching of strings, so we pick this values # and add it to our results if (!foundBetterToken) { # remember string value token <- colnames(m)[j] # check if we already found a string value # within this column. if not, add string values # to "close" pairs of this column if (!any(pairvector == token) && !findInPairs(token, pairs)) pairvector <- c(pairvector, token) } } } # now we have a vector with all "close" string values # from the current row's value pairvector <- sort(pairvector) # check if we already have saved these values to our list # if not, add "close" values as new list element if (!any(unlist(lapply(pairs, function(x) length(x) == length(pairvector) && any(x == pairvector))))) pairs <- c(pairs, list(pairvector)) } } # we now have a list, where each list element # is a vector of "close" string values strings.new <- rep(NA, length(strings)) # go through each list element for (i in seq_len(length(pairs))) { r <- pairs[[i]] # find vector indices of "close" values in # original string indices <- unlist(lapply(r, function(x) which(strings == x))) strings.new[indices] <- paste0(pairs[[i]], collapse = ", ") } if (verbose) close(pb) # return new vector, where all single "close" # values are replaced by the group of closed values. # e.g. the three values "hello", "holle" and "hole" # will be "recoded" into on value "hello, holle, hole" strings.new } # helper function that finds elements in # final list of grouped elements findInPairs <- function(curel, pairs) { elfound <- FALSE if (length(pairs) > 0) { for (ll in seq_len(length(pairs))) { pel <- pairs[[ll]] if (!is.na(curel) && any(pel == curel)) return(TRUE) } } elfound } fuzzy_grep <- function(x, pattern, precision = NULL) { if (is.null(precision)) precision <- round(nchar(pattern) / 3) if (precision > nchar(pattern)) return(NULL) p <- sprintf("(%s){~%i}", pattern, precision) grep(pattern = p, x = x, ignore.case = FALSE) } string_dist_matrix <- function(string) { l <- length(string) m <- matrix(nrow = l, ncol = l) for (i in 1:(l - 1)) { for (j in (i + 1):l) { pos <- string_dist(string[i], string[j]) if (pos == -1) pos <- 8 m[i, j] <- m[j, i] <- pos } } rownames(m) <- string colnames(m) <- string m } string_dist <- function(s1, s2) { if (is.na(s1) || is.na(s2)) return(-1) if (nchar(s1) > nchar(s2)) { x <- s2 pattern <- s1 } else { x <- s1 pattern <- s2 } len <- nchar(pattern) if (len > 8) len <- 8 for (p in 1:len) { pos <- grep(pattern = sprintf("(%s){~%i}", pattern, p), x = x, ignore.case = FALSE) if (length(pos)) { return(p) } } return(-1) } sjmisc/R/add_columns.R0000644000176200001440000001617214272453251014372 0ustar liggesusers#' @title Add or replace data frame columns #' @name add_columns #' #' @description \code{add_columns()} combines two or more data frames, but unlike #' \code{\link{cbind}} or \code{\link[dplyr:bind]{dplyr::bind_cols()}}, this function #' binds \code{data} as last columns of a data frame (i.e., behind columns #' specified in \code{...}). This can be useful in a "pipe"-workflow, where #' a data frame returned by a previous function should be appended #' \emph{at the end} of another data frame that is processed in #' \code{add_colums()}. #' \cr \cr #' \code{replace_columns()} replaces all columns in \code{data} with #' identically named columns in \code{...}, and adds remaining (non-duplicated) #' columns from \code{...} to \code{data}. #' \cr \cr #' \code{add_id()} simply adds an ID-column to the data frame, with values #' from 1 to \code{nrow(data)}, respectively for grouped data frames, values #' from 1 to group size. See 'Examples'. #' #' @param data A data frame. For \code{add_columns()}, will be bound after data #' frames specified in \code{...}. For \code{replace_columns()}, duplicated #' columns in \code{data} will be replaced by columns in \code{...}. #' @param ... More data frames to combine, resp. more data frames with columns #' that should replace columns in \code{data}. #' @param replace Logical, if \code{TRUE} (default), columns in \code{...} with #' identical names in \code{data} will replace the columns in \code{data}. #' The order of columns after replacing is preserved. #' @param add.unique Logical, if \code{TRUE} (default), remaining columns in #' \code{...} that did not replace any column in \code{data}, are appended #' as new columns to \code{data}. #' @param var Name of new the ID-variable. #' #' @return For \code{add_columns()}, a data frame, where columns of \code{data} #' are appended after columns of \code{...}. #' \cr \cr #' For \code{replace_columns()}, a data frame where columns in \code{data} #' will be replaced by identically named columns in \code{...}, and remaining #' columns from \code{...} will be appended to \code{data} (if #' \code{add.unique = TRUE}). #' \cr \cr #' For \code{add_id()}, a new column with ID numbers. This column is always #' the first column in the returned data frame. #' #' @note For \code{add_columns()}, by default, columns in \code{data} with #' identical names like columns in one of the data frames in \code{...} #' will be dropped (i.e. variables with identical names in \code{...} will #' replace existing variables in \code{data}). Use \code{replace = FALSE} to #' keep all columns. Identical column names will then be renamed, to ensure #' unique column names (which happens by default when using #' \code{\link[dplyr:bind]{dplyr::bind_cols()}}). When replacing columns, replaced columns #' are not added to the end of the data frame. Rather, the original order of #' columns will be preserved. #' #' @examples #' data(efc) #' d1 <- efc[, 1:3] #' d2 <- efc[, 4:6] #' #' if (require("dplyr") && require("sjlabelled")) { #' head(bind_cols(d1, d2)) #' add_columns(d1, d2) %>% head() #' #' d1 <- efc[, 1:3] #' d2 <- efc[, 2:6] #' #' add_columns(d1, d2, replace = TRUE) %>% head() #' add_columns(d1, d2, replace = FALSE) %>% head() #' #' # use case: we take the original data frame, select specific #' # variables and do some transformations or recodings #' # (standardization in this example) and add the new, transformed #' # variables *to the end* of the original data frame #' efc %>% #' select(e17age, c160age) %>% #' std() %>% #' add_columns(efc) %>% #' head() #' #' # new variables with same name will overwrite old variables #' # in "efc". order of columns is not changed. #' efc %>% #' select(e16sex, e42dep) %>% #' to_factor() %>% #' add_columns(efc) %>% #' head() #' #' # keep both old and new variables, automatically #' # rename variables with identical name #' efc %>% #' select(e16sex, e42dep) %>% #' to_factor() %>% #' add_columns(efc, replace = FALSE) %>% #' head() #' #' # create sample data frames #' d1 <- efc[, 1:10] #' d2 <- efc[, 2:3] #' d3 <- efc[, 7:8] #' d4 <- efc[, 10:12] #' #' # show original #' head(d1) #' #' library(sjlabelled) #' # slightly change variables, to see effect #' d2 <- as_label(d2) #' d3 <- as_label(d3) #' #' # replace duplicated columns, append remaining #' replace_columns(d1, d2, d3, d4) %>% head() #' #' # replace duplicated columns, omit remaining #' replace_columns(d1, d2, d3, d4, add.unique = FALSE) %>% head() #' #' # add ID to dataset #' library(dplyr) #' data(mtcars) #' add_id(mtcars) #' #' mtcars %>% #' group_by(gear) %>% #' add_id() %>% #' arrange(gear, ID) %>% #' print(n = 100) #' } #' @export add_columns <- function(data, ..., replace = TRUE) { # evaluate dots .dots <- match.call(expand.dots = FALSE)$`...` # if add_columns had no arguments, .dots are NULL # this crashes R when calling bind_cols if (is.null(.dots)) { stop("You must specify at least one more data frame with columns to add.", call. = FALSE) } # check for identical column names tmp <- dplyr::bind_cols(...) doubles <- colnames(tmp) %in% colnames(data) # keep order? reihenfolge <- c(which(!doubles), which(doubles)) # remove duplicate column names, if requested if (replace && any(doubles)) tmp <- tmp[, !doubles, drop = FALSE] # bind all data x <- dplyr::bind_cols(tmp, data) # restore order if (replace) { # check for correct length. if "data" had duplicated variables, # but not all variable are duplicates, add indices of regular values if (ncol(x) > length(reihenfolge)) { # get remaining indices xl <- seq_len(ncol(x))[-seq_len(length(reihenfolge))] # add to "reihefolge" reihenfolge <- c(reihenfolge, xl) } # sort data frame x <- x[, order(reihenfolge)] } x } #' @rdname add_columns #' @export replace_columns <- function(data, ..., add.unique = TRUE) { # evaluate dots .dots <- match.call(expand.dots = FALSE)$`...` # if add_columns had no arguments, .dots are NULL # this crashes R when calling bind_cols if (is.null(.dots)) { stop("You must specify at least one more data frame with columns to add.", call. = FALSE) } # bind all data frames to one tmp <- dplyr::bind_cols(...) # check for identical column names data.doubles <- colnames(data) %in% colnames(tmp) tmp.doubles <- colnames(tmp) %in% colnames(data) # replace duplicate variables in "data" with duplicates from "..." data[, data.doubles] <- tmp[, tmp.doubles, drop = FALSE] # add remaining columns that were not duplicates if (add.unique) x <- dplyr::bind_cols(data, tmp[, !tmp.doubles, drop = FALSE]) else x <- data x } #' @rdname add_columns #' @export add_id <- function(data, var = "ID") { if (!is.data.frame(data)) stop("`data` must be a data frame.", call. = FALSE) x <- dplyr::mutate(data, id = dplyr::row_number()) colnames(x)[ncol(x)] <- var dplyr::bind_cols(x[, ncol(x), drop = FALSE], data) } sjmisc/R/trim.R0000644000176200001440000000230513451124270013040 0ustar liggesusers#' @title Trim leading and trailing whitespaces from strings #' @name trim #' #' @description Trims leading and trailing whitespaces from strings or #' character vectors. #' #' @param x Character vector or string, or a list or data frame with such vectors. #' Function is vectorized, i.e. vector may have a length greater than #' 1. See 'Examples'. #' #' @return Trimmed \code{x}, i.e. with leading and trailing spaces removed. #' #' @examples #' trim("white space at end ") #' trim(" white space at start and end ") #' trim(c(" string1 ", " string2", "string 3 ")) #' #' tmp <- data.frame(a = c(" string1 ", " string2", "string 3 "), #' b = c(" strong one ", " string two", " third string "), #' c = c(" str1 ", " str2", "str3 ")) #' tmp #' trim(tmp) #' #' @export trim <- function(x) { UseMethod("trim") } #' @export trim.data.frame <- function(x) { as.data.frame(lapply(x, FUN = trim_helper)) } #' @export trim.list <- function(x) { lapply(x, FUN = trim_helper) } #' @export trim.default <- function(x) { trim_helper(x) } trim_helper <- function(x) gsub("^\\s+|\\s+$", "", x) sjmisc/R/select_helpers.R0000644000176200001440000000154514046746443015110 0ustar liggesusersstring_starts_with <- function(pattern, x) { pattern <- paste0("^\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_contains <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E") grep(pattern, x, perl = TRUE) } string_ends_with <- function(pattern, x) { pattern <- paste0("\\Q", pattern, "\\E$") grep(pattern, x, perl = TRUE) } string_one_of <- function(pattern, x) { m <- unlist(purrr::map(pattern, ~ grep(., x, fixed = TRUE, useBytes = TRUE))) x[m] } rownames_as_column <- function(x, var = "rowname") { rn <- data.frame(rn = rownames(x), stringsAsFactors = FALSE) x <- cbind(rn, x) colnames(x)[1] <- var rownames(x) <- NULL x } obj_has_name <- function(x, name) { name %in% names(x) } obj_has_rownames <- function(x) { !identical(as.character(1:nrow(x)), rownames(x)) } sjmisc/R/zap_inf.R0000644000176200001440000000253014046746443013530 0ustar liggesusers#' @title Convert infiite or NaN values into regular NA #' @name zap_inf #' #' @description Replaces all infinite (\code{Inf} and \code{-Inf}) or \code{NaN} #' values with regular \code{NA}. #' #' @param x A vector or a data frame. #' #' @inheritParams to_dummy #' #' @return \code{x}, where all \code{Inf}, \code{-Inf} and \code{NaN} are converted to \code{NA}. #' #' @examples #' x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7) #' zap_inf(x) #' #' data(efc) #' # produce some NA and NaN values #' efc$e42dep[1] <- NaN #' efc$e42dep[2] <- NA #' efc$c12hour[1] <- NaN #' efc$c12hour[2] <- NA #' efc$e17age[2] <- NaN #' efc$e17age[1] <- NA #' #' # only zap NaN for c12hour #' zap_inf(efc$c12hour) #' #' # only zap NaN for c12hour and e17age, not for e42dep, #' # but return complete data framee #' zap_inf(efc, c12hour, e17age) #' #' # zap NaN for complete data frame #' zap_inf(efc) #' @export zap_inf <- function(x, ...) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) if (is.data.frame(x)) { # iterate variables of data frame for (i in colnames(.dat)) { # convert NaN and Inf to missing x[[i]][is.nan(x[[i]])] <- NA x[[i]][is.infinite(x[[i]])] <- NA } } else { x[is.nan(x)] <- NA x[is.infinite(x)] <- NA } x } sjmisc/R/str_contains.R0000644000176200001440000001040214046746443014605 0ustar liggesusers#' @title Check if string contains pattern #' @name str_contains #' @description This functions checks whether a string or character vector #' \code{x} contains the string \code{pattern}. By default, #' this function is case sensitive. #' #' @param x Character string where matches are sought. May also be a #' character vector of length > 1 (see 'Examples'). #' @param pattern Character string to be matched in \code{x}. May also be a #' character vector of length > 1 (see 'Examples'). #' @param ignore.case Logical, whether matching should be case sensitive or not. #' @param logic Indicates whether a logical combination of multiple search pattern #' should be made. #' \itemize{ #' \item Use \code{"or"}, \code{"OR"} or \code{"|"} for a logical or-combination, i.e. at least one element of \code{pattern} is in \code{x}. #' \item Use \code{"and"}, \code{"AND"} or \code{"&"} for a logical AND-combination, i.e. all elements of \code{pattern} are in \code{x}. #' \item Use \code{"not"}, \code{"NOT"} or \code{"!"} for a logical NOT-combination, i.e. no element of \code{pattern} is in \code{x}. #' \item By default, \code{logic = NULL}, which means that \code{TRUE} or \code{FALSE} is returned for each element of \code{pattern} separately. #' } #' @param switch Logical, if \code{TRUE}, \code{x} will be sought in each element #' of \code{pattern}. If \code{switch = TRUE}, \code{x} needs to be of #' length 1. #' #' @return \code{TRUE} if \code{x} contains \code{pattern}. #' #' @details This function iterates all elements in \code{pattern} and #' looks for each of these elements if it is found in #' \emph{any} element of \code{x}, i.e. which elements #' of \code{pattern} are found in the vector \code{x}. #' \cr \cr #' Technically, it iterates \code{pattern} and calls #' \code{grep(x, pattern[i], fixed = TRUE)} for each element #' of \code{pattern}. If \code{switch = TRUE}, it iterates #' \code{pattern} and calls \code{grep(pattern[i], x, fixed = TRUE)} #' for each element of \code{pattern}. Hence, in the latter case #' (if \code{switch = TRUE}), \code{x} must be of length 1. #' #' #' #' @examples #' str_contains("hello", "hel") #' str_contains("hello", "hal") #' #' str_contains("hello", "Hel") #' str_contains("hello", "Hel", ignore.case = TRUE) #' #' # which patterns are in "abc"? #' str_contains("abc", c("a", "b", "e")) #' #' # is pattern in any element of 'x'? #' str_contains(c("def", "abc", "xyz"), "abc") #' # is "abcde" in any element of 'x'? #' str_contains(c("def", "abc", "xyz"), "abcde") # no... #' # is "abc" in any of pattern? #' str_contains("abc", c("defg", "abcde", "xyz12"), switch = TRUE) #' #' str_contains(c("def", "abcde", "xyz"), c("abc", "123")) #' #' # any pattern in "abc"? #' str_contains("abc", c("a", "b", "e"), logic = "or") #' #' # all patterns in "abc"? #' str_contains("abc", c("a", "b", "e"), logic = "and") #' str_contains("abc", c("a", "b"), logic = "and") #' #' # no patterns in "abc"? #' str_contains("abc", c("a", "b", "e"), logic = "not") #' str_contains("abc", c("d", "e", "f"), logic = "not") #' @export str_contains <- function(x, pattern, ignore.case = FALSE, logic = NULL, switch = FALSE) { # check if correct length when switching if (switch && length(x) > 1) { warning("`x` must be of length 1 when `switch = TRUE`. First element will be used.", call. = FALSE) x <- x[1] } # counter for matches cnt <- c() # ignore case for x and pattern if (ignore.case) { x <- tolower(x) pattern <- tolower(pattern) } # iterate patterns for (k in pattern) { # append result if (switch) cnt <- c(cnt, !sjmisc::is_empty(grep(x, k, fixed = TRUE))) else cnt <- c(cnt, !sjmisc::is_empty(grep(k, x, fixed = TRUE))) } # which logical combination? if (is.null(logic)) return(cnt) else if (logic %in% c("or", "OR", "|")) return(any(cnt)) else if (logic %in% c("and", "AND", "&")) return(all(cnt)) else if (logic %in% c("not", "NOT", "!")) return(!any(cnt)) return(cnt) } sjmisc/R/typical.R0000644000176200001440000001332614046746443013554 0ustar liggesusers#' @title Return the typical value of a vector #' @name typical_value #' #' @description This function returns the "typical" value of a variable. #' #' #' @param x A variable. #' @param fun Character vector, naming the function to be applied to #' \code{x}. Currently, \code{"mean"}, \code{"weighted.mean"}, #' \code{"median"} and \code{"mode"} are supported, which call the #' corresponding R functions (except \code{"mode"}, which calls an #' internal function to compute the most common value). \code{"zero"} #' simply returns 0. \strong{Note:} By default, if \code{x} is a factor, #' only \code{fun = "mode"} is applicable; for all other functions (including #' the default, \code{"mean"}) the reference level of \code{x} is returned. #' For character vectors, only the mode is returned. You can use a named #' vector to apply other different functions to integer, numeric and categorical #' \code{x}, where factors are first converted to numeric vectors, e.g. #' \code{fun = c(numeric = "median", factor = "mean")}. See 'Examples'. #' @param weights Name of variable in \code{x} that indicated the vector of #' weights that will be applied to weight all observations. Default is #' \code{NULL}, so no weights are used. #' @param ... Further arguments, passed down to \code{fun}. #' #' @return The "typical" value of \code{x}. #' #' @details By default, for numeric variables, \code{typical_value()} returns the #' mean value of \code{x} (unless changed with the \code{fun}-argument). #' \cr \cr #' For factors, the reference level is returned or the most common value #' (if \code{fun = "mode"}), unless \code{fun} is a named vector. If #' \code{fun} is a named vector, specify the function for integer, numeric #' and categorical variables as element names, e.g. #' \code{fun = c(integer = "median", factor = "mean")}. In this case, #' factors are converted to numeric values (using \code{\link{to_value}}) #' and the related function is applied. You may abbreviate the names #' \code{fun = c(i = "median", f = "mean")}. See also 'Examples'. #' \cr \cr #' For character vectors the most common value (mode) is returned. #' #' @examples #' data(iris) #' typical_value(iris$Sepal.Length) #' #' library(purrr) #' map(iris, ~ typical_value(.x)) #' #' # example from ?stats::weighted.mean #' wt <- c(5, 5, 4, 1) / 15 #' x <- c(3.7, 3.3, 3.5, 2.8) #' #' typical_value(x, fun = "weighted.mean") #' typical_value(x, fun = "weighted.mean", weights = wt) #' #' # for factors, return either reference level or mode value #' set.seed(123) #' x <- sample(iris$Species, size = 30, replace = TRUE) #' typical_value(x) #' typical_value(x, fun = "mode") #' #' # for factors, use a named vector to apply other functions than "mode" #' map(iris, ~ typical_value(.x, fun = c(n = "median", f = "mean"))) #' @export typical_value <- function(x, fun = "mean", weights = NULL, ...) { # check if we have named vectors and find the requested function # for special functions for factors, convert to numeric first fnames <- names(fun) if (!is.null(fnames)) { if (is.integer(x)) { fun <- fun[which(fnames %in% c("integer", "i"))] x <- as.numeric(x) } else if (is.numeric(x)) { fun <- fun[which(fnames %in% c("numeric", "n"))] } else if (is.factor(x)) { fun <- fun[which(fnames %in% c("factor", "f"))] if (fun != "mode") x <- to_value(x, keep.labels = FALSE) } } if (!(fun %in% c("mean", "median", "mode", "weighted.mean", "zero"))) stop("`fun` must be one of \"mean\", \"median\", \"mode\", \"weighted.mean\" or \"zero\".", call. = FALSE) # for weighted mean, check that weights are of same length as x if (fun == "weighted.mean" && !is.null(weights)) { # make sure weights and x have same length if (length(weights) != length(x)) { # if not, tell user and change function to mean warning("Vector of weights is of different length than `x`. Using `mean` as function for typical value.", call. = FALSE) fun <- "mean" } # make sure weights are differen from 1 if (all(weights == 1)) { # if not, tell user and change function to mean warning("All weight values are `1`. Using `mean` as function for typical value.", call. = FALSE) fun <- "mean" } } # no weights, than use normal mean function if (fun == "weighted.mean" && is.null(weights)) fun <- "mean" if (fun == "median") myfun <- get("median", asNamespace("stats")) else if (fun == "weighted.mean") myfun <- get("weighted.mean", asNamespace("stats")) else if (fun == "mode") myfun <- get("mode_value", asNamespace("sjmisc")) else if (fun == "zero") return(0) else myfun <- get("mean", asNamespace("base")) if (is.integer(x)) { stats::median(x, na.rm = TRUE) } else if (is.numeric(x)) { if (fun == "weighted.mean") do.call(myfun, args = list(x = x, na.rm = TRUE, w = weights, ...)) else do.call(myfun, args = list(x = x, na.rm = TRUE, ...)) } else if (is.factor(x)) { if (fun != "mode") levels(x)[1] else mode_value(x) } else { mode_value(x) } } mode_value <- function(x, ...) { # create frequency table, to find most common value counts <- table(x) modus <- names(counts)[max(counts) == counts] # in case all values appear equally often, use first value if (length(modus) > 1) modus <- modus[1] # check if it's numeric if (!is.na(suppressWarnings(as.numeric(modus)))) as.numeric(modus) else modus } sjmisc/R/row_sums.R0000644000176200001440000001704514046746443013767 0ustar liggesusers#' @title Row sums and means for data frames #' @name row_sums #' #' @description \code{row_sums()} and \code{row_means()} compute row sums or means #' for at least \code{n} valid values per row. The functions are designed #' to work nicely within a pipe-workflow and allow select-helpers #' for selecting variables. #' #' @param n May either be #' \itemize{ #' \item a numeric value that indicates the amount of valid values per row to calculate the row mean or sum; #' \item a value between 0 and 1, indicating a proportion of valid values per row to calculate the row mean or sum (see 'Details'). #' \item or \code{Inf}. If \code{n = Inf}, all values per row must be non-missing to compute row mean or sum. #' } #' If a row's sum of valid (i.e. non-\code{NA}) values is less than \code{n}, \code{NA} will be returned as value for the row mean or sum. #' @param var Name of new the variable with the row sums or means. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return For \code{row_sums()}, a data frame with a new variable: the row sums from #' \code{x}; for \code{row_means()}, a data frame with a new variable: the row #' means from \code{x}. If \code{append = FALSE}, only the new variable #' with row sums resp. row means is returned. \code{total_mean()} returns #' the mean of all values from all specified columns in a data frame. #' #' @details For \code{n}, must be a numeric value from \code{0} to \code{ncol(x)}. If #' a \emph{row} in \code{x} has at least \code{n} non-missing values, the #' row mean or sum is returned. If \code{n} is a non-integer value from 0 to 1, #' \code{n} is considered to indicate the proportion of necessary non-missing #' values per row. E.g., if \code{n = .75}, a row must have at least \code{ncol(x) * n} #' non-missing values for the row mean or sum to be calculated. See 'Examples'. #' #' @examples #' data(efc) #' efc %>% row_sums(c82cop1:c90cop9, n = 3, append = FALSE) #' #' library(dplyr) #' row_sums(efc, contains("cop"), n = 2, append = FALSE) #' #' dat <- data.frame( #' c1 = c(1,2,NA,4), #' c2 = c(NA,2,NA,5), #' c3 = c(NA,4,NA,NA), #' c4 = c(2,3,7,8), #' c5 = c(1,7,5,3) #' ) #' dat #' #' row_means(dat, n = 4) #' row_sums(dat, n = 4) #' #' row_means(dat, c1:c4, n = 4) #' # at least 40% non-missing #' row_means(dat, c1:c4, n = .4) #' row_sums(dat, c1:c4, n = .4) #' #' # total mean of all values in the data frame #' total_mean(dat) #' #' # create sum-score of COPE-Index, and append to data #' efc %>% #' select(c82cop1:c90cop9) %>% #' row_sums(n = 1) #' #' # if data frame has only one column, this column is returned #' row_sums(dat[, 1, drop = FALSE], n = 0) #' #' @export row_sums <- function(x, ...) { UseMethod("row_sums") } #' @rdname row_sums #' @export row_sums.default <- function(x, ..., n, var = "rowsums", append = TRUE) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) # remember original data, if user wants to bind columns orix <- x if (is.data.frame(x)) { # for Inf-values, use all columns if (is.infinite(n)) n <- ncol(.dat) # is 'n' indicating a proportion? digs <- n %% 1 if (digs != 0) n <- round(ncol(.dat) * digs) # check if we have a data frame with at least two columns if (ncol(.dat) < 2) { message("No row-sums are returned, because `x` has only one column.", call. = TRUE) colnames(orix) <- var return(orix) } # n may not be larger as df's amount of columns if (ncol(.dat) < n) { warning("`n` must be smaller or equal to number of columns in data frame.", call. = TRUE) return(orix) } rs <- apply(.dat, 1, function(x) ifelse(sum(!is.na(x)) >= n, sum(x, na.rm = TRUE), NA)) } else { stop("`x` must be a data frame.", call. = FALSE) } # to data frame, and rename variable rs <- as.data.frame(rs) colnames(rs) <- var # combine data if (append) rs <- dplyr::bind_cols(orix, rs) rs } #' @rdname row_sums #' @export row_sums.mids <- function(x, ..., var = "rowsums", append = TRUE) { rfun <- rowSums row_mids(x = x, ..., var = var, append = append, rfun = rfun) } #' @rdname row_sums #' @export row_means <- function(x, ...) { UseMethod("row_means") } #' @rdname row_sums #' @export total_mean <- function(x, ...) { UseMethod("total_mean") } #' @export total_mean.data.frame <- function(x, ...) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) sum(colSums(.dat, na.rm = TRUE)) / sum(apply(.dat, 1:2, function(x) !is.na(x))) } #' @rdname row_sums #' @export row_means.default <- function(x, ..., n, var = "rowmeans", append = TRUE) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) # remember original data, if user wants to bind columns orix <- x if (is.data.frame(x)) { # for Inf-values, use all columns if (is.infinite(n)) n <- ncol(.dat) # is 'n' indicating a proportion? digs <- n %% 1 if (digs != 0) n <- round(ncol(.dat) * digs) # check if we have a data frame with at least two columns if (ncol(.dat) < 2) { message("No row-means are returned, because `x` has only one column.", call. = TRUE) colnames(orix) <- var return(orix) } # n may not be larger as df's amount of columns if (ncol(.dat) < n) { warning("`n` must be smaller or equal to number of columns in data frame.", call. = TRUE) return(orix) } rm <- apply(.dat, 1, function(x) ifelse(sum(!is.na(x)) >= n, mean(x, na.rm = TRUE), NA)) } else { stop("`x` must be a data frame.", call. = FALSE) } # to data frame, and rename variable rm <- as.data.frame(rm) colnames(rm) <- var # combine data if (append) rm <- dplyr::bind_cols(orix, rm) rm } #' @rdname row_sums #' @export row_means.mids <- function(x, ..., var = "rowmeans", append = TRUE) { rfun <- rowMeans row_mids(x = x, ..., var = var, append = append, rfun = rfun) } row_mids <- function(x, ..., var, append, rfun, count = NULL) { # check if suggested package is available if (!requireNamespace("mice", quietly = TRUE)) stop("Package `mice` needed for this function to work. Please install it.", call. = FALSE) # check classes if (!inherits(x, "mids")) stop("`x` must be a `mids`-object, as returned by the `mice()`-function.", call. = FALSE) # quote dots and convert mids into long-data.frame vars <- dplyr::quos(...) long <- mice::complete(x, action = "long", include = TRUE) # group by imputation, so we can easily iterate each imputed dataset ndf <- long %>% dplyr::group_by(.data$.imp) %>% .nest() # select variable and compute rowsums. add this variable # to each imputed if (is.null(count)) { ndf$data <- purrr::map(ndf$data, ~ mutate( .x, rowsums = .x %>% dplyr::select(!!! vars) %>% rfun() )) } else { ndf$data <- purrr::map(ndf$data, ~ mutate( .x, rowsums = .x %>% dplyr::select(!!! vars) %>% rfun(count = count) )) } # rename new variable ndf$data <- purrr::map(ndf$data, function(.x) { colnames(.x)[ncol(.x)] <- var .x }) # return mids-object. need to use "as.data.frame()", # because "as.mids()" can't cope with tibbles ndf %>% .unnest() %>% as.data.frame() %>% mice::as.mids() } sjmisc/R/group_var.R0000644000176200001440000002444114046746443014113 0ustar liggesusers#' @title Recode numeric variables into equal-ranged groups #' @name group_var #' #' @description Recode numeric variables into equal ranged, grouped factors, #' i.e. a variable is cut into a smaller number of groups, where each group #' has the same value range. \code{group_labels()} creates the related value #' labels. \code{group_var_if()} and \code{group_labels_if()} are scoped #' variants of \code{group_var()} and \code{group_labels()}, where grouping #' will be applied only to those variables that match the logical condition #' of \code{predicate}. #' #' @seealso \code{\link{split_var}} to split variables into equal sized groups, #' \code{\link{group_str}} for grouping string vectors or #' \code{\link{rec_pattern}} and \code{\link{rec}} for another convenient #' way of recoding variables into smaller groups. #' #' @param size Numeric; group-size, i.e. the range for grouping. By default, #' for each 5 categories of \code{x} a new group is defined, i.e. \code{size = 5}. #' Use \code{size = "auto"} to automatically resize a variable into a maximum #' of 30 groups (which is the ggplot-default grouping when plotting #' histograms). Use \code{n} to determine the amount of groups. #' @param right.interval Logical; if \code{TRUE}, grouping starts with the lower #' bound of \code{size}. See 'Details'. #' @param n Sets the maximum number of groups that are defined when auto-grouping is on #' (\code{size = "auto"}). Default is 30. If \code{size} is not set to \code{"auto"}, #' this argument will be ignored. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return #' \itemize{ #' \item For \code{group_var()}, a grouped variable, either as numeric or as factor (see paramter \code{as.num}). If \code{x} is a data frame, only the grouped variables will be returned. #' \item For \code{group_labels()}, a string vector or a list of string vectors containing labels based on the grouped categories of \code{x}, formatted as "from lower bound to upper bound", e.g. \code{"10-19" "20-29" "30-39"} etc. See 'Examples'. #' } #' #' @note Variable label attributes (see, for instance, #' \code{\link[sjlabelled]{set_label}}) are preserved. Usually you should use #' the same values for \code{size} and \code{right.interval} in #' \code{group_labels()} as used in the \code{group_var} function if you want #' matching labels for the related recoded variable. #' #' @details If \code{size} is set to a specific value, the variable is recoded #' into several groups, where each group has a maximum range of \code{size}. #' Hence, the amount of groups differ depending on the range of \code{x}. #' \cr \cr #' If \code{size = "auto"}, the variable is recoded into a maximum of #' \code{n} groups. Hence, independent from the range of #' \code{x}, always the same amount of groups are created, so the range #' within each group differs (depending on \code{x}'s range). #' \cr \cr #' \code{right.interval} determins which boundary values to include when #' grouping is done. If \code{TRUE}, grouping starts with the \strong{lower #' bound} of \code{size}. For example, having a variable ranging from #' 50 to 80, groups cover the ranges from 50-54, 55-59, 60-64 etc. #' If \code{FALSE} (default), grouping starts with the \code{upper bound} #' of \code{size}. In this case, groups cover the ranges from #' 46-50, 51-55, 56-60, 61-65 etc. \strong{Note:} This will cover #' a range from 46-50 as first group, even if values from 46 to 49 #' are not present. See 'Examples'. #' \cr \cr #' If you want to split a variable into a certain amount of equal #' sized groups (instead of having groups where values have all the same #' range), use the \code{\link{split_var}} function! #' \cr \cr #' \code{group_var()} also works on grouped data frames (see \code{\link[dplyr]{group_by}}). #' In this case, grouping is applied to the subsets of variables #' in \code{x}. See 'Examples'. #' #' #' @examples #' age <- abs(round(rnorm(100, 65, 20))) #' age.grp <- group_var(age, size = 10) #' hist(age) #' hist(age.grp) #' #' age.grpvar <- group_labels(age, size = 10) #' table(age.grp) #' print(age.grpvar) #' #' # histogram with EUROFAMCARE sample dataset #' # variable not grouped #' library(sjlabelled) #' data(efc) #' hist(efc$e17age, main = get_label(efc$e17age)) #' #' # bar plot with EUROFAMCARE sample dataset #' # grouped variable #' ageGrp <- group_var(efc$e17age) #' ageGrpLab <- group_labels(efc$e17age) #' barplot(table(ageGrp), main = get_label(efc$e17age), names.arg = ageGrpLab) #' #' # within a pipe-chain #' library(dplyr) #' efc %>% #' select(e17age, c12hour, c160age) %>% #' group_var(size = 20) #' #' # create vector with values from 50 to 80 #' dummy <- round(runif(200, 50, 80)) #' # labels with grouping starting at lower bound #' group_labels(dummy) #' # labels with grouping startint at upper bound #' group_labels(dummy, right.interval = TRUE) #' #' # works also with gouped data frames #' mtcars %>% #' group_var(disp, size = 4, append = FALSE) %>% #' table() #' #' mtcars %>% #' group_by(cyl) %>% #' group_var(disp, size = 4, append = FALSE) %>% #' table() #' @export group_var <- function(x, ..., size = 5, as.num = TRUE, right.interval = FALSE, n = 30, append = TRUE, suffix = "_gr") { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) recode_fun( x = x, .dat = .dat, fun = get("g_v_helper", asNamespace("sjmisc")), suffix = suffix, append = append, groupsize = size, as.num = as.num, right.interval = right.interval, groupcount = n ) } #' @rdname group_var #' @export group_var_if <- function(x, predicate, size = 5, as.num = TRUE, right.interval = FALSE, n = 30, append = TRUE, suffix = "_gr") { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } recode_fun( x = x, .dat = .dat, fun = get("g_v_helper", asNamespace("sjmisc")), suffix = suffix, append = append, groupsize = size, as.num = as.num, right.interval = right.interval, groupcount = n ) } g_v_helper <- function(x, groupsize, as.num, right.interval, groupcount) { # do we have labels? varlab <- sjlabelled::get_label(x) # group variable x <- group_helper(x, groupsize, right.interval, groupcount) # set new levels of grouped variable levels(x) <- seq_len(nlevels(x)) # convert to numeric? if (as.num) x <- as.numeric(as.character(x)) # set back variable labels if (!is.null(varlab)) x <- sjlabelled::set_label(x, label = varlab) x } #' @rdname group_var #' @export group_labels <- function(x, ..., size = 5, right.interval = FALSE, n = 30) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) gl_fun( x = x, .dat = .dat, size = size, right.interval = right.interval, n = n ) } #' @rdname group_var #' @export group_labels_if <- function(x, predicate, size = 5, right.interval = FALSE, n = 30) { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } gl_fun( x = x, .dat = .dat, size = size, right.interval = right.interval, n = n ) } gl_fun <- function(x, .dat, size, right.interval, n) { if (is.data.frame(x)) { # iterate variables of data frame return( purrr::map(.dat, ~ g_l_helper( x = .x, groupsize = size, right.interval = right.interval, groupcount = n ))) } else { x <- g_l_helper( x = .dat, groupsize = size, right.interval = right.interval, groupcount = n ) } x } g_l_helper <- function(x, groupsize, right.interval, groupcount) { # do we have labels? varlab <- sjlabelled::get_label(x) # group variable x <- group_helper(x, groupsize, right.interval, groupcount) # Gruppen holen lvl <- levels(x) # rückgabewert init retval <- rep(c(""), length(lvl)) # alle Gruppierungen durchgehen for (i in seq_len(length(lvl))) { # Länge jedes Labels der Gruppeneinteilungen auslesen sublength <- nchar(lvl[i]) # "(" und "]", das bei "cut"-Funktion automatisch erstellt wird, # aus dem Label entfernen lvlstr <- substr(lvl[i], 2, sublength - 1) # Unter- und Obergrenze in jeweils einem string subs <- strsplit(lvlstr, ",") # Untergrenze als Zahlenwert lower <- as.numeric(subs[[1]][1]) # Obergrenze als Zahlenwert upper <- as.numeric(subs[[1]][2]) # Prüfen, welche Intervallgrenze ein- # und welche ausgeschlossen werden soll if (right.interval) { lower <- lower + 1 } else { upper <- upper - 1 } # Rückgabe des Strings retval[i] <- c(paste(lower, "-", upper, sep = "")) } # set back variable labels if (!is.null(varlab)) retval <- sjlabelled::set_label(retval, label = varlab) retval } group_helper <- function(x, groupsize, right.interval, groupcount) { # check if factor. factors need conversion # to numeric before grouped if (is.factor(x)) x <- sjlabelled::as_numeric(x, keep.labels = FALSE) # minimum range. will be changed when autogrouping minval <- 0 multip <- 2 # check for auto-grouping if (groupsize == "auto") { # determine groupsize, which is 1/30 of range size <- ceiling((max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) / groupcount) # reset groupsize groupsize <- as.numeric(size) # change minvalue minval <- min(x, na.rm = TRUE) multip <- 1 } # Einteilung der Variablen in Gruppen. Dabei werden unbenutzte # Faktoren gleich entfernt x <- droplevels(cut(x, breaks = c( seq(minval, max(x, na.rm = TRUE) + multip * groupsize, by = groupsize) ), right = right.interval)) x } sjmisc/R/str_pos.R0000644000176200001440000001362214046746443013577 0ustar liggesusers#' @title Find partial matching and close distance elements in strings #' @name str_find #' @description This function finds the element indices of partial matching or #' similar strings in a character vector. Can be used to find exact or #' slightly mistyped elements in a string vector. #' #' @seealso \code{\link{group_str}} #' #' @param string Character vector with string elements. #' @param pattern String that should be matched against the elements of \code{string}. #' @param partial Activates similar matching (close distance strings) for parts (substrings) #' of the \code{string}. Following values are accepted: #' \itemize{ #' \item 0 for no partial distance matching #' \item 1 for one-step matching, which means, only substrings of same length as \code{pattern} are extracted from \code{string} matching #' \item 2 for two-step matching, which means, substrings of same length as \code{pattern} as well as strings with a slightly wider range are extracted from \code{string} matching #' } #' Default value is 0. See 'Details' for more information. #' #' @inheritParams group_str #' #' @return A numeric vector with index position of elements in \code{string} that #' partially match or are similar to \code{pattern}. Returns \code{-1} if no #' match was found. #' #' @note This function does \emph{not} return the position of a matching string \emph{inside} #' another string, but the element's index of the \code{string} vector, where #' a (partial) match with \code{pattern} was found. Thus, searching for "abc" in #' a string "this is abc" will not return 9 (the start position of the substring), #' but 1 (the element index, which is always 1 if \code{string} only has one element). #' #' @details \strong{Computation Details} #' \cr \cr #' Fuzzy string matching is based on regular expressions, in particular #' \code{grep(pattern = "(){~}", x = string)}. This #' means, \code{precision} indicates the number of chars inside \code{pattern} #' that may differ in \code{string} to cosinder it as "matching". The higher #' \code{precision} is, the more tolerant is the search (i.e. yielding more #' possible matches). Furthermore, the higher the value for \code{partial} #' is, the more matches may be found. #' \cr \cr #' \strong{Partial Distance Matching} #' \cr \cr #' For \code{partial = 1}, a substring of \code{length(pattern)} is extracted #' from \code{string}, starting at position 0 in \code{string} until #' the end of \code{string} is reached. Each substring is matched against #' \code{pattern}, and results with a maximum distance of \code{precision} #' are considered as "matching". If \code{partial = 2}, the range #' of the extracted substring is increased by 2, i.e. the extracted substring #' is two chars longer and so on. #' #' @examples #' string <- c("Hello", "Helo", "Hole", "Apple", "Ape", "New", "Old", "System", "Systemic") #' str_find(string, "hel") # partial match #' str_find(string, "stem") # partial match #' str_find(string, "R") # no match #' str_find(string, "saste") # similarity to "System" #' #' # finds two indices, because partial matching now #' # also applies to "Systemic" #' str_find(string, #' "sytsme", #' partial = 1) #' #' # finds partial matching of similarity #' str_find("We are Sex Pistols!", "postils") #' @export str_find <- function( string, pattern, precision = 2, partial = 0, verbose = FALSE ) { # init return value indices <- c() # find element indices from partial matching of string and find term pos <- as.numeric(grep(pattern, string, ignore.case = TRUE)) if (length(pos) > 0) indices <- c(indices, pos) # find element indices from similar strings pos <- which(sapply(tolower(string), function(.x) string_dist(tolower(pattern), .x) <= precision)) if (length(pos) > 0) indices <- c(indices, pos) # find element indices from partial similar (distance) # string matching if (partial > 0) { ftlength <- nchar(pattern) # create progress bar if (verbose) pb <- utils::txtProgressBar(min = 0, max = length(string), style = 3) # iterate search string vector for (ssl in seq_len(length(string))) { # retrieve each element of search string vector # we do this step by step instead of vectorizing # due to the substring approach sst <- string[ssl] # we extract substrings of same length as pattern # starting from first char of string until end # and try to find similar matches steps <- nchar(sst) - ftlength + 1 if (steps > 0) { for (pi in seq_len(steps)) { # retrieve substring sust <- trim(substr(sst, pi, pi + ftlength - 1)) # find element indices from similar substrings pos <- which(string_dist(tolower(pattern), tolower(sust)) <= precision) if (length(pos) > 0) indices <- c(indices, ssl) } } if (partial > 1) { # 2nd loop picks longer substrings, because similarity # may also be present if length of strings differ # (e.g. "app" and "apple") steps <- nchar(sst) - ftlength if (steps > 1) { for (pi in 2:steps) { # retrieve substring sust <- trim(substr(sst, pi - 1, pi + ftlength)) # find element indices from similar substrings pos <- which(string_dist(tolower(pattern), tolower(sust)) <= precision) if (length(pos) > 0) indices <- c(indices, ssl) } } } # update progress bar if (verbose) utils::setTxtProgressBar(pb, ssl) } } if (verbose) close(pb) # return result if (length(indices) > 0) return(sort(unique(indices))) return(-1) } sjmisc/R/var_type.R0000644000176200001440000000350314046746443013734 0ustar liggesusers#' @title Determine variable type #' @name var_type #' #' @description This function returns the type of a variable as character. It #' is similar to \code{pillar::type_sum()}, however, the #' return value is not truncated, and \code{var_type()} works #' on data frames and within pipe-chains. #' #' @param abbr Logical, if \code{TRUE}, returns a shortened, abbreviated value #' for the variable type (as returned by \code{pillar::type_sum()}). #' If \code{FALSE} (default), a longer "description" is returned. #' #' @inheritParams to_dummy #' #' @return The variable type of \code{x}, as character. #' #' #' @examples #' data(efc) #' #' var_type(1) #' var_type(1L) #' var_type("a") #' #' var_type(efc$e42dep) #' var_type(to_factor(efc$e42dep)) #' #' library(dplyr) #' var_type(efc, contains("cop")) #' @export var_type <- function(x, ..., abbr = FALSE) { # get dot data x <- get_dot_data(x, dplyr::quos(...)) if (is.data.frame(x)) purrr::map_chr(x, ~ get_vt(.x, abbr = abbr)) else get_vt(x, abbr = abbr) } get_vt <- function(x, abbr) { if (is.ordered(x)) vt <- "ord" else if (is.factor(x)) vt <- "fct" else if (methods::is(x, "Date")) vt <- "date" else { vt <- switch( typeof(x), logical = "lgl", integer = "int", double = "dbl", character = "chr", complex = "cpl", closure = "fn", environment = "env", typeof(x) ) } if (!abbr) { vt <- dplyr::case_when( vt == "ord" ~ "ordinal", vt == "fct" ~ "categorical", vt == "dbl" ~ "numeric", vt == "int" ~ "integer", vt == "chr" ~ "character", vt == "lbl" ~ "labelled", vt == "cpl" ~ "complex", TRUE ~ vt ) } vt } sjmisc/R/reshape_helpers.R0000644000176200001440000000567414046746443015267 0ustar liggesusers#' @keywords internal .nest <- function(x, cn = "data") { if (!inherits(x, "grouped_df")) return(x) # get group indices and group keys from grouped df g <- .group_indices(x) k <- .group_keys(x) # create a factor with group indices, for "split()" f <- vector(mode = "numeric", length = nrow(x)) for (i in 1:length(g)) { f[g[[i]]] <- i } # remove grouping variables (keys) from data frame # because these should not be nested data_to_group <- x[, setdiff(colnames(x), colnames(k)), drop = FALSE] # split data, and add create a data frame with list-variable l <- split(data_to_group, f) dat <- data.frame(data = I(l)) colnames(dat) <- cn # bind keys and nested data frames nested_df <- cbind(k, dat) attr(nested_df, "groups") <- f attr(nested_df, "indices") <- unlist(g) nested_df } #' @keywords internal .unnest <- function(x, cn = NULL, more_list_cols = NULL) { # get name of data column if (is.null(cn)) cn <- colnames(x)[ncol(x)] # iterate all rows, i.e. all nested data frames # and add values from key-variables as variables, # so the key variables are also present in the final, # unnested data frame keys <- x[, setdiff(colnames(x), c(cn, more_list_cols)), drop = FALSE] for (i in 1:nrow(x)) { for (j in 1:length(keys)) { x[[cn]][[i]][[colnames(keys)[j]]] <- keys[i, j] } if (!is.null(more_list_cols)) x[[cn]][[i]][[more_list_cols]] <- x[[more_list_cols]][[i]] } # bind all data frames, and restore original order unnested_df <- do.call(rbind, x[[cn]]) rows <- attr(x, "indices", exact = TRUE) if (is.null(rows)) rows <- 1:nrow(unnested_df) unnested_df[order(rows), c(colnames(keys), setdiff(colnames(unnested_df), colnames(keys)))] } #' @keywords internal .group_indices <- function(x) { # dplyr >= 0.8.0 returns attribute "indices" grps <- attr(x, "groups", exact = TRUE) # dplyr < 0.8.0? if (is.null(grps)) { grps <- attr(x, "indices", exact = TRUE) } else { grps <- grps[[".rows"]] } grps } #' @keywords internal .group_keys <- function(x) { # dplyr >= 0.8.0 returns attribute "indices" grps <- attr(x, "groups", exact = TRUE) # dplyr < 0.8.0? if (is.null(grps)) { ## TODO fix for dplyr < 0.8 keys <- x[, attr(x, "vars", exact = TRUE), drop = FALSE] } else { keys <- grps[, setdiff(colnames(grps), ".rows")] } keys } #' @keywords internal .gather <- function(x, key = "key", value = "value", columns = colnames(x)) { if (is.numeric(columns)) columns <- colnames(x)[columns] dat <- stats::reshape( as.data.frame(x), idvar = "id", ids = row.names(x), times = columns, timevar = key, v.names = value, varying = list(columns), direction = "long" ) if (is.factor(dat[[value]])) dat[[value]] <- as.character(dat[[value]]) dat[, 1:(ncol(dat) - 1), drop = FALSE] } sjmisc/R/shorten_string.R0000644000176200001440000000327413500376602015146 0ustar liggesusers#' @title Shorten character strings #' @name shorten_string #' #' @description This function shortens strings that are longer than \code{max.length} #' chars, without cropping words. #' #' @param s A string. #' @param max.length Maximum length of chars for the string. #' @param abbr String that will be used as suffix, if \code{s} was shortened. #' #' @return A shortened string. #' #' @details If the string length defined in \code{max.length} happens to be inside #' a word, this word is removed from the returned string (see 'Examples'), so #' the returned string has a \emph{maximum length} of \code{max.length}, but #' might be shorter. #' #' @examples #' s <- "This can be considered as very long string!" #' #' # string is shorter than max.length, so returned as is #' shorten_string(s, 60) #' #' # string is shortened to as many words that result in #' # a string of maximum 20 chars #' shorten_string(s, 20) #' #' # string including "considered" is exactly of length 22 chars #' shorten_string(s, 22) #' #' @export shorten_string <- function(s, max.length = NULL, abbr = "...") { # check if labels should be truncated if (!is.null(max.length)) { # create pattern to find words uo to number of max.length chars in vector pattern <- paste('(.{1,', max.length, '})(\\s|$)', sep = "") # I *hate* regular expressions and will never understand them... tmp <- paste0(substr(s, 0, unlist(regexec( abbr, sub(pattern, replacement = paste0("\\1", abbr), s), fixed = T )) - 1), abbr) # only replace strings that are longer than max.length too.long <- nchar(s) > max.length s[too.long] <- tmp[too.long] } s } sjmisc/R/spread_coef.R0000644000176200001440000002031614620405274014346 0ustar liggesusers#' @title Spread model coefficients of list-variables into columns #' @name spread_coef #' #' @description This function extracts coefficients (and standard error and #' p-values) of fitted model objects from (nested) data frames, #' which are saved in a list-variable, and spreads the coefficients #' into new colummns. #' #' @param data A (nested) data frame with a list-variable that contains fitted #' model objects (see 'Details'). #' @param model.column Name or index of the list-variable that contains the #' fitted model objects. #' @param model.term Optional, name of a model term. If specified, only this model #' term (including p-value) will be extracted from each model and #' added as new column. #' @param se Logical, if \code{TRUE}, standard errors for estimates will also be extracted. #' @param p.val Logical, if \code{TRUE}, p-values for estimates will also be extracted. #' @param append Logical, if \code{TRUE} (default), this function returns #' \code{data} with new columns for the model coefficients; else, #' a new data frame with model coefficients only are returned. #' #' @return A data frame with columns for each coefficient of the models #' that are stored in the list-variable of \code{data}; or, if #' \code{model.term} is given, a data frame with the term's estimate. #' If \code{se = TRUE} or \code{p.val = TRUE}, the returned data frame #' also contains columns for the coefficients' standard error and #' p-value. #' If \code{append = TRUE}, the columns are appended to \code{data}, #' i.e. \code{data} is also returned. #' #' @details This function requires a (nested) data frame (e.g. created by the #' \code{\link[tidyr]{nest}}-function of the \pkg{tidyr}-package), #' where several fitted models are saved in a list-variable (see #' 'Examples'). Since nested data frames with fitted models stored as list-variable #' are typically fit with an identical formula, all models have the same #' dependent and independent variables and only differ in their #' subsets of data. The function then extracts all coefficients from #' each model and saves each estimate in a new column. The result #' is a data frame, where each \emph{row} is a model with each #' model's coefficients in an own \emph{column}. #' #' @examples #' if (require("dplyr") && require("tidyr") && require("purrr")) { #' data(efc) #' #' # create nested data frame, grouped by dependency (e42dep) #' # and fit linear model for each group. These models are #' # stored in the list variable "models". #' model.data <- efc %>% #' filter(!is.na(e42dep)) %>% #' group_by(e42dep) %>% #' nest() %>% #' mutate( #' models = map(data, ~lm(neg_c_7 ~ c12hour + c172code, data = .x)) #' ) #' #' # spread coefficients, so we can easily access and compare the #' # coefficients over all models. arguments `se` and `p.val` default #' # to `FALSE`, when `model.term` is not specified #' spread_coef(model.data, models) #' spread_coef(model.data, models, se = TRUE) #' #' # select only specific model term. `se` and `p.val` default to `TRUE` #' spread_coef(model.data, models, c12hour) #' #' # spread_coef can be used directly within a pipe-chain #' efc %>% #' filter(!is.na(e42dep)) %>% #' group_by(e42dep) %>% #' nest() %>% #' mutate( #' models = map(data, ~lm(neg_c_7 ~ c12hour + c172code, data = .x)) #' ) %>% #' spread_coef(models) #' } #' @export spread_coef <- function(data, model.column, model.term, se, p.val, append = TRUE) { # check if we have a data frame if (!is.data.frame(data)) stop("`data` needs to be a data frame.", call. = FALSE) # evaluate arguments model.column <- deparse(substitute(model.column)) model.term <- deparse(substitute(model.term)) # check if variable is a list variable if (!is.list(data[[model.column]])) stop(sprintf("%s needs to be a list-variable.", model.column), call. = FALSE) # check for proper defaults, depending on return style if (missing(se)) se <- !sjmisc::is_empty(model.term) if (missing(p.val)) p.val <- !sjmisc::is_empty(model.term) # check if user just wants a specific model term # if yes, select this, and its p-value if (!sjmisc::is_empty(model.term)) { # validate model term, i.e. check if coefficient exists in models tmp <- summary(data[[model.column]][[1]])$coefficients %>% as.data.frame() %>% rownames_as_column("term") %>% var_rename( Estimate = "estimate", `Std. Error` = "std.error", `t value` = "statistic", `z value` = "statistic", `Pr(>|t|)` = "p.value", `Pr(>|z|)` = "p.value", verbose = FALSE ) # if term is no valid coefficient name, tell user, and make # suggestions of possibly meant correct terms if (model.term %nin% tmp$term) { pos <- str_find(string = tmp$term, pattern = model.term, partial = 1) if (length(pos) > 1 || pos != -1) { pos_str <- sprintf(" Did you mean (one of) `%s`?", paste(tmp$term[pos], collapse = ", ")) } else { pos_str <- "" } stop( sprintf( "`%s` is no valid model term.%s", model.term, pos_str ), call. = F ) } # select variables for output variables <- "estimate" if (se) variables <- c(variables, "std.error") if (p.val) variables <- c(variables, "p.value") # iterate list variable dat <- purrr::map_df(data[[model.column]], function(x) { # tidy model. for mixed effects, return fixed effects only tmp <- summary(x)$coefficients %>% as.data.frame() %>% rownames_as_column("term") %>% var_rename( Estimate = "estimate", `Std. Error` = "std.error", `t value` = "statistic", `z value` = "statistic", `Pr(>|t|)` = "p.value", `Pr(>|z|)` = "p.value", verbose = FALSE ) %>% # filter term dplyr::filter(.data$term == model.term) # just select estimate and p-value tmp <- dplyr::select( tmp, string_one_of(pattern = variables, x = colnames(tmp)) ) # set colnames colnames(tmp) <- c(model.term, variables[-1]) tmp }) } else { # iterate list variable dat <- purrr::map_df(data[[model.column]], function(x) { # tidy model. for mixed effects, return fixed effects only tmp <- summary(x)$coefficients %>% as.data.frame() %>% rownames_as_column("term") %>% var_rename( Estimate = "estimate", `Std. Error` = "std.error", `t value` = "statistic", `z value` = "statistic", `Pr(>|t|)` = "p.value", `Pr(>|z|)` = "p.value", verbose = FALSE ) # just select term name and estimate value df1 <- as.data.frame(t(tmp$estimate)) colnames(df1) <- tmp$term # columns for each data frame cols <- ncol(df1) # standard error also requested? if (se) { # just select term name and estimate value df2 <- as.data.frame(t(tmp$std.error)) colnames(df2) <- sprintf("%s.se", tmp$term) # bind together df1 <- dplyr::bind_cols(df1, df2) } # p-value also requested? if (p.val) { # just select term name and estimate value df3 <- as.data.frame(t(tmp$p.value)) colnames(df3) <- sprintf("%s.p", tmp$term) # bind together df1 <- dplyr::bind_cols(df1, df3) } # return sorted data frame df1[, unlist(lapply(1:cols, function(x) seq(from = 1, to = ncol(df1), by = cols) + x - 1))] }) } # bind result to original data frame if (append) dplyr::bind_cols(data, dat) else dat } sjmisc/R/word_wrap.R0000644000176200001440000000524014046746443014107 0ustar liggesusers#' @title Insert line breaks in long labels #' @name word_wrap #' #' @description Insert line breaks in long character strings. Useful if you want to wordwrap #' labels / titles for plots or tables. #' #' @param labels Label(s) as character string, where a line break should be #' inserted. Several strings may be passed as vector (see 'Examples'). #' @param wrap Maximum amount of chars per line (i.e. line length). If #' \code{wrap = Inf} or \code{wrap = 0}, no word wrap will be performed #' (i.e. \code{labels} will be returned as is). #' @param linesep By default, this argument is \code{NULL} and a regular new line #' string (\code{"\\n"}) is used. For HTML-purposes, for instance, \code{linesep} #' could be \code{"
"}. #' @return New label(s) with line breaks inserted at every \code{wrap}'s position. #' #' @examples #' word_wrap(c("A very long string", "And another even longer string!"), 10) #' #' message(word_wrap("Much too long string for just one line!", 15)) #' @export word_wrap <- function(labels, wrap, linesep = NULL) { # infinite wrap? then return labels if (is.infinite(wrap) | wrap == 0) return(labels) # expressions can't be wrapped if (is.expression(labels)) { warning("Word wrap is not available for expressions.") return(labels) } # check if labels have NA values and remove them if (anyNA(labels)) labels <- as.character(stats::na.omit(labels)) # check for valid value if (is.null(labels) || length(labels) == 0) return(NULL) # coerce to character, if factor if (!is.character(labels)) labels <- as.character(labels) # default line separator is \n if (is.null(linesep)) { linesep <- '\\1\n' lsub <- 0 ori.linesep <- '\n' } else { # however, for html-function we can use "
" # as argument lsub <- nchar(linesep) - 1 ori.linesep <- linesep linesep <- sprintf("\\1%s", linesep) } # create regex pattern for line break pattern <- paste('(.{1,', wrap, '})(\\s|$)', sep = "") # iterate all labels for (n in seq_len(length(labels))) { # check if wrap exceeds lengths of labels if (wrap > 0 && nchar(labels[n]) > wrap) { # insert line breaks labels[n] <- gsub(pattern, linesep, labels[n]) # in case label was short enough, we still have a line break # at the end of the label. here we remove any trailing line breaks l <- nchar(labels[n]) # get last char lc <- substr(labels[n], l - lsub, l) # check if line break if (lc == ori.linesep) { # if yes, remove it labels[n] <- substr(labels[n], 0, l - (lsub + 1)) } } } labels } sjmisc/R/to_value.R0000644000176200001440000000411414046746443013720 0ustar liggesusers#' @title Convert factors to numeric variables #' @name to_value #' #' @description This function converts (replaces) factor levels with the #' related factor level index number, thus the factor is converted to #' a numeric variable. \code{to_value()} and \code{to_numeric()} are aliases. #' #' @param start.at Starting index, i.e. the lowest numeric value of the variable's #' value range. By default, this argument is \code{NULL}, hence the lowest #' value of the returned numeric variable corresponds to the lowest factor #' level (if factor levels are numeric) or to \code{1} (if factor levels #' are not numeric). #' @param keep.labels Logical, if \code{TRUE}, former factor levels will be added as #' value labels. For numeric factor levels, values labels will be used, #' if present. See 'Examples' and \code{\link{set_labels}} for more details. #' @param use.labels Logical, if \code{TRUE} and \code{x} has numeric value labels, #' these value labels will be set as numeric values. #' #' @return A numeric variable with values ranging either from \code{start.at} to #' \code{start.at} + length of factor levels, or to the corresponding #' factor levels (if these were numeric). If \code{x} is a data frame, #' the complete data frame \code{x} will be returned, where variables #' specified in \code{...} are coerced to numeric; if \code{...} is #' not specified, applies to all variables in the data frame. #' #' @inheritParams to_dummy #' #' @note This function is kept for backwards-compatibility. It is preferred to #' use \code{\link[sjlabelled]{as_numeric}}. #' #' @examples #' library(sjlabelled) #' data(efc) #' test <- as_label(efc$e42dep) #' table(test) #' table(to_value(test)) #' #' # Find more examples at '?sjlabelled::as_numeric' #' @export to_value <- function(x, ..., start.at = NULL, keep.labels = TRUE, use.labels = FALSE) { sjlabelled::as_numeric(x = x, ..., start.at = start.at, keep.labels = keep.labels, use.labels = use.labels) } sjmisc/R/helpfunctions.R0000644000176200001440000000323314620405202014743 0ustar liggesusersget_dot_data <- function(x, qs) { if (sjmisc::is_empty(qs)) x else # In case dots are not empty, get subdataframe of dots selection in function build_dot_data build_dot_data(x, qs) } build_dot_data <- function(x, qs) { # below lapply goes through each of the expressions in dots data # and the corresponding subdataframes got with build_col are saved in list out out <- lapply(qs, function(dot_expr) build_col(x, dot_expr)) out <- dplyr::bind_cols(out) attr(out, "row.names") <- attr(x, "row.names") out } # depending on the dots expressions, old variables are selected or new ones are computed with transmute build_col <- function(x, qs_expr) { if (check_qs(rlang::get_expr(qs_expr))) { suppressMessages(dplyr::select(x, !!qs_expr)) } else { suppressMessages(dplyr::transmute(x, !!qs_expr)) } } # name or numeric expressions, regular sequences or those looked for with select helpers are selected # otherwise, they are transmuted check_qs <- function(is_expr) { if (!inherits(is_expr, "call")) { TRUE } else if (as.character(is_expr)[1] == ":") { TRUE } else if (as.character(is_expr)[1] %in% paste0(c(rep("",length(tidyselect::vars_select_helpers)),rep("dplyr::",length(tidyselect::vars_select_helpers)),rep("tidyselect::",length(tidyselect::vars_select_helpers))),names(tidyselect::vars_select_helpers))) { TRUE } else { FALSE } } data_frame <- function(...) { x <- data.frame(..., stringsAsFactors = FALSE) rownames(x) <- NULL x } n_unique <- function(x, na.rm = TRUE) { x <- as.vector(x) if (na.rm) x <- stats::na.omit(x) length(unique(x)) } sjmisc/R/replace_na.R0000644000176200001440000001264614046746443014204 0ustar liggesusers#' @title Replace NA with specific values #' @name replace_na #' #' @description This function replaces (tagged) NA's of a variable, data frame #' or list of variables with \code{value}. #' #' @seealso \code{\link[sjlabelled]{set_na}} for setting \code{NA} values, \code{\link{rec}} #' for general recoding of variables and \code{\link{recode_to}} #' for re-shifting value ranges. #' #' @param value Value that will replace the \code{\link{NA}}'s. #' @param na.label Optional character vector, used to label the the former NA-value #' (i.e. adding a \code{labels} attribute for \code{value} to \code{x}). #' @param tagged.na Optional single character, specifies a \code{\link[haven]{tagged_na}} value #' that will be replaced by \code{value}. Herewith it is possible #' to replace only specific \code{NA} values of \code{x}. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return \code{x}, where \code{NA}'s are replaced with \code{value}. If \code{x} #' is a data frame, the complete data frame \code{x} will be returned, #' with replaced NA's for variables specified in \code{...}; #' if \code{...} is not specified, applies to all variables in the #' data frame. #' #' @note Value and variable label attributes are preserved. #' #' @details While regular \code{NA} values can only be \emph{completely} replaced with #' a single value, \code{\link[haven]{tagged_na}} allows to differentiate #' between different qualitative values of \code{NA}s. #' Tagged \code{NA}s work exactly like regular R missing values #' except that they store one additional byte of information: a tag, #' which is usually a letter ("a" to "z") or character number ("0" to "9"). #' Therewith it is possible to replace only specific NA values, while #' other NA values are preserved. #' #' @examples #' library(sjlabelled) #' data(efc) #' table(efc$e42dep, useNA = "always") #' table(replace_na(efc$e42dep, value = 99), useNA = "always") #' #' # the original labels #' get_labels(replace_na(efc$e42dep, value = 99)) #' # NA becomes "99", and is labelled as "former NA" #' get_labels( #' replace_na(efc$e42dep, value = 99, na.label = "former NA"), #' values = "p" #' ) #' #' dummy <- data.frame( #' v1 = efc$c82cop1, #' v2 = efc$c83cop2, #' v3 = efc$c84cop3 #' ) #' # show original distribution #' lapply(dummy, table, useNA = "always") #' # show variables, NA's replaced with 99 #' lapply(replace_na(dummy, v2, v3, value = 99), table, useNA = "always") #' #' if (require("haven")) { #' x <- labelled(c(1:3, tagged_na("a", "c", "z"), 4:1), #' c("Agreement" = 1, "Disagreement" = 4, "First" = tagged_na("c"), #' "Refused" = tagged_na("a"), "Not home" = tagged_na("z"))) #' # get current NA values #' x #' get_na(x) #' #' # replace only the NA, which is tagged as NA(c) #' replace_na(x, value = 2, tagged.na = "c") #' get_na(replace_na(x, value = 2, tagged.na = "c")) #' #' table(x) #' table(replace_na(x, value = 2, tagged.na = "c")) #' #' # tagged NA also works for non-labelled class #' # init vector #' x <- c(1, 2, 3, 4) #' # set values 2 and 3 as tagged NA #' x <- set_na(x, na = c(2, 3), as.tag = TRUE) #' # see result #' x #' # now replace only NA tagged with 2 with value 5 #' replace_na(x, value = 5, tagged.na = "2") #' } #' @export replace_na <- function(x, ..., value, na.label = NULL, tagged.na = NULL) { # check for valid value if (is.null(value) || is.na(value)) return(x) # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) if (is.data.frame(x)) { # iterate variables of data frame for (i in colnames(.dat)) { x[[i]] <- replace_na_helper( x = .dat[[i]], value = value, na.label = na.label, tagged.na = tagged.na ) } } else { x <- replace_na_helper( x = .dat, value = value, na.label = na.label, tagged.na = tagged.na ) } x } replace_na_helper <- function(x, value, na.label, tagged.na) { # create named vector, for labelleing if (!is.null(na.label)) { na.vec <- value names(na.vec) <- as.character(na.label) } # check if we have any misisngs at all if (anyNA(x)) { # do we have a factor? then check for levels if (is.factor(x)) { # is value in levels? if (!any(levels(x) %in% as.character(value))) { # if not, add value to levels levels(x) <- c(levels(x), as.character(value)) } } # check if we have tagged NA if (!is.null(tagged.na)) { if (!requireNamespace("haven", quietly = TRUE)) { stop("Package `haven` needed for this function to work. Please install it.", call. = FALSE) } # coerce to tagged NA if (!haven::is_tagged_na(tagged.na)) tagged.na <- haven::tagged_na(tagged.na) # replace tagged NA x[which(haven::na_tag(x) == haven::na_tag(tagged.na))] <- value # remove label x <- suppressMessages(sjlabelled::remove_labels(x, labels = tagged.na)) } else { x[is.na(x)] <- value } # add NA label if (!is.null(na.label)) x <- sjlabelled::add_labels(x, labels = na.vec) } else { message("`x` has no missings.") } x } sjmisc/R/de_mean.R0000644000176200001440000001017414046746443013475 0ustar liggesusers#' @title Compute group-meaned and de-meaned variables #' @name de_mean #' #' @description \code{de_mean()} computes group- and de-meaned versions of a #' variable that can be used in regression analysis to model the between- #' and within-subject effect. #' #' @param x A data frame. #' @param ... Names of variables that should be group- and de-meaned. #' @param grp Quoted or unquoted name of the variable that indicates the #' group- or cluster-ID. #' @param suffix.dm,suffix.gm String value, will be appended to the names of the #' group-meaned and de-meaned variables of \code{x}. By default, de-meaned #' variables will be suffixed with \code{"_dm"} and grouped-meaned variables #' with \code{"_gm"}. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return For \code{append = TRUE}, \code{x} including the group-/de-meaned #' variables as new columns is returned; if \code{append = FALSE}, only the #' group-/de-meaned variables will be returned. #' #' @details \code{de_mean()} is intended to create group- and de-meaned variables #' for complex random-effect-within-between models (see \cite{Bell et al. 2018}), #' where group-effects (random effects) and fixed effects correlate (see #' \cite{Bafumi and Gelman 2006)}). This violation of one of the #' \emph{Gauss-Markov-assumptions} can happen, for instance, when analysing panel #' data. To control for correlating predictors and group effects, it is #' recommended to include the group-meaned and de-meaned version of #' \emph{time-varying covariates} in the model. By this, one can fit #' complex multilevel models for panel data, including time-varying, #' time-invariant predictors and random effects. This approach is superior to #' simple fixed-effects models, which lack information of variation in the #' group-effects or between-subject effects. #' \cr \cr #' A description of how to translate the #' formulas described in \emph{Bell et al. 2018} into R using \code{lmer()} #' from \pkg{lme4} or \code{glmmTMB()} from \pkg{glmmTMB} can be found here: #' \href{https://strengejacke.github.io/mixed-models-snippets/random-effects-within-between-effects-model.html}{for lmer()} #' and \href{https://strengejacke.github.io/mixed-models-snippets/random-effects-within-between-effects-model-glmmtmb.html}{for glmmTMB()}. #' #' @references #' Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the American Political Science Association. #' \cr \cr #' Bell A, Fairbrother M, Jones K. 2018. Fixed and Random Effects Models: Making an Informed Choice. Quality & Quantity. \doi{10.1007/s11135-018-0802-x} #' #' @examples #' data(efc) #' efc$ID <- sample(1:4, nrow(efc), replace = TRUE) # fake-ID #' de_mean(efc, c12hour, barthtot, grp = ID, append = FALSE) #' @export de_mean <- function(x, ..., grp, append = TRUE, suffix.dm = "_dm", suffix.gm = "_gm") { group_name <- rlang::quo_name(rlang::enquo(grp)) group_var <- rlang::as_quosure(rlang::sym(group_name), env = rlang::global_env()) group_ids <- c(group_name, ".dummyid") # evaluate arguments, generate data dat <- get_dot_data(x, dplyr::quos(...)) dat <- x %>% dplyr::ungroup() %>% dplyr::select(!! group_name) %>% add_columns(dat) %>% dplyr::mutate(.dummyid = 1:nrow(dat)) %>% dplyr::arrange(!! group_var) cn <- colnames(dat)[1:(ncol(dat) - 2)] x_gm <- dat %>% dplyr::group_by(!! group_var) %>% dplyr::mutate_at(cn, mean, na.rm = TRUE) %>% dplyr::ungroup() x_dm_list <- purrr::map(cn, ~ dat[[.x]] - x_gm[[.x]]) names(x_dm_list) <- cn x_dm <- as.data.frame(x_dm_list) colnames(x_dm) <- sprintf("%s%s", colnames(x_dm), suffix.dm) renamers <- which(colnames(x_gm) %in% cn) colnames(x_gm)[renamers] <- sprintf("%s%s", colnames(x_gm)[renamers], suffix.gm) dat <- dat %>% dplyr::select(!! -group_ids) %>% dplyr::bind_cols(x_dm, x_gm) %>% dplyr::arrange(.data$.dummyid) %>% dplyr::select(!! -group_ids) if (append) dat <- add_columns(dat, x) dat } sjmisc/R/has_na.R0000644000176200001440000001010214046746443013325 0ustar liggesusers#' @title Check if variables or cases have missing / infinite values #' @name has_na #' #' @description This functions checks if variables or observations in a data #' frame have \code{NA}, \code{NaN} or \code{Inf} values. #' #' @param x A data frame. #' @param by Whether to check column- or row-wise for missing and infinite values. #' If \code{by = "col"}, \code{has_na()} checks for \code{NA}/\code{NaN}/\code{Inf} #' in \emph{columns}; If \code{by = "row"}, \code{has_na()} checks each row for #' these values. #' @param out Output (return) format of the results. May be abbreviated. #' #' @inheritParams descr #' #' @return If \code{x} is a vector, returns \code{TRUE} if \code{x} has any missing #' or infinite values. If \code{x} is a data frame, returns \code{TRUE} for #' each variable (if \code{by = "col"}) or observation (if \code{by = "row"}) #' that has any missing or infinite values. If \code{out = "table"}, results #' are returned as data frame, with column number, variable name and #' label, and a logical vector indicating if a variable has missing values or #' not. However, it's printed in colors, with green rows indicating that a #' variable has no missings, while red rows indicate the presence of missings #' or infinite values. If \code{out = "index"}, a named vector is returned. #' #' @note \code{complete_cases()} and \code{incomplete_cases()} are convenient #' shortcuts for \code{has_na(by = "row", out = "index")}, where the first #' only returns case-id's for all complete cases, and the latter only for #' non-complete cases. \cr \cr #' \code{complete_vars()} and \code{incomplete_vars()} are convenient shortcuts #' for \code{has_na(by = "col", out = "index")}, and again only return those #' column-id's for variables which are (in-)complete. #' #' @examples #' data(efc) #' has_na(efc$e42dep) #' has_na(efc, e42dep, tot_sc_e, c161sex) #' has_na(efc) #' #' has_na(efc, e42dep, tot_sc_e, c161sex, out = "index") #' has_na(efc, out = "df") #' #' has_na(efc, by = "row") #' has_na(efc, e42dep, tot_sc_e, c161sex, by = "row", out = "index") #' has_na(efc, by = "row", out = "df") #' #' complete_cases(efc, e42dep, tot_sc_e, c161sex) #' incomplete_cases(efc, e42dep, tot_sc_e, c161sex) #' complete_vars(efc, e42dep, tot_sc_e, c161sex) #' incomplete_vars(efc, e42dep, tot_sc_e, c161sex) #' @export has_na <- function(x, ..., by = c("col", "row"), out = c("table", "df", "index")) { out <- match.arg(out) by <- match.arg(by) .dat <- get_dot_data(x, dplyr::quos(...)) if (is.data.frame(x)) { if (by == "row") { tmp <- apply(.dat, 1, function(.x) anyNA(.x) | any(is.infinite(.x))) } else { tmp <- purrr::map_lgl(.dat, ~ anyNA(.x) | any(is.infinite(.x))) } # return data frame? if (out == "df") { tmp <- as.data.frame(tmp) } # return variable labels? if (out == "table" && by == "col") { tmp <- data_frame( col = match(names(tmp), colnames(x)), name = names(tmp), label = shorten_string(sjlabelled::get_label(.dat, def.value = names(tmp)), 35), has.na = tmp ) class(tmp) <- c("sj_has_na", class(tmp)) } if (out == "table" && by == "row") { tmp <- data_frame( case = 1:nrow(.dat), has.na = tmp ) } if (out == "index" && by == "row") tmp <- which(tmp) tmp } else { anyNA(x) | any(is.infinite(x)) } } #' @rdname has_na #' @export incomplete_cases <- function(x, ...) { has_na(x, ..., by = "row", out = "index") } #' @rdname has_na #' @export complete_cases <- function(x, ...) { all.cases <- seq_len(nrow(x)) na.cases <- has_na(x, ..., by = "row", out = "index") if (sjmisc::is_empty(na.cases)) all.cases else all.cases[-na.cases] } #' @rdname has_na #' @export complete_vars <- function(x, ...) { which(!has_na(x, ..., by = "col", out = "index")) } #' @rdname has_na #' @export incomplete_vars <- function(x, ...) { which(has_na(x, ..., by = "col", out = "index")) } sjmisc/R/to_long.R0000644000176200001440000001717314046746443013554 0ustar liggesusers#' @title Convert wide data to long format #' @name to_long #' @description This function converts wide data into long format. It allows #' to transform multiple key-value pairs to be transformed #' from wide to long format in one single step. #' #' @param data A \code{data.frame} that should be tansformed from wide to #' long format. #' @param keys Character vector with name(s) of key column(s) to create in output. #' Either one key value per column group that should be gathered, or #' a single string. In the latter case, this name will be used as #' key column, and only one key column is created. See 'Examples'. #' @param values Character vector with names of value columns (variable names) #' to create in output. Must be of same length as number of column #' groups that should be gathered. See 'Examples'. #' @param ... Specification of columns that should be gathered. Must be one #' character vector with variable names per column group, or a numeric #' vector with column indices indicating those columns that should be #' gathered. See 'Examples'. #' @param labels Character vector of same length as \code{values} with variable #' labels for the new variables created from gathered columns. #' See 'Examples' and 'Details'. #' @param recode.key Logical, if \code{TRUE}, the values of the \code{key} #' column will be recoded to numeric values, in sequential ascending #' order. #' #' @seealso \code{\link{reshape_longer}} #' #' @details This function reshapes data from wide to long format, however, #' you can gather multiple column groups at once. Value and variable labels #' for non-gathered variables are preserved. Attributes from gathered variables, #' such as information about the variable labels, are lost during reshaping. #' Hence, the new created variables from gathered columns don't have any #' variable label attributes. In such cases, use \code{labels} argument to set #' back variable label attributes. #' #' @examples #' # create sample #' mydat <- data.frame(age = c(20, 30, 40), #' sex = c("Female", "Male", "Male"), #' score_t1 = c(30, 35, 32), #' score_t2 = c(33, 34, 37), #' score_t3 = c(36, 35, 38), #' speed_t1 = c(2, 3, 1), #' speed_t2 = c(3, 4, 5), #' speed_t3 = c(1, 8, 6)) #' #' # gather multiple columns. both time and speed are gathered. #' to_long( #' data = mydat, #' keys = "time", #' values = c("score", "speed"), #' c("score_t1", "score_t2", "score_t3"), #' c("speed_t1", "speed_t2", "speed_t3") #' ) #' #' # alternative syntax, using "reshape_longer()" #' reshape_longer( #' mydat, #' columns = list( #' c("score_t1", "score_t2", "score_t3"), #' c("speed_t1", "speed_t2", "speed_t3") #' ), #' names.to = "time", #' values.to = c("score", "speed") #' ) #' #' # or ... #' reshape_longer( #' mydat, #' list(3:5, 6:8), #' names.to = "time", #' values.to = c("score", "speed") #' ) #' #' # gather multiple columns, use numeric key-value #' to_long( #' data = mydat, #' keys = "time", #' values = c("score", "speed"), #' c("score_t1", "score_t2", "score_t3"), #' c("speed_t1", "speed_t2", "speed_t3"), #' recode.key = TRUE #' ) #' #' # gather multiple columns by colum names and colum indices #' to_long( #' data = mydat, #' keys = "time", #' values = c("score", "speed"), #' c("score_t1", "score_t2", "score_t3"), #' 6:8, #' recode.key = TRUE #' ) #' #' # gather multiple columns, use separate key-columns #' # for each value-vector #' to_long( #' data = mydat, #' keys = c("time_score", "time_speed"), #' values = c("score", "speed"), #' c("score_t1", "score_t2", "score_t3"), #' c("speed_t1", "speed_t2", "speed_t3") #' ) #' #' # gather multiple columns, label columns #' mydat <- to_long( #' data = mydat, #' keys = "time", #' values = c("score", "speed"), #' c("score_t1", "score_t2", "score_t3"), #' c("speed_t1", "speed_t2", "speed_t3"), #' labels = c("Test Score", "Time needed to finish") #' ) #' #' library(sjlabelled) #' str(mydat$score) #' get_label(mydat$speed) #' #' @export to_long <- function(data, keys, values, ..., labels = NULL, recode.key = FALSE) { UseMethod("to_long") } #' @export to_long.default <- function(data, keys, values, ..., labels = NULL, recode.key = FALSE) { to_long_helper( data = data, keys = keys, values = values, ..., labels = labels, recode.key = recode.key ) } #' @export to_long.mids <- function(data, keys, values, ..., labels = NULL, recode.key = FALSE) { ndf <- prepare_mids_recode(data) # select variable and compute rowsums. add this variable # to each imputed ndf$data <- purrr::map( ndf$data, function(.x) { dat <- to_long_helper( data = .x, keys = keys, values = values, ..., labels = labels, recode.key = recode.key ) dat$.id <- 1:nrow(dat) dat } ) final_mids_recode(ndf) } to_long_helper <- function(data, keys, values, ..., labels, recode.key) { # get variable names for gather columns data_cols <- list(...) # if we have just one key value, repeat it to required length if (length(keys) < length(data_cols)) keys <- rep(keys, times = length(data_cols)) # check for correct length if (length(values) < length(data_cols)) { stop("`values` must be of same length as column groups to gather.", call. = F) } # check for correct length if (!is.null(labels) && length(labels) < length(data_cols)) { warning("`labels` must be of same length as `values`. Dropping variable labels for gathered columns.") labels <- NULL } # check for numeric indices, and get column names then for (i in seq_len(length(data_cols))) { # check if all values are numeric if (all(is.numeric(data_cols[[i]]))) { # get column names instead data_cols[[i]] <- colnames(data)[data_cols[[i]]] } } # get all columns that should be gathered all_data_cols <- unlist(data_cols) # iterate each column group dummy <- purrr::map(seq_len(length(data_cols)), function(i) { # which of all column groups should be gathered in this step, # which not? remove_cols <- all_data_cols[!all_data_cols %in% data_cols[[i]]] # remove those columns that should not be gathered tmp <- data[, -match(remove_cols, colnames(data))] # gather data frame tmp <- suppressWarnings(.gather(tmp, keys[i], values[i], data_cols[[i]])) # need to recode key-value? if (recode.key) tmp[[keys[i]]] <- sort(sjlabelled::as_numeric(tmp[[keys[i]]], keep.labels = FALSE)) # set variable label if (!is.null(labels)) sjlabelled::set_label(tmp[[values[i]]]) <- labels[i] tmp }) # we have at least one gathered data frame mydat <- dummy[[1]] # if we have multiple column groups to gather, go on here if (length(dummy) > 1) { # iterate remaining groups for (i in 2:length(dummy)) { # find gathered columns that do not already exist in our # output data frame .add_cols <- dummy[[i]][!colnames(dummy[[i]]) %in% colnames(mydat)] # remove rownames rownames(mydat) <- NULL rownames(.add_cols) <- NULL # and bind them to the output mydat <- dplyr::bind_cols(mydat, .add_cols) } } # return results mydat } sjmisc/R/merge_imputations.R0000644000176200001440000002577114046746443015651 0ustar liggesusers#' @title Merges multiple imputed data frames into a single data frame #' @name merge_imputations #' #' @description This function merges multiple imputed data frames from #' \code{\link[mice:mids-class]{mice::mids()}}-objects into a single data frame #' by computing the mean or selecting the most likely imputed value. #' #' @param dat The data frame that was imputed and used as argument in the #' \code{\link[mice]{mice}}-function call. #' @param imp The \code{\link[mice:mids-class]{mice::mids()}}-object with the imputed data frames #' from \code{dat}. #' @param ori Optional, if \code{ori} is specified, the imputed variables are #' appended to this data frame; else, a new data frame with the imputed #' variables is returned. #' @param summary After merging multiple imputed data, \code{summary} displays #' a graphical summary of the "quality" of the merged values, compared #' to the original imputed values. #' \describe{ #' \item{\code{"dens"}}{ #' Creates a density plot, which shows the distribution of the mean #' of the imputed values for each variable at each observation. The #' larger the areas overlap, the better is the fit of the merged #' value compared to the imputed value. #' } #' \item{\code{"hist"}}{ #' Similar to \code{summary = "dens"}, however, mean and merged #' values are shown as histogram. Bins should have almost equal #' height for both groups (mean and merged). #' } #' \item{\code{"sd"}}{ #' Creates a dot plot, where data points indicate the standard #' deviation for all imputed values (y-axis) at each merged #' value (x-axis) for all imputed variables. The higher the #' standard deviation, the less precise is the imputation, and #' hence the merged value. #' } #' } #' @param filter A character vector with variable names that should be plotted. #' All non-defined variables will not be shown in the plot. #' #' #' @return A data frame with (merged) imputed variables; or \code{ori} with #' appended imputed variables, if \code{ori} was specified. #' If \code{summary} is included, returns a list with the data frame #' \code{data} with (merged) imputed variables and some other summary #' information, including the \code{plot} as ggplot-object. #' #' @details This method merges multiple imputations of variables into a single #' variable by computing the (rounded) mean of all imputed values #' of missing values. By this, each missing value is replaced by #' those values that have been imputed the most times. #' \cr \cr #' \code{imp} must be a \code{mids}-object, which is returned by the #' \code{mice()}-function of the \pkg{mice}-package. \code{merge_imputations()} #' than creates a data frame for each imputed variable, by combining all #' imputations (as returned by the \code{\link[mice]{complete}}-function) #' of each variable, and computing the row means of this data frame. #' The mean value is then rounded for integer values (and not for numerical #' values with fractional part), which corresponds to the most frequent #' imputed value (mode) for a missing value. Missings in the original variable #' are replaced by the most frequent imputed value. #' #' @note Typically, further analyses are conducted on pooled results of multiple #' imputed data sets (see \code{\link[mice]{pool}}), however, sometimes #' (in social sciences) it is also feasible to compute the mean or mode #' of multiple imputed variables (see \cite{Burns et al. 2011}). #' #' @references Burns RA, Butterworth P, Kiely KM, Bielak AAM, Luszcz MA, Mitchell P, et al. 2011. Multiple imputation was an efficient method for harmonizing the Mini-Mental State Examination with missing item-level data. Journal of Clinical Epidemiology;64:787-93 \doi{10.1016/j.jclinepi.2010.10.011} #' #' @examples #' if (require("mice")) { #' imp <- mice(nhanes) #' #' # return data frame with imputed variables #' merge_imputations(nhanes, imp) #' #' # append imputed variables to original data frame #' merge_imputations(nhanes, imp, nhanes) #' #' # show summary of quality of merging imputations #' merge_imputations(nhanes, imp, summary = "dens", filter = c("chl", "hyp")) #' } #' @export merge_imputations <- function(dat, imp, ori = NULL, summary = c("none", "dens", "hist", "sd"), filter = NULL) { summary <- match.arg(summary) # check if suggested package is available if (!requireNamespace("mice", quietly = TRUE)) { stop("Package `mice` needed for this function to work. Please install it.", call. = FALSE) } # check classes if (!inherits(imp, "mids")) stop("`imp` must be a `mids`-object, as returned by the `mice()`-function.", call. = FALSE) if (!is.data.frame(dat)) stop("`dat` must be data frame.", call. = FALSE) if (!is.null(ori) && !is.data.frame(ori)) stop("`ori` must be data frame.", call. = FALSE) # create return value imputed.dat <- data.frame() analyse <- list() # make sure we have a valid range merge.steps <- seq_len(ncol(dat)) if (length(merge.steps) > length(imp$method)) merge.steps <- 1:length(imp$method) # iterate all variables of data frame that has missing values for (i in merge.steps) { # check if current variable was imputed or not if (!sjmisc::is_empty(imp$method[i])) { # copy indices of missing values from original variable miss_inc <- which(is.na(dat[[i]])) # create a new data frame from all imputation steps, where only the # imputations of the current variables are in miss_inc_dat <- as.data.frame(lapply(seq_len(imp$m), function(x) { mice::complete(imp, action = x)[[i]] }), stringsAsFactors = FALSE) # convert imputed variable to numeric. needed to perform row means. miss_inc_dat_num <- sjlabelled::as_numeric(miss_inc_dat) # copy original variable with missings to a new dummy vector x <- dat[[i]] # now compute the row means for this variable from all imputed variables # (which are in the data frame miss_inc_dat). This mean value represents # the most imputed value for a missing value. Copy this "final imputed" # value into the variable with missings, thus replacing the missings # in the original variable with the most likely imputed value. For numeric # integer values, this mean is rounded. if (is_float(x)) { x[miss_inc] <- rowMeans(miss_inc_dat_num[miss_inc, ]) } else if (is.numeric(x)) { x[miss_inc] <- round(rowMeans(miss_inc_dat_num[miss_inc, ])) } else if (is_num_fac(x)) { new.vals <- round(rowMeans(miss_inc_dat_num[miss_inc, ])) x <- factor(x, levels = unique(c(levels(x), as.character(new.vals)))) x[miss_inc] <- new.vals } else { tmp <- miss_inc_dat[miss_inc, ] x[miss_inc] <- apply(tmp, MARGIN = 1, FUN = mode_value) } # analyse quality of merged values, by saving mean and standard deviation # for each merged value to a separate list. the mean and sd refer to # all imputed values for a case analyse.mw <- apply(miss_inc_dat_num[miss_inc, ], 1, mean) analyse.sd <- apply(miss_inc_dat_num[miss_inc, ], 1, stats::sd) merge_result <- list( merged = x[miss_inc], mean = analyse.mw, sd = analyse.sd, grp = rep(colnames(dat[i]), length.out = length(miss_inc)) ) # and add to final list analyse[[length(analyse) + 1]] <- merge_result names(analyse)[length(analyse)] <- colnames(dat[i]) # append the imputed variable to the original data frame and preserve # the non-imputed variable with missing values as well if (ncol(imputed.dat) == 0) imputed.dat <- data.frame(x) else imputed.dat <- cbind(imputed.dat, x) # give meaningful column-/variable name. if (is.null(ori)) colnames(imputed.dat)[ncol(imputed.dat)] <- sprintf("%s", colnames(dat)[i]) else colnames(imputed.dat)[ncol(imputed.dat)] <- sprintf("%s_imp", colnames(dat)[i]) } } # user wants summary of quality-analysis of merged value if (summary != "none") { # bind data if (is.null(ori)) data <- imputed.dat else data <- dplyr::bind_cols(ori, imputed.dat) # return merged data and summary data impret <- list( data = data, summary = analyse, sum.type = summary, filter = filter, plot = .create_imputation_plot(summary, filter, analyse) ) return(structure(class = "sj_merge.imp", impret)) } if (is.null(ori)) # return imputed variables imputed.dat else # return data frame with appended imputed variables dplyr::bind_cols(ori, imputed.dat) } .create_imputation_plot <- function(.sum.type, .filter, .summary) { # check if ggplot is installed if (!requireNamespace("ggplot2", quietly = TRUE)) { stop("Package `ggplot2` needed for to plot summaries. Please install it.", call. = FALSE) } if (.sum.type == "sd") { analyse <- .summary %>% purrr::map_df(~.x) if (!is.null(.filter)) analyse <- analyse %>% dplyr::filter(.data$grp %in% .filter) p <- ggplot2::ggplot( data = analyse, mapping = ggplot2::aes_string(x = "merged", y = "sd") ) + ggplot2::geom_point() + ggplot2::facet_wrap( facets = ~grp, scales = "free", ncol = ceiling(sqrt(dplyr::n_distinct(analyse$grp))) ) + ggplot2::theme_bw() + ggplot2::labs( x = NULL, y = NULL, fill = NULL, title = "Standard Deviation of imputed values for each merged value" ) } else { analyse <- purrr::map_df(.summary, ~.x) analyse <- .gather(analyse, key = "value", value = "xpos", colnames(analyse)[1:2]) if (!is.null(.filter)) analyse <- analyse %>% dplyr::filter(.data$grp %in% .filter) p <- ggplot2::ggplot( data = analyse, mapping = ggplot2::aes_string(x = "xpos", fill = "value") ) + ggplot2::facet_wrap( facets = ~grp, scales = "free", ncol = ceiling(sqrt(dplyr::n_distinct(analyse$grp))) ) + ggplot2::theme_bw() + ggplot2::labs( x = NULL, y = NULL, fill = NULL, title = "Comparison between mean of imputed values and final merged values" ) # check type of summary diagram if (.sum.type == "dens") p <- p + ggplot2::geom_density(alpha = .2) else p <- p + ggplot2::geom_histogram(position = "dodge") } p } sjmisc/R/is_even.R0000644000176200001440000000255313451124270013522 0ustar liggesusers#' @title Check whether value is even or odd #' @name is_even #' #' @description Checks whether \code{x} is an even or odd number. Only #' accepts numeric vectors. #' #' @param x Numeric vector or single numeric value, or a data frame or list with #' such vectors. #' #' @return \code{is_even()} returns \code{TRUE} for each even value of \code{x}, \code{FALSE} for #' odd values. \code{is_odd()} returns \code{TRUE} for each odd value of \code{x} #' and \code{FALSE} for even values. #' #' @examples #' is_even(4) #' is_even(5) #' is_even(1:4) #' #' is_odd(4) #' is_odd(5) #' is_odd(1:4) #' #' @export is_even <- function(x) { UseMethod("is_even") } #' @export is_even.data.frame <- function(x) { lapply(x, FUN = is_even_helper) } #' @export is_even.list <- function(x) { lapply(x, FUN = is_even_helper) } #' @export is_even.default <- function(x) { is_even_helper(x) } is_even_helper <- function(x) (x %% 2) == 0 #' @rdname is_even #' @export is_odd <- function(x) { UseMethod("is_odd") } #' @export is_odd.data.frame <- function(x) { lapply(x, FUN = is_odd_helper) } #' @export is_odd.list <- function(x) { lapply(x, FUN = is_odd_helper) } #' @export is_odd.default <- function(x) { is_odd_helper(x) } is_odd_helper <- function(x) (x %% 2) == 1 sjmisc/R/add_cases.R0000644000176200001440000000633714046746443014021 0ustar liggesusers#' @title Add variables or cases to data frames #' @name add_variables #' #' @description \code{add_variables()} adds a new column to a data frame, while #' \code{add_case()} adds a new row to a data frame. These are convenient #' functions to add columns or rows not only at the end of a data frame, #' but at any column or row position. Furthermore, they allow easy integration #' into a pipe-workflow. #' #' @param data A data frame. #' @param ... One or more named vectors that indicate the variables or values, #' which will be added as new column or row to \code{data}. For \code{add_case()}, #' non-matching columns in \code{data} will be filled with \code{NA}. #' @param .after,.before Numerical index of row or column, where after or before #' the new variable or case should be added. If \code{.after = -1}, variables #' or cases are added at the beginning; if \code{.after = Inf}, #' variables and cases are added at the end. In case of \code{add_variables()}, #' \code{.after} and \code{.before} may also be a character name indicating #' the column in \code{data}, after or infront of what \code{...} should be #' inserted. #' #' @return \code{data}, including the new variables or cases from \code{...}. #' #' @note For \code{add_case()}, if variable does not exist, a new variable is #' created and existing cases for this new variable get the value \code{NA}. #' See 'Examples'. #' #' @examples #' d <- data.frame( #' a = c(1, 2, 3), #' b = c("a", "b", "c"), #' c = c(10, 20, 30), #' stringsAsFactors = FALSE #' ) #' #' add_case(d, b = "d") #' add_case(d, b = "d", a = 5, .before = 1) #' #' # adding a new case for a new variable #' add_case(d, e = "new case") #' #' add_variables(d, new = 5) #' add_variables(d, new = c(4, 4, 4), new2 = c(5, 5, 5), .after = "b") #' @export add_variables <- function(data, ..., .after = Inf, .before = NULL) { # copy attributes a <- attributes(data) if (is.character(.after)) .after <- which(colnames(data) == .after) if (!is.null(.before) && is.character(.before)) .after <- which(colnames(data) == .before) - 1 if (!is.null(.before) && is.numeric(.before)) .after <- .before - 1 dat <- data.frame(..., stringsAsFactors = FALSE) if (.after < 1) { x <- cbind(dat, data) } else if (is.infinite(.after) || .after >= ncol(data)) { x <- cbind(data, dat) } else { c1 <- 1:.after c2 <- (.after + 1):ncol(data) x1 <- dplyr::select(data, !! c1) x2 <- dplyr::select(data, !! c2) x <- cbind(x1, dat, x2) } a[names(a) %in% names(attributes(x))] <- NULL attributes(x) <- c(attributes(x), a) x } #' @rdname add_variables #' @export add_case <- function(data, ..., .after = Inf, .before = NULL) { if (!is.null(.before)) .after <- .before - 1 dat <- data.frame(..., stringsAsFactors = FALSE) x <- rbind(data, NA) last.row <- nrow(x) for (.x in colnames(dat)) { x[last.row, .x] <- dat[[.x]] } if (.after < 1) o <- c(last.row, 1:(last.row - 1)) else if (is.infinite(.after) || .after >= nrow(x)) o <- 1:last.row else o <- c(1:.after, last.row, (.after + 1):(last.row - 1)) x[o, , drop = FALSE] } sjmisc/R/rec.R0000644000176200001440000006577114153357271012667 0ustar liggesusers#' @title Recode variables #' @name rec #' #' @description \code{rec()} recodes values of variables, where variable #' selection is based on variable names or column position, or on #' select helpers (see documentation on \code{...}). \code{rec_if()} is a #' scoped variant of \code{rec()}, where recoding will be applied only #' to those variables that match the logical condition of \code{predicate}. #' #' @seealso \code{\link[sjlabelled]{set_na}} for setting \code{NA} values, \code{\link{replace_na}} #' to replace \code{NA}'s with specific value, \code{\link{recode_to}} #' for re-shifting value ranges and \code{\link{ref_lvl}} to change the #' reference level of (numeric) factors. #' #' @param predicate A predicate function to be applied to the columns. The #' variables for which \code{predicate} returns \code{TRUE} are selected. #' @param rec String with recode pairs of old and new values. See 'Details' #' for examples. \code{\link{rec_pattern}} is a convenient function to #' create recode strings for grouping variables. #' @param as.num Logical, if \code{TRUE}, return value will be numeric, not a factor. #' @param to.factor Logical, alias for \code{as.num}. If \code{TRUE}, return value #' will be a factor, not numeric. #' @param var.label Optional string, to set variable label attribute for the #' returned variable (see vignette \href{https://cran.r-project.org/package=sjlabelled/vignettes/intro_sjlabelled.html}{Labelled Data and the sjlabelled-Package}). #' If \code{NULL} (default), variable label attribute of \code{x} will #' be used (if present). If empty, variable label attributes will be removed. #' @param val.labels Optional character vector, to set value label attributes #' of recoded variable (see vignette \href{https://cran.r-project.org/package=sjlabelled/vignettes/intro_sjlabelled.html}{Labelled Data and the sjlabelled-Package}). #' If \code{NULL} (default), no value labels will be set. Value labels #' can also be directly defined in the \code{rec}-syntax, see #' 'Details'. #' @param append Logical, if \code{TRUE} (the default) and \code{x} is a data frame, #' \code{x} including the new variables as additional columns is returned; #' if \code{FALSE}, only the new variables are returned. #' @param suffix String value, will be appended to variable (column) names of #' \code{x}, if \code{x} is a data frame. If \code{x} is not a data #' frame, this argument will be ignored. The default value to suffix #' column names in a data frame depends on the function call: #' \itemize{ #' \item recoded variables (\code{rec()}) will be suffixed with \code{"_r"} #' \item recoded variables (\code{recode_to()}) will be suffixed with \code{"_r0"} #' \item dichotomized variables (\code{dicho()}) will be suffixed with \code{"_d"} #' \item grouped variables (\code{split_var()}) will be suffixed with \code{"_g"} #' \item grouped variables (\code{group_var()}) will be suffixed with \code{"_gr"} #' \item standardized variables (\code{std()}) will be suffixed with \code{"_z"} #' \item centered variables (\code{center()}) will be suffixed with \code{"_c"} #' } #' If \code{suffix = ""} and \code{append = TRUE}, existing variables that #' have been recoded/transformed will be overwritten. #' #' @inheritParams to_dummy #' #' @return \code{x} with recoded categories. If \code{x} is a data frame, #' for \code{append = TRUE}, \code{x} including the recoded variables #' as new columns is returned; if \code{append = FALSE}, only #' the recoded variables will be returned. If \code{append = TRUE} and #' \code{suffix = ""}, recoded variables will replace (overwrite) existing #' variables. #' #' @details The \code{rec} string has following syntax: #' \describe{ #' \item{recode pairs}{each recode pair has to be separated by a \code{;}, e.g. \code{rec = "1=1; 2=4; 3=2; 4=3"}} #' \item{multiple values}{multiple old values that should be recoded into a new single value may be separated with comma, e.g. \code{"1,2=1; 3,4=2"}} #' \item{value range}{a value range is indicated by a colon, e.g. \code{"1:4=1; 5:8=2"} (recodes all values from 1 to 4 into 1, and from 5 to 8 into 2)} #' \item{value range for doubles}{for double vectors (with fractional part), all values within the specified range are recoded; e.g. \code{1:2.5=1;2.6:3=2} recodes 1 to 2.5 into 1 and 2.6 to 3 into 2, but 2.55 would not be recoded (since it's not included in any of the specified ranges)} #' \item{\code{"min"} and \code{"max"}}{minimum and maximum values are indicates by \emph{min} (or \emph{lo}) and \emph{max} (or \emph{hi}), e.g. \code{"min:4=1; 5:max=2"} (recodes all values from minimum values of \code{x} to 4 into 1, and from 5 to maximum values of \code{x} into 2)} #' \item{\code{"else"}}{all other values, which have not been specified yet, are indicated by \emph{else}, e.g. \code{"3=1; 1=2; else=3"} (recodes 3 into 1, 1 into 2 and all other values into 3)} #' \item{\code{"copy"}}{the \code{"else"}-token can be combined with \emph{copy}, indicating that all remaining, not yet recoded values should stay the same (are copied from the original value), e.g. \code{"3=1; 1=2; else=copy"} (recodes 3 into 1, 1 into 2 and all other values like 2, 4 or 5 etc. will not be recoded, but copied, see 'Examples')} #' \item{\code{NA}'s}{\code{\link{NA}} values are allowed both as old and new value, e.g. \code{"NA=1; 3:5=NA"} (recodes all NA into 1, and all values from 3 to 5 into NA in the new variable)} #' \item{\code{"rev"}}{\code{"rev"} is a special token that reverses the value order (see 'Examples')} #' \item{direct value labelling}{value labels for new values can be assigned inside the recode pattern by writing the value label in square brackets after defining the new value in a recode pair, e.g. \code{"15:30=1 [young aged]; 31:55=2 [middle aged]; 56:max=3 [old aged]"}. See 'Examples'.} #' } #' #' @note Please note following behaviours of the function: #' \itemize{ #' \item the \code{"else"}-token should always be the last argument in the \code{rec}-string. #' \item Non-matching values will be set to \code{NA}, unless captured by the \code{"else"}-token. #' \item Tagged NA values (see \code{\link[haven]{tagged_na}}) and their value labels will be preserved when copying NA values to the recoded vector with \code{"else=copy"}. #' \item Variable label attributes (see, for instance, \code{\link[sjlabelled]{get_label}}) are preserved (unless changed via \code{var.label}-argument), however, value label attributes are removed (except for \code{"rev"}, where present value labels will be automatically reversed as well). Use \code{val.labels}-argument to add labels for recoded values. #' \item If \code{x} is a data frame, all variables should have the same categories resp. value range (else, see second bullet, \code{NA}s are produced). #' } #' #' @examples #' data(efc) #' table(efc$e42dep, useNA = "always") #' #' # replace NA with 5 #' table(rec(efc$e42dep, rec = "1=1;2=2;3=3;4=4;NA=5"), useNA = "always") #' #' # recode 1 to 2 into 1 and 3 to 4 into 2 #' table(rec(efc$e42dep, rec = "1,2=1; 3,4=2"), useNA = "always") #' #' # keep value labels. variable label is automatically preserved #' library(dplyr) #' efc %>% #' select(e42dep) %>% #' rec(rec = "1,2=1; 3,4=2", #' val.labels = c("low dependency", "high dependency")) %>% #' frq() #' #' # works with mutate #' efc %>% #' select(e42dep, e17age) %>% #' mutate(dependency_rev = rec(e42dep, rec = "rev")) %>% #' head() #' #' # recode 1 to 3 into 1 and 4 into 2 #' table(rec(efc$e42dep, rec = "min:3=1; 4=2"), useNA = "always") #' #' # recode 2 to 1 and all others into 2 #' table(rec(efc$e42dep, rec = "2=1; else=2"), useNA = "always") #' #' # reverse value order #' table(rec(efc$e42dep, rec = "rev"), useNA = "always") #' #' # recode only selected values, copy remaining #' table(efc$e15relat) #' table(rec(efc$e15relat, rec = "1,2,4=1; else=copy")) #' #' # recode variables with same category in a data frame #' head(efc[, 6:9]) #' head(rec(efc[, 6:9], rec = "1=10;2=20;3=30;4=40")) #' #' # recode multiple variables and set value labels via recode-syntax #' dummy <- rec( #' efc, c160age, e17age, #' rec = "15:30=1 [young]; 31:55=2 [middle]; 56:max=3 [old]", #' append = FALSE #' ) #' frq(dummy) #' #' # recode variables with same value-range #' lapply( #' rec( #' efc, c82cop1, c83cop2, c84cop3, #' rec = "1,2=1; NA=9; else=copy", #' append = FALSE #' ), #' table, #' useNA = "always" #' ) #' #' # recode character vector #' dummy <- c("M", "F", "F", "X") #' rec(dummy, rec = "M=Male; F=Female; X=Refused") #' #' # recode numeric to character #' rec(efc$e42dep, rec = "1=first;2=2nd;3=third;else=hi") %>% head() #' #' # recode non-numeric factors #' data(iris) #' table(rec(iris, Species, rec = "setosa=huhu; else=copy", append = FALSE)) #' #' # recode floating points #' table(rec( #' iris, Sepal.Length, rec = "lo:5=1;5.01:6.5=2;6.501:max=3", append = FALSE #' )) #' #' # preserve tagged NAs #' if (require("haven")) { #' x <- labelled(c(1:3, tagged_na("a", "c", "z"), 4:1), #' c("Agreement" = 1, "Disagreement" = 4, "First" = tagged_na("c"), #' "Refused" = tagged_na("a"), "Not home" = tagged_na("z"))) #' # get current value labels #' x #' # recode 2 into 5; Values of tagged NAs are preserved #' rec(x, rec = "2=5;else=copy") #' } #' #' # use select-helpers from dplyr-package #' out <- rec( #' efc, contains("cop"), c161sex:c175empl, #' rec = "0,1=0; else=1", #' append = FALSE #' ) #' head(out) #' #' # recode only variables that have a value range from 1-4 #' p <- function(x) min(x, na.rm = TRUE) > 0 && max(x, na.rm = TRUE) < 5 #' out <- rec_if(efc, predicate = p, rec = "1:3=1;4=2;else=copy") #' head(out) #' @export rec <- function(x, ..., rec, as.num = TRUE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_r", to.factor = !as.num) { UseMethod("rec") } #' @export rec.default <- function(x, ..., rec, as.num = TRUE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_r", to.factor = !as.num) { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) if (!missing(to.factor)) { as.num <- !to.factor } rec_core_fun( x = x, .dat = .dat, rec = rec, as.num = as.num, var.label = var.label, val.labels = val.labels, append = append, suffix = suffix ) } #' @export rec.mids <- function(x, ..., rec, as.num = TRUE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_r", to.factor = !as.num) { vars <- dplyr::quos(...) ndf <- prepare_mids_recode(x) if (!missing(to.factor)) { as.num <- !to.factor } # select variable and compute rowsums. add this variable # to each imputed ndf$data <- purrr::map( ndf$data, function(.x) { dat <- dplyr::select(.x, !!! vars) dplyr::bind_cols( .x, rec_core_fun( x = dat, .dat = dat, rec = rec, as.num = as.num, var.label = var.label, val.labels = val.labels, append = FALSE, suffix = suffix )) } ) final_mids_recode(ndf) } #' @rdname rec #' @export rec_if <- function(x, predicate, rec, as.num = TRUE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_r", to.factor = !as.num) { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) if (!missing(to.factor)) { as.num <- !to.factor } # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } rec_core_fun( x = x, .dat = .dat, rec = rec, as.num = as.num, var.label = var.label, val.labels = val.labels, append = append, suffix = suffix ) } rec_core_fun <- function(x, .dat, rec, as.num = TRUE, var.label = NULL, val.labels = NULL, append = TRUE, suffix = "_r") { if (is.data.frame(x)) { # remember original data, if user wants to bind columns orix <- x # iterate variables of data frame for (i in colnames(.dat)) { x[[i]] <- rec_helper( x = .dat[[i]], recodes = rec, as.num = as.num, var.label = var.label, val.labels = val.labels, column_name = i ) } # select only recoded variables x <- x[colnames(.dat)] # add suffix to recoded variables and combine data x <- append_columns(x, orix, suffix, append) } else { x <- rec_helper( x = .dat, recodes = rec, as.num = as.num, var.label = var.label, val.labels = val.labels ) } x } rec_helper <- function(x, recodes, as.num, var.label, val.labels, column_name = NULL) { # retrieve variable label if (is.null(var.label)) var_lab <- sjlabelled::get_label(x) else var_lab <- var.label # do we have any value labels? val_lab <- val.labels if (all_na(x)) { if (!is.null(column_name)) { msg <- paste0("Variable '", column_name, "' has no non-missing values.") } else { msg <- "Variable has no non-missing values." } warning(insight::format_message(msg), call. = FALSE) } # remember if NA's have been recoded... na_recoded <- FALSE # save some sates... reversed_value_labels <- NULL recodes_reversed <- recodes == "rev" n_values <- n_unique(x) # drop labels when reversing if (recodes_reversed) { reversed_value_labels <- sjlabelled::get_labels( x, attr.only = TRUE, values = "n", non.labelled = FALSE, drop.na = TRUE ) # reverse value labels, but not values r_values <- names(reversed_value_labels) reversed_value_labels <- rev(reversed_value_labels) names(reversed_value_labels) <- r_values } # get current NA values current.na <- sjlabelled::get_na(x) # do we have a factor with "x"? if (is.factor(x)) { if (is_num_fac(x)) { # numeric factors coerced to numeric x <- as.numeric(as.character(x)) } else { # non-numeric factors coerced to character x <- as.character(x) # non-numeric factors will always be factor again as.num <- FALSE } } # is vector a double with decimals? with_dec <- is_float(x) # retrieve min and max values min_val <- suppressWarnings(min(x, na.rm = TRUE)) max_val <- suppressWarnings(max(x, na.rm = TRUE)) # do we have special recode-token? if (recodes_reversed) { # retrieve unique valus, sorted ov <- if (!is.null(reversed_value_labels) && n_values <= length(reversed_value_labels)) as.numeric(as.vector(names(reversed_value_labels))) else sort(unique(stats::na.omit(as.vector(x)))) # new values should be reversed order nv <- rev(ov) # create recodes-string recodes <- paste(sprintf("%i=%i", ov, nv), collapse = ";") } # we allow direct labelling, so extract possible direct labels here # this piece of code is definitely not the best solution, I bet... # but it seems to work, and I discovered the regex-pattern by myself :-) # this function extracts direct value labels from the recodes-pattern and # creates a named vector with value labels, e.g.: # "18:23=1 [18to23]; 24:65=2 [24to65]; 66:max=3 [> 65]" # will return a named vector with value 1 to 3, where the text inside [ and ] # is used as name for each value dir.label <- unlist(lapply(strsplit( unlist(regmatches( recodes, gregexpr( pattern = "=([^\\]]*)\\]", text = recodes, perl = TRUE ) )), split = "\\[", perl = TRUE ), function(x) { tmp <- as.numeric(trim(substr(x[1], 2, nchar(x[1])))) names(tmp) <- trim(substr(x[2], 1, nchar(x[2]) - 1)) tmp })) # if we found any labels, replace the value label argument if (!is.null(dir.label) && !sjmisc::is_empty(dir.label)) val_lab <- dir.label # remove possible direct labels from recode pattern recodes <- gsub( pattern = "\\[([^\\[]*)\\]", replacement = "", x = recodes, useBytes = TRUE, perl = TRUE ) # prepare and clean recode string # retrieve each single recode command rec_string <- unlist(strsplit(recodes, ";", fixed = TRUE)) # remove spaces rec_string <- gsub(" ", "", rec_string, useBytes = TRUE, fixed = TRUE) # remove line breaks rec_string <- gsub("\n", "", rec_string, useBytes = TRUE, fixed = FALSE) rec_string <- gsub("\r", "", rec_string, useBytes = TRUE, fixed = FALSE) # replace min and max placeholders rec_string <- gsub("(\\blo\\b)(.*)=(.*)", paste0(as.character(min_val), "\\2=\\3"), rec_string, useBytes = TRUE, perl = TRUE) rec_string <- gsub("(\\bmin\\b)(.*)=(.*)", paste0(as.character(min_val), "\\2=\\3"), rec_string, useBytes = TRUE, perl = TRUE) rec_string <- gsub("(\\bmax\\b)(.*)=(.*)", paste0(as.character(max_val), "\\2=\\3"), rec_string, useBytes = TRUE, perl = TRUE) rec_string <- gsub("(\\bhi\\b)(.*)=(.*)", paste0(as.character(max_val), "\\2=\\3"), rec_string, useBytes = TRUE, perl = TRUE) # rec_string <- gsub("(min)\\b", as.character(min_val), rec_string, useBytes = TRUE, perl = TRUE) # rec_string <- gsub("(lo)\\b", as.character(min_val), rec_string, useBytes = TRUE, perl = TRUE) # rec_string <- gsub("(max)\\b", as.character(max_val), rec_string, useBytes = TRUE, perl = TRUE) # rec_string <- gsub("(hi)\\b", as.character(max_val), rec_string, useBytes = TRUE, perl = TRUE) # retrieve all recode-pairs, i.e. all old-value = new-value assignments rec_pairs <- strsplit(rec_string, "=", fixed = TRUE) # check for correct syntax correct_syntax <- unlist(lapply(rec_pairs, function(r) if (length(r) != 2) r else NULL)) # found any errors in syntax? if (!is.null(correct_syntax)) { stop(sprintf("?Syntax error in argument \"%s\"", paste(correct_syntax, collapse = "=")), call. = FALSE) } # check for duplicated inputs if (anyDuplicated(sapply(rec_pairs, function(i) i[2])) > 0) { insight::print_color("One or more of the old values are recoded into identical new values.\nPlease check if you correctly specified the recode-pattern,\nelse separate multiple values with comma, e.g.", "red") insight::print_color(" rec=\"a,b,c=1; d,e,f=2\"", "green") insight::print_color(".\n", "red") } # the new, recoded variable new_var <- rep(-Inf, length(x)) # now iterate all recode pairs # and do each recoding step for (i in seq_len(length(rec_pairs))) { # retrieve recode pairs as string, and start with separaring old-values # at comma separator old_val_string <- unlist(strsplit(rec_pairs[[i]][1], ",", fixed = TRUE)) new_val_string <- rec_pairs[[i]][2] new_val <- c() # check if new_val_string is correct syntax if (new_val_string == "NA") { # here we have a valid NA specification new_val <- NA } else if (new_val_string == "copy") { # copy all remaining values, i.e. don't recode # remaining values that have not else been specified # or recoded. NULL indicates the "copy"-token new_val <- NULL } else { # can new value be converted to numeric? new_val <- suppressWarnings(as.numeric(new_val_string)) # if not, assignment is wrong if (is.na(new_val)) new_val <- new_val_string } # retrieve and check old values old_val <- c() for (j in seq_len(length(old_val_string))) { # copy to shorten code ovs <- old_val_string[j] # check if old_val_string is correct syntax if (ovs == "NA") { # here we have a valid NA specification # add value to vector of old values that # should be recoded old_val <- c(old_val, NA) } else if (ovs == "else") { # here we have a valid "else" specification # add all remaining values (in the new variable # created as "-Inf") to vector that should be recoded old_val <- -Inf break } else if (length(grep(":", ovs, fixed = TRUE)) > 0) { # this value indicates a range of values to be recoded, because # we have found a colon. now copy from and to values from range from <- suppressWarnings(as.numeric(unlist(strsplit(ovs, ":", fixed = TRUE))[1])) to <- suppressWarnings(as.numeric(unlist(strsplit(ovs, ":", fixed = TRUE))[2])) # check for valid range values if (is.na(from) || is.na(to)) { stop(sprintf("?Syntax error in argument \"%s\"", ovs), call. = FALSE) } # to lower than from? if (from > to) { stop(sprintf("?Syntax error in recode range from %g to %g.", from, to), call. = FALSE) } # for floating point range, we keep the range if (with_dec) old_val <- ovs else # add range to vector of old values old_val <- c(old_val, seq(from, to)) } else { # can new value be converted to numeric? ovn <- suppressWarnings(as.numeric(ovs)) # if not, assignment is wrong if (is.na(ovn)) ovn <- ovs # add old recode values to final vector of values old_val <- c(old_val, ovn) } } # now we have all recode values and want # to replace old with new values... for (k in seq_len(length(old_val))) { # check for "else" token if (is.infinite(old_val[k])) { # else-token found. we first need to preserve NA, but only, # if these haven't been copied before if (!na_recoded) new_var[which(is.na(x))] <- x[which(is.na(x))] # find replace-indices. since "else"-token has to be # the last argument in the "recodes"-string, the remaining, # non-recoded values are still "-Inf". Hence, find positions # of all not yet recoded values rep.pos <- which(new_var == -Inf) # else token found, now check whether we have a "copy" # token as well. in this case, new_val would be NULL if (is.null(new_val)) { # all not yet recodes values in new_var should get # the values at that position of "x" (the old variable), # i.e. these values remain unchanged. new_var[rep.pos] <- x[rep.pos] } else { # find all -Inf in new var and replace them with replace value new_var[rep.pos] <- new_val } # check for "NA" token } else if (is.na(old_val[k])) { # replace all NA with new value new_var[which(is.na(x))] <- new_val # remember that we have recoded NA's. Might be # important for else-token above. na_recoded <- TRUE } else if (is.character(old_val[k]) && with_dec) { # this value indicates a range of values to be recoded, because # we have found a colon. now copy from and to values from range from <- suppressWarnings(as.numeric(unlist(strsplit(old_val[k], ":", fixed = TRUE))[1])) to <- suppressWarnings(as.numeric(unlist(strsplit(old_val[k], ":", fixed = TRUE))[2])) # if old_val is a character, we assume we have a double with decimal # points and want to recode a range new_var[which(x >= from & x <= to)] <- new_val } else { # else we have numeric values, which should be replaced new_var[which(gsub(" ", "", x) == old_val[k])] <- new_val } } } # replace remaining -Inf with NA if (any(is.infinite(new_var))) new_var[which(new_var == -Inf)] <- NA # if we have no value labels, but we have reversed labels, add them back now if (is.null(val_lab) && !is.null(reversed_value_labels) && n_values <= length(reversed_value_labels)) { val_lab <- reversed_value_labels } # if we have no value labels and there is a 1 to 1 correspondence between old and new values, keep the old labels with the related new values if(is.null(val_lab)) { old_values <- sapply(rec_pairs, function(item) item[1]) new_values <- sapply(rec_pairs, function(item) item[2]) num_ovs <- suppressWarnings(as.numeric(old_values)) num_nvs <- suppressWarnings(as.numeric(new_values)) lab_log <- c() # all numeric or NA or else=copy lab_log <- c(lab_log, length(old_values) == sum(!is.na(num_ovs) | old_values == "NA" | (new_values == "copy" & old_values == "else"))) lab_log <- c(lab_log, length(new_values) == sum(!is.na(num_nvs) | new_values == "NA" | (new_values == "copy" & old_values == "else"))) # at most 2 distinct non-numeric elements (which only can be NA or else=copy) lab_log <- c(lab_log, sum(is.na(num_ovs)) == 0 || identical(old_values[is.na(num_ovs)], "NA") || identical(old_values[is.na(num_ovs)], "else") || (setequal(old_values[is.na(num_ovs)],c("else","NA")) & sum(is.na(num_ovs)) == 2)) lab_log <- c(lab_log, sum(is.na(num_nvs)) == 0 || identical(new_values[is.na(num_nvs)], "NA") || identical(new_values[is.na(num_nvs)], "copy") || (setequal(new_values[is.na(num_nvs)],c("copy","NA")) & sum(is.na(num_nvs)) == 2)) # all numeric values distinct lab_log <- c(lab_log, length(num_ovs[!is.na(num_ovs)]) == length(unique(num_ovs[!is.na(num_ovs)]))) lab_log <- c(lab_log, length(num_nvs[!is.na(num_nvs)]) == length(unique(num_nvs[!is.na(num_nvs)]))) if(all(lab_log)) { value_labels <- sjlabelled::get_labels( x, attr.only = TRUE, values = "n", non.labelled = FALSE, drop.na = TRUE ) nullw <- which(names(value_labels) %in% old_values[which(new_values=="NA")]) if(length(nullw) != 0) { value_labels <- value_labels[-nullw] # remove possible label mapped to NA } if("else=copy" %in% rec_string && length(intersect(setdiff(names(value_labels), old_values), new_values)) == 0) { new_value_labels <- value_labels } else if(!"else=copy" %in% rec_string){ new_value_labels <- value_labels <- value_labels[which(names(value_labels) %in% old_values)] } else { new_value_labels <- value_labels <- NULL # don't keep labels, since there are new values which should keep the labels from multiple old values } ivl <- intersect(old_values, names(value_labels)) for(nvl in ivl) { names(new_value_labels)[which(names(value_labels) == nvl)] <- new_values[which(old_values == nvl)] } val_lab <- new_value_labels } } # add back NA labels if (!is.null(current.na) && length(current.na) > 0) { # add named missings val_lab <- c(val_lab, current.na) } # set back variable and value labels new_var <- suppressWarnings(sjlabelled::set_label(x = new_var, label = var_lab)) new_var <- suppressWarnings(sjlabelled::set_labels(x = new_var, labels = val_lab)) # return result as factor? if (!as.num) new_var <- to_factor(new_var) new_var } sjmisc/R/frq.R0000644000176200001440000005436214620403335012670 0ustar liggesusers#' @title Frequency table of labelled variables #' @name frq #' #' @description This function returns a frequency table of labelled vectors, as data frame. #' #' @param sort.frq Determines whether categories should be sorted #' according to their frequencies or not. Default is \code{"none"}, so #' categories are not sorted by frequency. Use \code{"asc"} or #' \code{"desc"} for sorting categories ascending or descending order. #' @param weights Bare name, or name as string, of a variable in \code{x} #' that indicates the vector of weights, which will be applied to weight all #' observations. Default is \code{NULL}, so no weights are used. #' @param auto.grp Numeric value, indicating the minimum amount of unique #' values in a variable, at which automatic grouping into smaller units #' is done (see \code{\link{group_var}}). Default value for \code{auto.group} #' is \code{NULL}, i.e. auto-grouping is off. #' @param show.strings Logical, if \code{TRUE}, frequency tables for character #' vectors will not be printed. This is useful when printing frequency tables #' of all variables from a data frame, and due to computational reasons #' character vectors should not be printed. #' @param show.na Logical, or \code{"auto"}. If \code{TRUE}, the output always #' contains information on missing values, even if variables have no missing #' values. If \code{FALSE}, information on missing values are removed from #' the output. If \code{show.na = "auto"}, information on missing values #' is only shown when variables actually have missing values, else it's not #' shown. #' @param grp.strings Numeric, if not \code{NULL}, groups string values in #' character vectors, based on their similarity. See \code{\link{group_str}} #' and \code{\link{str_find}} for details on grouping, and their #' \code{precision}-argument to get more details on the distance of strings #' to be treated as equal. #' @param min.frq Numeric, indicating the minimum frequency for which a #' value will be shown in the output (except for the missing values, prevailing #' \code{show.na}). Default value for \code{min.frq} is \code{0}, so all value #' frequencies are shown. All values or categories that have less than #' \code{min.frq} occurences in the data will be summarized in a \code{"n < 100"} #' category. #' @param title String, will be used as alternative title to the variable #' label. If \code{x} is a grouped data frame, \code{title} must be a #' vector of same length as groups. #' @param file Destination file, if the output should be saved as file. #' Only used when \code{out} is not \code{"txt"}. #' @param encoding Character vector, indicating the charset encoding used #' for variable and value labels. Default is \code{"UTF-8"}. Only used #' when \code{out} is not \code{"txt"}. #' #' @inheritParams descr #' @inheritParams to_dummy #' #' @return A list of data frames with values, value labels, frequencies, raw, valid and #' cumulative percentages of \code{x}. #' #' @details The \dots-argument not only accepts variable names or expressions #' from select-helpers. You can also use logical #' conditions, math operations, or combining variables to produce "crosstables". #' See 'Examples' for more details. #' #' @note \code{x} may also be a grouped data frame (see \code{\link[dplyr]{group_by}}) #' with up to two grouping variables. Frequency tables are created for each #' subgroup then. #' \cr \cr #' The \code{print()}-method adds a table header with information on the #' variable label, variable type, total and valid N, and mean and #' standard deviations. Mean and SD are \emph{always} printed, even for #' categorical variables (factors) or character vectors. In this case, #' values are coerced into numeric vector to calculate the summary #' statistics. #' \cr \cr #' To print tables in markdown or HTML format, use \code{print_md()} or #' \code{print_html()}. #' #' @seealso \code{\link{flat_table}} for labelled (proportional) tables. #' #' @examples #' # simple vector #' data(efc) #' frq(efc$e42dep) #' #' # with grouped data frames, in a pipe #' library(dplyr) #' efc %>% #' group_by(e16sex, c172code) %>% #' frq(e42dep) #' #' # show only categories with a minimal amount of frequencies #' frq(mtcars$gear) #' #' frq(mtcars$gear, min.frq = 10) #' #' frq(mtcars$gear, min.frq = 15) #' #' # with select-helpers: all variables from the COPE-Index #' # (which all have a "cop" in their name) #' frq(efc, contains("cop")) #' #' # all variables from column "c161sex" to column "c175empl" #' frq(efc, c161sex:c175empl) #' #' # for non-labelled data, variable name is printed, #' # and "label" column is removed from output #' data(iris) #' frq(iris, Species) #' #' # also works on grouped data frames #' efc %>% #' group_by(c172code) %>% #' frq(is.na(nur_pst)) #' #' # group variables with large range and with weights #' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) #' frq(efc, c160age, auto.grp = 5, weights = weights) #' #' # different weight options #' frq(efc, c172code, weights = weights) #' frq(efc, c172code, weights = "weights") #' frq(efc, c172code, weights = efc$weights) #' frq(efc$c172code, weights = efc$weights) #' #' # group string values #' dummy <- efc[1:50, 3, drop = FALSE] #' dummy$words <- sample( #' c("Hello", "Helo", "Hole", "Apple", "Ape", #' "New", "Old", "System", "Systemic"), #' size = nrow(dummy), #' replace = TRUE #' ) #' #' frq(dummy) #' frq(dummy, grp.strings = 2) #' #' #### other expressions than variables #' #' # logical conditions #' frq(mtcars, cyl ==6) #' #' frq(efc, is.na(nur_pst), contains("cop")) #' #' iris %>% #' frq(starts_with("Petal"), Sepal.Length > 5) #' #' # computation of variables "on the fly" #' frq(mtcars, (gear + carb) / cyl) #' #' # crosstables #' set.seed(123) #' d <- data.frame( #' var_x = sample(letters[1:3], size = 30, replace = TRUE), #' var_y = sample(1:2, size = 30, replace = TRUE), #' var_z = sample(LETTERS[8:10], size = 30, replace = TRUE) #' ) #' table(d$var_x, d$var_z) #' frq(d, paste0(var_x, var_z)) #' frq(d, paste0(var_x, var_y, var_z)) #' @export frq <- function(x, ..., sort.frq = c("none", "asc", "desc"), weights = NULL, auto.grp = NULL, show.strings = TRUE, show.na = TRUE, grp.strings = NULL, min.frq = 0, out = c("txt", "viewer", "browser"), title = NULL, encoding = "UTF-8", file = NULL) { out <- match.arg(out) if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) { message("Package `sjPlot` needs to be loaded to print HTML tables.") out <- "txt" } # check min.frq value if (!is.numeric(min.frq)) { message("min.frq value is not numeric. Returned output assumes default value 0.") min.frq <- 0 } # get dot data xw <- get_dot_data(x, dplyr::enquos(...)) if (missing(weights)) { w <- NULL x <- xw } else { w <- try(rlang::quo_name(rlang::enquo(weights)), silent = TRUE) if (inherits(w, "try-error")) w <- NULL w.string <- try(eval(weights), silent = TRUE) if (!inherits(w.string, "try-error") && is.character(w.string)) w <- w.string if (!sjmisc::is_empty(w) && w != "NULL" && !obj_has_name(xw, w) && obj_has_name(x, w)) { x <- dplyr::bind_cols(xw, data.frame(x[[w]])) colnames(x)[ncol(x)] <- w } else if (!sjmisc::is_empty(string_contains("$", w)) && length(w.string) > 1 && is.numeric(w.string)) { x <- cbind(xw, data.frame(w.string)) w <- sub("(.*)\\$(.*)", "\\2", w) colnames(x)[ncol(x)] <- w } else { message(sprintf("Weights `%s` not found in data.", w)) w <- NULL x <- xw } } if (!isTRUE(show.na)) { # remove empty columns rem.col <- empty_cols(x) if (!sjmisc::is_empty(rem.col)) { rem.vars <- colnames(x)[rem.col] x <- remove_empty_cols(x) message(sprintf("Following %i variables have only missing values and are not shown:", length(rem.vars))) cat(paste(sprintf("%s [%i]", rem.vars, rem.col), collapse = ", ")) cat("\n") } } # match args sort.frq <- match.arg(sort.frq) # return values dataframes <- list() # remove strings from output, if requested # and check if there are any variables left to print if (!show.strings) x <- dplyr::select_if(x, no_character) if ((all(sjmisc::is_empty(stats::na.omit(x), first.only = FALSE)) && show.na == FALSE) || all(suppressMessages(replace_na(sjmisc::is_empty(x, first.only = FALSE, all.na.empty = FALSE), value = FALSE)))) return(NULL) # group strings if (!is.null(grp.strings)) { a <- attributes(x) if (!is.data.frame(x)) { was.df <- FALSE x <- data.frame(x, stringsAsFactors = FALSE) } else was.df <- TRUE x <- x %>% purrr::map_if(is.character, ~ group_str( strings = .x, precision = grp.strings, remove.empty = FALSE) ) %>% as.data.frame(stringsAsFactors = FALSE) if (was.df) attributes(x) <- a else attributes(x[[1]]) <- a } # do we have a grouped data frame? if (inherits(x, "grouped_df")) { grkey <- colnames(dplyr::group_keys(x)) for (i in grkey) { if (is.character(x[[i]])) x[[i]] <- as.factor(x[[i]]) } # get grouped data grps <- get_grouped_data(x) # we may have more than two variables... for (j in seq_len(ncol(grps$data[[1]]))) { # now plot everything for (i in seq_len(nrow(grps))) { # copy back labels to grouped data frame tmp <- sjlabelled::copy_labels(grps$data[[i]][j], x) if (!is.null(w)) wb <- grps$data[[i]][[w]] else wb <- NULL # user-defined title if (!is.null(title) && length(title) >= i) gr.title <- title[i] else gr.title <- NULL # iterate data frame, but don't select # weighting variable if (is.null(w) || colnames(tmp)[1] != w) { dummy <- frq_helper( x = tmp[[1]], sort.frq = sort.frq, weight.by = wb, cn = colnames(tmp)[1], auto.grp = auto.grp, title = gr.title, show.na = show.na, min.frq = min.frq ) attr(dummy, "group") <- get_grouped_title(x, grps, i, sep = ", ", long = FALSE) # save data frame for return value dataframes[[length(dataframes) + 1]] <- dummy } } } } else { # if we don't have data frame, coerce if (!is.data.frame(x)) x <- data.frame(x, stringsAsFactors = FALSE) if (!is.null(w)) wb <- x[[w]] else wb <- NULL for (i in seq_len(ncol(x))) { # iterate data frame, but don't select # weighting variable if (is.null(w) || colnames(x)[i] != w) { dummy <- frq_helper( x = x[[i]], sort.frq = sort.frq, weight.by = wb, cn = colnames(x)[i], auto.grp = auto.grp, title = title, show.na = show.na, min.frq = min.frq ) # save data frame for return value dataframes[[length(dataframes) + 1]] <- dummy } } } # add class-attr for print-method() if (out == "txt") class(dataframes) <- c("sjmisc_frq", "list") else class(dataframes) <- c("sjt_frq", "sjmisc_frq", "list") # save how to print output attr(dataframes, "print") <- out attr(dataframes, "encoding") <- encoding attr(dataframes, "file") <- file dataframes } frq_helper <- function(x, sort.frq, weight.by, cn, auto.grp, title = NULL, show.na = TRUE, min.frq = 0) { # remember type vartype <- var_type(x) # convert NaN and Inf to missing x <- zap_inf(x) # variable with only missing? if (length(stats::na.omit(x)) == 0 && show.na == FALSE) { mydat <- data.frame( val = NA, label = NA, frq = NA, raw.prc = NA, valid.prc = NA, cum.perc = NA ) return(structure(class = "sjmisc_frq", list(mydat = mydat))) } # save descriptive statistics xnum <- sjlabelled::as_numeric(x, keep.labels = FALSE) if (!is.null(weight.by)) { # make sure, vector and weights have same length, so remove missing from weights weight.by[is.na(xnum)] <- NA xnum[is.na(weight.by)] <- NA x[is.na(weight.by)] <- NA mean.value <- stats::weighted.mean(stats::na.omit(xnum), w = stats::na.omit(weight.by)) if (requireNamespace("datawizard", quietly = TRUE)) sd.value <- datawizard::weighted_sd(stats::na.omit(xnum), weights = stats::na.omit(weight.by)) else sd.value <- NA } else { mean.value <- mean(xnum, na.rm = TRUE) sd.value <- stats::sd(xnum, na.rm = TRUE) } # get variable label (if any) varlab <- sjlabelled::get_label(x) # numeric variables with many distinct values may # be grouped for better overview if (!is.null(auto.grp) && dplyr::n_distinct(x, na.rm = TRUE) >= auto.grp) { gl <- group_labels(x, size = "auto", n = auto.grp) x <- group_var(x, size = "auto", n = auto.grp) gv <- sort(stats::na.omit(unique(x))) names(gv) <- gl attr(x, "labels") <- gv } # get value labels (if any) labels <- sjlabelled::get_labels( x, attr.only = TRUE, values = "n", non.labelled = TRUE ) # if we don't have variable label, use column name if (sjmisc::is_empty(varlab) && !sjmisc::is_empty(cn)) varlab <- cn else if (!sjmisc::is_empty(varlab) && !sjmisc::is_empty(cn)) varlab <- sprintf("%s (%s)", varlab, cn) # do we have a labelled vector? if (!is.null(labels)) { # add rownames and values as columns dat <- data_frame( n = names(labels), v = as.character(labels) ) colnames(dat) <- c("val", "label") # character vectors need to be converted with to_value # to avoid NAs, but only if character is non-numeric if (is.character(dat$val) && anyNA(suppressWarnings(as.numeric(dat$val)))) dat$val <- sjlabelled::as_numeric(dat$val, keep.labels = FALSE) else dat$val <- as.numeric(dat$val) # weight data? if (!is.null(weight.by)) { dat2 <- data.frame(round( stats::xtabs( weights ~ x, data = data.frame(weights = stats::na.omit(weight.by), x = stats::na.omit(x)), na.action = stats::na.pass, exclude = NULL ), 0 )) } else { # create frequency table dat2 <- data.frame(table(x, useNA = "always")) } colnames(dat2) <- c("val", "frq") dat2$val <- sjlabelled::as_numeric(dat2$val, keep.labels = FALSE) # join frq table and label columns mydat <- suppressMessages(dplyr::full_join(dat, dat2)) # replace NA with 0, for proper percentages, i.e. # missing values don't appear (zero counts) mydat$frq <- suppressMessages(sjmisc::replace_na(mydat$frq, value = 0)) } else { # weight data? if (!is.null(weight.by)) { mydat <- data.frame(round( stats::xtabs( weights ~ x, data = data.frame(weights = stats::na.omit(weight.by), x = stats::na.omit(x)), na.action = stats::na.pass, exclude = NULL ), 0 )) } else { # if we have no labels, do simple frq table mydat <- data.frame(table(x, useNA = "always")) } colnames(mydat) <- c("val", "frq") if (!anyNA(suppressWarnings(as.numeric(attr(mydat$val, "levels"))))) { mydat$val <- sjlabelled::as_numeric(mydat$val, keep.labels = FALSE) } # add values as label mydat$label <- as.character("") mydat <- mydat[c("val", "label", "frq")] } min.frq.string <- sprintf("n < %g", min.frq) if (any(mydat$frq[!is.na(mydat$val)] < min.frq)) { mydatS1 <- mydat[which(mydat$frq >= min.frq | is.na(mydat$val)), ] mydatS2 <- mydat[which(mydat$frq < min.frq & !is.na(mydat$val)), ] mydatS3 <- data_frame( val = min.frq.string, label = "", frq = sum(mydatS2$frq) ) if (mydatS3$frq == 0) { mydat <- mydatS1 } else { mydat <- rbind(mydatS1, mydatS3) row.names(mydat) <- c( row.names(mydat)[-length(row.names(mydat))], as.character(as.integer(row.names(mydat)[length(row.names(mydat)) - 1]) + 1) ) } } # need numeric if (is.factor(x) || is.character(x)) { x <- sjlabelled::as_numeric(x, keep.labels = FALSE) } # check if we have any NA-values - if not, add row for NA's if (!anyNA(mydat$val)) { mydat <- dplyr::bind_rows( mydat, data.frame( val = NA, label = NA, frq = 0 ) ) } # valid values are one row less, because last row is NA row valid.vals <- nrow(mydat) - 1 if (!all(is.na(mydat$val))) { extra.vals <- 1 # Momentarily, in order to sort categories, we consider lower frequencies subtotal as a non valid value if (is.na(mydat$val[valid.vals]) & mydat$val[valid.vals + 1] == min.frq.string) { valid.vals <- valid.vals - 1 extra.vals <- 2 } # sort categories ascending or descending if (!is.null(sort.frq) && (sort.frq == "asc" || sort.frq == "desc")) { ord <- order(mydat$frq[seq_len(valid.vals)], decreasing = (sort.frq == "desc")) } else { ord <- seq_len(valid.vals) } mydat <- mydat[c(ord, (valid.vals + extra.vals):(valid.vals + 1)), ] } valid.vals <- nrow(mydat) - 1 # raw percentages mydat$raw.prc <- mydat$frq / sum(mydat$frq) # compute valid and cumulative percentages mydat$valid.prc <- c(mydat$frq[seq_len(valid.vals)] / sum(mydat$frq[seq_len(valid.vals)]), NA) mydat$cum.prc <- c(cumsum(mydat$valid.prc[seq_len(valid.vals)]), NA) # proper rounding mydat$raw.prc <- 100 * round(mydat$raw.prc, 4) mydat$cum.prc <- 100 * round(mydat$cum.prc, 4) mydat$valid.prc <- 100 * round(mydat$valid.prc, 4) # "rename" labels for NA values if (!is.null(mydat$label)) mydat$label[is.na(mydat$val)] <- NA_character_ if (!all(is.na(mydat$val))) { if (extra.vals == 1) { # save original order reihe <- sjlabelled::as_numeric(mydat$val, start.at = 1, keep.labels = FALSE) # sort if (sort.frq == "none") mydat <- mydat[order(reihe), ] } else if (extra.vals == 2) { # save original order reihe <- suppressWarnings(sjlabelled::as_numeric(mydat$val[-c(valid.vals, valid.vals + 1)], start.at = 1, keep.labels = FALSE)) # sort if (sort.frq == "none") mydat <- mydat[c(order(reihe), valid.vals, valid.vals + 1), ] } } # remove NA, if requested has.na <- mydat$frq[nrow(mydat)] > 0 if ((!is.logical(show.na) && show.na == "auto" && !has.na) || identical(show.na, FALSE)) { mydat <- mydat[-nrow(mydat), ] } # compute relative confidence intervals total_n <- sum(mydat$frq) rel_frq <- as.numeric(mydat$frq / total_n) ci <- stats::qnorm(.975) * suppressWarnings(sqrt(rel_frq * (1 - rel_frq) / total_n)) total_ci <- data.frame(lower = total_n * (rel_frq - ci), upper = total_n * (rel_frq + ci)) relative_ci <- data.frame(lower = rel_frq - ci, upper = rel_frq + ci) # add variable label and type as attribute, for print-method if (!is.null(title)) { attr(mydat, "label") <- title attr(mydat, "vartype") <- "" } else { attr(mydat, "label") <- varlab attr(mydat, "vartype") <- vartype } attr(mydat, "mean") <- mean.value attr(mydat, "sd") <- sd.value attr(mydat, "ci") <- total_ci attr(mydat, "relative.ci") <- relative_ci row.names(mydat) <- NULL mydat } get_grouped_title <- function(x, grps, i, sep = ", ", long = FALSE) { # create title for first grouping level tp <- get_title_part(x, grps, 1, i) if (long) title <- sprintf("%s: %s", tp[1], tp[2]) else title <- sprintf("%s", tp[2]) # do we have another groupng variable? if (length(dplyr::group_vars(x)) > 1) { tp <- get_title_part(x, grps, 2, i) if (long) title <- sprintf("%s%s%s: %s", title, sep, tp[1], tp[2]) else title <- sprintf("%s%s%s", title, sep, tp[2]) } # return title title } get_title_part <- function(x, grps, level, i) { # prepare title for group var.name <- colnames(grps)[level] # get values from value labels vals <- sjlabelled::get_values(x[[var.name]]) t2 <- NULL # if we have no value labels, get values directly if (is.null(vals)) { vals <- grps[[var.name]] if (is.factor(grps[[var.name]])) vals <- as.character(vals) lab.pos <- i } else { # find position of value labels for current group lab.pos <- which(vals == grps[[var.name]][i]) t2 <- sjlabelled::get_labels(x[[var.name]])[lab.pos] } # get variable and value labels t1 <- sjlabelled::get_label(x[[var.name]], def.value = var.name) # if we have no value label, use value instead if (sjmisc::is_empty(t2)) t2 <- vals[lab.pos] # generate title c(t1, t2) } get_grouped_data <- function(x) { # nest data frame grps <- .nest(x) # remove NA category for grouped data cc <- grps %>% dplyr::select(-.data$data) %>% stats::complete.cases() # select only complete cases grps <- dplyr::filter(grps, !! cc) # arrange data if (length(dplyr::group_vars(x)) == 1) reihe <- order(grps[[1]]) else reihe <- order(grps[[1]], grps[[2]]) grps <- grps[reihe, , drop = FALSE] grps } no_character <- function(x) !is.character(x) #' @export as.data.frame.sjmisc_frq <- function(x, row.names = NULL, optional = FALSE, ...) { x <- lapply(x, function(i) { i$variable <- attr(i, "label") i$group <- attr(i, "group") cols <- c("variable", "group", "val", "label", "frq", "raw.prc", "vaid.prc", "cum.prc") i[, intersect(cols, colnames(i))] }) do.call(rbind, x) } sjmisc/R/round_num.R0000644000176200001440000000175414046746443014117 0ustar liggesusers#' @title Round numeric variables in a data frame #' @name round_num #' #' @description \code{round_num()} rounds numeric variables in a data frame #' that also contains non-numeric variables. Non-numeric variables are #' ignored. #' #' @param digits Numeric, number of decimals to round to. #' @inheritParams to_dummy #' #' @return \code{x} with all numeric variables rounded. #' #' @examples #' data(iris) #' round_num(iris) #' #' @export round_num <- function(x, digits = 0) { UseMethod("round_num") } #' @export round_num.data.frame <- function(x, digits = 0) { xa <- attributes(x) x <- x %>% purrr::map_if(is.numeric, ~ round(.x, digits = digits)) %>% as.data.frame(stringsAsFactors = FALSE) attributes(x) <- xa x } #' @export round_num.list <- function(x, digits = 0) { purrr::map_if(x, is.numeric, ~ round(.x, digits = digits)) } #' @export round_num.default <- function(x, digits = 0) { round(x, digits = digits) } sjmisc/R/is_crossed.R0000644000176200001440000000646213777355176014260 0ustar liggesusers#' @title Check whether two factors are crossed or nested #' @name is_crossed #' @description These functions checks whether two factors are (fully) crossed #' or nested, i.e. if each level of one factor occurs in combination with #' each level of the other factor (\code{is_crossed()}) resp. if each #' category of the first factor co-occurs with only one category of the #' other (\code{is_nested()}). \code{is_cross_classified()} checks if one #' factor level occurs in some, but not all levels of another factor. #' #' @param f1 Numeric vector or \code{\link{factor}}. #' @param f2 Numeric vector or \code{\link{factor}}. #' #' @return Logical. For \code{is_crossed()}, \code{TRUE} if factors are (fully) #' crossed, \code{FALSE} otherwise. For \code{is_nested()}, \code{TRUE} if #' factors are nested, \code{FALSE} otherwise. For \code{is_cross_classified()}, #' \code{TRUE}, if one factor level occurs in some, but not all levels of #' another factor. #' #' @note If factors are nested, a message is displayed to tell whether \code{f1} #' is nested within \code{f2} or vice versa. #' #' @references Grace, K. The Difference Between Crossed and Nested Factors. \href{https://www.theanalysisfactor.com/the-difference-between-crossed-and-nested-factors/}{(web)} #' #' @examples #' # crossed factors, each category of #' # x appears in each category of y #' x <- c(1,4,3,2,3,2,1,4) #' y <- c(1,1,1,2,2,1,2,2) #' # show distribution #' table(x, y) #' # check if crossed #' is_crossed(x, y) #' #' # not crossed factors #' x <- c(1,4,3,2,3,2,1,4) #' y <- c(1,1,1,2,1,1,2,2) #' # show distribution #' table(x, y) #' # check if crossed #' is_crossed(x, y) #' #' #' # nested factors, each category of #' # x appears in one category of y #' x <- c(1,2,3,4,5,6,7,8,9) #' y <- c(1,1,1,2,2,2,3,3,3) #' # show distribution #' table(x, y) #' # check if nested #' is_nested(x, y) #' is_nested(y, x) #' #' # not nested factors #' x <- c(1,2,3,4,5,6,7,8,9,1,2) #' y <- c(1,1,1,2,2,2,3,3,3,2,3) #' # show distribution #' table(x, y) #' # check if nested #' is_nested(x, y) #' is_nested(y, x) #' #' # also not fully crossed #' is_crossed(x, y) #' #' # but partially crossed #' is_cross_classified(x, y) #' #' @export is_crossed <- function(f1, f2) { tab <- table(f1, f2) # for crossed factors, we should have no zeros in any rows # (i.e. each level of f1 also contains any level of f2) !any(apply(tab, 1, function(x) any(x == 0)) == TRUE) } #' @rdname is_crossed #' @export is_nested <- function(f1, f2) { tab <- table(f1, f2) # cross tabulation of nested factors should have only 1 value per row # (or column) that is not zero. If we found more, factors are not nested # or rows and columns have to be swapped. # check if f1 is nested within f2 nested <- !any(apply(tab, 1, function(x) sum(x != 0) > 1)) if (nested) message("'f1' is nested within 'f2'") # swap rows and columns to check whether factors are nested # check whether f2 is nested within f1 if (!nested) { nested <- !any(apply(tab, 2, function(x) sum(x != 0) > 1)) if (nested) message("'f2' is nested within 'f1'") } nested } #' @rdname is_crossed #' @export is_cross_classified <- function(f1, f2) { suppressMessages(!is_nested(f1, f2) && !is_crossed(f1, f2)) } sjmisc/R/std.R0000644000176200001440000002146614046746443012705 0ustar liggesusers#' @title Standardize and center variables #' @name std #' #' @description \code{std()} computes a z-transformation (standardized and centered) #' on the input. \code{center()} centers the input. \code{std_if()} and #' \code{center_if()} are scoped variants of \code{std()} and \code{center()}, #' where transformation will be applied only to those variables that match the #' logical condition of \code{predicate}. #' #' @param include.fac Logical, if \code{TRUE}, factors will be converted to numeric #' vectors and also standardized or centered. #' @param robust Character vector, indicating the method applied when #' standardizing variables with \code{std()}. By default, standardization is #' achieved by dividing the centered variables by their standard deviation #' (\code{robust = "sd"}). However, for skewed distributions, the median #' absolute deviation (MAD, \code{robust = "mad"}) or Gini's mean difference #' (\code{robust = "gmd"}) might be more robust measures of dispersion. For #' the latter option, \CRANpkg{sjstats} needs to be installed. #' \code{robust = "2sd"} divides the centered variables by two standard #' deviations, following a suggestion by \emph{Gelman (2008)}, so the #' rescaled input is comparable to binary variables. #' #' @inheritParams to_dummy #' @inheritParams rec #' #' @return If \code{x} is a vector, returns a vector with standardized or #' centered variables. If \code{x} is a data frame, for \code{append = TRUE}, #' \code{x} including the transformed variables as new columns is returned; #' if \code{append = FALSE}, only the transformed variables will be returned. #' If \code{append = TRUE} and \code{suffix = ""}, recoded variables will #' replace (overwrite) existing variables. #' #' @note \code{std()} and \code{center()} only return a vector, if \code{x} is #' a vector. If \code{x} is a data frame and only one variable is specified #' in the \code{...}-ellipses argument, both functions do return a #' data frame (see 'Examples'). #' #' @details \code{std()} and \code{center()} also work on grouped data frames #' (see \code{\link[dplyr]{group_by}}). In this case, standardization #' or centering is applied to the subsets of variables in \code{x}. #' See 'Examples'. #' \cr \cr #' For more complicated models with many predictors, Gelman and Hill (2007) #' suggest leaving binary inputs as is and only standardize continuous predictors #' by dividing by two standard deviations. This ensures a rough comparability #' in the coefficients. #' #' @references #' Gelman A (2008) Scaling regression inputs by dividing by two #' standard deviations. \emph{Statistics in Medicine 27: 2865-2873.} #' \url{http://www.stat.columbia.edu/~gelman/research/published/standardizing7.pdf} #' \cr \cr #' Gelman A, Hill J (2007) Data Analysis Using Regression and Multilevel/Hierarchical #' Models. Cambdridge, Cambdrige University Press: 55-57 #' #' @examples #' data(efc) #' std(efc$c160age) %>% head() #' std(efc, e17age, c160age, append = FALSE) %>% head() #' #' center(efc$c160age) %>% head() #' center(efc, e17age, c160age, append = FALSE) %>% head() #' #' # NOTE! #' std(efc$e17age) # returns a vector #' std(efc, e17age) # returns a data frame #' #' # with quasi-quotation #' x <- "e17age" #' center(efc, !!x, append = FALSE) %>% head() #' #' # works with mutate() #' library(dplyr) #' efc %>% #' select(e17age, neg_c_7) %>% #' mutate(age_std = std(e17age), burden = center(neg_c_7)) %>% #' head() #' #' # works also with grouped data frames #' mtcars %>% std(disp) #' #' # compare new column "disp_z" w/ output above #' mtcars %>% #' group_by(cyl) %>% #' std(disp) #' #' data(iris) #' # also standardize factors #' std(iris, include.fac = TRUE, append = FALSE) #' # don't standardize factors #' std(iris, include.fac = FALSE, append = FALSE) #' #' # standardize only variables with more than 10 unique values #' p <- function(x) dplyr::n_distinct(x) > 10 #' std_if(efc, predicate = p, append = FALSE) #' #' @export std <- function(x, ..., robust = c("sd", "2sd", "gmd", "mad"), include.fac = FALSE, append = TRUE, suffix = "_z") { UseMethod("std") } #' @export std.default <- function(x, ..., robust = c("sd", "2sd", "gmd", "mad"), include.fac = FALSE, append = TRUE, suffix = "_z") { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) # match arguments robust <- match.arg(robust) std_and_center(x, .dat, include.fac, append, standardize = TRUE, robust = robust, suffix) } #' @export std.mids <- function(x, ..., robust = c("sd", "2sd", "gmd", "mad"), include.fac = FALSE, append = TRUE, suffix = "_z") { vars <- dplyr::quos(...) ndf <- prepare_mids_recode(x) # select variable and compute rowsums. add this variable # to each imputed ndf$data <- purrr::map( ndf$data, function(.x) { dat <- dplyr::select(.x, !!! vars) dplyr::bind_cols( .x, std_and_center( x = dat, .dat = dat, include.fac = include.fac, append = FALSE, standardize = TRUE, robust = robust, suffix = suffix ) ) } ) final_mids_recode(ndf) } #' @rdname std #' @export std_if <- function(x, predicate, robust = c("sd", "2sd", "gmd", "mad"), include.fac = FALSE, append = TRUE, suffix = "_z") { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } # match arguments robust <- match.arg(robust) std_and_center(x, .dat, include.fac, append, standardize = TRUE, robust = robust, suffix) } #' @rdname std #' @export center <- function(x, ..., include.fac = FALSE, append = TRUE, suffix = "_c") { UseMethod("center") } #' @export center.default <- function(x, ..., include.fac = FALSE, append = TRUE, suffix = "_c") { # evaluate arguments, generate data .dat <- get_dot_data(x, dplyr::quos(...)) std_and_center(x, .dat, include.fac, append, standardize = FALSE, robust = NULL, suffix) } #' @export center.mids <- function(x, ..., robust = c("sd", "2sd", "gmd", "mad"), include.fac = FALSE, append = TRUE, suffix = "_z") { vars <- dplyr::quos(...) ndf <- prepare_mids_recode(x) # select variable and compute rowsums. add this variable # to each imputed ndf$data <- purrr::map( ndf$data, function(.x) { dat <- dplyr::select(.x, !!! vars) dplyr::bind_cols( .x, std_and_center( x = dat, .dat = dat, include.fac = include.fac, append = FALSE, standardize = FALSE, robust = robust, suffix = suffix ) ) } ) final_mids_recode(ndf) } #' @rdname std #' @export center_if <- function(x, predicate, include.fac = FALSE, append = TRUE, suffix = "_c") { # select variables that match logical conditions .dat <- dplyr::select_if(x, .predicate = predicate) # if no variable matches the condition specified # in predicate, return original data if (sjmisc::is_empty(.dat)) { if (append) return(x) else return(.dat) } std_and_center(x, .dat, include.fac, append, standardize = FALSE, robust = NULL, suffix) } std_and_center <- function(x, .dat, include.fac, append, standardize, robust, suffix) { recode_fun( x = x, .dat = .dat, fun = get("std_helper", asNamespace("sjmisc")), suffix = suffix, append = append, include.fac = include.fac, standardize = standardize, robust = robust ) } std_helper <- function(x, include.fac, standardize, robust) { # check whether factors should also be standardized if (is.factor(x)) { if (include.fac) x <- sjlabelled::as_numeric(x, keep.labels = FALSE) else return(x) } # non-numeric are preserved. if (!is.numeric(x)) return(x) # remove missings tmp <- stats::na.omit(x) # save value label, if any lab <- sjlabelled::get_label(x) # now center and standardize tmp <- tmp - mean(tmp) # standardization can be achieved by std. dev., MAD or Gini's MD if (standardize) { if (robust == "mad") tmp <- tmp / stats::mad(tmp) else if (robust == "gmd" && requireNamespace("sjstats", quietly = TRUE)) tmp <- tmp / sjstats::gmd(tmp) else if (robust == "2sd") tmp <- tmp / (2 * stats::sd(tmp)) else tmp <- tmp / stats::sd(tmp) } # and fill in values in original vector x[!is.na(x)] <- tmp # add back label sjlabelled::set_label(x, label = lab) } sjmisc/R/reshape_longer.R0000644000176200001440000000755414046746443015112 0ustar liggesusers#' @title Reshape data into long format #' @name reshape_longer #' #' @description \code{reshape_longer()} reshapes one or more columns from #' wide into long format. #' #' @param x A data frame. #' @param columns Names of variables (as character vector), or column index of #' variables, that should be reshaped. If multiple column groups should be #' reshaped, use a list of vectors (see 'Examples'). #' @param names.to Character vector with name(s) of key column(s) to create in output. #' Either one name per column group that should be gathered, or a single string. #' In the latter case, this name will be used as key column, and only one key #' column is created. #' @param values.to Character vector with names of value columns (variable names) #' to create in output. Must be of same length as number of column #' groups that should be gathered. See 'Examples'. #' @param numeric.timevar Logical, if \code{TRUE}, the values of the \code{names.to} #' column will be recoded to numeric values, in sequential ascending order. #' @param id Name of ID-variable. #' @param labels Character vector of same length as \code{values.to} with variable #' labels for the new variables created from gathered columns. #' See 'Examples'. #' #' @seealso \code{\link{to_long}} #' #' @return A reshaped data frame. #' #' @examples #' # Reshape one column group into long format #' mydat <- data.frame( #' age = c(20, 30, 40), #' sex = c("Female", "Male", "Male"), #' score_t1 = c(30, 35, 32), #' score_t2 = c(33, 34, 37), #' score_t3 = c(36, 35, 38) #' ) #' #' reshape_longer( #' mydat, #' columns = c("score_t1", "score_t2", "score_t3"), #' names.to = "time", #' values.to = "score" #' ) #' #' #' # Reshape multiple column groups into long format #' mydat <- data.frame( #' age = c(20, 30, 40), #' sex = c("Female", "Male", "Male"), #' score_t1 = c(30, 35, 32), #' score_t2 = c(33, 34, 37), #' score_t3 = c(36, 35, 38), #' speed_t1 = c(2, 3, 1), #' speed_t2 = c(3, 4, 5), #' speed_t3 = c(1, 8, 6) #' ) #' #' reshape_longer( #' mydat, #' columns = list( #' c("score_t1", "score_t2", "score_t3"), #' c("speed_t1", "speed_t2", "speed_t3") #' ), #' names.to = "time", #' values.to = c("score", "speed") #' ) #' #' # or ... #' reshape_longer( #' mydat, #' list(3:5, 6:8), #' names.to = "time", #' values.to = c("score", "speed") #' ) #' #' # gather multiple columns, label columns #' x <- reshape_longer( #' mydat, #' list(3:5, 6:8), #' names.to = "time", #' values.to = c("score", "speed"), #' labels = c("Test Score", "Time needed to finish") #' ) #' #' library(sjlabelled) #' str(x$score) #' get_label(x$speed) #' @export reshape_longer <- function(x, columns = colnames(x), names.to = "key", values.to = "value", labels = NULL, numeric.timevar = FALSE, id = ".id") { variable_attr <- lapply(x, attributes) if (!is.list(columns)) columns <- list(columns) columns <- lapply(columns, function(.x) { if (is.numeric(.x)) .x <- colnames(x)[.x] .x }) dat <- stats::reshape( x, idvar = id, times = columns[[1]], timevar = names.to, v.names = values.to, varying = columns, direction = "long" ) if (numeric.timevar) { f <- as.factor(dat[[names.to]]) levels(f) <- 1:nlevels(f) dat[[names.to]] <- as.numeric(as.character(f)) } for (i in colnames(dat)) { attributes(dat[[i]]) <- variable_attr[[i]] } if (!is.null(labels)) { if (length(labels) != length(values.to)) { insight::print_color("Could not set variable labels. 'labels' have different length than number of reshaped columns ('values.to').\n", "red") } else { for (i in 1:length(labels)) sjlabelled::set_label(dat[[values.to[i]]]) <- labels[i] } } row.names(dat) <- NULL dat } sjmisc/NEWS.md0000644000176200001440000004322314620403662012647 0ustar liggesusers# sjmisc 2.8.8 ## Changes to functions * `rec()` now keeps the labels of the old values with the new ones when there are no labels specified and there is a 1 to 1 correspondence between old and new values. * `format()` for `frq()` was revised, and now allows to format the frequency table to prepare for printing in text, markdown and HTML-format. To do so, use the methods `print()`, `print_md()` or `print_html()`. * Address changes in forthcoming update of _sjstats_. ## Bug fixes * Fix CRAN check issues. # sjmisc 2.8.7 ## New functions * A first draft of `format()` for `frq()` was implemented. ## Changes to functions * `merge_df()` preserves more attributes related to labelled data. * `to.factor` is an alias for the argument `as.num`. # sjmisc 2.8.6 ## General * Updating imports. ## Bug fixes * Fixed bug in `move_columns()` (using a variable as value for argument `.after` didn't work). # sjmisc 2.8.5 ## Changes to functions * `flat_table()` gains a `weights`-argument. ## Bug fixes * `descr()` calculated wrong percentage of missing values for weighted data. * Fixed issue in `rec()` when `min`, `max`, `lo` or `hi` was used to recode a numeric into a character vector, and the new recode string contained one of these four strings as pattern. * Give informative warning in `rec()` when `max` or `hi` was used to recode a value which maximum values was lower than a defined range, e.g. `4:max` when the maximum values was lower than 4. # sjmisc 2.8.4 ## Changes to functions * `descr()` now also calculates the IQR. * Revised `print()`-method for `frq()`. * Minor changes to be compatible with forthcoming dplyr-release. # sjmisc 2.8.3 ## New functions * Added `as.data.frame()` for `frq()`. ## Changes to functions * `typical_value()` now returns the median for integer-values (instead of mean), to preserve the integer-type of a variable. * The recode-pattern in `rec()` now also works for character variables with whitespaces. * `rec()` now warns explicitely for possible non-intended multiple assignment of identical new recode-values. * Improved printing for `frq()`. * `merge_imputations()` now returns the plot-object as well. * `to_numeric()` as alias for `to_value()`. ## Bug fixes * Fixed warning from CRAN checks. * Fixed errors from CRAN checks. # sjmisc 2.8.2 ## General * Alias `find_variables()` (alias for `find_var()`) was renamed to `find_in_data()`, to avoid conflicts with package *insight*. * `rename_variables()` and `rename_columns()` are aliases for `var_rename()`. ## Changes to functions * `frq()` now also prints frequencies of logical conditions, e.g. how many values are lower or greater than a certain threshold. * `frq()` gets a `min.frq`-argument, indicating the minimum frequency for which a value will be shown in the output. * `descr()` gets a `show` argument to show selected columns only. * `descr()` gets a `file`-argument to write the output as HTML file. * `var_rename()` now also accepts a named vector with multiple elements as ellipses-argument. ## Bug fixes * Fixed erroneously warning in `de_mean()`. * `merge_df()` now removes columns with identical column names inside a data frame before merging, to avoid errors. * Fixed issue when printing character vectors in `frq()`, where first element was empty, and vectors were not provided as data frame argument. * Fixed issue in `word_wrap()` when processing expressions. * Fixed issue in `rec()` with token `rec = "rev"`, when reversing labelled vectors with more value labels than values. # sjmisc 2.8.1 ## General * `find_variables()` as alias for `find_var()`. * Revised docs. ## Bug fixes * Fixed issue with forthcoming update of the **rlang** package. # sjmisc 2.8.0 ## General * Some print-methods, especially for grouped data frames, are now more compact. ## New functions * `reshape_longer()`, as alternative to `to_long()`, probably easier to remember (function and argument-names). ## Bug fixes * `frq()` displayed labels as `NA` in some situations for grouped data frames with more than one group, when data were not labelled. # sjmisc 2.7.8 ## General * Reduce package dependencies. * `str_pos()` was renamed into `str_find()`. * New package-vignette **Recoding Variables**. ## New functions * `typical_value()`, which was formerly located in package _sjstats_. ## Changes to functions * `is_whole()` now automatically removes missing values from vectors. * `is_empty()` now also checks lists with only `NULL`-elements. ## Bug fixes * Better handling of factors in `merge_imputations()`, which previously could result in `NA`-values when merging imputed values into one variable. * Fix issue in `is_empty()` in case the vector had non-missing values, but first element of vector was `NA`. * Fixed bug in `frq()` for grouped data frame, when grouping variable was a character vector. In this case, group titles were mixed up. * Fix encoding issues in help-files. # sjmisc 2.7.7 ## New functions * `tidy_values()` to "clean" values (i.e. remove special chars) of character vectors or levels of factors. * `add_id()` to quickly add an ID variable to (grouped) data frames. ## Changes to functions * `frq()` gets a `show.na`-argument, to (automatically) show or hide the information for `NA`-values from the output. * The `weights`-argument in `frq()` now also accepts vectors, and is not limited to variable names. Note that these vectors must be part of a data frame. * For recode-functions (like `rec()`, `dicho()`, ...), if `suffix = ""` and `append = TRUE`, existing variables will be replaced by the new, recoded variables. * Improved performance for `group_str()`. * `var_rename()` now supports quasi-quotation (see Examples). * `row_sums()` and `row_means()` now return the input data frame when this data frame only had one column and no row means or sums were calculated. The returned data frame still gets the new variable name defined in `var`. ## Bug fixes * `complete_cases()` returned an empty vector instead of all indexes if all cases (rows) of a data frame were complete. * Fix issue with `to_dummy()` for character-vector input. * Fix issue with missing values in `group_str()`. * Fix issue with grouped data frames in `frq()` when `grp.strings = TRUE`. # sjmisc 2.7.6 ## Changes to functions * `frq()` gets a `file` and `encoding` argument, to save the HTML output as file. * `add_variables()` and `move_columns()` now preserve the attributes of a data frame. # sjmisc 2.7.5 ## General * Reduce package dependencies. ## New functions * `de_mean()` to compute group-meaned and de-meaned variables. * `add_variables()` and `add_case()` to add columns or rows in a convenient way to a data frame. * `move_columns()` to move one or more columns to another position in a data frame. * `is_num_chr()` to check whether a character vector has only numeric strings. * `seq_col()` and `seq_row()` as convenient wrapper to create a regular sequence for column or row numbers. ## Changes to functions * `descr()` gets a `weights`-argument, to print weighted descriptive statistics. * The `n`-argument in `row_means()` and `row_sums()` now also may be `Inf`, to compute means or sums only if all values in a row are valid (i.e. non-missing). * Argument `weight.by` in `frq()` was renamed into `weights`. * `frq()` gets a `title`-argument, to specify an alternative title to the variable label. ## Bug fixes * `round_num()` preserves data frame attributes. * `frq()` printed frequencies of grouping-variable for grouped data frames, when `weights` was not `NULL`. * Fixed issue with wrong title in `frq()` for grouped data frames, when grouping variable was an unlabelled factor. # sjmisc 2.7.4 ## New functions * `has_na()` to check if variables or observations in a data frame contain `NA`, `NaN` or `Inf` values. Convenient shortcuts for this function are `complete_cases()`, `incomplete_cases()`, `complete_vars()` and `incomplete_vars()`. * `total_mean()` to compute the overall mean of all values from all columns in a data frame. * `prcn()` to convert numeric scalars between 0 and 1 into a character-percentage value. * `numeric_to_factor()` to convert numeric variables into factors, using associated value labels as factor levels. ## Changes to functions * `set_na()` now also replaces different values per variable into `NA`. * Changed behaviour of `row_sums()` and missing values. `row_sums()` gets a `n`-argument and now computes row sums if a row has at least `n` non-missing values. # sjmisc 2.7.3 ## General * A test-suite was added to the package. * Updated reference in `CITATION` to the publication in the Journal of Open Source Software. ## New functions * `is_cross_classified()` to check whether two factors are partially crossed. ## Changes to functions * `ref_lvl()` now also accepts value labels as value for the `lvl`-argument. Additionally, `ref_lvl()` now also works for factor with non-numeric factor levels and simply returns `relevel(x, ref = lvl)` in such cases. ## Bug fixes * Fixed encoding issues in `rec()` with direct labelling for certain locales. * Fixed issue in `count_na()`, which did not print labels of tagged `NA` values since the last revision of `frq()`. * Fixed issue in `merge_imputation()` for cases where original data frame had less columns than imputed data frames. * Fixed issue in `find_var()` for fuzzy-matching in all elements (i.e. when `fuzzy = TRUE` and `search = "all"`). # sjmisc 2.7.2 ## New functions * `round_num()` to round only numeric values in a data frame. ## General * Improved performance for `merge_df()`. Furthermore, `add_rows()` was added as alias for `merge_df()`. * `merge_df()` resp. `add_rows()` now create a unique `id`-name instead of dropping the ID-variable, in case `id` has the same name of any existing variables in the provided data frames. * Improved performance for `descr()` and minor changes to the output. ## Support for `mids`-objects (package _mice_) Following functions now also work on `mids`-objects, as returned by the `mice()`-function: * `row_count()`, `row_sums()`, `row_means()`, `rec()`, `dicho()`, `center()`, `std()`, `recode_to()` and `to_long()`. ## Changes to functions * The `weight.by`-argument in `frq()` now should be a variable name from a variable in `x`, and no longer a separate vector. ## Bug fixes * `descr()` does not work with character vectors, so these are being removed now. # sjmisc 2.7.1 ## General * Fix typos and revise outdated paragraphs in vignettes. ## New functions The recoding and transformation functions get scoped variants, allowing to select variables based on logical conditions described in a function: * `rec_if()` as scoped variant of `rec()`. * `dicho_if()` as scoped variant of `dicho()`. * `center_if()` as scoped variant of `center()`. * `std_if()` as scoped variant of `std()`. * `split_var_if()` as scoped variant of `split_var()`. * `group_var_if()` and `group_label_if()` as scoped variant of `group_var()` and `group_label()`. * `recode_to_if()` as scoped variant of `recode_to()`. * `set_na_if()` as scoped variant of `set_na()`. ## Changes to functions * New function `remove_cols()` as alias for `remove_var()`. * `std()` gets a new robust-option, `robust = "2sd"`, which divides the centered variables by two standard deviations. * Slightly improved performance for `set_na()`. ## Bug fixes * `frq()` now removes empty columns before computing frequencies, because applying `frq()` on empty vectors caused an error. * `empty_cols()` and `empty_rows()` (and hence, `remove_empty_cols()` and `remove_empty_rows()`) caused an error for data frames with only one column resp. row, or if `x` was a vector and no data frame. * `frq()` now removes missing values from input when weights are applied, to ensure that input and weights have same length. # sjmisc 2.7.0 ## General * *Breaking changes*: The `append`-argument in recode and transformation functions like `rec()`, `dicho()`, `split_var()`, `group_var()`, `center()`, `std()`, `recode_to()`, `row_sums()`, `row_count()`, `col_count()` and `row_means()` now defaults to `TRUE`. * The `print()`-method for `descr()` now accepts a `digits`-argument, to specify the rounding of the output. * Cross refences from `dplyr::select_helpers` were updated to `tidyselect::select_helpers`. ## New functions * `is_whole()` as counterpart to `is_float()`. ## Changes to functions * `frq()` now prints variable names for non-labelled data, adds variable names in braces for labelled data and omits the _label_ column for non-labelled data. * `frq()` now prints mean and standard deviation in the header line of the output. * `frq()` now gets a `auto.grp`-argument to automatically group variables with many unique values. * `frq()` now gets a `show.strings`-argument to omit string variables (character vectors) from being printed as frequency table. * `frq()` now gets a `grp.strings`-argument to group similar string values in the frequency table. * `frq()` gets an `out`-argument, to print output to console, or as HTML table in the viewer or web browser. * `descr()` gets an `out`-argument, to print output to console, or as HTML table in the viewer or web browser. ## Bug fixes * `is_empty()` returned `TRUE` for single vectors with `NA` being the first element. * Fix issue where due to a bug during code cleanup, `remove_empty_rows()` did no longer remove empty rows, but columns. # sjmisc 2.6.3 ## General * Revised examples that used removed methods from other packages. * Use select-helpers from package *tidyselect*, instead of *dplyr*. * Beautiful colored output for `frq()`, `descr()` and `flat_table()`. ## Changes to functions * `rec()` now also recodes doubles with floating points, if a range of values is specified. * `std()` and `center()` now use `include.fac = FALSE` as default option. * `std()` gets a `robust`-argument, to divide variables either by standard deviation, or - in case of asymmetrically distributed variables - median absolute deviation or Gini's mean difference. * `frq()` now shows total and valid N in output. ## Bug fixes * `center()`, `std()`, `dicho()`, `split_var()` and `group_var()` did not work correctly for grouped data frames. * `frq()` did not print multiple variables when applied on grouped data frames. # sjmisc 2.6.2 ## Changes to functions * Arguments `as.df` and `as.varlab` in function `find_var()` are now deprecated. Please use `out` instead. * `rotate_df()` preserves attributes. * `is_float()` is now exported as function. ## Bug fixes * Fixed bug for `to_label()`, when `x` was a character vector and argument `drop.levels` was `TRUE`. # sjmisc 2.6.1 ## General * Fixed issue with latest tidyr-update on CRAN. ## Bug fixes * `frq()` did not correctly calculate valid and cumulative percentages when using weights. # sjmisc 2.6.0 ## General * All labelled-data functions were removed and are now in package *sjlabelled*. ## New functions * `remove_var()` as pipe-friendly function to remove variables from data frames. * `var_type()` as pipe-friendly function to determine the type of variables. * `all_na()` to check whether a vector only consists of NA values. * `rotate_df()` to rotate data frames (switch columns and rows). * `shorten_string()`, to shorten strings to a certain maxium number of chars. ## Changes to functions * Following functions now also work on grouped data frames: `dicho()`, `split_var()`, `group_var()`, `std()` and `center()`. * Argument `groupcount` in `split_var()`, `group_var()` and `group_labels()` is now named `n`. * Argument `groupsize` in `group_var()` and `group_labels()` is now named `size`. * `frq()` gets a revised print-method, which does not print the result to console when captured in an object (i.e., `x <- frq(x)` no longer prints the result). * `frq()` no longer prints (redundant) labels for factors w/o value label attributes. * `frq()` adds information about the variable type in the table caption (only for variables with variable labels). * `frq()` adds information about groups when printing grouped, non-labelled variables. * `descr()` now also prints information about the variable type. * `to_character()` now preserves variable labels. # sjmisc 2.5.0 ## General * **sjmisc** now uses dplyr's tidyeval-approach to evaluate arguments. This means that the select-helper-functions (like `one_of()` or `contains()`) no longer need to be prefixed with a `~` when used as argument within **sjmisc**-functions. * All labelled-data functions are now deprecated and will become defunct in future package versions. The labelled-data functions have been moved into a separate package, *sjlabelled*. ## New functions * `row_count()` to count specific values in a data frame per observation. * `col_count()` to count specific values in a data frame per variable. * `str_start()` and `str_end()` to find starting and end indices of patterns inside strings. ## Changes to functions * The output for `frq()` now always includes a `NA`-row, but no longer prints a value for the `NA`-row. * `merge_imputations()` gets a `summary`-argument to plot a graphical summary of the quality of the merging process. ## Bug fixes * `add_columns()` and `replace_columns()` crashed R when no data frame was specified in `...`-ellipses argument. * `descr()` and `frq()` used wrong variable labels when processing grouped data frames for specific situations, where the grouping variable had no sequences values. * `descr()` did not work for large data frames, because internally, because `psych::describe()` switched to fast mode by default then (removing columns from the output). sjmisc/MD50000644000176200001440000002224614620415212012055 0ustar liggesusers0e63adc11a47bb3639f9d102121188f1 *DESCRIPTION 06ecc00132d291235365d711ad008fff *NAMESPACE 620ae321f847aff3935bb298d29cf614 *NEWS.md af1bee56db9ab9177b00f8281ea6d753 *R/S3-methods.R 08aed6edb5b3c08d77933e46bc9988af *R/add_cases.R 6c2d273c52ae116627305cafed37d310 *R/add_columns.R 3eafcc077cc6b9f47c9cb28b72a0ce72 *R/all_na.R 3034e8ce49e7288ef9eedb115e3cbd8a *R/big_mark.R f7329112aebe50cf19aa0720cdab093f *R/count_na.R 73dfbbc7cb87747401bac0b69c39a5ca *R/de_mean.R 65629d1db2c81592eafa95c6a77fa75a *R/descr.R 446acdaa43e2735fd584eb8adf6839cc *R/dicho.R db1a52d2c6c5570f1b97b5cb08dc59e5 *R/efc.R 93618ca846c2b5b094da1f313287bbfe *R/find_var.R bd3558af542ddd86a735feaa1f66cc24 *R/frq.R ffebaa32d75c3db0e21770acacf9eb9f *R/group_str.R c02c4ab6d7bcedfc6e390d0024d3c556 *R/group_var.R 4d52bade23cd95ca6e814b7eb6902308 *R/has_na.R 159f54eab88637adb853864969827936 *R/helpfunctions.R 153fc7e439777dd4dd8b4f57ad82ba4a *R/is_crossed.R a70e5c63baa4e47e357482b93fbe0749 *R/is_empty.R 46bcca79313f309a727ba2dd17dd2a5d *R/is_even.R 2804e1bccab623679dc5730240b1cb84 *R/is_float.R 756280708ea8f63b66b568c3fb081cd2 *R/is_num_fac.R eae3230d718afcd964f391b316c939bb *R/merge_df.R 269b5d87c0c500c495eebe6149306320 *R/merge_imputations.R b49e729ed3baa9fefea28ceb1f1ed0ad *R/move_column.R cae6f2ad9a5daef98f0326baebd91af9 *R/notin.R 21e76ebce64bedf7fe21404ad18beabb *R/num_to_fac.R 3ef6705f77b9f722e36fc90158dc30b8 *R/prop_table.R e59babb138c36bb2ac853469e369f16d *R/re-exports.R 57a2a49fcefd3f54d7a4d98b303d50fa *R/rec.R beb9389132b4f9fcd74dd6c556efa156 *R/rec_pattern.R b5e3a3bd238d5808f66948cbb7055ee3 *R/recode_mids_fun.R 6a95681e21c4e8991ab98a004b04c74b *R/recode_to.R 2c2e59391d7af42a9aa611e9bdf75bbc *R/ref_lvl.R a18efbf65c554645947c6f8b9d60a48b *R/remove_vars.R 16af996ab2e91916aed145a6b52fcb19 *R/replace_na.R 1fca624043b533953263545973b420a7 *R/reshape_helpers.R f3e3a67309448ad428ccb718e0be3117 *R/reshape_longer.R c5c03012f9eea02407f7ee8c013536e2 *R/rotate_df.R d0fa5d7e5e8d7a130c0c1231ed58d270 *R/round_num.R 57fa723c7e128546e15fc70c88032a5b *R/row_count.R 377853b04f4c52fa167b750152d4ef1c *R/row_sums.R 6e6e11ba75385b85fefbc52c14f78f88 *R/select_helpers.R 7df4dee25fe4cf3a130d0bd013afda63 *R/seq_col.R a59a6909488c1aebcdac8ec2707573e9 *R/set_na.R 21a7814a5f908c806d46452a109d6fb3 *R/shorten_string.R a38b27c732e7d7226ff204212eb91c0f *R/split_var.R 1f8159082fca21808b5fa673c25cf914 *R/spread_coef.R 80809f94f19b09c93ccd27e5a3d86313 *R/std.R 76239c8b931fc2a52a957469af12d17d *R/str_contains.R d3784c1f17e2403c6a23c1ea14655c12 *R/str_pos.R dfbf2a3e8e0ac0e80987d8b0cfe67fd3 *R/str_start_end.R 88421a97ca4a20648a7680e503507abc *R/tidy_values.R fa235ac4132a2644f69036f306f698f5 *R/to_dummy.R 2682611f1a3fa744cf221720729e1af5 *R/to_long.R 19e6f795dce69f61f7ce77d8d6735c3c *R/to_value.R b835ba26d74133de6159ce62d4017cb9 *R/trim.R cfed6e2126b8ee549e22ed59c524cff6 *R/typical.R 6f3b2f346ba6b0efbe544e69181c3312 *R/var_labels.R f1019d6fb8925e3c51826c223eb95a15 *R/var_type.R 49b7e260c8616cee7f925a1042e12ca5 *R/word_wrap.R 2e176088cd4e28bbf15f7b50d1340707 *R/zap_inf.R a02c4d0752bcf419cbb78c22a9bdf8bf *R/zzz.R 203f850cfa884f3e4c9ea826f34f1188 *README.md 181b5ac5a69bd3e3e542680739e88fd5 *build/partial.rdb 104adef183be5a7f109c13a79534e435 *build/vignette.rds 3172b22b3d87d0f86d78326bc26891fc *data/efc.RData cb6c20ea2e6e3368477354b4ef7e829d *inst/CITATION d589bb03229047d17ab9fd2f469da09c *inst/doc/design_philosophy.R aaef8b4c2577b9578ddd8f0a99fba9eb *inst/doc/design_philosophy.Rmd d62776311a5fc245a1412f3ef700705f *inst/doc/design_philosophy.html e9e1a307ce8254d94645fc28635c138e *inst/doc/exploringdatasets.R bce92194e5e92a8394ca512d6d151549 *inst/doc/exploringdatasets.Rmd 73dae46c6a06cd23c4806eadb675c3af *inst/doc/exploringdatasets.html 6ab48e0585492f2efa7e5f4ccb0043f7 *inst/doc/recodingvariables.R 817964b5da6189084685a6077350c2a7 *inst/doc/recodingvariables.Rmd bf71a40b6acbfaedfd0a3091190f81c2 *inst/doc/recodingvariables.html 81aaf12825b9a852bd3826cad810a6e3 *man/add_columns.Rd 80d85456fd90032112b74c28d8f3ab82 *man/add_rows.Rd 9aabb784e192fbf0cdb82a2ddc9276d4 *man/add_variables.Rd b2fdc4424914a7155455e855b5737866 *man/all_na.Rd 864e92392b54500dac54e37624e6ec75 *man/big_mark.Rd 741b2012ee4d5bcb34708f18ed5b6a6d *man/count_na.Rd 20eb92c264b50d5df050b3401ae77bba *man/de_mean.Rd 1e3108fcb6901b8a23d0e6bb0d665ae5 *man/descr.Rd 2ca75857c032edd9dab35b9c021ebda3 *man/dicho.Rd 2ac3e43e7b5550c1df943e9dff3f772c *man/efc.Rd 98ee48d0344a92d56182e8823bcfa9ef *man/empty_cols.Rd 5aa312cd4fda1b5cefcc1848844851cf *man/figures/logo.png 81d6009b50795383e7e2aaaedd4ba6f9 *man/find_var.Rd 6d4fa628f274b38374d0a976616fb0cd *man/flat_table.Rd 8c07c697796f5af74bb91744e5bcb2c8 *man/frq.Rd 7948950f3b01acaf431e8929024a9d49 *man/grapes-nin-grapes.Rd 00b22dd067db513ed18ade6f9cc58c20 *man/group_str.Rd aba1fd42f7e3e8739eddf6eddb30ba14 *man/group_var.Rd d8bac51a52b951190f581098edc1bda7 *man/has_na.Rd bbbbecf00c154be287e0e406212800db *man/is_crossed.Rd ba8d0e082a47baa804ab6250968939cb *man/is_empty.Rd 73c24e95eb55267b98706b317dde4e6f *man/is_even.Rd 93ee575c5956e26c0a353e2791147991 *man/is_float.Rd bf7bf44eff4d5e3d6c5860865256bc17 *man/is_num_fac.Rd d4a0acb017997b56a09332ad238341db *man/merge_imputations.Rd c8137b8a103982b4070d04231c5cd50d *man/move_columns.Rd de1307d70c1e9b73609550ac4627c8c6 *man/numeric_to_factor.Rd b8fa6e2aa6308c4b7c73573f56d5fde1 *man/rec.Rd d4358aaf955aac74968989bb6b67eaeb *man/rec_pattern.Rd 6355a083f56290df41676f70f50bdba4 *man/recode_to.Rd f036afc210a32a5ea18cd25d19032467 *man/reexports.Rd c2a5dbf44772a4b4938994131f23e030 *man/ref_lvl.Rd 457b0a9a1f48d5eb3eb7a1e95f124280 *man/remove_var.Rd 18e02a1164edecab8b32146a57f5d728 *man/replace_na.Rd 11f94308b61d915d62e087ebaee9908d *man/reshape_longer.Rd 394e87fa5d5629a3024f27623ae3f7f5 *man/rotate_df.Rd 42a7087f2faf6f9b1f4b10e95fdf9ad0 *man/round_num.Rd ec08fb367623aa012efe3227b4ad48a2 *man/row_count.Rd 6e100aa6e4b936831ebd26202a79c28f *man/row_sums.Rd 6cfc9a05417ff40aa92c3e55de56d7a2 *man/seq_col.Rd ec3f33d2dbff79a63cb802cd2141b924 *man/set_na_if.Rd 9932b7b3639fdeea8363465e28aaaba4 *man/shorten_string.Rd 293eb2da1088d1f1e71c5985342f54f6 *man/sjmisc-package.Rd f40ed745d4a91b2064b8a194b252248d *man/split_var.Rd 30d8ab1c4b8513814da3714e54f25f1e *man/spread_coef.Rd 78ea3a458107567b08e232f2f9b4cd1d *man/std.Rd 5828635067eae26f567b3e18e2f59673 *man/str_contains.Rd ff870d34efd7b1cf9de3c9ac11b9f52e *man/str_find.Rd ed0bad010fd24e8fd8eb7bb1b31a2eb4 *man/str_start.Rd 2837f18870c4842ff8783fad0a5ccf4f *man/tidy_values.Rd eaf12f0a55bd2fc1df64a8b7603c750b *man/to_dummy.Rd a7ad3daa6c15fcf0277c1ad1aaa8d9ae *man/to_long.Rd a33084321c41b82c0abb23d7c10a34b3 *man/to_value.Rd f2395a12e6d53c7757556c09bf3c9c98 *man/trim.Rd d006b3ddb982afa627281cb3720e9343 *man/typical_value.Rd 185348fd378d010b33f131a8045e8f30 *man/var_rename.Rd 6eaeed0f238ad743a4c43b41535c0a91 *man/var_type.Rd 0fd708154b227e1b155b3078f35f4d30 *man/word_wrap.Rd 527c2fd70cfec08ec6402d0f4d3660a5 *man/zap_inf.Rd 41d2543d8f5f8f08e15e770722f89c1a *tests/testthat.R edcabca91e726adde4a1544a975ff107 *tests/testthat/test-add_case.R fdc88aec5b0e4b83e5a087d07adbd4be *tests/testthat/test-allna.R a810bb5aae792becf895fc8beee0e61c *tests/testthat/test-colcnt.R 7def0fd6960ddbab29dfd00bcc2dfb03 *tests/testthat/test-countna.R b93cb8678aa054b1c7b287bbce07a86f *tests/testthat/test-demean.R 3f8e2c2e376c121f6ea6158e56e48164 *tests/testthat/test-dicho.R 861fb1eae1a3ec71bc115a153f49a4c5 *tests/testthat/test-empty.R 29468198fabdc1b88b00a48065ece0d1 *tests/testthat/test-findvar.R 3edeb67b7b9bf638850287b299810a36 *tests/testthat/test-frq-weights.R 9d8cf91aedf56ad0c83934aa731adf5e *tests/testthat/test-frq.R 6a3a236081fe280511353291931cffa2 *tests/testthat/test-frq_whitespace.R 1d4eb5c4571621a6250a25cdd4fa939f *tests/testthat/test-group_str.R 8585dbdd9496c135302203d07c15213b *tests/testthat/test-is_num_fac.R 9afffdf47049e79a3c931c9a06430115 *tests/testthat/test-isempty.R 402fcd0ff5e1bacc5a704ad5c6cc810c *tests/testthat/test-merge_imputations.R 06b1c2fd8a678aec33d72b8c4a4d1515 *tests/testthat/test-move_columns.R 19f3fbfa557548030ecdf05216549806 *tests/testthat/test-prcn.R 337fa4902fdd5d43ecb77eae1c0cdfbe *tests/testthat/test-rec.R 9fafed6b50a5e21143f719f9f917e347 *tests/testthat/test-reflvl.R 5793d1496099a084c30c76ba08116a94 *tests/testthat/test-removevar.R ce5fd8548779f4d56af992e02957cbc5 *tests/testthat/test-replacena.R 2dd2fe31c94c9447cfa65a53f286cbf5 *tests/testthat/test-rowcnt.R 67234fb8f6389205f5d53180c9dc97b8 *tests/testthat/test-rowmeans.R 227fbd56576cd858e7eb50462883606f *tests/testthat/test-rowsums.R 181703604a03498253a2d94393aaaf53 *tests/testthat/test-setna.R 73ab8611ff043233c2b6f86a3b74d191 *tests/testthat/test-splitvar.R 5b9738a704aaf84d15cb2fe22eb3c958 *tests/testthat/test-std.R c8deadc45d6622e103219d01d1078ae2 *tests/testthat/test-strstartend.R 41bd5f8245059e7162b918611b70b909 *tests/testthat/test-tidy_values.R f182e43e4e422397efeae57f3db36e77 *tests/testthat/test-to_dummy.R 5afa74a6682cf3bdb07a0cfb88beda42 *tests/testthat/test-to_value.R 12d1c6cfb38d3f634cd49392c6e7bc57 *tests/testthat/test-vartype.R 9528b1d52c2569953ac2579a38cd3122 *tests/testthat/test-word_wrap.R b401603734454b3110ed946fcc391c9e *tests/testthat/test-zapinf.R aaef8b4c2577b9578ddd8f0a99fba9eb *vignettes/design_philosophy.Rmd bce92194e5e92a8394ca512d6d151549 *vignettes/exploringdatasets.Rmd 817964b5da6189084685a6077350c2a7 *vignettes/recodingvariables.Rmd sjmisc/inst/0000755000176200001440000000000014620407644012526 5ustar liggesuserssjmisc/inst/doc/0000755000176200001440000000000014620407644013273 5ustar liggesuserssjmisc/inst/doc/design_philosophy.R0000644000176200001440000000456614620407642017156 0ustar liggesusers## ----echo = FALSE------------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) if (!requireNamespace("dplyr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } options(max.print = 1000) suppressPackageStartupMessages(library(sjmisc)) ## ----------------------------------------------------------------------------- library(sjmisc) data(efc) # returns a vector x <- rec(efc$e42dep, rec = "1,2=1; 3,4=2") str(x) # returns a data frame rec(efc, e42dep, rec = "1,2=1; 3,4=2", append = FALSE) %>% head() ## ----echo=FALSE, message=FALSE------------------------------------------------ library(dplyr) ## ----collapse=TRUE------------------------------------------------------------ # select all variables with "cop" in their names, and also # the range from c161sex to c175empl rec( efc, contains("cop"), c161sex:c175empl, rec = "0,1=0; else=1", append = FALSE ) %>% head() # center all variables with "age" in name, variable c12hour # and all variables from column 19 to 21 center(efc, c12hour, contains("age"), 19:21, append = FALSE) %>% head() ## ----------------------------------------------------------------------------- x <- efc[, 3:5] x %>% str() to_factor(x, e42dep, e16sex) %>% str() ## ----------------------------------------------------------------------------- # complete data, including new columns rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE) %>% head() # only new columns rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% head() ## ----------------------------------------------------------------------------- efc %>% rec(c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% add_columns(efc) %>% head() ## ----------------------------------------------------------------------------- # complete data, existing columns c82cop1 and c83cop2 are replaced rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE, suffix = "") %>% head() ## ----------------------------------------------------------------------------- efc %>% select(c82cop1, c83cop2) %>% rec(rec = "1,2=0; 3:4=2") %>% head() efc %>% select(c82cop1, c83cop2) %>% mutate( c82cop1_dicho = rec(c82cop1, rec = "1,2=0; 3:4=2"), c83cop2_dicho = rec(c83cop2, rec = "1,2=0; 3:4=2") ) %>% head() sjmisc/inst/doc/recodingvariables.Rmd0000644000176200001440000002530714620405065017424 0ustar liggesusers--- title: "Recoding Variables" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Recoding Variables} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, comment = "#>") if (!requireNamespace("dplyr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } suppressPackageStartupMessages(library(sjmisc)) ``` Data preparation is a common task in research, which usually takes the most amount of time in the analytical process. **sjmisc** is a package with special focus on transformation of _variables_ that fits into the workflow and design-philosophy of the so-called "tidyverse". Basically, this package complements the **dplyr** package in that **sjmisc** takes over data transformation tasks on variables, like recoding, dichotomizing or grouping variables, setting and replacing missing values, etc. A distinctive feature of **sjmisc** is the support for labelled data, which is especially useful for users who often work with data sets from other statistical software packages like _SPSS_ or _Stata_. This vignette demonstrate some of the important recoding-functions in **sjmisc**. The examples are based on data from the EUROFAMCARE project, a survey on the situation of family carers of older people in Europe. The sample data set `efc` is part of this package. ```{r message=FALSE} library(sjmisc) data(efc) ``` To show the results after recoding variables, the `frq()` function is used to print frequency tables. ## Dichotomization: dividing variables into two groups `dicho()` dichotomizes variables into "dummy" variables (with 0/1 coding). Dichotomization is either done by median, mean or a specific value (see argument `dich.by`). Like all recoding-functions in **sjmisc**, `dicho()` returns the complete data frame _including_ the recoded variables, if the first argument is a `data.frame`. If the first argument is a vector, only the recoded variable is returned. See [this vignette](design_philosophy.html) for details about the function-design. If `dicho()` returns a data frame, the recoded variables have the same name as the original variable, including a suffix `_d`. ```{r} # age, ranged from 65 to 104, in this output # grouped to get a shorter table frq(efc, e17age, auto.grp = 5) # splitting is done at the median by default: median(efc$e17age, na.rm = TRUE) # the recoded variable is now named "e17age_d" efc <- dicho(efc, e17age) frq(efc, e17age_d) ``` As `dicho()`, like all recoding-functions, supports [labelled data](https://cran.r-project.org/package=sjlabelled), the variable preserves it variable label (but not the value labels). You can directly define value labels inside the function: ```{r} x <- dicho(efc$e17age, val.labels = c("young age", "old age")) frq(x) ``` To split a variable at a different value, use the `dich.by`-argument. The value specified in `dich.by` is _inclusive_, i.e. all values from lowest to and including `dich.by` are recoded into the lower category, while all values _above_ `dich.by` are recoded into the higher category. ```{r} # split at upper quartile x <- dicho( efc$e17age, dich.by = quantile(efc$e17age, probs = .75, na.rm = TRUE), val.labels = c("younger three quarters", "oldest quarter") ) frq(x) ``` Since the distribution of values in a dataset may differ for different subgroups, all recoding-functions also work on grouped data frames. In the following example, first, the age-variable `e17age` is dichotomized at the median. Then, the data is grouped by gender (`c161sex`) and the dichotomization is done for each subgroup, i.e. it once relates to the median age in the subgroup of female, and once to the median age in the subgroup of male family carers. ```{r} data(efc) x1 <- dicho(efc$e17age) x2 <- efc %>% dplyr::group_by(c161sex) %>% dicho(e17age) %>% dplyr::pull(e17age_d) # median age of total sample frq(x1) # median age of total sample, with median-split applied # to distribution of age by subgroups of gender frq(x2) ``` ## Splitting variables into several groups `split_var()` recodes numeric variables into equal sized groups, i.e. a variable is cut into a smaller number of groups at specific cut points. The amount of groups depends on the `n`-argument and cuts a variable into `n` quantiles. Similar to `dicho()`, if the first argument in `split_var()` is a data frame, the complete data frame including the new recoded variable(s), with suffix `_g`, is returned. ```{r} x <- split_var(efc$e17age, n = 3) frq(x) ``` Unlike dplyr's `ntile()`, `split_var()` never splits a value into two different categories, i.e. you always get a "clean" separation of original categories. In other words: cases that have identical values in a variable will always be recoded into the same group. The following example demonstrates the differences: ```{r} x <- dplyr::ntile(efc$neg_c_7, n = 3) # for some cases, value "10" is recoded into category "1", # for other cases into category "2". Same is true for value "13" table(efc$neg_c_7, x) x <- split_var(efc$neg_c_7, n = 3) # no separation of cases with identical values. table(efc$neg_c_7, x) ``` `split_var()`, unlike `ntile()`, does therefor not always return exactly equal-sized groups: ```{r} x <- dplyr::ntile(efc$neg_c_7, n = 3) frq(x) x <- split_var(efc$neg_c_7, n = 3) frq(x) ``` ## Recode variables into equal-ranged groups With `group_var()`, variables can be grouped into equal ranged categories, i.e. a variable is cut into a smaller number of groups, where each group has the same value range. `group_labels()` creates the related value labels. The range of the groups is defined in the `size`-argument. At the same time, the `size`-argument also defines the _lower bound_ of one of the groups. For instance, if the lowest value of a variable is 1 and the maximum is 10, and `size = 5`, then a) each group will have a range of 5, and b) one of the groups will start with the value 5. This means, that an equal-ranged grouping will define groups from _0 to 4_, _5 to 9_ and _10-14_. Each of these groups has a range of 5, and one of the groups starts with the value 5. The group assignment becomes clearer, when `group_labels()` is used in parallel: ```{r} set.seed(123) x <- round(runif(n = 150, 1, 10)) frq(x) frq(group_var(x, size = 5)) group_labels(x, size = 5) dummy <- group_var(x, size = 5, as.num = FALSE) levels(dummy) <- group_labels(x, size = 5) frq(dummy) dummy <- group_var(x, size = 3, as.num = FALSE) levels(dummy) <- group_labels(x, size = 3) frq(dummy) ``` The argument `right.interval` can be used when `size` should indicate the _upper bound_ of a group-range. ```{r} dummy <- group_var(x, size = 4, as.num = FALSE) levels(dummy) <- group_labels(x, size = 4) frq(dummy) dummy <- group_var(x, size = 4, as.num = FALSE, right.interval = TRUE) levels(dummy) <- group_labels(x, size = 4, right.interval = TRUE) frq(dummy) ``` ## Flexible recoding of variables `rec()` recodes old values of variables into new values, and can be considered as a "classical" recode-function. The recode-pattern, i.e. which new values should replace the old values, is defined in the `rec`-argument. This argument has a specific "syntax": * **recode pairs**: Each recode pair has to be separated by a ;, e.g. `rec = "1=1; 2=4; 3=2; 4=3"` * **multiple values**: Multiple old values that should be recoded into a new single value may be separated with comma, e.g. `rec = "1,2=1; 3,4=2"` * **value range**: A value range is indicated by a colon, e.g. `rec = "1:4=1; 5:8=2"` (recodes all values from 1 to 4 into 1, and from 5 to 8 into 2) * **value range for doubles**: For double vectors (with fractional part), all values within the specified range are recoded; e.g. `rec = "1:2.5=1;2.6:3=2"` recodes 1 to 2.5 into 1 and 2.6 to 3 into 2, but 2.55 would not be recoded (since it's not included in any of the specified ranges) * **"min" and "max"**: Minimum and maximum values are indicates by `min` (or `lo`) and `max` (or `hi`), e.g. `rec = "min:4=1; 5:max=2"` (recodes all values from minimum values of x to 4 into 1, and from 5 to maximum values of x into 2) You can also use `min` or `max` to recode a value into the minimum or maximum value of a variable, e.g. `rec = "min:4=1; 5:7=max"` (recodes all values from minimum values of x to 4 into 1, and from 5 to 7 into the maximum value of x). * **"else"**: All other values, which have not been specified yet, are indicated by else, e.g. `rec = "3=1; 1=2; else=3"` (recodes 3 into 1, 1 into 2 and all other values into 3) * **"copy"**: The `"else"`-token can be combined with `"copy"`, indicating that all remaining, not yet recoded values should stay the same (are copied from the original value), e.g. `rec = "3=1; 1=2; else=copy"` (recodes 3 into 1, 1 into 2 and all other values like 2, 4 or 5 etc. will not be recoded, but copied. * **NA's**: `NA` values are allowed both as old and new value, e.g. `rec = "NA=1; 3:5=NA"` (recodes all `NA` into 1, and all values from 3 to 5 into NA in the new variable) * **"rev"**: `"rev"` is a special token that reverses the value order. * **direct value labelling**: Value labels for new values can be assigned inside the recode pattern by writing the value label in square brackets after defining the new value in a recode pair, e.g. `rec = "15:30=1 [young aged]; 31:55=2 [middle aged]; 56:max=3 [old aged]"` * **non-captured values**: Non-matching values will be set to `NA`, unless captured by the `"else"`- or `"copy"`-token. Here are some examples: ```{r} frq(efc$e42dep) # replace NA with 5 frq(rec(efc$e42dep, rec = "NA=5;else=copy")) # recode 1 to 2 into 1 and 3 to 4 into 2 frq(rec(efc$e42dep, rec = "1,2=1; 3,4=2")) # recode 1 to 3 into 4 into 2 frq(rec(efc$e42dep, rec = "min:3=1; 4=2")) # recode numeric to character, and remaining values # into the highest value (="hi") of e42dep frq(rec(efc$e42dep, rec = "1=first;2=2nd;else=hi")) data(iris) frq(rec(iris, Species, rec = "setosa=huhu; else=copy", append = FALSE)) # works with mutate efc %>% dplyr::select(e42dep, e17age) %>% dplyr::mutate(dependency_rev = rec(e42dep, rec = "rev")) %>% head() # recode multiple variables and set value labels via recode-syntax dummy <- rec( efc, c160age, e17age, rec = "15:30=1 [young]; 31:55=2 [middle]; 56:max=3 [old]", append = FALSE ) frq(dummy) ``` ## Scoped variants Where applicable, the recoding-functions in **sjmisc** have "scoped" versions as well, e.g. `dicho_if()` or `split_var_if()`, where transformation will be applied only to those variables that match the logical condition of `predicate`. sjmisc/inst/doc/recodingvariables.R0000644000176200001440000000757514620407644017117 0ustar liggesusers## ----echo = FALSE------------------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, comment = "#>") if (!requireNamespace("dplyr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } suppressPackageStartupMessages(library(sjmisc)) ## ----message=FALSE------------------------------------------------------------ library(sjmisc) data(efc) ## ----------------------------------------------------------------------------- # age, ranged from 65 to 104, in this output # grouped to get a shorter table frq(efc, e17age, auto.grp = 5) # splitting is done at the median by default: median(efc$e17age, na.rm = TRUE) # the recoded variable is now named "e17age_d" efc <- dicho(efc, e17age) frq(efc, e17age_d) ## ----------------------------------------------------------------------------- x <- dicho(efc$e17age, val.labels = c("young age", "old age")) frq(x) ## ----------------------------------------------------------------------------- # split at upper quartile x <- dicho( efc$e17age, dich.by = quantile(efc$e17age, probs = .75, na.rm = TRUE), val.labels = c("younger three quarters", "oldest quarter") ) frq(x) ## ----------------------------------------------------------------------------- data(efc) x1 <- dicho(efc$e17age) x2 <- efc %>% dplyr::group_by(c161sex) %>% dicho(e17age) %>% dplyr::pull(e17age_d) # median age of total sample frq(x1) # median age of total sample, with median-split applied # to distribution of age by subgroups of gender frq(x2) ## ----------------------------------------------------------------------------- x <- split_var(efc$e17age, n = 3) frq(x) ## ----------------------------------------------------------------------------- x <- dplyr::ntile(efc$neg_c_7, n = 3) # for some cases, value "10" is recoded into category "1", # for other cases into category "2". Same is true for value "13" table(efc$neg_c_7, x) x <- split_var(efc$neg_c_7, n = 3) # no separation of cases with identical values. table(efc$neg_c_7, x) ## ----------------------------------------------------------------------------- x <- dplyr::ntile(efc$neg_c_7, n = 3) frq(x) x <- split_var(efc$neg_c_7, n = 3) frq(x) ## ----------------------------------------------------------------------------- set.seed(123) x <- round(runif(n = 150, 1, 10)) frq(x) frq(group_var(x, size = 5)) group_labels(x, size = 5) dummy <- group_var(x, size = 5, as.num = FALSE) levels(dummy) <- group_labels(x, size = 5) frq(dummy) dummy <- group_var(x, size = 3, as.num = FALSE) levels(dummy) <- group_labels(x, size = 3) frq(dummy) ## ----------------------------------------------------------------------------- dummy <- group_var(x, size = 4, as.num = FALSE) levels(dummy) <- group_labels(x, size = 4) frq(dummy) dummy <- group_var(x, size = 4, as.num = FALSE, right.interval = TRUE) levels(dummy) <- group_labels(x, size = 4, right.interval = TRUE) frq(dummy) ## ----------------------------------------------------------------------------- frq(efc$e42dep) # replace NA with 5 frq(rec(efc$e42dep, rec = "NA=5;else=copy")) # recode 1 to 2 into 1 and 3 to 4 into 2 frq(rec(efc$e42dep, rec = "1,2=1; 3,4=2")) # recode 1 to 3 into 4 into 2 frq(rec(efc$e42dep, rec = "min:3=1; 4=2")) # recode numeric to character, and remaining values # into the highest value (="hi") of e42dep frq(rec(efc$e42dep, rec = "1=first;2=2nd;else=hi")) data(iris) frq(rec(iris, Species, rec = "setosa=huhu; else=copy", append = FALSE)) # works with mutate efc %>% dplyr::select(e42dep, e17age) %>% dplyr::mutate(dependency_rev = rec(e42dep, rec = "rev")) %>% head() # recode multiple variables and set value labels via recode-syntax dummy <- rec( efc, c160age, e17age, rec = "15:30=1 [young]; 31:55=2 [middle]; 56:max=3 [old]", append = FALSE ) frq(dummy) sjmisc/inst/doc/exploringdatasets.Rmd0000644000176200001440000001657714046746443017524 0ustar liggesusers--- title: "Exploring Data Sets" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Exploring Data Sets} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r echo = FALSE} knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, comment = "#>") if (!requireNamespace("dplyr", quietly = TRUE) || !requireNamespace("tidyr", quietly = TRUE) || !requireNamespace("purrr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } suppressPackageStartupMessages(library(sjmisc)) ``` Tidying up, transforming and exploring data is an important part of data analysis, and you can manage many common tasks in this process with the *tidyverse* or related packages. The **sjmisc**-package fits into this workflow, especially when you work with [labelled data](https://cran.r-project.org/package=sjlabelled), because it offers functions for data transformation and labelled data utility functions. This vignette describes typical steps when beginning with data exploration. The examples are based on data from the EUROFAMCARE project, a survey on the situation of family carers of older people in Europe. The sample data set `efc` is part of this package. Let us see how the family carer's gender and subjective perception of negative impact of care as well as the cared-for person's dependency are associated with the family carer's quality of life. ```{r message=FALSE} library(sjmisc) library(dplyr) data(efc) ``` ## Print frequencies with labels The first thing that may be of interest is probably the distribution of gender. You can plot frequencies for labelled data with `frq()`. This function requires either a vector or data frame as input and prints the variable label as first line, followed by a frequency-table with values, labels, counts and percentages of the vector. ```{r} frq(efc$c161sex) ``` ## Find variables in a data frame Next, let’s look at the distribution of gender by the cared-for person's dependency. To compute cross tables, you can use `flat_table()`. It requires the data as first argument, followed by any number of variable names. But first, we need to know the name of the dependency-variable. This is where `find_var()` comes into play. It searches for variables in a data frame by 1. variable names, 2. variable labels, 3. value labels 4. or any combination of these. By default, it looks for variable name and labels. The function also supports regex-patterns. By default, `find_var()` returns the column-indices, but you can also print a small "summary"" with the `out`-argument. ```{r} # find all variables with "dependency" in name or label find_var(efc, "dependency", out = "table") ``` Variable in column 5, named _e42dep_, is what we are looking for. ## Print crosstables with labels Now we can look at the distribution of gender by dependency: ```{r} flat_table(efc, e42dep, c161sex) ``` Since the distribution of male and female carers is skewed, let's see the proportions. To compute crosstables with row or column percentages, use the `margin`-argument: ```{r} flat_table(efc, e42dep, c161sex, margin = "col") ``` ## Recoding variables Next, we need the negatice impact of care (*neg_c_7*) and want to create three groups: low, middle and high negative impact. We can easily recode and label vectors with `rec()`. This function does not only recode vectors, it also allows direct labelling of categories inside the recode-syntax (this is optional, you can also use the `val.labels`-argument). We now recode *neg_c_7* into a new variable _burden_. The cut-points are a bit arbitrary, for the sake of demonstration. ```{r} efc$burden <- rec( efc$neg_c_7, rec = c("min:9=1 [low]; 10:12=2 [moderate]; 13:max=3 [high]; else=NA"), var.label = "Subjective burden", as.num = FALSE # we want a factor ) # print frequencies frq(efc$burden) ``` You can see the variable _burden_ has a variable label ("Subjective burden"), which was set inside `rec()`, as well as three values with labels ("low", "moderate" and "high"). From the lowest value in *neg_c_7* to 9 were recoded into 1, values 10 to 12 into 2 and values 13 to the highest value in *neg_c_7* into 3. All remaining values are set to missing (`else=NA` – for details on the recode-syntax, see `?rec`). ## Grouped data frames How is burden distributed by gender? We can group the data and print frequencies using `frq()` for this as well, as this function also accepts grouped data frames. Frequencies for grouped data frames first print the group-details (variable name and category), followed by the frequency table. Thanks to labelled data, the output is easy to understand. ```{r} efc %>% select(burden, c161sex) %>% group_by(c161sex) %>% frq() ``` ## Nested data frames Let's investigate the association between quality of life and burden across the different dependency categories, by fitting linear models for each category of _e42dep_. We can do this using _nested data frames_. `nest()` from the **tidyr**-package can create subsets of a data frame, based on grouping criteria, and create a new _list-variable_, where each element itself is a data frame (so it’s nested, because we have data frames inside a data frame). In the following example, we group the data by _e42dep_, and "nest" the groups. Now we get a data frame with two columns: First, the grouping variable (_e42dep_) and second, the datasets (subsets) for each country as data frame, stored in the list-variable _data_. The data frames in the subsets (in _data_) all contain the selected variables _burden_, _c161sex_ and *quol_5* (quality of life). ```{r} # convert variable to labelled factor, because we then # have the labels as factor levels in the output efc$e42dep <- to_label(efc$e42dep, drop.levels = TRUE) efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() ``` ## Get coefficients of nested models Using `map()` from the **purrr**-package, we can iterate this list and apply any function on each data frame in the list-variable "data". We want to apply the `lm()`-function to the list-variable, to run linear models for all "dependency-datasets". The results of these linear regressions are stored in another list-variable, _models_ (created with `mutate()`). To quickly access and look at the coefficients, we can use `spread_coef()`. ```{r} efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() %>% na.omit() %>% # remove nested group for NA arrange(e42dep) %>% # arrange by order of levels mutate(models = purrr::map( data, ~ lm(quol_5 ~ burden + c161sex, data = .)) ) %>% spread_coef(models) ``` We see that higher burden is associated with lower quality of life, for all dependency-groups. The `se` and `p.val`-arguments add standard errors and p-values to the output. `model.term` returns the statistics only for a specific term. If you specify a `model.term`, arguments `se` and `p.val` automatically default to `TRUE`. ```{r} efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() %>% na.omit() %>% # remove nested group for NA arrange(e42dep) %>% # arrange by order of levels mutate(models = purrr::map( data, ~ lm(quol_5 ~ burden + c161sex, data = .)) ) %>% spread_coef(models, burden3) ``` sjmisc/inst/doc/recodingvariables.html0000644000176200001440000020622614620407644017654 0ustar liggesusers Recoding Variables

Recoding Variables

Daniel Lüdecke

2024-05-13

Data preparation is a common task in research, which usually takes the most amount of time in the analytical process. sjmisc is a package with special focus on transformation of variables that fits into the workflow and design-philosophy of the so-called “tidyverse”.

Basically, this package complements the dplyr package in that sjmisc takes over data transformation tasks on variables, like recoding, dichotomizing or grouping variables, setting and replacing missing values, etc. A distinctive feature of sjmisc is the support for labelled data, which is especially useful for users who often work with data sets from other statistical software packages like SPSS or Stata.

This vignette demonstrate some of the important recoding-functions in sjmisc. The examples are based on data from the EUROFAMCARE project, a survey on the situation of family carers of older people in Europe. The sample data set efc is part of this package.

library(sjmisc)
data(efc)

To show the results after recoding variables, the frq() function is used to print frequency tables.

Dichotomization: dividing variables into two groups

dicho() dichotomizes variables into “dummy” variables (with 0/1 coding). Dichotomization is either done by median, mean or a specific value (see argument dich.by).

Like all recoding-functions in sjmisc, dicho() returns the complete data frame including the recoded variables, if the first argument is a data.frame. If the first argument is a vector, only the recoded variable is returned. See this vignette for details about the function-design.

If dicho() returns a data frame, the recoded variables have the same name as the original variable, including a suffix _d.

# age, ranged from 65 to 104, in this output
# grouped to get a shorter table
frq(efc, e17age, auto.grp = 5)
#> elder' age (e17age) <numeric> 
#> # total N=908 valid N=891 mean=79.12 sd=8.09
#> 
#> Value |  Label |   N | Raw % | Valid % | Cum. %
#> -----------------------------------------------
#>     1 |  65-72 | 212 | 23.35 |   23.79 |  23.79
#>     2 |  73-80 | 277 | 30.51 |   31.09 |  54.88
#>     3 |  81-88 | 270 | 29.74 |   30.30 |  85.19
#>     4 |  89-96 | 124 | 13.66 |   13.92 |  99.10
#>     5 | 97-104 |   8 |  0.88 |    0.90 | 100.00
#>  <NA> |   <NA> |  17 |  1.87 |    <NA> |   <NA>

# splitting is done at the median by default:
median(efc$e17age, na.rm = TRUE)
#> [1] 79

# the recoded variable is now named "e17age_d"
efc <- dicho(efc, e17age)
frq(efc, e17age_d)
#> elder' age (e17age_d) <categorical> 
#> # total N=908 valid N=891 mean=0.49 sd=0.50
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     0 | 455 | 50.11 |   51.07 |  51.07
#>     1 | 436 | 48.02 |   48.93 | 100.00
#>  <NA> |  17 |  1.87 |    <NA> |   <NA>

As dicho(), like all recoding-functions, supports labelled data, the variable preserves it variable label (but not the value labels). You can directly define value labels inside the function:

x <- dicho(efc$e17age, val.labels = c("young age", "old age"))
frq(x)
#> elder' age (x) <categorical> 
#> # total N=908 valid N=891 mean=0.49 sd=0.50
#> 
#> Value |     Label |   N | Raw % | Valid % | Cum. %
#> --------------------------------------------------
#>     0 | young age | 455 | 50.11 |   51.07 |  51.07
#>     1 |   old age | 436 | 48.02 |   48.93 | 100.00
#>  <NA> |      <NA> |  17 |  1.87 |    <NA> |   <NA>

To split a variable at a different value, use the dich.by-argument. The value specified in dich.by is inclusive, i.e. all values from lowest to and including dich.by are recoded into the lower category, while all values above dich.by are recoded into the higher category.

# split at upper quartile
x <- dicho(
  efc$e17age, 
  dich.by = quantile(efc$e17age, probs = .75, na.rm = TRUE), 
  val.labels = c("younger three quarters", "oldest quarter")
)
frq(x)
#> elder' age (x) <categorical> 
#> # total N=908 valid N=891 mean=0.24 sd=0.43
#> 
#> Value |                  Label |   N | Raw % | Valid % | Cum. %
#> ---------------------------------------------------------------
#>     0 | younger three quarters | 678 | 74.67 |   76.09 |  76.09
#>     1 |         oldest quarter | 213 | 23.46 |   23.91 | 100.00
#>  <NA> |                   <NA> |  17 |  1.87 |    <NA> |   <NA>

Since the distribution of values in a dataset may differ for different subgroups, all recoding-functions also work on grouped data frames. In the following example, first, the age-variable e17age is dichotomized at the median. Then, the data is grouped by gender (c161sex) and the dichotomization is done for each subgroup, i.e. it once relates to the median age in the subgroup of female, and once to the median age in the subgroup of male family carers.

data(efc)
x1 <- dicho(efc$e17age)

x2 <- efc %>% 
  dplyr::group_by(c161sex) %>% 
  dicho(e17age) %>% 
  dplyr::pull(e17age_d)

# median age of total sample
frq(x1)
#> elder' age (x) <categorical> 
#> # total N=908 valid N=891 mean=0.49 sd=0.50
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     0 | 455 | 50.11 |   51.07 |  51.07
#>     1 | 436 | 48.02 |   48.93 | 100.00
#>  <NA> |  17 |  1.87 |    <NA> |   <NA>

# median age of total sample, with median-split applied
# to distribution of age by subgroups of gender
frq(x2)
#> elder' age (x) <numeric> 
#> # total N=908 valid N=891 mean=1.50 sd=0.50
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     1 | 449 | 49.45 |   50.39 |  50.39
#>     2 | 442 | 48.68 |   49.61 | 100.00
#>  <NA> |  17 |  1.87 |    <NA> |   <NA>

Splitting variables into several groups

split_var() recodes numeric variables into equal sized groups, i.e. a variable is cut into a smaller number of groups at specific cut points. The amount of groups depends on the n-argument and cuts a variable into n quantiles.

Similar to dicho(), if the first argument in split_var() is a data frame, the complete data frame including the new recoded variable(s), with suffix _g, is returned.

x <- split_var(efc$e17age, n = 3)
frq(x)
#> elder' age (x) <categorical> 
#> # total N=908 valid N=891 mean=2.05 sd=0.82
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     1 | 274 | 30.18 |   30.75 |  30.75
#>     2 | 294 | 32.38 |   33.00 |  63.75
#>     3 | 323 | 35.57 |   36.25 | 100.00
#>  <NA> |  17 |  1.87 |    <NA> |   <NA>

Unlike dplyr’s ntile(), split_var() never splits a value into two different categories, i.e. you always get a “clean” separation of original categories. In other words: cases that have identical values in a variable will always be recoded into the same group. The following example demonstrates the differences:

x <- dplyr::ntile(efc$neg_c_7, n = 3)
# for some cases, value "10" is recoded into category "1",
# for other cases into category "2". Same is true for value "13"
table(efc$neg_c_7, x)
#>     x
#>        1   2   3
#>   7   75   0   0
#>   8   99   0   0
#>   9  106   0   0
#>   10  18 102   0
#>   11   0  96   0
#>   12   0  85   0
#>   13   0  14  50
#>   14   0   0  54
#>   15   0   0  45
#>   16   0   0  30
#>   17   0   0  35
#>   18   0   0  26
#>   19   0   0  16
#>   20   0   0  16
#>   21   0   0   2
#>   22   0   0   7
#>   23   0   0   4
#>   24   0   0   3
#>   25   0   0   6
#>   27   0   0   1
#>   28   0   0   2

x <- split_var(efc$neg_c_7, n = 3)
# no separation of cases with identical values.
table(efc$neg_c_7, x)
#>     x
#>        1   2   3
#>   7   75   0   0
#>   8   99   0   0
#>   9  106   0   0
#>   10   0 120   0
#>   11   0  96   0
#>   12   0  85   0
#>   13   0   0  64
#>   14   0   0  54
#>   15   0   0  45
#>   16   0   0  30
#>   17   0   0  35
#>   18   0   0  26
#>   19   0   0  16
#>   20   0   0  16
#>   21   0   0   2
#>   22   0   0   7
#>   23   0   0   4
#>   24   0   0   3
#>   25   0   0   6
#>   27   0   0   1
#>   28   0   0   2

split_var(), unlike ntile(), does therefor not always return exactly equal-sized groups:

x <- dplyr::ntile(efc$neg_c_7, n = 3)
frq(x)
#> x <integer> 
#> # total N=908 valid N=892 mean=2.00 sd=0.82
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     1 | 298 | 32.82 |   33.41 |  33.41
#>     2 | 297 | 32.71 |   33.30 |  66.70
#>     3 | 297 | 32.71 |   33.30 | 100.00
#>  <NA> |  16 |  1.76 |    <NA> |   <NA>

x <- split_var(efc$neg_c_7, n = 3)
frq(x)
#> Negative impact with 7 items (x) <categorical> 
#> # total N=908 valid N=892 mean=2.03 sd=0.81
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     1 | 280 | 30.84 |   31.39 |  31.39
#>     2 | 301 | 33.15 |   33.74 |  65.13
#>     3 | 311 | 34.25 |   34.87 | 100.00
#>  <NA> |  16 |  1.76 |    <NA> |   <NA>

Recode variables into equal-ranged groups

With group_var(), variables can be grouped into equal ranged categories, i.e. a variable is cut into a smaller number of groups, where each group has the same value range. group_labels() creates the related value labels.

The range of the groups is defined in the size-argument. At the same time, the size-argument also defines the lower bound of one of the groups.

For instance, if the lowest value of a variable is 1 and the maximum is 10, and size = 5, then

  1. each group will have a range of 5, and
  2. one of the groups will start with the value 5.

This means, that an equal-ranged grouping will define groups from 0 to 4, 5 to 9 and 10-14. Each of these groups has a range of 5, and one of the groups starts with the value 5.

The group assignment becomes clearer, when group_labels() is used in parallel:

set.seed(123)
x <- round(runif(n = 150, 1, 10))

frq(x)
#> x <numeric> 
#> # total N=150 valid N=150 mean=5.52 sd=2.63
#> 
#> Value |  N | Raw % | Valid % | Cum. %
#> -------------------------------------
#>     1 |  6 |  4.00 |    4.00 |   4.00
#>     2 | 19 | 12.67 |   12.67 |  16.67
#>     3 | 16 | 10.67 |   10.67 |  27.33
#>     4 | 17 | 11.33 |   11.33 |  38.67
#>     5 | 20 | 13.33 |   13.33 |  52.00
#>     6 | 12 |  8.00 |    8.00 |  60.00
#>     7 | 19 | 12.67 |   12.67 |  72.67
#>     8 | 16 | 10.67 |   10.67 |  83.33
#>     9 | 15 | 10.00 |   10.00 |  93.33
#>    10 | 10 |  6.67 |    6.67 | 100.00
#>  <NA> |  0 |  0.00 |    <NA> |   <NA>

frq(group_var(x, size = 5))
#> x <numeric> 
#> # total N=150 valid N=150 mean=1.68 sd=0.59
#> 
#> Value |  N | Raw % | Valid % | Cum. %
#> -------------------------------------
#>     1 | 58 | 38.67 |   38.67 |  38.67
#>     2 | 82 | 54.67 |   54.67 |  93.33
#>     3 | 10 |  6.67 |    6.67 | 100.00
#>  <NA> |  0 |  0.00 |    <NA> |   <NA>

group_labels(x, size = 5)
#> [1] "0-4"   "5-9"   "10-14"

dummy <- group_var(x, size = 5, as.num = FALSE)
levels(dummy) <- group_labels(x, size = 5)
frq(dummy)
#> x <categorical> 
#> # total N=150 valid N=150 mean=1.68 sd=0.59
#> 
#> Value |  N | Raw % | Valid % | Cum. %
#> -------------------------------------
#> 0-4   | 58 | 38.67 |   38.67 |  38.67
#> 5-9   | 82 | 54.67 |   54.67 |  93.33
#> 10-14 | 10 |  6.67 |    6.67 | 100.00
#> <NA>  |  0 |  0.00 |    <NA> |   <NA>

dummy <- group_var(x, size = 3, as.num = FALSE)
levels(dummy) <- group_labels(x, size = 3)
frq(dummy)
#> x <categorical> 
#> # total N=150 valid N=150 mean=2.48 sd=0.96
#> 
#> Value |  N | Raw % | Valid % | Cum. %
#> -------------------------------------
#> 0-2   | 25 | 16.67 |   16.67 |  16.67
#> 3-5   | 53 | 35.33 |   35.33 |  52.00
#> 6-8   | 47 | 31.33 |   31.33 |  83.33
#> 9-11  | 25 | 16.67 |   16.67 | 100.00
#> <NA>  |  0 |  0.00 |    <NA> |   <NA>

The argument right.interval can be used when size should indicate the upper bound of a group-range.

dummy <- group_var(x, size = 4, as.num = FALSE)
levels(dummy) <- group_labels(x, size = 4)
frq(dummy)
#> x <categorical> 
#> # total N=150 valid N=150 mean=2.00 sd=0.74
#> 
#> Value |  N | Raw % | Valid % | Cum. %
#> -------------------------------------
#> 0-3   | 41 | 27.33 |   27.33 |  27.33
#> 4-7   | 68 | 45.33 |   45.33 |  72.67
#> 8-11  | 41 | 27.33 |   27.33 | 100.00
#> <NA>  |  0 |  0.00 |    <NA> |   <NA>

dummy <- group_var(x, size = 4, as.num = FALSE, right.interval = TRUE)
levels(dummy) <- group_labels(x, size = 4, right.interval = TRUE)
frq(dummy)
#> x <categorical> 
#> # total N=150 valid N=150 mean=1.78 sd=0.71
#> 
#> Value |  N | Raw % | Valid % | Cum. %
#> -------------------------------------
#> 1-4   | 58 | 38.67 |   38.67 |  38.67
#> 5-8   | 67 | 44.67 |   44.67 |  83.33
#> 9-12  | 25 | 16.67 |   16.67 | 100.00
#> <NA>  |  0 |  0.00 |    <NA> |   <NA>

Flexible recoding of variables

rec() recodes old values of variables into new values, and can be considered as a “classical” recode-function. The recode-pattern, i.e. which new values should replace the old values, is defined in the rec-argument. This argument has a specific “syntax”:

  • recode pairs: Each recode pair has to be separated by a ;, e.g. rec = "1=1; 2=4; 3=2; 4=3"

  • multiple values: Multiple old values that should be recoded into a new single value may be separated with comma, e.g. rec = "1,2=1; 3,4=2"

  • value range: A value range is indicated by a colon, e.g. rec = "1:4=1; 5:8=2" (recodes all values from 1 to 4 into 1, and from 5 to 8 into 2)

  • value range for doubles: For double vectors (with fractional part), all values within the specified range are recoded; e.g. rec = "1:2.5=1;2.6:3=2" recodes 1 to 2.5 into 1 and 2.6 to 3 into 2, but 2.55 would not be recoded (since it’s not included in any of the specified ranges)

  • “min” and “max”: Minimum and maximum values are indicates by min (or lo) and max (or hi), e.g. rec = "min:4=1; 5:max=2" (recodes all values from minimum values of x to 4 into 1, and from 5 to maximum values of x into 2) You can also use min or max to recode a value into the minimum or maximum value of a variable, e.g. rec = "min:4=1; 5:7=max" (recodes all values from minimum values of x to 4 into 1, and from 5 to 7 into the maximum value of x).

  • “else”: All other values, which have not been specified yet, are indicated by else, e.g. rec = "3=1; 1=2; else=3" (recodes 3 into 1, 1 into 2 and all other values into 3)

  • “copy”: The "else"-token can be combined with "copy", indicating that all remaining, not yet recoded values should stay the same (are copied from the original value), e.g. rec = "3=1; 1=2; else=copy" (recodes 3 into 1, 1 into 2 and all other values like 2, 4 or 5 etc. will not be recoded, but copied.

  • NA’s: NA values are allowed both as old and new value, e.g. rec = "NA=1; 3:5=NA" (recodes all NA into 1, and all values from 3 to 5 into NA in the new variable)

  • “rev”: "rev" is a special token that reverses the value order.

  • direct value labelling: Value labels for new values can be assigned inside the recode pattern by writing the value label in square brackets after defining the new value in a recode pair, e.g. rec = "15:30=1 [young aged]; 31:55=2 [middle aged]; 56:max=3 [old aged]"

  • non-captured values: Non-matching values will be set to NA, unless captured by the "else"- or "copy"-token.

Here are some examples:

frq(efc$e42dep)
#> elder's dependency (x) <numeric> 
#> # total N=908 valid N=901 mean=2.94 sd=0.94
#> 
#> Value |                Label |   N | Raw % | Valid % | Cum. %
#> -------------------------------------------------------------
#>     1 |          independent |  66 |  7.27 |    7.33 |   7.33
#>     2 |   slightly dependent | 225 | 24.78 |   24.97 |  32.30
#>     3 | moderately dependent | 306 | 33.70 |   33.96 |  66.26
#>     4 |   severely dependent | 304 | 33.48 |   33.74 | 100.00
#>  <NA> |                 <NA> |   7 |  0.77 |    <NA> |   <NA>

# replace NA with 5
frq(rec(efc$e42dep, rec = "NA=5;else=copy"))
#> elder's dependency (x) <numeric> 
#> # total N=908 valid N=908 mean=2.96 sd=0.95
#> 
#> Value |                Label |   N | Raw % | Valid % | Cum. %
#> -------------------------------------------------------------
#>     1 |          independent |  66 |  7.27 |    7.27 |   7.27
#>     2 |   slightly dependent | 225 | 24.78 |   24.78 |  32.05
#>     3 | moderately dependent | 306 | 33.70 |   33.70 |  65.75
#>     4 |   severely dependent | 304 | 33.48 |   33.48 |  99.23
#>     5 |                    5 |   7 |  0.77 |    0.77 | 100.00
#>  <NA> |                 <NA> |   0 |  0.00 |    <NA> |   <NA>

# recode 1 to 2 into 1 and 3 to 4 into 2
frq(rec(efc$e42dep, rec = "1,2=1; 3,4=2"))
#> elder's dependency (x) <numeric> 
#> # total N=908 valid N=901 mean=1.68 sd=0.47
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     1 | 291 | 32.05 |   32.30 |  32.30
#>     2 | 610 | 67.18 |   67.70 | 100.00
#>  <NA> |   7 |  0.77 |    <NA> |   <NA>

# recode 1 to 3 into 4 into 2
frq(rec(efc$e42dep, rec = "min:3=1; 4=2"))
#> elder's dependency (x) <numeric> 
#> # total N=908 valid N=901 mean=1.34 sd=0.47
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#>     1 | 597 | 65.75 |   66.26 |  66.26
#>     2 | 304 | 33.48 |   33.74 | 100.00
#>  <NA> |   7 |  0.77 |    <NA> |   <NA>

# recode numeric to character, and remaining values
# into the highest value (="hi") of e42dep
frq(rec(efc$e42dep, rec = "1=first;2=2nd;else=hi"))
#> elder's dependency (x) <character> 
#> # total N=908 valid N=901 mean=2.43 sd=0.86
#> 
#> Value |   N | Raw % | Valid % | Cum. %
#> --------------------------------------
#> 2nd   | 225 | 24.78 |   24.97 |  24.97
#> first |  66 |  7.27 |    7.33 |  32.30
#> hi    | 610 | 67.18 |   67.70 | 100.00
#> <NA>  |   7 |  0.77 |    <NA> |   <NA>

data(iris)
frq(rec(iris, Species, rec = "setosa=huhu; else=copy", append = FALSE))
#> Species_r <categorical> 
#> # total N=150 valid N=150 mean=2.00 sd=0.82
#> 
#> Value      |  N | Raw % | Valid % | Cum. %
#> ------------------------------------------
#> huhu       | 50 | 33.33 |   33.33 |  33.33
#> versicolor | 50 | 33.33 |   33.33 |  66.67
#> virginica  | 50 | 33.33 |   33.33 | 100.00
#> <NA>       |  0 |  0.00 |    <NA> |   <NA>

# works with mutate
efc %>%
  dplyr::select(e42dep, e17age) %>%
  dplyr::mutate(dependency_rev = rec(e42dep, rec = "rev")) %>%
  head()
#>   e42dep e17age dependency_rev
#> 1      3     83              2
#> 2      3     88              2
#> 3      3     82              2
#> 4      4     67              1
#> 5      4     84              1
#> 6      4     85              1

# recode multiple variables and set value labels via recode-syntax
dummy <- rec(
  efc, c160age, e17age,
  rec = "15:30=1 [young]; 31:55=2 [middle]; 56:max=3 [old]",
  append = FALSE
)
frq(dummy)
#> carer' age (c160age_r) <numeric> 
#> # total N=908 valid N=901 mean=2.40 sd=0.59
#> 
#> Value |  Label |   N | Raw % | Valid % | Cum. %
#> -----------------------------------------------
#>     1 |  young |  48 |  5.29 |    5.33 |   5.33
#>     2 | middle | 442 | 48.68 |   49.06 |  54.38
#>     3 |    old | 411 | 45.26 |   45.62 | 100.00
#>  <NA> |   <NA> |   7 |  0.77 |    <NA> |   <NA>
#> 
#> elder' age (e17age_r) <numeric> 
#> # total N=908 valid N=891 mean=3.00 sd=0.00
#> 
#> Value |  Label |   N | Raw % | Valid % | Cum. %
#> -----------------------------------------------
#>     1 |  young |   0 |  0.00 |       0 |      0
#>     2 | middle |   0 |  0.00 |       0 |      0
#>     3 |    old | 891 | 98.13 |     100 |    100
#>  <NA> |   <NA> |  17 |  1.87 |    <NA> |   <NA>

Scoped variants

Where applicable, the recoding-functions in sjmisc have “scoped” versions as well, e.g. dicho_if() or split_var_if(), where transformation will be applied only to those variables that match the logical condition of predicate.

sjmisc/inst/doc/exploringdatasets.R0000644000176200001440000000523714620407643017164 0ustar liggesusers## ----echo = FALSE------------------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, comment = "#>") if (!requireNamespace("dplyr", quietly = TRUE) || !requireNamespace("tidyr", quietly = TRUE) || !requireNamespace("purrr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } suppressPackageStartupMessages(library(sjmisc)) ## ----message=FALSE------------------------------------------------------------ library(sjmisc) library(dplyr) data(efc) ## ----------------------------------------------------------------------------- frq(efc$c161sex) ## ----------------------------------------------------------------------------- # find all variables with "dependency" in name or label find_var(efc, "dependency", out = "table") ## ----------------------------------------------------------------------------- flat_table(efc, e42dep, c161sex) ## ----------------------------------------------------------------------------- flat_table(efc, e42dep, c161sex, margin = "col") ## ----------------------------------------------------------------------------- efc$burden <- rec( efc$neg_c_7, rec = c("min:9=1 [low]; 10:12=2 [moderate]; 13:max=3 [high]; else=NA"), var.label = "Subjective burden", as.num = FALSE # we want a factor ) # print frequencies frq(efc$burden) ## ----------------------------------------------------------------------------- efc %>% select(burden, c161sex) %>% group_by(c161sex) %>% frq() ## ----------------------------------------------------------------------------- # convert variable to labelled factor, because we then # have the labels as factor levels in the output efc$e42dep <- to_label(efc$e42dep, drop.levels = TRUE) efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() ## ----------------------------------------------------------------------------- efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() %>% na.omit() %>% # remove nested group for NA arrange(e42dep) %>% # arrange by order of levels mutate(models = purrr::map( data, ~ lm(quol_5 ~ burden + c161sex, data = .)) ) %>% spread_coef(models) ## ----------------------------------------------------------------------------- efc %>% select(e42dep, burden, c161sex, quol_5) %>% group_by(e42dep) %>% tidyr::nest() %>% na.omit() %>% # remove nested group for NA arrange(e42dep) %>% # arrange by order of levels mutate(models = purrr::map( data, ~ lm(quol_5 ~ burden + c161sex, data = .)) ) %>% spread_coef(models, burden3) sjmisc/inst/doc/exploringdatasets.html0000644000176200001440000010264214620407643017725 0ustar liggesusers Exploring Data Sets

Exploring Data Sets

Daniel Lüdecke

2024-05-13

Tidying up, transforming and exploring data is an important part of data analysis, and you can manage many common tasks in this process with the tidyverse or related packages. The sjmisc-package fits into this workflow, especially when you work with labelled data, because it offers functions for data transformation and labelled data utility functions. This vignette describes typical steps when beginning with data exploration.

The examples are based on data from the EUROFAMCARE project, a survey on the situation of family carers of older people in Europe. The sample data set efc is part of this package. Let us see how the family carer’s gender and subjective perception of negative impact of care as well as the cared-for person’s dependency are associated with the family carer’s quality of life.

library(sjmisc)
library(dplyr)
data(efc)

Find variables in a data frame

Next, let’s look at the distribution of gender by the cared-for person’s dependency. To compute cross tables, you can use flat_table(). It requires the data as first argument, followed by any number of variable names.

But first, we need to know the name of the dependency-variable. This is where find_var() comes into play. It searches for variables in a data frame by

  1. variable names,
  2. variable labels,
  3. value labels
  4. or any combination of these.

By default, it looks for variable name and labels. The function also supports regex-patterns. By default, find_var() returns the column-indices, but you can also print a small “summary”” with the out-argument.

# find all variables with "dependency" in name or label
find_var(efc, "dependency", out = "table")
#>   col.nr var.name          var.label
#> 1      5   e42dep elder's dependency

Variable in column 5, named e42dep, is what we are looking for.

Recoding variables

Next, we need the negatice impact of care (neg_c_7) and want to create three groups: low, middle and high negative impact. We can easily recode and label vectors with rec(). This function does not only recode vectors, it also allows direct labelling of categories inside the recode-syntax (this is optional, you can also use the val.labels-argument). We now recode neg_c_7 into a new variable burden. The cut-points are a bit arbitrary, for the sake of demonstration.

efc$burden <- rec(
  efc$neg_c_7,
  rec = c("min:9=1 [low]; 10:12=2 [moderate]; 13:max=3 [high]; else=NA"),
  var.label = "Subjective burden",
  as.num = FALSE # we want a factor
)
# print frequencies
frq(efc$burden)
#> Subjective burden (x) <categorical> 
#> # total N=908 valid N=892 mean=2.03 sd=0.81
#> 
#> Value |    Label |   N | Raw % | Valid % | Cum. %
#> -------------------------------------------------
#>     1 |      low | 280 | 30.84 |   31.39 |  31.39
#>     2 | moderate | 301 | 33.15 |   33.74 |  65.13
#>     3 |     high | 311 | 34.25 |   34.87 | 100.00
#>  <NA> |     <NA> |  16 |  1.76 |    <NA> |   <NA>

You can see the variable burden has a variable label (“Subjective burden”), which was set inside rec(), as well as three values with labels (“low”, “moderate” and “high”). From the lowest value in neg_c_7 to 9 were recoded into 1, values 10 to 12 into 2 and values 13 to the highest value in neg_c_7 into 3. All remaining values are set to missing (else=NA – for details on the recode-syntax, see ?rec).

Grouped data frames

How is burden distributed by gender? We can group the data and print frequencies using frq() for this as well, as this function also accepts grouped data frames. Frequencies for grouped data frames first print the group-details (variable name and category), followed by the frequency table. Thanks to labelled data, the output is easy to understand.

efc %>% 
  select(burden, c161sex) %>% 
  group_by(c161sex) %>% 
  frq()
#> Subjective burden (burden) <categorical> 
#> # grouped by: Male
#> # total N=215 valid N=212 mean=1.91 sd=0.81
#> 
#> Value |    Label |  N | Raw % | Valid % | Cum. %
#> ------------------------------------------------
#>     1 |      low | 80 | 37.21 |   37.74 |  37.74
#>     2 | moderate | 72 | 33.49 |   33.96 |  71.70
#>     3 |     high | 60 | 27.91 |   28.30 | 100.00
#>  <NA> |     <NA> |  3 |  1.40 |    <NA> |   <NA>
#> 
#> Subjective burden (burden) <categorical> 
#> # grouped by: Female
#> # total N=686 valid N=679 mean=2.08 sd=0.81
#> 
#> Value |    Label |   N | Raw % | Valid % | Cum. %
#> -------------------------------------------------
#>     1 |      low | 199 | 29.01 |   29.31 |  29.31
#>     2 | moderate | 229 | 33.38 |   33.73 |  63.03
#>     3 |     high | 251 | 36.59 |   36.97 | 100.00
#>  <NA> |     <NA> |   7 |  1.02 |    <NA> |   <NA>

Nested data frames

Let’s investigate the association between quality of life and burden across the different dependency categories, by fitting linear models for each category of e42dep. We can do this using nested data frames. nest() from the tidyr-package can create subsets of a data frame, based on grouping criteria, and create a new list-variable, where each element itself is a data frame (so it’s nested, because we have data frames inside a data frame).

In the following example, we group the data by e42dep, and “nest” the groups. Now we get a data frame with two columns: First, the grouping variable (e42dep) and second, the datasets (subsets) for each country as data frame, stored in the list-variable data. The data frames in the subsets (in data) all contain the selected variables burden, c161sex and quol_5 (quality of life).

# convert variable to labelled factor, because we then 
# have the labels as factor levels in the output
efc$e42dep <- to_label(efc$e42dep, drop.levels = TRUE)
efc %>%
  select(e42dep, burden, c161sex, quol_5) %>%
  group_by(e42dep) %>%
  tidyr::nest()
#> # A tibble: 5 × 2
#> # Groups:   e42dep [5]
#>   e42dep               data              
#>   <fct>                <list>            
#> 1 moderately dependent <tibble [306 × 3]>
#> 2 severely dependent   <tibble [304 × 3]>
#> 3 independent          <tibble [66 × 3]> 
#> 4 slightly dependent   <tibble [225 × 3]>
#> 5 <NA>                 <tibble [7 × 3]>

Get coefficients of nested models

Using map() from the purrr-package, we can iterate this list and apply any function on each data frame in the list-variable “data”. We want to apply the lm()-function to the list-variable, to run linear models for all “dependency-datasets”. The results of these linear regressions are stored in another list-variable, models (created with mutate()). To quickly access and look at the coefficients, we can use spread_coef().

efc %>%
  select(e42dep, burden, c161sex, quol_5) %>%
  group_by(e42dep) %>%
  tidyr::nest() %>% 
  na.omit() %>%       # remove nested group for NA
  arrange(e42dep) %>% # arrange by order of levels
  mutate(models = purrr::map(
    data, ~ 
    lm(quol_5 ~ burden + c161sex, data = .))
  ) %>%
  spread_coef(models)
#> # A tibble: 4 × 7
#> # Groups:   e42dep [4]
#>   e42dep               data     models `(Intercept)` burden2 burden3 c161sex
#>   <fct>                <list>   <list>         <dbl>   <dbl>   <dbl>   <dbl>
#> 1 independent          <tibble> <lm>            18.8   -3.16   -4.94  -0.709
#> 2 slightly dependent   <tibble> <lm>            19.8   -2.20   -2.48  -1.14 
#> 3 moderately dependent <tibble> <lm>            17.9   -1.82   -5.29  -0.637
#> 4 severely dependent   <tibble> <lm>            19.1   -3.66   -7.92  -0.746

We see that higher burden is associated with lower quality of life, for all dependency-groups. The se and p.val-arguments add standard errors and p-values to the output. model.term returns the statistics only for a specific term. If you specify a model.term, arguments se and p.val automatically default to TRUE.

efc %>%
  select(e42dep, burden, c161sex, quol_5) %>%
  group_by(e42dep) %>%
  tidyr::nest() %>% 
  na.omit() %>%       # remove nested group for NA
  arrange(e42dep) %>% # arrange by order of levels
  mutate(models = purrr::map(
    data, ~ 
    lm(quol_5 ~ burden + c161sex, data = .))
  ) %>%
  spread_coef(models, burden3)
#> # A tibble: 4 × 6
#> # Groups:   e42dep [4]
#>   e42dep               data               models burden3 std.error  p.value
#>   <fct>                <list>             <list>   <dbl>     <dbl>    <dbl>
#> 1 independent          <tibble [66 × 3]>  <lm>     -4.94     2.20  2.84e- 2
#> 2 slightly dependent   <tibble [225 × 3]> <lm>     -2.48     0.694 4.25e- 4
#> 3 moderately dependent <tibble [306 × 3]> <lm>     -5.29     0.669 5.22e-14
#> 4 severely dependent   <tibble [304 × 3]> <lm>     -7.92     0.875 2.10e-17
sjmisc/inst/doc/design_philosophy.html0000644000176200001440000012234014620407642017710 0ustar liggesusers The Design Philosophy of Functions in sjmisc

The Design Philosophy of Functions in sjmisc

Daniel Lüdecke

2024-05-13

Basically, this package complements the dplyr package in that sjmisc takes over data transformation tasks on variables, like recoding, dichotomizing or grouping variables, setting and replacing missing values, etc. The data transformation functions also support labelled data.

The design of data transformation functions

The design of data transformation functions in this package follows, where appropriate, the tidyverse-approach, with the first argument of a function always being the data (either a data frame or vector), followed by variable names that should be processed by the function. If no variables are specified as argument, the function applies to the complete data that was indicated as first function argument.

The data-argument

A major difference to dplyr-functions like select() or filter() is that the data-argument (the first argument of each function), may either be a data frame or a vector. The returned object for each function equals the type of the data-argument:

  • If the data-argument is a vector, the function returns a vector.
  • If the data-argument is a data frame, the function returns a data frame.
library(sjmisc)
data(efc)

# returns a vector
x <- rec(efc$e42dep, rec = "1,2=1; 3,4=2")
str(x)
#>  num [1:908] 2 2 2 2 2 2 2 2 2 2 ...
#>  - attr(*, "label")= chr "elder's dependency"

# returns a data frame
rec(efc, e42dep, rec = "1,2=1; 3,4=2", append = FALSE) %>% head()
#>   e42dep_r
#> 1        2
#> 2        2
#> 3        2
#> 4        2
#> 5        2
#> 6        2

This design-choice is mainly due to compatibility- and convenience-reasons. It does not affect the usual “tidyverse-workflow” or when using pipe-chains.

The …-ellipses-argument

The selection of variables specified in the ...-ellipses-argument is powered by dplyr’s select() and tidyselect’s select_helpers(). This means, you can use existing functions like : to select a range of variables, or also use tidyselect’s select_helpers, like contains() or one_of().

# select all variables with "cop" in their names, and also
# the range from c161sex to c175empl
rec(
  efc, contains("cop"), c161sex:c175empl, 
  rec = "0,1=0; else=1", 
  append = FALSE
) %>% head()
#>   c82cop1_r c83cop2_r c84cop3_r c85cop4_r c86cop5_r c87cop6_r c88cop7_r
#> 1         1         1         1         1         0         0         1
#> 2         1         1         1         1         1         0         1
#> 3         1         1         0         1         0         0         0
#> 4         1         0         1         0         0         0         0
#> 5         1         1         0         1         1         1         0
#> 6         1         1         1         1         1         1         1
#>   c89cop8_r c90cop9_r c161sex_r c172code_r c175empl_r
#> 1         1         1         1          1          0
#> 2         1         1         1          1          0
#> 3         1         1         0          0          0
#> 4         1         1         0          1          0
#> 5         1         1         1          1          0
#> 6         0         0         0          1          0

# center all variables with "age" in name, variable c12hour
# and all variables from column 19 to 21
center(efc, c12hour, contains("age"), 19:21, append = FALSE) %>% head()
#>   c12hour_c   e17age_c  c160age_c barthtot_c  neg_c_7_c pos_v_4_c
#> 1 -26.39911   3.878788  2.5371809  10.453001  0.1502242 -0.476731
#> 2 105.60089   8.878788  0.5371809  10.453001  8.1502242 -1.476731
#> 3  27.60089   2.878788 26.5371809 -29.546999 -0.8497758  0.523269
#> 4 125.60089 -12.121212 15.5371809 -64.546999 -1.8497758  2.523269
#> 5 125.60089   4.878788 -6.4628191 -39.546999  0.1502242  2.523269
#> 6 -26.39911   5.878788  2.5371809  -4.546999  7.1502242 -3.476731

The function-types

There are two types of function designs:

coercing/converting functions

Functions like to_factor() or to_label(), which convert variables into other types or add additional information like variable or value labels as attribute, typically return the complete data frame that was given as first argument without any new variables. The variables specified in the ...-ellipses argument are converted (overwritten), all other variables remain unchanged.

x <- efc[, 3:5]

x %>% str()
#> 'data.frame':    908 obs. of  3 variables:
#>  $ e16sex: num  2 2 2 2 2 2 1 2 2 2 ...
#>   ..- attr(*, "label")= chr "elder's gender"
#>   ..- attr(*, "labels")= Named num [1:2] 1 2
#>   .. ..- attr(*, "names")= chr [1:2] "male" "female"
#>  $ e17age: num  83 88 82 67 84 85 74 87 79 83 ...
#>   ..- attr(*, "label")= chr "elder' age"
#>  $ e42dep: num  3 3 3 4 4 4 4 4 4 4 ...
#>   ..- attr(*, "label")= chr "elder's dependency"
#>   ..- attr(*, "labels")= Named num [1:4] 1 2 3 4
#>   .. ..- attr(*, "names")= chr [1:4] "independent" "slightly dependent" "moderately dependent" "severely dependent"

to_factor(x, e42dep, e16sex) %>% str()
#> 'data.frame':    908 obs. of  3 variables:
#>  $ e16sex: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 1 2 2 2 ...
#>   ..- attr(*, "labels")= Named num [1:2] 1 2
#>   .. ..- attr(*, "names")= chr [1:2] "male" "female"
#>   ..- attr(*, "label")= chr "elder's gender"
#>  $ e17age: num  83 88 82 67 84 85 74 87 79 83 ...
#>   ..- attr(*, "label")= chr "elder' age"
#>  $ e42dep: Factor w/ 4 levels "1","2","3","4": 3 3 3 4 4 4 4 4 4 4 ...
#>   ..- attr(*, "labels")= Named num [1:4] 1 2 3 4
#>   .. ..- attr(*, "names")= chr [1:4] "independent" "slightly dependent" "moderately dependent" "severely dependent"
#>   ..- attr(*, "label")= chr "elder's dependency"

transformation/recoding functions

Functions like rec() or dicho(), which transform or recode variables, by default add the transformed or recoded variables to the data frame, so they return the new variables and the original data as combined data frame. To return only the transformed and recoded variables specified in the ...-ellipses argument, use argument append = FALSE.

# complete data, including new columns
rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE) %>% head()
#>   c12hour e15relat e16sex e17age e42dep c82cop1 c83cop2 c84cop3 c85cop4 c86cop5
#> 1      16        2      2     83      3       3       2       2       2       1
#> 2     148        2      2     88      3       3       3       3       3       4
#> 3      70        1      2     82      3       2       2       1       4       1
#> 4     168        1      2     67      4       4       1       3       1       1
#> 5     168        2      2     84      4       3       2       1       2       2
#> 6      16        2      2     85      4       2       2       3       3       3
#>   c87cop6 c88cop7 c89cop8 c90cop9 c160age c161sex c172code c175empl barthtot
#> 1       1       2       3       3      56       2        2        1       75
#> 2       1       3       2       2      54       2        2        1       75
#> 3       1       1       4       3      80       1        1        0       35
#> 4       1       1       2       4      69       1        2        0        0
#> 5       2       1       4       4      47       2        2        0       25
#> 6       2       2       1       1      56       1        2        1       60
#>   neg_c_7 pos_v_4 quol_5 resttotn tot_sc_e n4pstu nur_pst c82cop1_r c83cop2_r
#> 1      12      12     14        0        4      0      NA         2         0
#> 2      20      11     10        4        0      0      NA         2         2
#> 3      11      13      7        0        1      2       2         0         0
#> 4      10      15     12        2        0      3       3         2         0
#> 5      12      15     19        2        1      2       2         2         0
#> 6      19       9      8        1        3      2       2         0         0

# only new columns
rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% head()
#>   c82cop1_r c83cop2_r
#> 1         2         0
#> 2         2         2
#> 3         0         0
#> 4         2         0
#> 5         2         0
#> 6         0         0

These variables usually get a suffix, so you can bind these variables as new columns to a data frame, for instance with add_columns(). The function add_columns() is useful if you want to bind/add columns within a pipe-chain to the end of a data frame.

efc %>% 
  rec(c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% 
  add_columns(efc) %>% 
  head()
#>   c12hour e15relat e16sex e17age e42dep c82cop1 c83cop2 c84cop3 c85cop4 c86cop5
#> 1      16        2      2     83      3       3       2       2       2       1
#> 2     148        2      2     88      3       3       3       3       3       4
#> 3      70        1      2     82      3       2       2       1       4       1
#> 4     168        1      2     67      4       4       1       3       1       1
#> 5     168        2      2     84      4       3       2       1       2       2
#> 6      16        2      2     85      4       2       2       3       3       3
#>   c87cop6 c88cop7 c89cop8 c90cop9 c160age c161sex c172code c175empl barthtot
#> 1       1       2       3       3      56       2        2        1       75
#> 2       1       3       2       2      54       2        2        1       75
#> 3       1       1       4       3      80       1        1        0       35
#> 4       1       1       2       4      69       1        2        0        0
#> 5       2       1       4       4      47       2        2        0       25
#> 6       2       2       1       1      56       1        2        1       60
#>   neg_c_7 pos_v_4 quol_5 resttotn tot_sc_e n4pstu nur_pst c82cop1_r c83cop2_r
#> 1      12      12     14        0        4      0      NA         2         0
#> 2      20      11     10        4        0      0      NA         2         2
#> 3      11      13      7        0        1      2       2         0         0
#> 4      10      15     12        2        0      3       3         2         0
#> 5      12      15     19        2        1      2       2         2         0
#> 6      19       9      8        1        3      2       2         0         0

If append = TRUE and suffix = "", recoded variables will replace (overwrite) existing variables.

# complete data, existing columns c82cop1 and c83cop2 are replaced
rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE, suffix = "") %>% head()
#>   c12hour e15relat e16sex e17age e42dep c82cop1 c83cop2 c84cop3 c85cop4 c86cop5
#> 1      16        2      2     83      3       2       0       2       2       1
#> 2     148        2      2     88      3       2       2       3       3       4
#> 3      70        1      2     82      3       0       0       1       4       1
#> 4     168        1      2     67      4       2       0       3       1       1
#> 5     168        2      2     84      4       2       0       1       2       2
#> 6      16        2      2     85      4       0       0       3       3       3
#>   c87cop6 c88cop7 c89cop8 c90cop9 c160age c161sex c172code c175empl barthtot
#> 1       1       2       3       3      56       2        2        1       75
#> 2       1       3       2       2      54       2        2        1       75
#> 3       1       1       4       3      80       1        1        0       35
#> 4       1       1       2       4      69       1        2        0        0
#> 5       2       1       4       4      47       2        2        0       25
#> 6       2       2       1       1      56       1        2        1       60
#>   neg_c_7 pos_v_4 quol_5 resttotn tot_sc_e n4pstu nur_pst
#> 1      12      12     14        0        4      0      NA
#> 2      20      11     10        4        0      0      NA
#> 3      11      13      7        0        1      2       2
#> 4      10      15     12        2        0      3       3
#> 5      12      15     19        2        1      2       2
#> 6      19       9      8        1        3      2       2

sjmisc and dplyr

The functions of sjmisc are designed to work together seamlessly with other packages from the tidyverse, like dplyr. For instance, you can use the functions from sjmisc both within a pipe-worklflow to manipulate data frames, or to create new variables with mutate():

efc %>% 
  select(c82cop1, c83cop2) %>% 
  rec(rec = "1,2=0; 3:4=2") %>% 
  head()
#>   c82cop1 c83cop2 c82cop1_r c83cop2_r
#> 1       3       2         2         0
#> 2       3       3         2         2
#> 3       2       2         0         0
#> 4       4       1         2         0
#> 5       3       2         2         0
#> 6       2       2         0         0

efc %>% 
  select(c82cop1, c83cop2) %>% 
  mutate(
    c82cop1_dicho = rec(c82cop1, rec = "1,2=0; 3:4=2"),
    c83cop2_dicho = rec(c83cop2, rec = "1,2=0; 3:4=2")
  ) %>% 
  head()
#>   c82cop1 c83cop2 c82cop1_dicho c83cop2_dicho
#> 1       3       2             2             0
#> 2       3       3             2             2
#> 3       2       2             0             0
#> 4       4       1             2             0
#> 5       3       2             2             0
#> 6       2       2             0             0

This makes it easy to adapt the sjmisc functions to your own workflow.

sjmisc/inst/doc/design_philosophy.Rmd0000644000176200001440000001253014046746443017474 0ustar liggesusers--- title: "The Design Philosophy of Functions in sjmisc" author: "Daniel Lüdecke" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{The Design Philosophy of Functions in sjmisc} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r echo = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) if (!requireNamespace("dplyr", quietly = TRUE)) { knitr::opts_chunk$set(eval = FALSE) } options(max.print = 1000) suppressPackageStartupMessages(library(sjmisc)) ``` Basically, this package complements the _dplyr_ package in that _sjmisc_ takes over data transformation tasks on variables, like recoding, dichotomizing or grouping variables, setting and replacing missing values, etc. The data transformation functions also support labelled data. # The design of data transformation functions The design of data transformation functions in this package follows, where appropriate, the _tidyverse-approach_, with the first argument of a function always being the data (either a data frame or vector), followed by variable names that should be processed by the function. If no variables are specified as argument, the function applies to the complete data that was indicated as first function argument. ## The data-argument A major difference to dplyr-functions like `select()` or `filter()` is that the data-argument (the first argument of each function), may either be a _data frame_ or a _vector_. The returned object for each function _equals the type of the data-argument_: * If the data-argument is a vector, the function returns a vector. * If the data-argument is a data frame, the function returns a data frame. ```{r} library(sjmisc) data(efc) # returns a vector x <- rec(efc$e42dep, rec = "1,2=1; 3,4=2") str(x) # returns a data frame rec(efc, e42dep, rec = "1,2=1; 3,4=2", append = FALSE) %>% head() ``` This design-choice is mainly due to compatibility- and convenience-reasons. It does not affect the usual "tidyverse-workflow" or when using pipe-chains. ## The ...-ellipses-argument The selection of variables specified in the `...`-ellipses-argument is powered by dplyr's `select()` and tidyselect's `select_helpers()`. This means, you can use existing functions like `:` to select a range of variables, or also use tidyselect's `select_helpers`, like `contains()` or `one_of()`. ```{r echo=FALSE, message=FALSE} library(dplyr) ``` ```{r collapse=TRUE} # select all variables with "cop" in their names, and also # the range from c161sex to c175empl rec( efc, contains("cop"), c161sex:c175empl, rec = "0,1=0; else=1", append = FALSE ) %>% head() # center all variables with "age" in name, variable c12hour # and all variables from column 19 to 21 center(efc, c12hour, contains("age"), 19:21, append = FALSE) %>% head() ``` ## The function-types There are two types of function designs: ### coercing/converting functions Functions like `to_factor()` or `to_label()`, which convert variables into other types or add additional information like variable or value labels as attribute, typically _return the complete data frame_ that was given as first argument _without any new variables_. The variables specified in the `...`-ellipses argument are converted (overwritten), all other variables remain unchanged. ```{r} x <- efc[, 3:5] x %>% str() to_factor(x, e42dep, e16sex) %>% str() ``` ### transformation/recoding functions Functions like `rec()` or `dicho()`, which transform or recode variables, by default add _the transformed or recoded variables_ to the data frame, so they return the new variables _and_ the original data as combined data frame. To return _only the transformed and recoded variables_ specified in the `...`-ellipses argument, use argument `append = FALSE`. ```{r} # complete data, including new columns rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE) %>% head() # only new columns rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% head() ``` These variables usually get a suffix, so you can bind these variables as new columns to a data frame, for instance with `add_columns()`. The function `add_columns()` is useful if you want to bind/add columns within a pipe-chain _to the end_ of a data frame. ```{r} efc %>% rec(c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = FALSE) %>% add_columns(efc) %>% head() ``` If `append = TRUE` and `suffix = ""`, recoded variables will replace (overwrite) existing variables. ```{r} # complete data, existing columns c82cop1 and c83cop2 are replaced rec(efc, c82cop1, c83cop2, rec = "1,2=0; 3:4=2", append = TRUE, suffix = "") %>% head() ``` ## sjmisc and dplyr The functions of **sjmisc** are designed to work together seamlessly with other packages from the tidyverse, like **dplyr**. For instance, you can use the functions from **sjmisc** both within a pipe-worklflow to manipulate data frames, or to create new variables with `mutate()`: ```{r} efc %>% select(c82cop1, c83cop2) %>% rec(rec = "1,2=0; 3:4=2") %>% head() efc %>% select(c82cop1, c83cop2) %>% mutate( c82cop1_dicho = rec(c82cop1, rec = "1,2=0; 3:4=2"), c83cop2_dicho = rec(c83cop2, rec = "1,2=0; 3:4=2") ) %>% head() ``` This makes it easy to adapt the **sjmisc** functions to your own workflow. sjmisc/inst/CITATION0000644000176200001440000000044013451124270013651 0ustar liggesusersbibentry( bibtype = "article", title = "sjmisc: Data and Variable Transformation Functions.", volume = "3", doi = "10.21105/joss.00754", number = "26", journal = "Journal of Open Source Software", author = person("Daniel", "Lüdecke"), year = "2018", pages = "754" )