themis/0000755000176200001440000000000014061222402011534 5ustar liggesusersthemis/NAMESPACE0000644000176200001440000000447314061171747013002 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(bake,step_adasyn) S3method(bake,step_bsmote) S3method(bake,step_downsample) S3method(bake,step_nearmiss) S3method(bake,step_rose) S3method(bake,step_smote) S3method(bake,step_tomek) S3method(bake,step_upsample) S3method(prep,step_adasyn) S3method(prep,step_bsmote) S3method(prep,step_downsample) S3method(prep,step_nearmiss) S3method(prep,step_rose) S3method(prep,step_smote) S3method(prep,step_tomek) S3method(prep,step_upsample) S3method(print,step_adasyn) S3method(print,step_bsmote) S3method(print,step_nearmiss) S3method(print,step_rose) S3method(print,step_smote) S3method(print,step_tomek) S3method(required_pkgs,step_adasyn) S3method(required_pkgs,step_bsmote) S3method(required_pkgs,step_downsample) S3method(required_pkgs,step_nearmiss) S3method(required_pkgs,step_rose) S3method(required_pkgs,step_smote) S3method(required_pkgs,step_tomek) S3method(required_pkgs,step_upsample) S3method(tidy,step_adasyn) S3method(tidy,step_bsmote) S3method(tidy,step_downsample) S3method(tidy,step_nearmiss) S3method(tidy,step_rose) S3method(tidy,step_smote) S3method(tidy,step_tomek) S3method(tidy,step_upsample) S3method(tunable,step_adasyn) S3method(tunable,step_bsmote) S3method(tunable,step_downsample) S3method(tunable,step_nearmiss) S3method(tunable,step_rose) S3method(tunable,step_smote) S3method(tunable,step_upsample) export(adasyn) export(bsmote) export(required_pkgs) export(smote) export(step_adasyn) export(step_bsmote) export(step_downsample) export(step_nearmiss) export(step_rose) export(step_smote) export(step_tomek) export(step_upsample) export(tidy) export(tunable) importFrom(ROSE,ROSE) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(generics,required_pkgs) importFrom(generics,tidy) importFrom(generics,tunable) importFrom(purrr,map_dfr) importFrom(purrr,map_lgl) importFrom(recipes,add_step) importFrom(recipes,bake) importFrom(recipes,check_type) importFrom(recipes,ellipse_check) importFrom(recipes,is_trained) importFrom(recipes,prep) importFrom(recipes,printer) importFrom(recipes,rand_id) importFrom(recipes,sel2char) importFrom(recipes,step) importFrom(recipes,terms_select) importFrom(rlang,":=") importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(unbalanced,ubTomek) importFrom(withr,with_seed) themis/LICENSE0000644000176200001440000000005413603701230012542 0ustar liggesusersYEAR: 2020 COPYRIGHT HOLDER: Emil Hvitfeldt themis/README.md0000644000176200001440000001527214061033307013026 0ustar liggesusers # themis [![R build status](https://github.com/tidymodels/themis/workflows/R-CMD-check/badge.svg)](https://github.com/tidymodels/themis/actions) [![Codecov test coverage](https://codecov.io/gh/tidymodels/themis/branch/master/graph/badge.svg)](https://codecov.io/gh/tidymodels/themis?branch=master) [![CRAN status](https://www.r-pkg.org/badges/version/themis)](https://CRAN.R-project.org/package=themis) [![Downloads](http://cranlogs.r-pkg.org/badges/themis)](https://CRAN.R-project.org/package=themis) [![Lifecycle: maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html) [![R-CMD-check](https://github.com/tidymodels/themis/workflows/R-CMD-check/badge.svg)](https://github.com/tidymodels/themis/actions) **themis** contains extra steps for the [`recipes`](https://CRAN.R-project.org/package=recipes) package for dealing with unbalanced data. The name **themis** is that of the [ancient Greek god](https://thishollowearth.wordpress.com/2012/07/02/god-of-the-week-themis/) who is typically depicted with a balance. ![](https://thishollowearth.files.wordpress.com/2012/07/themis.jpg) ## Installation You can install the released version of themis from [CRAN](https://CRAN.R-project.org) with: ``` r install.packages("themis") ``` Install the development version from GitHub with: ``` r require("devtools") install_github("tidymodels/themis") ``` ## Example Following is a example of using the [SMOTE](https://jair.org/index.php/jair/article/view/10302/24590) algorithm to deal with unbalanced data ``` r library(recipes) library(modeldata) library(themis) data(okc) sort(table(okc$Class, useNA = "always")) #> #> stem other #> 0 9539 50316 ds_rec <- recipe(Class ~ age + height, data = okc) %>% step_meanimpute(all_predictors()) %>% step_smote(Class) %>% prep() #> Warning: `step_meanimpute()` was deprecated in recipes 0.1.16. #> Please use `step_impute_mean()` instead. sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) #> #> stem other #> 0 50316 50316 ``` ## Methods Below is some unbalanced data. Used for examples latter. ``` r example_data <- data.frame(class = letters[rep(1:5, 1:5 * 10)], x = rnorm(150)) library(ggplot2) example_data %>% ggplot(aes(class)) + geom_bar() ``` ### Upsample / Over-sampling The following methods all share the tuning parameter `over_ratio`, which is the ratio of the majority-to-minority frequencies. | name | function | Multi-class | |-----------------------------------------------------------------|---------------------------|----------------------| | Random minority over-sampling with replacement | `step_upsample()` | :heavy\_check\_mark: | | Synthetic Minority Over-sampling Technique | `step_smote()` | :heavy\_check\_mark: | | Borderline SMOTE-1 | `step_bsmote(method = 1)` | :heavy\_check\_mark: | | Borderline SMOTE-2 | `step_bsmote(method = 2)` | :heavy\_check\_mark: | | Adaptive synthetic sampling approach for imbalanced learning | `step_adasyn()` | :heavy\_check\_mark: | | Generation of synthetic data by Randomly Over Sampling Examples | `step_rose()` | | By setting `over_ratio = 1` you bring the number of samples of all minority classes equal to 100% of the majority class. ``` r recipe(~., example_data) %>% step_upsample(class, over_ratio = 1) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` and by setting `over_ratio = 0.5` we upsample any minority class with less samples then 50% of the majority up to have 50% of the majority. ``` r recipe(~., example_data) %>% step_upsample(class, over_ratio = 0.5) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` ### Downsample / Under-sampling Most of the the following methods all share the tuning parameter `under_ratio`, which is the ratio of the minority-to-majority frequencies. | name | function | Multi-class | under\_ratio | |-------------------------------------------------|---------------------|----------------------|----------------------| | Random majority under-sampling with replacement | `step_downsample()` | :heavy\_check\_mark: | :heavy\_check\_mark: | | NearMiss-1 | `step_nearmiss()` | :heavy\_check\_mark: | :heavy\_check\_mark: | | Extraction of majority-minority Tomek links | `step_tomek()` | | | By setting `under_ratio = 1` you bring the number of samples of all majority classes equal to 100% of the minority class. ``` r recipe(~., example_data) %>% step_downsample(class, under_ratio = 1) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` and by setting `under_ratio = 2` we downsample any majority class with more then 200% samples of the minority class down to have to 200% samples of the minority. ``` r recipe(~., example_data) %>% step_downsample(class, under_ratio = 2) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` ## Contributing This project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. - For questions and discussions about tidymodels packages, modeling, and machine learning, [join us on RStudio Community](https://community.rstudio.com/new-topic?category_id=15&tags=tidymodels,question). - If you think you have encountered a bug, please [submit an issue](https://github.com/tidymodels/themis/issues). - Either way, learn how to create and share a [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) (a minimal, reproducible example), to clearly communicate about your code. - Check out further details on [contributing guidelines for tidymodels packages](https://www.tidymodels.org/contribute/) and [how to get help](https://www.tidymodels.org/help/). themis/data/0000755000176200001440000000000013603701230012447 5ustar liggesusersthemis/data/circle_example.rda0000644000176200001440000001146013603701230016115 0ustar liggesusersBZh91AY&SYA  &^ڝ{wm02Bzj{)Ɋm6ODFSja3T$))=&ѓh4##&S=zڙO&O4h4M1S4dɦE< S66(dS2z&hS4E<=OGeOQ4 '4jfTQ=F4d2қQP<4z4ɩěHiLi(H|("KNT A=96xѢZ\C 6\7f"( LнdBcfB"QhL-hv!B S7o!k"ڄڂpYExO r0^=1i&n("Br3XDd0fBŒ 1 -nd@[F%%EV ^";0 Br LbѤ˷9!Bɦ],QˈiFs.o lDG@B:3v.D9pMۑ%,Uq;M$WO's U {$MgqIsDА:j; d! Ά l#|bMƋJ7G3 TkF!6 d˜%EC<9Ur$BjMS GBƑ|@u8HrkPh*oemՅJQ#x 8 ۊCd3'0bD e)Ӗ2",m_2i#J2rXM#QZeew__M9J !v`q C{wʡ-=E?tdMjZt֚]_E%(k$?ӳiNHxxg?J+Ĵqu\lmgp% <.Ms!aapvjz(~fEg.)X<ͮ^Z?9Qqms ?谿+6K>{c5)O f]Lg8랩_ku,k:|]0E31yAfu^kTU4x\ܞw)| %i.=v%mE y3Ao|FJE;Uf=Tw|df w ytMw^>^D>a ;7 `» 5-]k"5.H. |A|3>ɇ| e$W6m}qƛ.%`R6е0ıl^b`f%Kŧd02^9˾CkV`oES@e/10c |$.^OƯѭּc2# _J&?J. JlAr+l5XH^CQ_q&57?ޟb n՝[E2-U3 H0=(p~Vr,ơNe95i:T󣳚w_ݟ&sg]%FnBsXF3+G2Zܳã)1s;cuf¥Ҫ-eT鎾\` 88m>+XdqO"י9`bH" 9ب: qMQʖb5+Ɏ/{ Y.9즎id|ZI5ϢȏWRƚNoUQ5YJSgr<^bG3F S7Nja3F*#/b7#|JD(.~G1T@Ist86ПvJrx+ph[mU@{j*@=? Wb ^'ndx>z80_ӵuʣ[MF7v1mf/aPE =]ڹ9^z{c0'5?Rڐ=xwZbI0x+u!D-7M-m%*>*OCN^R:{wѲ"eo7#2Ir,p-+OomVu|a/aƇ+{.Ӳ~ɴ >D[=BDgmRAMu4LHHC_ (᜶du<]IBd0ِ4p_῝fqָG'sF; 1K w?˖ +$z%68dqf"L䊲N!sn$#CWηH\&l4' VvqhBF^j\&=APw-b:~Wr"a~ո)SyS!ޯym0ђ^`b=^;#3w)5ЙqC{sjgDKz=r(h!xG JY {5wI׈kv*vٵdU{.;4D3RXбjqWyNsb<gzq.ߌwW.atl]MrRso}I2uI&Mm3;>B&?s) ͚Ԝ6$@lJ0'tō&g+4]E"e'@ cM!𸉦շS@]clzW&2虒%6ԛO@B; ᛨU֤t#I #bח\1U:gWx82C,uQnOx ):ceM#JiD;QRKk~6P”^MJe^p2ݣjЧ\I:i$1#'u4vS(mpʯ24.]utuSy_ S>*b ,-[7{X2O(g7{p9 JYUEYx}ž 8)Laݙ44o#xKQ<ۖk/#L0OA_9:YgʂR\ǯ~U'>W5;%N5k}tY7du\zEgR7q[a|F4`IW&lfj|zp;ԥ!qF,8t-b,#>cхwC㾚/,ze/ukȅ^Pf;X$(YWo:{룒U ko~Sk){XR1NleFGc/ h.άO@@8D$]LlMO]?Oȸ1%MjM~:@.O˲~j: @P)W'|ZCW%C\DF :tz.l#DJ: D#u@i~BO/c6cD@x:zH=jFoCxC]") ͭm?5%BHqi++R.ܽ^@"(H yŀthemis/man/0000755000176200001440000000000014061032365012316 5ustar liggesusersthemis/man/step_adasyn.Rd0000644000176200001440000000767014061010144015120 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_adasyn.R \name{step_adasyn} \alias{step_adasyn} \alias{tidy.step_adasyn} \title{Adaptive Synthetic Sampling Approach} \usage{ step_adasyn( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("adasyn") ) \method{tidy}{step_adasyn}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when applied.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_adasyn} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_adasyn} creates a \emph{specification} of a recipe step that generates synthetic positive instances using ADASYN algorithm. } \details{ All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \examples{ library(recipes) library(modeldata) data(okc) sort(table(okc$Class, useNA = "always")) ds_rec <- recipe(Class ~ age + height, data = okc) \%>\% step_meanimpute(all_predictors()) \%>\% step_adasyn(Class) \%>\% prep() sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) # since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(ds_rec, new_data = okc) table(baked_okc$Class, useNA = "always") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without ADASYN") recipe(class ~ ., data = circle_example) \%>\% step_adasyn(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With ADASYN") } \references{ He, H., Bai, Y., Garcia, E. and Li, S. 2008. ADASYN: Adaptive synthetic sampling approach for imbalanced learning. Proceedings of IJCNN 2008. (IEEE World Congress on Computational Intelligence). IEEE International Joint Conference. pp.1322-1328. } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/adasyn.Rd0000644000176200001440000000264014061030663014065 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/adasyn_impl.R \name{adasyn} \alias{adasyn} \title{Adaptive Synthetic Sampling Approach algorithm} \usage{ adasyn(df, var, k = 5, over_ratio = 1) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ Generates synthetic positive instances using ADASYN algorithm. } \details{ All columns used in this function must be numeric with no missing data. } \examples{ adasyn(circle_example, var = "class") adasyn(circle_example, var = "class", k = 10) adasyn(circle_example, var = "class", over_ratio = 0.8) } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } themis/man/step_smote.Rd0000644000176200001440000001112414061010144014755 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_smote.R \name{step_smote} \alias{step_smote} \alias{tidy.step_smote} \title{Apply SMOTE algorithm} \usage{ step_smote( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("smote") ) \method{tidy}{step_smote}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when smote-ing.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_smote} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_smote} creates a \emph{specification} of a recipe step that generate new examples of the minority class using nearest neighbors of these cases. } \details{ The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \examples{ library(recipes) library(modeldata) data(credit_data) sort(table(credit_data$Status, useNA = "always")) ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) \%>\% step_meanimpute(all_predictors()) \%>\% step_smote(Status) \%>\% prep() sort(table(bake(ds_rec, new_data = NULL)$Status, useNA = "always")) # since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(ds_rec, new_data = credit_data) table(baked_okc$Status, useNA = "always") ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) \%>\% step_meanimpute(all_predictors()) \%>\% step_smote(Status, over_ratio = 0.2) \%>\% prep() table(bake(ds_rec2, new_data = NULL)$Status, useNA = "always") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without SMOTE") recipe(class ~ ., data = circle_example) \%>\% step_smote(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With SMOTE") } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/required_pkgs.step.Rd0000644000176200001440000000220714061010144016413 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_adasyn.R, R/step_bsmote.R, % R/step_downsample.R, R/step_nearmiss.R, R/step_rose.R, R/step_smote.R, % R/step_tomek.R, R/step_upsample.R \name{required_pkgs.step_adasyn} \alias{required_pkgs.step_adasyn} \alias{required_pkgs.step_bsmote} \alias{required_pkgs.step_downsample} \alias{required_pkgs.step_nearmiss} \alias{required_pkgs.step_rose} \alias{required_pkgs.step_smote} \alias{required_pkgs.step_tomek} \alias{required_pkgs.step_upsample} \title{S3 methods for tracking which additional packages are needed for steps.} \usage{ \method{required_pkgs}{step_adasyn}(x, ...) \method{required_pkgs}{step_bsmote}(x, ...) \method{required_pkgs}{step_downsample}(x, ...) \method{required_pkgs}{step_nearmiss}(x, ...) \method{required_pkgs}{step_rose}(x, ...) \method{required_pkgs}{step_smote}(x, ...) \method{required_pkgs}{step_tomek}(x, ...) \method{required_pkgs}{step_upsample}(x, ...) } \arguments{ \item{x}{A recipe step} } \value{ A character vector } \description{ S3 methods for tracking which additional packages are needed for steps. } \keyword{internal} themis/man/smote.Rd0000644000176200001440000000353314061032365013740 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/smote_impl.R \name{smote} \alias{smote} \title{SMOTE algorithm} \usage{ smote(df, var, k = 5, over_ratio = 1) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ SMOTE generates new examples of the minority class using nearest neighbors of these cases. } \details{ The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns used in this function must be numeric with no missing data. } \examples{ smote(circle_example, var = "class") smote(circle_example, var = "class", k = 10) smote(circle_example, var = "class", over_ratio = 0.8) } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } themis/man/bsmote.Rd0000644000176200001440000000543414061032365014104 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bsmote_impl.R \name{bsmote} \alias{bsmote} \title{borderline-SMOTE algorithm} \usage{ bsmote(df, var, k = 5, over_ratio = 1, all_neighbors = FALSE) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{all_neighbors}{Type of two borderline-SMOTE method. Defaults to FALSE. See details.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ BSMOTE generates generate new examples of the minority class using nearest neighbors of these cases in the border region between classes. } \details{ This methods works the same way as \code{\link[=smote]{smote()}}, expect that instead of generating points around every point of of the minority class each point is first being classified into the boxes "danger" and "not". For each point the k nearest neighbors is calculated. If all the neighbors comes from a different class it is labeled noise and put in to the "not" box. If more then half of the neighbors comes from a different class it is labeled "danger. If \code{all_neighbors = FALSE} then points will be generated between nearest neighbors in its own class. If \code{all_neighbors = TRUE} then points will be generated between any nearest neighbors. See examples for visualization. The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns used in this step must be numeric with no missing data. } \examples{ bsmote(circle_example, var = "class") bsmote(circle_example, var = "class", k = 10) bsmote(circle_example, var = "class", over_ratio = 0.8) bsmote(circle_example, var = "class", all_neighbors = TRUE) } \references{ Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: a new over-sampling method in imbalanced data sets learning. In International Conference on Intelligent Computing, pages 878–887. Springer, 2005. } themis/man/step_bsmote.Rd0000644000176200001440000001341614061010144015125 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_bsmote.R \name{step_bsmote} \alias{step_bsmote} \alias{tidy.step_bsmote} \title{Apply borderline-SMOTE algorithm} \usage{ step_bsmote( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, all_neighbors = FALSE, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("bsmote") ) \method{tidy}{step_bsmote}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{all_neighbors}{Type of two borderline-SMOTE method. Defaults to FALSE. See details.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when smote-ing.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_bsmote} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_bsmote} creates a \emph{specification} of a recipe step that generate new examples of the minority class using nearest neighbors of these cases in the border region between classes. } \details{ This methods works the same way as \code{\link[=step_smote]{step_smote()}}, expect that instead of generating points around every point of of the minority class each point is first being classified into the boxes "danger" and "not". For each point the k nearest neighbors is calculated. If all the neighbors comes from a different class it is labeled noise and put in to the "not" box. If more then half of the neighbors comes from a different class it is labeled "danger. If all_neighbors = FALSE then points will be generated between nearest neighbors in its own class. If all_neighbors = TRUE then points will be generated between any nearest neighbors. See examples for visualization. The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \examples{ library(recipes) library(modeldata) data(credit_data) sort(table(credit_data$Status, useNA = "always")) ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) \%>\% step_meanimpute(all_predictors()) \%>\% step_bsmote(Status) \%>\% prep() sort(table(bake(ds_rec, new_data = NULL)$Status, useNA = "always")) # since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(ds_rec, new_data = credit_data) table(baked_okc$Status, useNA = "always") ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) \%>\% step_meanimpute(all_predictors()) \%>\% step_bsmote(Status, over_ratio = 0.2) \%>\% prep() table(bake(ds_rec2, new_data = NULL)$Status, useNA = "always") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without SMOTE") recipe(class ~ ., data = circle_example) \%>\% step_bsmote(class, all_neighbors = FALSE) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With borderline-SMOTE, all_neighbors = FALSE") recipe(class ~ ., data = circle_example) \%>\% step_bsmote(class, all_neighbors = TRUE) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With borderline-SMOTE, all_neighbors = TRUE") } \references{ Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: a new over-sampling method in imbalanced data sets learning. In International Conference on Intelligent Computing, pages 878–887. Springer, 2005. } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/step_tomek.Rd0000644000176200001440000000707314061010144014755 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_tomek.R \name{step_tomek} \alias{step_tomek} \alias{tidy.step_tomek} \title{Under-sampling by removing Tomek’s links.} \usage{ step_tomek( recipe, ..., role = NA, trained = FALSE, column = NULL, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("tomek") ) \method{tidy}{step_tomek}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when applied.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_tomek} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_tomek} creates a \emph{specification} of a recipe step that removes majority class instances of tomek links. Using \code{\link[unbalanced:ubTomek]{unbalanced::ubTomek()}}. } \details{ The factor variable used to balance around must only have 2 levels. All other variables must be numerics with no missing data. A tomek link is defined as a pair of points from different classes and are each others nearest neighbors. All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \examples{ library(recipes) library(modeldata) data(okc) sort(table(okc$Class, useNA = "always")) ds_rec <- recipe(Class ~ age + height, data = okc) \%>\% step_meanimpute(all_predictors()) \%>\% step_tomek(Class) \%>\% prep() sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) # since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(ds_rec, new_data = okc) table(baked_okc$Class, useNA = "always") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without Tomek") + xlim(c(1, 15)) + ylim(c(1, 15)) recipe(class ~ ., data = circle_example) \%>\% step_tomek(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With Tomek") + xlim(c(1, 15)) + ylim(c(1, 15)) } \references{ Tomek. Two modifications of cnn. IEEE Trans. Syst. Man Cybern., 6:769-772, 1976. } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/themis-package.Rd0000644000176200001440000000227513714626477015516 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/themis-package.R \docType{package} \name{themis-package} \alias{themis} \alias{themis-package} \title{themis: Extra Recipes Steps for Dealing with Unbalanced Data} \description{ A dataset with an uneven number of cases in each class is said to be unbalanced. Many models produce a subpar performance on unbalanced datasets. A dataset can be balanced by increasing the number of minority cases using SMOTE 2011 , BorderlineSMOTE 2005 and ADASYN 2008 . Or by decreasing the number of majority cases using NearMiss 2003 or Tomek link removal 1976 . } \seealso{ Useful links: \itemize{ \item \url{https://github.com/tidymodels/themis} \item \url{https://themis.tidymodels.org} \item Report bugs at \url{https://github.com/tidymodels/themis/issues} } } \author{ \strong{Maintainer}: Emil Hvitfeldt \email{emilhhvitfeldt@gmail.com} (\href{https://orcid.org/0000-0002-0679-1945}{ORCID}) } \keyword{internal} themis/man/figures/0000755000176200001440000000000014061033307013757 5ustar liggesusersthemis/man/figures/README-unnamed-chunk-3-1.png0000644000176200001440000003650514061033307020464 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_8IDATx ]U}?_7  E( F $gac}w=~w  @nMa @d @U@VnF  @[n` @ @U``>Zv(u1dȐhooϾ7=ׯ_lݺoCpAŖ-[»╯~/3{Sb#Fu4BeG=3^}ӛ?b{4}'abÆ މOoߕ~/ڦ?0,{zz 1?g @) 9 @@chc~&@S@Ls @M PZ' @ 9 @NN0  @@s6 @@h` @4& 6l @::4'@hL@m @u uiN ИN 6ĢE4>c*ۋ/g}6&N{we @ tK}ꫯ:{x'Ck&oPig @\@~8SOtX`Azѿ;wn̙3'..l @Ghy7gW::ׯ_,Y$&LD?~[n%nʾqW!L C ycR 0 qJ)e6-GF9R}C-DŽ̢"rb+[nΝ@׾XbE5s,1rXzue;Y09hРxIuO|"c-$pm5oiPۦxj%hlwe_n!sJG>G?=\:::*ioo?>WrXn]}@@]d-o}Vm+P;x\ SZ^LKj 1vXfMei}ܸqm+ @hzmkko|#dzO=T44iRL˗/ts޼y1y) @@EO?u]}?8p`v̙3cƌ1f̘?~|L628+ @hzMdGyd}]6 LǦNSL-[Ĉ#.  @@%vm}{z>}Y @/{@Oh @hiK а0 @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zz%@hX@mP @hiK а0 @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zӸ80  C1n>/  @@t: @@}h}^Z @4( 6t @&@hP@m @ yiM Р  @@Қ @AA@ @' 5 @@hN'@O@Kk @N PZ @  : @>>/  @@t: @@}h}^Z @4( 6t @&@hP@m @ yiM Р@~.^xq{アjժ.m @O[>_c͚5ٳgǬYb…1}XlY @ )7pC瞕\ti,X n߿̝;7̙\pA @r tKݶm[\veWWqW,Y&Lg9qĘ?~xZyGk*V/0bĈOP}[|U {fE엷tKMW8ַQGe,+VQFU92V^]N+O=Tvs'Æ w孯ڪmy="~v7oޜ3ַ@ 1tvZ9ӳΝ+W^zs> .j(̊|PM>l|#}C /Č3ǎIIƍ` @Mf\N=׾ǏuŕW^˗/ςybɝM}'@(@hYs̙Y@3fLJMwc @\ۿu!:ujL2%lEK6 @ZN"}Y @/!  @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zz%@hX@mP @hiK а0 @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zz%@hX@mP @hiK а{  Gp=)0z|xdm2pv=V&pEԷ=w.nܸ1lR3v ˽n\nZmoXUڭzc";xܩ{ >A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @-Kqw3</^{oZjcv @K[8瞋1ٳc֬Yp˜>}z,[r  @@6{J۶mˮn^tEqƻ7;K. ĭsƜ9s .hO CM*k?pDӎ%KĄ 'Nhڗj1hРM@[[[eߝj(̊M/oiz|ui6m믿>۽bŊ5jTg9rd^Vn+*n8+V/谔W@}ն坙6ͬnܸ1h wuWşǘ1ci.6 @rj ֭^z).g}rs @@@M:r8蠃?i~o @x>.8o~oP @x#EFtڵqgҋv  @`5_|1m۶ns6 @9< @j+VСCѣ @/Ps=ò{@wFНG 3C=>֯_?x\s5VX!@ #Ps=#v}olܸ1կ_q; @T @O7n\qM>lذ3Q`ԷpҖPm[Dm 'mo{{{j>hlYZK/N8!A @@5g/@JAzyW-R5m @] @>?c=͛7ꪫO8@ @`{h:k^4yw}c̘18:8s: @] *C9$~~XxqC=4>s @T |4ҟٟŪU+gqFL6-fϞjU @9}ٱt8ΆW\qEv]wݵp @5=uָ{'/;?2ӟt_>tk @Nj^|uŚ5kw&@ Sh[[[|??$Ξқӧ{@qZ @"PSM}ߎޟp@5*=&M4} @@-5:J`-~G|goF-  @LZyae_ @@ON!@ P- V&@hT^ @T "  @* 6W @hm @ M9 @@Z-b @hSyuN P- V&@hT^ @T "  @* 6W @hm @ M9 @@Z-b @@_~bŊ;LhqƪUv8f @@%yqYg3<]tQ\qٳgǬYb…1}XlY @ l:::⦛nBAvZ|򓟌O?=֮] ,[o5s΍9s\a @@ wb7o6lL,Y&Lg:8qĘ?~o1ʾ~!T_`})$ շW[-@gVn{yKhzǕW^'tRFoȑzvZy;* 6ml[)z[ږW3+g7e[h-[+_Jl۶-.lLhڹСC;7G}teʕ~Mz[ږW3+gwH"7__{_|q 4(رcc͚5qU @O[藾??UeҤIhѢX|y͋ɓ'w @M Gy$[*W_}u3gƌ3b̘11~6mZ @ 4=va[-nԩ1eʔ#F쪙 @%hzũ-җ @rh͐ @VV) @ @ a  @@hR @" ¨ @ZZ#@(D@-Q' @ JiG PZN @j@kҎ @F @* * @@!h!:!@U@UJ; @BBuB PZv @0 @VV) @ @ a  @@hR @" ¨ @ZZ#@(D@-Q' @ JiG PZN @j@kҎ @F @* * @@!h!:!@U@UJ; @BBuB PZJH_#0tо3>8S-oVm+Pۯ_\^@mۖ;!'3R5ʵQ5ʷu=җl޼LT}[tU {fE<8=< @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @-ЭtÆ 0ŋǽV @ywygٳgǬYb…1}XlY6 @%-tɒ%qg-]4,X]w]|O1gΜ.ml @K``wLgƍqի+ 'NW|0xʾO?=gʶ 93T_mնY?z.R#8"ď~.YbE5/M8G6mZ 4m]oy뫶j[^rϬݭ["uK 1tvZԧ>}u\r+ub^VBoy뫶j[^rϬw -ǎk֬Nƍl[!@(@I&ŢEb呮~Λ7/&O\>e3"@Sϙ3gƌ3b̘11~HxZ @+ЭN2uԘ2eJlٲ%F! @%+H_ @ =5C @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @j @M@ʫs @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @j @M@ʫs @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @j @M@ʫs @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @Vh~E.wշU[-@g?.2$wwȑ#U*-U9LFmpjCmKU&SD};::vw.nڴ)۷ ]3S_mնY?Er)1A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @Z".^8XjU @@ x={v̚5+.\ӧOe˖ @)0kҥ`[1wܘ3gN\pv= @@/dɒ0aB>ĉc]/_>leAC l[)A?>/]|饗*V{#͊z{_~ 8kHWsxf-.rÆ e cǎ'x25kĸq*i%]!*ʕ+#Um%5M 5-Joܛ{Eoxն7Ʊ.-OYI&ŢE"Byb  @VȏMus̙1cƌ3fL?>{ʽ{ @=uԘ2eJlٲ%Fу @;zFpn ѣGZXm! @T/U9M hE◿e0u_ 쮻B_J3(P@-SW'?InH5kڍ֯QM#ܴiSWUV^%/?߻m5 qYl٪Iw5ںukV[oջVhSM_|Dwvvn앖yۦa=}7`k+)C Ek{O<C!;묳cMS0].R?7pC˿KD\ϫZo&?#nx駳k[nT{8~T=#9&۸qcy;,8~Kf=;NѲY\ztԏ~H j]@ݪ%[n޼9OG}tL6->OX j㉊-[{9ϧn3<xZR-{=={nI_ߣ3ZM`w2=أn67eV-&rXvm_ec .Fi[P ?q+^hCe6НS[D <و~e E 084hP<#Y/Y r|Ȟz;~x|SOcr8 W@ַ5?*#'_[D ,Kw Hb?4,Xn]9|3xW{3, @pn9 @`ss @n  4 @@wY @) & P@.(/_^Ksm ' O$ )KHƞ%@\m @> 6=G`ժUٕ|zjy;΋1c8v @\mB }"'7tStIBfgyfO2%ΝsxHYçM&Me]s7g[ԧ"#w ^ h/(! /b򗿌AeMW('?Yիc}o}[qaeñr3g?C#[OuQq9D"wޙ^DzI ?)9s0ȝ?yŋǃ>)tx`v󪫮c=6 G#=te4]i_ԩS+[<G@oHW?CݼysiaÆiٳggW5ӕD886Ce}}ё @7 ZJ@K |o|#.ww="^$/~G%3]1M˳>^xaСC+M{CSC _OǶm^xޖ?-hoq вy{"B=]kSN:+{|Ap ''}ƍ^D/~1{tE3ꫯ_W} nɓCc- ,M^ԛe@*g};w6 O͆ +cǎpz>=A@[@G(VQ!@2< @@вU| @-. x  P6l5 @@ -^ #@M@-[E͇ h @efZ~'v IENDB`themis/man/figures/README-unnamed-chunk-6-1.png0000644000176200001440000003524714061033307020471 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_6IDATx u}_M6ټ"ixE AVc)P9Q+6`pliJ9ъ"=ڨJS^ƼCym?l6{ν{gܙ?w7ߝ;wȮD @AC!@  @* 6 @P @ @mg @ @ t4tolcǎZj&F===W݁}DÇ!CĶml6*^Nc0˵Kٚ @JJ0  @j@k5 @@h` @& gk @**T'@M@ @U UN PZ  @@S @66?[ @T) V : @@mhm~&@R@Lu @@׮]=P{xٲeo;VZW @%Аo~3.xⳟl,ZWq…q?]tQ,_  @@ tvrK28?GϏc=wqG :4nXxq\~  @Itذa}-F-[bƍ?3gg*<؃o>ǑGٻlf!C+F{H?Gi|:蠶8[ k&N{H5jT?mۖ[4>w7pCuY~vpܸqz4sgrĈ*=^;::`}8}C{,[hn;SߪT֕wv|p|ySCh֭[b׮]qWf}J=3gO=H_iʕaÆҢ2vrNgMZEsT4՛ƶUFj`,b|;;;swސ7!uwwǟرccbiʔ)f͚S.!@h?|3q1e]gi5kV;<1c쫚 @m"PZ#} @_!?#$@T@TJ= @BB5B PZz @0j @RR) @ @ a @@hR @" ¨ @JJ#@(D@-Q# @ JG PZF @*@+R @F @T* V* @@!h!!@T@TJ= @BB5B PZz @0j @RR) @ @ a @@hR @" ¨ @JJ#@(D@-Q# @ JG PZF @*@+R @F @T* V* @@!h!!@T@TJ= @BB5B PZz @0j @RR) @ @ a @@hR @" ¨ @J:*,:;;cԨQҝǐ!CbĈ1vئeƶ/6Oˊߝ;wB\ݾ}{r '6oGMk1-mضۈy}z @hXݲeK\uU1t=M7+V &d3g@{ @hH}+g_=xg뮋iӦ6O Ц ;>cZf͚Xre|߈W^yer  @'А3|&#!s֭[cҥ1jԨ'?^xa|w7Ǎ7ػ1ӻlf!Cb4Ի 5ƶ釨1˼!t_1cF}1qĬG)pзm1w&bӦMfv 3&o۶m۽&TCm xMm{E]r{1tݺuaÆMoHڹsgfO<1*E\Mo[ L64Z{flklmM=+b|;;;sҐ{@Ճc͑zjov @=ꨣsύyEOOO7.,XкzN _N;-Wty睗]6ND z DN/|4 @[)h{;: @r\< @@к @@Za @h݉ @\@-0O Pwv@ P. k'@Zwb; @(@5 @]@; @   @. ֝ @r  @u@Nl @hy @ u' @r\< @@к @@Za @h݉ @\@-0O Pwv@ P. k'@Zwb; @(@5 @]@; @   @. ֝ @r  @u@Nl @hy @ u' @r\< @@к @@Za @h݉ @\@-0O Pwv@ P. k'@Zwb; @(@5 @]{(xFCX 1lذ#ДǏo~Tƶvfm6ӯ"wǎiyAq92n6mjth&aÆv;$ƶ}mm:"wĈH-@ڵ+B>w@kjzkljƶƫ6b|]ˮvT'@I@ @ ՊO PZ  @@S @&&> @T+ V+> @@MhM|6&@V@VL} @КlL PZ @5 5٘ @ZZ1  @j@k1 @@hb @$ gc @jj'@I@ @ ՊO PZ  @@S @&&> @T+ V+> @@MhM|6&@V@VL} @КlL PZ @5 5٘ @ZZ1  @j8qqƽvկ~5̙z+ @'Һ|'[\dI\{1rRqر#瞘>}z:3 @rG\rIlݺ5o<@ 6ÇW\qE:3 @rɓ{^9zk3&=e @rrh{ov&tkyY SN-_e @@;3Ν֭۫q  @}*[G&Mڣ@ #PQݰaC뮻.74 @@@E7n\q(5 @#PW_ϏW^y% 7MgE;J1 @)r?'>jz^$V @CbŊصkW͔? V @hgg'4 @5 T@_}5jTL8qٸqc<q)QwٲeK/ 'te @K:cƌ{@lW]uU :tpxcq7ƴiۍu @m Pq]dIرxO~|м矏+"Ə}cwܑn-/^_~yW @68{}s{V:W^ye^:_:R09sf>t -O3۶m˾J+wC )-zGO?(V5צ;flk&lmM!kV@uDSN{g_Rx}G+MgEKSz} WXhQ請3G{5;vl/Vx*]*m`-Tض` Eo:7U@?#zzzzJ?kN;w}53MS M駟)䖦X׭[WZG?ok [3L+Ps83c֭1f̘ @@k L'].Ofꫯf2zWog0 @PJ({1w7K\rIoFH{>w @@I3陟۷o>_ڰaÆXfMi+ @\hO?=.쓐J-ӧ{@q @T"PQM }_iӦ#:k֬ӐL @*ФIӟ4=o}[W~W*ُ: @2hB3/~ @"P%4n @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}:h宮6lXwS 8qbioHq1Y6[KƶF1===j;#Vrk׮m>pƷrVil[m*ﯱܪk1|.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Zmokn6y>}z @zM7Ŋ+b„ ̙3s4 @==>quŴi @k֬+Wƣ>vZz{HUFRW6Hx7zvc| 5 @]Aon!CvPs=[nKƨQⓟd\x~-]뮻]6SLi<۾}dE즓yӠ3fw'NxG7߼G=sWW{!+ze3/`{o5+GVaâkP@׭[6l >􆤝;wС-JS\m۶Ң@x _cklW\A}cyصkWqꩧܞ+$@hIA=zQGŹ͋7n\,X%!u @2A ~wyq,Vm @ZU`P/짉 @"?#$@( %  @ @l' @%$ @!hC @$ $ @4D@m @ВW @ a @ZJ !vB P@K^  @" 6N @JhI+ @@CІ0  @@I@-Ix%@hf;!@( %  @ @l' @%$ @!hC @$ $ @4D@m @ВW @ a @ZJ !vB P@K^  @" 6N @JhI+ @@CІ0  @@I@-Ix%@hf;!@( %  @ @l' @%$ @!hC @$ $ @4D@m @ВW @ a @ZJ tuuŰa lQS.0qf `|kkMmP 35EoOOO\T+l9k׶\ur[U46bVnՊ5Cw >G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @-tٲeo;VZUi @=.\0x㢋.˗7 @)Qdcն/c=wqG :4nXxq\~6> @@ j}c̙YL^'pB<{нK/;#bȑf_`Ĉ|cklWtb1o{7nܸXzurcѢE8z:o}kڮmP356ƶƮ[p^j6lXر===1jԨ43gΜx߻./~ѻlfAi7軷0*cǎXvmtY?+Hg&L]Ukr|jxÇ}V:eʔx'{;f͚:ujrIgHϒ\2RP5/k׮=B}m54iPZmҘRvil[i*PIޔ>of͚O=TO>/"?3}QКK,??|0M֚8餓b޼y>+)hI{g;S%?؝vtGر#vY`jwb_S M?ئ#29{y+Ζ @66iR38#:&n"pki¶M*0eʔ8묳I{[oFy䑵4a&x['Nl5c @p  @_@m1X੧gC' f{7P:15Eh~fG@;᯿zvhQE=ܼys}Ѹk#>Zuo^ziƍ|s!C! Jv [tip qu}zKŋFv /=>OG4 F衇Hꫯq79s?7|s?SDΫj,=nxg}lPz׻~TO>#9o&.uwwlj'umƌYm~O/bc6~;Y}#hZtU*M\sMnzG"=TZUSܲeK\tEq);3gN|Cjʾ}b֭q衇![5@z|_x8]k3Ç.֙Us\}h6ٳgn;wLegZM&rX~}ɟIc?xCOdUCwǺu"lɒ%=54g&Ha3&?tY P ݘ6mZ1"~d3f)COϮDs1qqŭ_ײKMHHD˳il/얨#H4ۿ1a„f#WM (꼚v,]qgQMKsI=vlܜ*FC~6lqƵ߁9شiSvw V @plF 00t`n"@:@8 @ L@ @( fD }xg?x+? A 0XKY>qF~ h6g@mD >  UVM7ݔL8N<Ľv.p tH)Ow饗aMzg'N38#GtOM#}8A,O}S1iҤcYf}#@%'[n%:H!3}l /W≮}o?J>;|Μ91k֬,`^{Y;_c;;{[A@ 84WXbEg?#FdMg(zG>իWǛ1cF>>+W>3~_u\pY;v}'tR\r%Έ o~3 >0#-"MH-2PI@ \x%ŋބӟ4ײeG:?<;/|!N>,ofKiJgFәַ-YgIp.a/x+hjzxtVHg?G߮nٲ%Dw; n…Ytf4?>.첬{,~%Kdmtߨ$ h+M)pGfg3;77qW{'O?ַӟtr!Ythwww?cP.kqUWEj+|0֬Y\sM<$|l;@&@|t袋3ezCR`~޿oLgLK/ŕW^ͧ9jԨҗLg57oޜ.92x8g]v鉣:*k?hUFJ? hZޡxN<99眸⋳K>6N//է3E_LWJod;fRg;vlNH$ZIZiHg%_},<_;sCKq )S*N~=,U 4{ @vp F @\@m= @@ 6 h @M@mu< @&@|t nh!@4{ @v[k8k݌IENDB`themis/man/figures/README-unnamed-chunk-4-1.png0000644000176200001440000003731214061033307020462 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_:3IDATx \e}?_.; P*H j1l#P*7 Xx* M) S-%JS4 ȅ\wI2dΜ{۝|33}& @$з0 @2 @@ ` @ @n@ @P? @*п[֬Y9T& WF׻G}-[no߾1`ؼysxWrj9"˝׆ h8,{wkd;obnݺu< }߅@z2dH_׻iUy96a*|}~M PZ#  @@h @N PZG @( fw @< @FF0 @'  @5 5ٝ @>>?&@Q@ @ y4 @@h`v'@O}ׯ [Z^hQ|xZfƌΕ 7T羽{#nۆ^t~JKv`/?Zk~{޶m[6%x㍱{g~|#GzMsjooN8!WbŊXvm]_؛7o/Sjmmm~Ul4鏋t"v}>,P{M^7#g 86LMz{:{dj1zXzui~رe3 @hxMgmoģ>=ew1q홖-[ܹscҤIS6" @@/?_}K\pAvSsƌ1}5jT7.NZ @ 4<&:*1֬YШUi۔)SbٽÆ KL @t>} @/{@Oh @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-h-Z%@[@P @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-h-Z%@[@P @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-h-Z%@[@P @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-kٹ߿ׯt}ӧOf3p'~S}ն>f|t[[[֭ĶmۚT@MoݭC] ׾}+l~ӗi捊GZ j Ugj[[+yI ݉\ݺuknKOj[6,6m`/Qz^No?-6<~^,j- @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/?q[nE}+Wa @ tk}衇K_R^$9k֬9sf,X MK.-m3C P<5tvo޻t%Kn}Ɯ9sbq1C P,n ۷o/<:⊒ŋcYL+'L+mO3=X<#uOi}İa^_i~OԶa˖ T,/_#F(>|xZf~ iSN9%immmŠX?gXu:w]Їz[q'Zq {<ٴi[T}ַvH˿=ʟ8H_ӊ+_\1c^wYkDž;!]HWu]wps#QtСZ_s=@1}ѣwxARzqرcwY @Z_g@SL_i_bܸqvڸꪫbٲeY;wnL4sW  @ (Z,9cƌ,5* SN @Z\ۿ@6eʔ}bРA˪}#GV;@kF۷ooNe\T<^ Uq8f̘شiS_CK$a+Nz>Ě5k#5" x.*R5wK8p wY|  @@o @].f  @/ 6 @]0 @4^@m# @t@`%@hxcG @" v0K xƎ@ E@a @h @, @@; @@ Y @ 7v @.h  @@o @].f  @/ 6 @]0 @4^@m# @t@`%@hxcG @" v0K xƎ@ E@a @h @, @@; @@ Y @ 7v @.h  @/=>NZhQw}rʝYA P,n ?Ϗ>/ļyJf͊3gƂ bڴit63 @!m߾=;yAx;f|C%Kn}Ɯ9sbq6['@!>}W_ o֭#dA4Xxq?> iy„ ;Mҥ_|1fӾ \L [] 5.mWrO{@+x.j*yo?yk駟7nn![|1bD.1|XjUi9~qWqqG(0tH_b ?L5jT1fT\TȲG}7lPjoW3@S㡇:+ׯ_ttt.-}cqǗ֍9ҋJ;ΤRO_b 6,W_}Xۃ|ӟރGyHO|{:f7xp7 !oxC (@lOgf̘ӧOt8ҩS> @ |g@ׯ_^Ӕ)Sbɱy杶u< @@1%vR2}/lw @h=;t=#@ ' Ћ^\|C'@Ih/. @zB@ u$@b  @=! c @zڋo @@{B1  @X@7t @@O= @^, : @'ОPwL @@/@{q  UГN:)֯_SԩSwZo @] u?G?Q_ qwơZZg @@%o}k{yغuk̛7/Wj-:?_Zg @@%t}Gy${rK 6R{ @(P1v}wߝ ]tiJǎuy @:~1}xWwjN[ouV @(:_e|QFЎ;pX @ PU]vmqW~W9 @T}@|p?ܚ @@3K/49x0ڿMgE878 @"^Oo]fMuY;EH;XA K/۷oe3]ߜ~;XI @ T@ @@U˗ cȑuwF @:~="qT#@ؕ@R֭'x"AK @ Pu=#wj~wlذ!կ 7ܰv+ @ T@ع=}u횎5qεַ;U +UkP֪WͣoIU||R %:6 @RҥKwzCF׏쥆M P@tꫯ-ozӛ]zWpH @fO>2|HNJ+"9'fHc'@NouE̘1#;XlYW^x!{fG  ?ϳWpYz˟t sύ|3> @@U۷ot^{-E*o2 @@1 {}~s< 6mqWI'TL"@]zue/@4iR3&F裏;/i @U C_(-ZŔ1* @T}4ҟʕ+3gyfL:5f͚ІF @b T@9XdI+cT @ Tu ~˖-qSO=։O}Snݺӑ>9  @O3=?nݚ|9ڵkc- @إ@U-\}RgK=8M @*Dz<0FЉ'f> D @M ?~L/#=~w~؇ @@&PuM{yg_ @@՗q!@ P. X&@hP^ @ "  @* 6W @he @  8 @@Z.b @hCy5N P. X&@hP^ @ "  @* 6W @he @  8 @@Z.b @@_y啸c; hѢEq}ʕ+wf @@%u]qgdz>_|q\y%Yf̙3c1mڴXtii @ o:::oB~z|38#֬YϏn-s̉ٳgDž^ni @@wcС7mׯL/^Ǐg8a„7o7tSp uiC--)~WAoqj[\b,ݔ*M s۶mqUW'ov?#J>|xZfG?ҺAƍKf/Ůj(M-4u`ww}袋>Lh\̾s1:+Vĺu:}]"oq뫶j[\b,݁VD!mذ!o&kKbYFW.u0͏;l @x@/g/Hg=;'… cٲe~Ν;7&MԹw @ 4O?>hu뭷H3f̈ӧǨQbܸq1u>f @'zgo;)Sɓ{D ݬ'@(@h5NmmmL @{@h @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@hR#@E@ͅQ# @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@hR#@E@ͅQ# @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@hR#@E@ͅQ# @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@ݱYkkke==#U]mնY}rtdcԼx5:"Qy-V=Fmjo;r=җlڴ T}[tU {dy8"{@+H  @* yl$@[@[T{ @Њ<6 @- -= @hE  @@ @@E" @y yj @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T{ @Њ<6 @- -= @hE  @@ @@E" @y yj @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H @#<-Zw_\rmV @KM+_Ju];Κ5+fΜ ,iӦҥKwn @X@/^gyf[n%K믏}sO}*fϞ> @%п;aÆ袋bժUC`:~r b޼yi桇|3ΈۯlÇ/ {շW[-@Gm*"uK=#N'?١3˗/#F֥uJOf_+WXJ[W#ww1tBu=;ѣcեi~رe3 @:qXpa,[,ϹsƤIlD @%1cFL>=Fƍt @@O4hPiL PAoqj[\b,tbԣt1bĈRV*-qWqqGt{Ӈz\ moMpm5aÆگ_(u=\ZN3SNSN9.|Kf^w}##̵^{W^yUU 3 {ww*n&{+㵵Ő!CvSѣGǓO>Yիcر4ΐv=KbŊHAմk۷w&&*i9ƏG+ l۶Mm[`UretҤT}GmĉpHyP9wܘ4iR-k @Y*:9cƌ>}z5*ƍ]roa5O Ѓ=@ӸL'O͛7ǰaz¡  @ChdQ5} @/У  @r @ @ʫq @r\2 @@CІj @\@-L P'@(@E, @4T@m(  @r @ @ʫq @r\2 @@CІj @\WZnM'㬳jn/wܱ}lhM~8,8[sz[>:f̘}v'ZR =/?qd{΀tr<~GGGl۶-5,T_SRmsūm@#  @@ٙAfo[I'q衇f}qǶu7=P\~qb7FzݺuqyE{{{_>{~ѧOVRa鯰<ꪸ+믏szC{tn}/k*H2k֬H1Gɥ^_kSO=k>`G7/t6g@kj_\zk<3YHٺuk 8C W>V<ҥ 6QGuhSjXdI{ae>ko#V ]Je]}OH'?ꭚrM6ŴiwwcS?N@߾_ؼysp{֐G5@zTۮ>qAT3mmmet":ޢG4IxG[{WiLuVj2+VĚ5k"[`ACOd;?OW_B~8ǷΘ0j*J8#[n~Q8ڍ؍8ԛ8Æ Ξoo7fgyfX:ӝ^b*ɟėw]M/B2@#"]Mb̘1njFhc^h>rTͫiNcԨQ; 43\}#@%'r)q7')dM}gwɓcΜ9?{̃>:?+ӟt:ujL81 _~yM7sN^|򓟌X3-P$]$@9^z :PO|UV~ַ֧3cŊg?3<3~aEGGG6#=sύtF4ѻ+ >0#-"EH-R($@y>dgϞS'w"_򗑾-Z=PЙΖtAΫ:;,~#tI?Mh:[[ٶ)SdgRw.a'x+hj<:G@+C}înڴ)D' ?dȐ8wxܬYh Gyd\p>}{GGgms15 @V ДrHv6koĥ^uUySO=sO|_?'aÆ)T?_W"W.,,XOzj}%>OO6ǎK_RFfza5\o<+2~Y3I&e^+[:& J^JWZ |pz|zgѣG9]O9кV @&/ @&|*j< @&@@G(Z @&/ @& Chr { @ E @\@m @hhIENDB`themis/man/figures/README-unnamed-chunk-2-1.png0000644000176200001440000003752014061033306020460 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_:IDATx \e}?&y'@P$R`@KЪh & };Rw X8QBK Ar4ECH!!e7w&L=g}3;wfڶf  @>Gq @@ Чhr; @9@ Чhr; @9@ Ч}zf͚ʡU1lذQGGGŦM6D{?hР2dHlܸ1|*^rƴG^%35jh8,So]w5^}?[ieXz[vm+wC۷#^G֭v|Z}Uz]Nc0Kr}|}~&@Q@Lq @M PZ# @ ٛ @FF0  @@7 @@h` @' go @'@O@ @5 5)N PZ  @j@kS @>vnu?M#dv|gL<9xe˖E:9o޼2eJ @$K|3qWgt syJ|Κ5+fΜcǎ &K3C P<Dva??ƚ5k7u3m6mZL:5wqԨQi @}@z{].ӧ @ zH PZ @u u @-e  @@&T @@-h-Z @- M @ZZ%@[@P @h)K PZ7  @j@kR @nnB @" ֢, @@h݄* @E@EKY @к U@ PZ @u u @-e  @@&T @@-h-Z @- M @ZZ%@[@P @h)K PZ7  @j@kR @nnB @" ֢, @@h݄* @EP= Mi6e6CmiP}9،{wttd2dHlٲMu:y<_ |heN m+Rmm~=3Ri˭4Zښx=~mۑh͛7Ggg3׏=:yC;Ύc[Mc;jԨذaCtuuGu9ww?Zy<_,j- @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/G?Q]vEw+Wf @ iK_R^$9gΜ={v,\0f̘K.-m3C P!^y 'Dz6m_#RX1K מ>|x9}h@;]~; 6O~ַի!/(>S#=+Vċ/ؽg+^}Xn]f PK\x^Fs>Nᶥ,h.7xݚ<79r7!{3ĉ'矏3gfǍқƏ@  ?fztO'|r|k_ &+]vY,[, ͋)St @ =fرcP:}JF }@}iӦԩScƍ޽ @@1ۧii\Ԟ-=rc[ܱM=c|薢p#ꪫF+}R>a„D  @hxM||;nݺHMSN7ֳy @bI*e<] @+{@ZF h;& @` xu h;& @` xu h;& @` xu h;& @` xu h;& @` xu h;& @` xu h;& @` xu U;.֭[׫c[A @`{[ٽ?i?xcذaݛ+n8K @$P1o:+6n7oKutt~J @$P1ftIq 7ĨQ*g @h=LҥK=P:~ @خ@[n3g/ܫO>9n^ @ P.Pu˿G>cǎݦ@ PA+ċ/\rI @@e>tѱ~ʵJ @ : 3όg}6 Ίr!op(  @D"@#f͚8{&^$V @@ /֭[[Mn+  @СC @[|>|xnu7F @:|=#qT#@؞@Rk׮G}4AK @ Pu=C{Uw;֯__Wk  @UǏv[b1"~mmm1lذ~k[V }Akݲ#80wc[ܱM=c|;;;+"U@~YY㢋.c=AܘθlGuz]{Wlذ!֭[]21cdXKe[,h^z[fll7> )] [R @Ft^4rM+ !@hZѣ_n!'7Żx{C ij @@s T@{HWrN81VX?5 @(0F͚5+:XlY򗿌{..j#@Ug;g,}OYg}7u @Zh[f:_^񫯾cQ, @P><}g?xG0>o?A\~qq6 @)PUM;\uULǎ>g  @Pwx<~ƢEwtA@ @\3#zjY&,X7|s 4(nƘ;wnwyn  @It|'FuqÆ nݺHt1q,|&MoCqu5\SZ<Ҳ2#F\!ئǞ{Y|-8mY^4;|nٲ%.8c=3fL}GUVG>RZ7lذxJf^HΛ7oxs^[ ^9WƶcWiqغuk|ڔΎ3Sggg >{1yGDztO+Vkv/C ܴiSvjZsSO]3u5ƶ1C؎>yoob]v . 5jܸqzRKf @''_rv)sύtֳ{T  @@@>죖vD7mڴ:ujvQvTz @4 @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T} @Њ<6 @- -> @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o> ֭|W-Zw}w\6+ @%gtÆ |%nm̙gώ ƌ3bҥl@ P,> /N;-֮]ޒ%KbqWg?ԧ>sݦ @b Ew֯_cժUC`:q4rIbiSO=ܳlu:th-4ѣqZVw6M=nܸXzuis?~|i  @@5N<9XlYR)SG @%~5kV̜93Ǝ&Lt @}@=HӴibԩq5jTM  @ (ЧtG~& @ =C @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @r @ @ʫr @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @r @ @ʫr @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @r @ @ʫr @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @W4r[[[ib-[c[\b/~w[.6, rOm{7zmWX*-pnc GmWg߮^\r^Ξ}0_p5k{Ʒolmqݳ<~wZɩĊ<6 @- -> @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T} @Њ<6 @- -> @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T} @Њ<6 @-tѢEqwʕ+ @L9sbٱp˜1cF,]Ɉ4 @ O<+%KĂ oAō7s΍;֪'@h~ /'f3yM4)ϟ ݲeg)cذae32dH;9{h|;W=w7X4k]|y3ԾѣGǪUJi~\z饥uzkr!坝;wvW5myƷN&64c['` ~:x*53^ZN3ӧON8.|Kf^c="  {k]v%㥗^j&kgLîkrM>@u4re1b k7n\P ~h;T ?FU^_ױrʪ+Z/R888#[ ں?.qV]v8쳣3֭[}Whkkk.K {G.K.$7oϏkJΝwKxi|MT̙i{)Fx/R0 _z\yqI' pbuhk__ 7& hm^MW:qM7SO=͛7СCTho_գ:*JkG Kwׯ;,kfk.dɒv>kM+J颋.~[~G\PShVMYrÆ 1cƌ8#b})۪Q;'0h*6nUdH<ƶO?t~.5Hޘ{znIǵhf2eJ(5k]v)͛N٪+T X"֬YW… ~&:$^~Ho4{{|ήM"JgL|E?ϳ{G$TG3w2dH3}݈Coc9&; :jԨ{>lov#>H_yie3 +bq|_|;%&$S ?"4W]uU^nja- t?u]7\5&8kj^t}M;Du5,}KwȑucHW1 +1zuLW_=Нs @N t'F sιً @`'Н @ ;f/ @@wnF }y˖-2? Y' /KJ%@m @@|RX+W??eg:|qa:x$e#<C}1a„lٳggwm8o:I"}9A.s9'Ǝ8V @m6ZB }# '_}Bfԧ~WN7xc"}_wӟӧOɓ'g/3όߋO~ @mAD[^_W1dȐ ='J_jUo}+>l}:~0VX}gO~8ӲtPtuue>?<:HgDS۳3" "ބ"4ɟdΝ۫{/~HE)t_v/:* #]OS:3δoVmڴiٙvz[A@S ãq@:9r7l Ky{#FSN9e̙LgFS=Csʼ͂:;;-oyKV *hv дz׻"C=}㤓N38#D޳{liv7}K_>>Lo,+׿uvgxF[n%&9eʔ]v%{'|H$ZIZiHg%{,7ʬ[.;:nܸ^t}' XA&@|4 P46ChrH @E@6ChrH @E@6ChrH @E@6ChrH @E4A͠IENDB`themis/man/figures/README-unnamed-chunk-5-1.png0000644000176200001440000003630014061033307020457 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_8)IDATx }?% 7hcvs^v 3{wϜs;+;;3e篗`!@ P'uz/C @ @  @U@+#@@  @U@+#@@  @U{]_/z:Z1^k׮gϞa˖-ݷQZF٫Wcǎm۶ZvOuz6o\b{---Wϱ_c t=/F~7fK,17_>l߾ZanݚaQIx@>}յkVlRl?lذjX1bjKM+ނN @Y YI @&&&; @d% f% @КD ~ @j@kb @@VhV!@I@N @Y YI @&&&; @d% f% @КD ~ @jk]~}7o^-Z(<aժUJlْ<駟۶m+m @`u 7o_ [n%|aᢋ. ˖-h+6m \pA={v뮻5\vn? @_.^J;S!t0wpwiӦ>;}ĕ{7;6\n 7n o  @/нC oVxG/v}75*<҃%KN;~?|7n\y 7~'{m+ХKA4@ݓߡ={6Ѭ~W oC4c]Q%jznݺV[@=%zV\X6`$7kb[i_yj}ȑ%y!+rgjj￿CPquvZ:iX];i;wjie w:ޱcGd{]ht-l߾d>}w9vOpk7֭[oLt6_մjgW,jګWTZmC W.7x`y nATj @@@G}ٰ|a̘1 _%SLǏ>hWmVL'xb  @$Щos9/pC=4L2%ی3B=W\&N4^%OO~!h_~妩Sǫ|/xxh\ @S!\<vwh'@hpN=m  @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 9]߿!a uƝ YkgYtǎliӦԉklnZhjul; um+R,jڽ{zLo-aKKKGoPhg_u-Nj*UԵXeYԴ[n/T @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U{jk΍+W K.x.]qUl+ ,[l)o?# @CS%K~ԫ֮]z%L:5tI{Z PN Ǐ+.E]MN/%>|x馛ڵ@ P,N o;.|Cj9yx$>cIP4iR۷ol @h|mx }ݻ{âE '^~pw3g!C+Ü9sN|r-0lذ>:;um«i4H]Ytͩ0 @1cT!KB*o@KiIqѣGذaC&P,6_]մjgW,jsT=ģݺu ;[z @'#$s@; @4@XFC  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @*W5h6&i:mS֮մLk뷫gQ۷ ֭+O҃P欯6_]մjgW,jڳgT ;wLPe뙨kkxQǶP׶"_GMZ3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J:iҤ~v9sf2eJ6 @ؕ@]m,m?~>So {.5۷|0~m @HH vXꪫ–-[¶m?uVG}{_>W @4'̛7/y{'/?m @RRhg>Cɑe˖%[Pz1 @] @p5kִ3 w_6 @h+Psç>ax[  @@@Mtݺu77tS8RD @ ]0 1"_zoZ  @F#|++ JFwSQc9f7/ @@Mш瀮]6|3iڑ@ PE믇;w76 @@W^ @@tʕU' 4holꫯ?}iW-?4^եK0z?B P<QG)9o.nڴ)|ӟN.ta??׾$j< @4@ ۷o/w ?¿˿$-7tŋr!Uycdž+"'^50nܸ@ И5c= ~~/lܸ1|_ wygm]zux7Ü9s„ =mɒ%N+o5jTx+͛CKKKO|>~Y n:kUMqF|uGMkx<VkN_2s;򓟄xi3ɕ-xxz|;wݻMw^X~}ŅFSN-4)- @-x@X|yVXE[j|WWg5  @)K`+KW7ŷ௺0k֬L0  @:[PƋvuO 6Tܞ'  @[0aB6mZrFaҤI=K#@hhߞܣs̘1m>񏇓O>9\} 3!!@hl/'?O0,Z(# G{FG P5\˒#/Ln6TI  5EK. q?crC=س4: @-[<G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d-=%K˗qƅ>}첋 -[ێ80x @zUWnݺ#Go=\}a̘1z---aԩᤓN*o?sв @tj}gÛoo%bwvtҥaᦛn* @RSG֮]6o\^/=Xxq@{mI&}+W 6lHw% 0J tީ? 3SN.@/96@ExY4fSڵk7x#9joaѢEN/r;̙3Ð!Cʻ~K_ gNy .,yCm S+6_MԵEM7nܘ ө4*M.BK.$įQx1R<--W\qE8xN[oUjԺ9 WW5mk5,Vm /ۿ W^ye8Sv5ưbŊhgi"rHo^<֫+4֭[j>&󮀺6O6_MԵEM{ ө]jUkv3ů̙3'B>>tg͚N=Ըj!@(@š5k__=|0cƌУGZ?3+[xҨQ @8@/v)^>}g<`wuw՟m @!Щs@;< @4@8 FB P/^^ @ @  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@ެ j֩Wn$$l5 5mDյdQӖT ov56Z7O-[D][k4c5m:)z5իW*Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@ @-Zx≰jժ۲eK7o^xöm۪ @:=r-o ,]tQXlY;M6 . ̞=;u]k ;wl  @/Щtҥaܹ;ӦM g}v۩{aرv[ظqc?~l @h|9ė^z)kwsQ?nHK, vZy{ƍ+o{[ow-q6[gϞ=}tv|Wi6_]i)Ur0p P卿~kVZW^y|[J>}… +tӧz^ if/{OU.MMOV-@۷Ғ>%7xm7zfOB~k}[n ׯ/L $лwn+4!C$N1}v!~=zHlڷSСC36oޜlb: @H9?y饗/8ķ=0eʔUg̘+"L81hJ>>y0bĈG{ @<qP3X>h_~s|wsDwwX7v-zJ~<}%]n]Q`m⅔|mkSaÆ 68_ul=[o믷JF- *_[] @4@6G .  @ m @ 怪K @hu- @99 @ZF  @@h$@. VB .  @ m @ 怪K @hu- @9t%~uـs gO<>!'?0zww{ti@x \wua…W^ 8BCcdž /0\ve{ti@o=̘1#̟?GCr;vȔ;۾}{RNd&Of0]mRd6{j鸀q3 @ {\O-N?зo߂pƏF ><]1.u'N vX>%j鸀s@;n @{!T @ 7  !fqƆA P@r1hm\jbkj-pm5k4$͎@ īofj:@;nVg^:,YQVp_|yXvm-@˲eޥnݚmvSĭ+Vzcv| / /CŽB_җBR_w^8sr}=/3τ}sO~g?φq^!7! 7xc1b?*rSoNCuɝHկ.]w ~5ūuկ~o&O{?ZwL'ᬳJn2lذV x'3x Ukb@WnpQGGy$;o} #2w/^}fS0aB2ɡC&/Z(ŭ{|+/^XvI'%%QKq.]|GL`q'ceٳg'O|Ï~#, ~p 7ɓ'38#p a޼y Ö-[BC")W?W̺#9B/>֭[Y˶3fLii{K}AEH5 }'W$}^S)?lv|wwX +ӟ49wP,nQ9гg$⑳;k)?χc ԶxʌvTUaoŋzmNO%/֭[MN92!} _^zxRR\xgi>}Sd.\}| -nMGrY<]&& {[ 0[yMQN"^a5U)CyذaCǢchǼM K@O'@蘀1/{ @쥀N 1c^&@Kt/=}O?F@mRE7@R-$@ 4U} @Tp4 w?O·dɒ0iҤpYg%Ӷ-[[o5]}CMv]fM曓A%}]tE~7lݺ5lS}hG@B@3 ̟??|# /R?38cSG?|4ncX5kV5]s=72eJ=ztoLpWLJ??K;q]hG@A@S M?ɼ>O&G@_T p@pQG%m?o&GL~ .Hڏ<Ȱ}y'h B<"!@,! Pl;w BKː!Cq}ܹ=yOgΜ-Z̙oڴ)~/<|[Jc9&iOJ>" @`1/aÆ~~v27o~zm}s9yrKrT3׿=p&zIxAJ;vlZ ШhVƸ(@~?O6%| =X?:蠤->gƍaƌ!o|W_ _8㏇իWӧ xgy&^BV @hPhK.$?Cx')9B/"jM|דͿկu]<O>nKxƣmxn޽{ /^{-#G~  Pmr "Cd b=zQF%G2[쭗 &O-ote%G4>BiӦ%3#F?泟l>k׮][: ,hT.ygθ Ptx5z|xv " 6H!  8t_y @D@mB @`_@J' @A)a @t_y @D@mB @`_@J' @A)a @%:IENDB`themis/man/reexports.Rd0000644000176200001440000000077213753073767014670 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/0_imports.R \docType{import} \name{reexports} \alias{reexports} \alias{tidy} \alias{required_pkgs} \alias{tunable} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{generics}{\code{\link[generics]{required_pkgs}}, \code{\link[generics]{tidy}}, \code{\link[generics]{tunable}}} }} themis/man/step_upsample.Rd0000644000176200001440000001132114061010144015453 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_upsample.R \name{step_upsample} \alias{step_upsample} \alias{tidy.step_upsample} \title{Up-Sample a Data Set Based on a Factor Variable} \usage{ step_upsample( recipe, ..., over_ratio = 1, ratio = NA, role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("upsample") ) \method{tidy}{step_upsample}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{ratio}{Deprecated argument; same as \code{over_ratio}.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{target}{An integer that will be used to subsample. This should not be set by the user and will be populated by \code{prep}.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when upsampling.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_upsample} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_upsample} creates a \emph{specification} of a recipe step that will replicate rows of a data set to make the occurrence of levels in a specific factor level equal. } \details{ Up-sampling is intended to be performed on the \emph{training} set alone. For this reason, the default is \code{skip = TRUE}. It is advisable to use \code{prep(recipe, retain = TRUE)} when preparing the recipe; in this way \code{\link[=juice]{juice()}} can be used to obtain the up-sampled version of the data. If there are missing values in the factor variable that is used to define the sampling, missing data are selected at random in the same way that the other factor levels are sampled. Missing values are not used to determine the amount of data in the majority level (see example below). For any data with factor levels occurring with the same frequency as the majority level, all data will be retained. All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. } \examples{ library(recipes) library(modeldata) data(okc) orig <- table(okc$diet, useNA = "always") sort(orig, decreasing = TRUE) up_rec <- recipe(~., data = okc) \%>\% # Bring the minority levels up to about 200 each # 200/16562 is approx 0.0121 step_upsample(diet, over_ratio = 0.0121) \%>\% prep(training = okc, retain = TRUE) training <- table(bake(up_rec, new_data = NULL)$diet, useNA = "always") # Since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(up_rec, new_data = okc) baked <- table(baked_okc$diet, useNA = "always") # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: data.frame( level = names(orig), orig_freq = as.vector(orig), train_freq = as.vector(training), baked_freq = as.vector(baked) ) library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without upsample") recipe(class ~ ., data = circle_example) \%>\% step_upsample(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_jitter(width = 0.1, height = 0.1) + labs(title = "With upsample (with jittering)") } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/step_rose.Rd0000644000176200001440000001204714061010144014603 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_rose.R \name{step_rose} \alias{step_rose} \alias{tidy.step_rose} \title{Apply ROSE algorithm} \usage{ step_rose( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, minority_prop = 0.5, minority_smoothness = 1, majority_smoothness = 1, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("rose") ) \method{tidy}{step_rose}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{minority_prop}{A numeric. Determines the of over-sampling of the minority class. Defaults to 0.5.} \item{minority_smoothness}{A numeric. Shrink factor to be multiplied by the smoothing parameters to estimate the conditional kernel density of the minority class. Defaults to 1.} \item{majority_smoothness}{A numeric. Shrink factor to be multiplied by the smoothing parameters to estimate the conditional kernel density of the majority class. Defaults to 1.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when rose-ing.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_rose} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_rose} creates a \emph{specification} of a recipe step that generates sample of synthetic data by enlarging the features space of minority and majority class example. Using \code{\link[ROSE:ROSE]{ROSE::ROSE()}}. } \details{ The factor variable used to balance around must only have 2 levels. The ROSE algorithm works by selecting an observation belonging to class k and generates new examples in its neighborhood is determined by some matrix H_k. Smaller values of these arguments have the effect of shrinking the entries of the corresponding smoothing matrix H_k, Shrinking would be a cautious choice if there is a concern that excessively large neighborhoods could lead to blur the boundaries between the regions of the feature space associated with each class. All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. All columns used in this step must be numeric. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \examples{ library(recipes) library(modeldata) data(okc) sort(table(okc$Class, useNA = "always")) ds_rec <- recipe(Class ~ age + height, data = okc) \%>\% step_rose(Class) \%>\% prep() sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) # since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(ds_rec, new_data = okc) table(baked_okc$Class, useNA = "always") ds_rec2 <- recipe(Class ~ age + height, data = okc) \%>\% step_rose(Class, minority_prop = 0.3) \%>\% prep() table(bake(ds_rec2, new_data = NULL)$Class, useNA = "always") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without ROSE") recipe(class ~ ., data = circle_example) \%>\% step_rose(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With ROSE") } \references{ Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a Package for Binary Imbalanced Learning. R Jorunal, 6:82–92. Menardi, G. and Torelli, N. (2014). Training and assessing classification rules with imbalanced data. Data Mining and Knowledge Discovery, 28:92–122. } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/step_nearmiss.Rd0000644000176200001440000001033014061010144015445 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_nearmiss.R \name{step_nearmiss} \alias{step_nearmiss} \alias{tidy.step_nearmiss} \title{Under-sampling by removing points near other classes.} \usage{ step_nearmiss( recipe, ..., role = NA, trained = FALSE, column = NULL, under_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("nearmiss") ) \method{tidy}{step_nearmiss}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{under_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled down to have the same frequency as the least occurring level. A value of 2 would mean that the majority levels will have (at most) (approximately) twice as many rows than the minority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when applied.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_nearmiss} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_nearmiss} creates a \emph{specification} of a recipe step that removes majority class instances by undersampling points in the majority class based on their distance to other points in the same class. } \details{ This methods retained the points form the majority classes which has the smallest mean distance to the k nearest points in the other classes. All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \examples{ library(recipes) library(modeldata) data(okc) sort(table(okc$Class, useNA = "always")) ds_rec <- recipe(Class ~ age + height, data = okc) \%>\% step_meanimpute(all_predictors()) \%>\% step_nearmiss(Class) \%>\% prep() sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) # since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(ds_rec, new_data = okc) table(baked_okc$Class, useNA = "always") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without NEARMISS") + xlim(c(1, 15)) + ylim(c(1, 15)) recipe(class ~ ., data = circle_example) \%>\% step_nearmiss(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With NEARMISS") + xlim(c(1, 15)) + ylim(c(1, 15)) } \references{ Inderjeet Mani and I Zhang. knn approach to unbalanced data distributions: a case study involving information extraction. In Proceedings of workshop on learning from imbalanced datasets, 2003. } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/step_downsample.Rd0000644000176200001440000001025514061010144016003 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step_downsample.R \name{step_downsample} \alias{step_downsample} \alias{tidy.step_downsample} \title{Down-Sample a Data Set Based on a Factor Variable} \usage{ step_downsample( recipe, ..., under_ratio = 1, ratio = NA, role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("downsample") ) \method{tidy}{step_downsample}(x, ...) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \code{\link[=selections]{selections()}} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{under_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled down to have the same frequency as the least occurring level. A value of 2 would mean that the majority levels will have (at most) (approximately) twice as many rows than the minority level.} \item{ratio}{Deprecated argument; same as \code{under_ratio}} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{target}{An integer that will be used to subsample. This should not be set by the user and will be populated by \code{prep}.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake.recipe()}}? While all operations are baked when \code{\link[recipes:prep]{prep.recipe()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations} \item{seed}{An integer that will be used as the seed when downsampling.} \item{id}{A character string that is unique to this step to identify it.} \item{x}{A \code{step_downsample} object.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_downsample} creates a \emph{specification} of a recipe step that will remove rows of a data set to make the occurrence of levels in a specific factor level equal. } \details{ Down-sampling is intended to be performed on the \emph{training} set alone. For this reason, the default is \code{skip = TRUE}. It is advisable to use \code{prep(recipe, retain = TRUE)} when preparing the recipe; in this way \code{\link[=juice]{juice()}} can be used to obtain the down-sampled version of the data. If there are missing values in the factor variable that is used to define the sampling, missing data are selected at random in the same way that the other factor levels are sampled. Missing values are not used to determine the amount of data in the minority level For any data with factor levels occurring with the same frequency as the minority level, all data will be retained. All columns in the data are sampled and returned by \code{\link[=juice]{juice()}} and \code{\link[=bake]{bake()}}. Keep in mind that the location of down-sampling in the step may have effects. For example, if centering and scaling, it is not clear whether those operations should be conducted \emph{before} or \emph{after} rows are removed. } \examples{ library(recipes) library(modeldata) data(okc) sort(table(okc$diet, useNA = "always")) ds_rec <- recipe(~., data = okc) \%>\% step_downsample(diet) \%>\% prep(training = okc, retain = TRUE) sort(table(bake(ds_rec, new_data = NULL)$diet, useNA = "always")) # since `skip` defaults to TRUE, baking the step has no effect baked_okc <- bake(ds_rec, new_data = okc) table(baked_okc$diet, useNA = "always") } \concept{preprocessing} \concept{subsampling} \keyword{datagen} themis/man/circle_example.Rd0000644000176200001440000000103113646106031015554 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{circle_example} \alias{circle_example} \title{Synthetic Dataset with a circle} \format{ A data frame with 200 rows and 3 variables: \describe{ \item{x}{Numeric.} \item{y}{Numeric.} \item{class}{Factor, values "Circle" and "Rest".} } } \usage{ circle_example } \description{ A random dataset with two classes one of which is inside a circle. Used for examples to show how the different methods handles borders. } \keyword{datasets} themis/man/tunable.step_adasyn.Rd0000644000176200001440000000174413753073767016600 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tuneable.R \name{tunable.step_adasyn} \alias{tunable.step_adasyn} \alias{tunable.step_bsmote} \alias{tunable.step_downsample} \alias{tunable.step_nearmiss} \alias{tunable.step_rose} \alias{tunable.step_smote} \alias{tunable.step_upsample} \title{tunable methods for themis} \usage{ \method{tunable}{step_adasyn}(x, ...) \method{tunable}{step_bsmote}(x, ...) \method{tunable}{step_downsample}(x, ...) \method{tunable}{step_nearmiss}(x, ...) \method{tunable}{step_rose}(x, ...) \method{tunable}{step_smote}(x, ...) \method{tunable}{step_upsample}(x, ...) } \arguments{ \item{x}{A recipe step object} \item{...}{Not used.} } \value{ A tibble object. } \description{ These functions define what parameters \emph{can} be tuned for specific steps. They also define the recommended objects from the \code{dials} package that can be used to generate new parameter values and other characteristics. } \keyword{internal} themis/DESCRIPTION0000644000176200001440000000306314061222402013244 0ustar liggesusersPackage: themis Title: Extra Recipes Steps for Dealing with Unbalanced Data Version: 0.1.4 Authors@R: person(given = "Emil", family = "Hvitfeldt", role = c("aut", "cre"), email = "emilhhvitfeldt@gmail.com", comment = c(ORCID = "0000-0002-0679-1945")) Description: A dataset with an uneven number of cases in each class is said to be unbalanced. Many models produce a subpar performance on unbalanced datasets. A dataset can be balanced by increasing the number of minority cases using SMOTE 2011 , BorderlineSMOTE 2005 and ADASYN 2008 . Or by decreasing the number of majority cases using NearMiss 2003 or Tomek link removal 1976 . License: MIT + file LICENSE URL: https://github.com/tidymodels/themis, https://themis.tidymodels.org BugReports: https://github.com/tidymodels/themis/issues Depends: R (>= 2.10), recipes (>= 0.1.15) Imports: dplyr, generics (>= 0.1.0), purrr, RANN, rlang, ROSE, tibble, unbalanced, withr Suggests: covr, ggplot2, modeldata, testthat (>= 2.1.0) Encoding: UTF-8 LazyData: true RoxygenNote: 7.1.1.9001 NeedsCompilation: no Packaged: 2021-06-12 20:59:07 UTC; emilhvitfeldthansen Author: Emil Hvitfeldt [aut, cre] () Maintainer: Emil Hvitfeldt Repository: CRAN Date/Publication: 2021-06-12 21:20:02 UTC themis/tests/0000755000176200001440000000000013603701230012700 5ustar liggesusersthemis/tests/testthat/0000755000176200001440000000000014061222402014536 5ustar liggesusersthemis/tests/testthat/test-smote_impl.R0000644000176200001440000000222514061031213020005 0ustar liggesusersset.seed(1234) test_that("samples stay inside convex hull of data.", { rdata <- matrix(c(0, 1, 1, 0, 0, 0, 1, 1), ncol = 2) expect_true(all(dplyr::between(smote_data(rdata, 3, 100), 0, 1))) }) test_that("order doesn't matter", { df <- data.frame( target = rep(c("Yes", "No"), c(10, 50)), x = rep(1:2, c(10, 50)) ) expect_equal(100, nrow(themis:::smote_impl(df, "target", 5, 1))) df <- data.frame( target = rep(c("Yes", "No"), c(50, 10)), x = rep(1:2, c(50, 10)) ) expect_equal(100, nrow(themis:::smote_impl(df, "target", 5, 1))) }) test_that("smote() interfaces correctly", { expect_error(smote(circle_example, var = "class"), NA) expect_error(smote(circle_example, var = "Class")) expect_error(smote(circle_example, var = c("class", "x"))) expect_error(smote(circle_example, var = "x")) circle_example0 <- circle_example circle_example0[1, 1] <- NA expect_error(smote(circle_example0, var = "class"), "missing values") expect_error(smote(circle_example, var = "class", k = 0)) expect_error(smote(circle_example, var = "class", k = -1)) expect_error(smote(circle_example, var = "class", k = c(5, 10))) }) themis/tests/testthat/test-smote.R0000644000176200001440000000306613753113253017004 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) context("SMOTE") iris2 <- iris[-c(1:30), ] rec <- recipe(~., data = iris2) test_that("all minority classes are upsampled", { iris3 <- iris[-c(1:25, 51:75), ] out <- recipe(~., data = iris3) %>% step_smote(Species) %>% prep() %>% bake(new_data = NULL) expect_equal(as.numeric(table(out$Species)), c(50, 50, 50)) }) test_that("tunable", { rec <- recipe(~., data = iris) %>% step_smote(all_predictors(), under_ratio = 1) rec_param <- tunable.step_smote(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio", "neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("errors if there isn't enough data", { iris4 <- iris iris4$Species <- as.character(iris4$Species) iris4$Species[1] <- "dummy" iris4$Species <- as.factor(iris4$Species) expect_error( recipe(~., data = iris4) %>% step_smote(Species) %>% prep(), "Not enough observations" ) }) test_basic_usage(step_smote) test_printing(step_smote) test_bad_data(step_smote) test_no_skipping(step_smote) test_character_error(step_smote) test_na_response(step_smote) test_seed(step_smote) test_tidy(step_smote) test_over_ratio(step_smote) test_multiclass(step_smote) test_multi_majority(step_smote) test_factor_level_memory(step_smote) test_result_ordering(step_smote) test_id_variables_are_ignores(step_smote) themis/tests/testthat/test-rose.R0000644000176200001440000000350013776000556016624 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) set.seed(1234) context("ROSE") test_that("minority_prop value", { rec <- recipe(~., data = circle_example) rec21 <- rec %>% step_rose(class, minority_prop = 0.1) rec22 <- rec %>% step_rose(class, minority_prop = 0.2) rec21_p <- prep(rec21, training = circle_example) rec22_p <- prep(rec22, training = circle_example) tr_xtab1 <- table(bake(rec21_p, new_data = NULL)$class, useNA = "no") tr_xtab2 <- table(bake(rec22_p, new_data = NULL)$class, useNA = "no") expect_equal(sum(tr_xtab1), sum(tr_xtab2)) expect_lt(tr_xtab1[["Circle"]], tr_xtab2[["Circle"]]) }) test_that("tunable", { rec <- recipe(~., data = iris) %>% step_rose(all_predictors(), under_ratio = 1) rec_param <- tunable.step_rose(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 1) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("row matching works correctly #36", { expect_error( recipe(class ~ ., data = circle_example) %>% step_rose(class, over_ratio = 1.2) %>% prep(), NA ) expect_error( recipe(class ~ ., data = circle_example) %>% step_rose(class, over_ratio = 0.8) %>% prep(), NA ) expect_error( recipe(class ~ ., data = circle_example) %>% step_rose(class, over_ratio = 1.7) %>% prep(), NA ) }) test_basic_usage(step_rose) test_printing(step_rose) test_bad_data(step_rose) test_no_skipping(step_rose) test_character_error(step_rose) test_na_response(step_rose) test_seed(step_rose) test_tidy(step_rose) test_2_class_only(step_rose) test_factor_level_memory(step_rose) test_id_variables_are_ignores(step_rose) themis/tests/testthat/test-bsmote_impl.R0000644000176200001440000000114314061032353020153 0ustar liggesuserstest_that("bsmote() interfaces correctly", { expect_error(bsmote(circle_example, var = "class"), NA) expect_error(bsmote(circle_example, var = "Class")) expect_error(bsmote(circle_example, var = c("class", "x"))) expect_error(bsmote(circle_example, var = "x")) circle_example0 <- circle_example circle_example0[1, 1] <- NA expect_error(bsmote(circle_example0, var = "class"), "missing values") expect_error(bsmote(circle_example, var = "class", k = 0)) expect_error(bsmote(circle_example, var = "class", k = -1)) expect_error(bsmote(circle_example, var = "class", k = c(5, 10))) }) themis/tests/testthat/test-adasyn.R0000644000176200001440000000240314022525425017124 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) context("adasyn") test_that("tunable", { rec <- recipe(~., data = iris) %>% step_adasyn(all_predictors(), under_ratio = 1) rec_param <- tunable.step_adasyn(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio", "neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("errors if there isn't enough data", { iris4 <- iris iris4$Species <- as.character(iris4$Species) iris4$Species[1] <- "dummy" iris4$Species <- as.factor(iris4$Species) expect_error( recipe(~., data = iris4) %>% step_adasyn(Species) %>% prep(), "Not enough observations" ) }) test_basic_usage(step_adasyn) test_printing(step_adasyn) test_bad_data(step_adasyn) test_no_skipping(step_adasyn) test_character_error(step_adasyn) test_na_response(step_adasyn) test_seed(step_adasyn) test_tidy(step_adasyn) test_over_ratio(step_adasyn) test_multiclass(step_adasyn) test_multi_majority(step_adasyn) test_factor_level_memory(step_adasyn) test_result_ordering(step_adasyn) test_id_variables_are_ignores(step_adasyn) themis/tests/testthat/test-bsmote.R0000644000176200001440000000334714022525434017146 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) set.seed(1234) context("bsmote") test_that("all minority classes are upsampled", { iris3 <- iris[-c(51:75, 101:110), ] out <- recipe(~., data = iris3) %>% step_bsmote(Species) %>% prep() %>% bake(new_data = NULL) expect_equal(as.numeric(table(out$Species)), c(50, 50, 50)) }) test_that("errors if there isn't enough danger data", { iris4 <- iris[-c(1:10), ] expect_error( recipe(~., data = iris4) %>% step_bsmote(Species) %>% prep(), "Not enough danger observations" ) }) test_that("tunable", { rec <- recipe(~., data = iris) %>% step_bsmote(all_predictors(), under_ratio = 1) rec_param <- tunable.step_bsmote(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio", "neighbors", "all_neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 3) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_basic_usage(step_bsmote) test_basic_usage(step_bsmote, all_neighbors = TRUE) test_printing(step_bsmote) test_bad_data(step_bsmote) test_no_skipping(step_bsmote) test_character_error(step_bsmote) test_na_response(step_bsmote) test_seed(step_bsmote) test_tidy(step_bsmote) test_over_ratio(step_bsmote) test_over_ratio(step_bsmote, all_neighbors = TRUE) test_multiclass(step_bsmote) test_multi_majority(step_bsmote) test_multi_majority(step_bsmote, all_neighbors = TRUE) test_factor_level_memory(step_bsmote) test_result_ordering(step_bsmote) test_result_ordering(step_bsmote, all_neighbors = TRUE) test_id_variables_are_ignores(step_bsmote) test_id_variables_are_ignores(step_bsmote, all_neighbors = TRUE) themis/tests/testthat/test-adasyn_impl.R0000644000176200001440000000114314061031202020131 0ustar liggesuserstest_that("adasyn() interfaces correctly", { expect_error(adasyn(circle_example, var = "class"), NA) expect_error(adasyn(circle_example, var = "Class")) expect_error(adasyn(circle_example, var = c("class", "x"))) expect_error(adasyn(circle_example, var = "x")) circle_example0 <- circle_example circle_example0[1, 1] <- NA expect_error(adasyn(circle_example0, var = "class"), "missing values") expect_error(adasyn(circle_example, var = "class", k = 0)) expect_error(adasyn(circle_example, var = "class", k = -1)) expect_error(adasyn(circle_example, var = "class", k = c(5, 10))) }) themis/tests/testthat/helper-test-functions.R0000644000176200001440000001677713753113241021154 0ustar liggesuserslibrary(rlang) test_printing <- function(step, data = NULL) { data <- data %||% circle_example rec <- recipe(~., data = data) %>% step(class) test_that("printing", { expect_output(print(rec)) expect_output(prep(rec, training = data, retain = TRUE, verbose = TRUE )) }) } test_bad_data <- function(step) { iris2 <- iris[-c(1:45), ] iris2$Species2 <- sample(iris2$Species) iris2$Species3 <- as.character(sample(iris2$Species)) rec <- recipe(~., data = iris2) test_that("bad data", { # numeric check expect_error( rec %>% step(Sepal.Width) %>% prep(retain = TRUE) ) # Multiple variable check expect_error( rec %>% step(Species, Species2) %>% prep(strings_as_factors = FALSE, retain = TRUE) ) # character check expect_error( rec %>% step(Species3) %>% prep(strings_as_factors = FALSE, retain = TRUE) ) }) } test_na_response <- function(step) { iris2 <- iris[-c(1:45), ] iris2$Species[seq(6, 96, by = 5)] <- NA test_that("NA in response", { # NA check expect_error( recipe(~., data = iris2) %>% step(Species) %>% prep(strings_as_factors = FALSE, retain = TRUE) ) }) } test_no_skipping <- function(step, data = NULL) { data <- data %||% circle_example rec <- recipe(~., data = data) %>% step(class, skip = FALSE) rec_p <- prep(rec, training = data, retain = TRUE) tr_xtab <- table(bake(rec_p, new_data = NULL)$class, useNA = "always") te_xtab <- table(bake(rec_p, new_data = data)$class, useNA = "always") test_that("no skipping", { expect_equal(te_xtab, tr_xtab) }) } test_character_error <- function(step) { test_that("errors if character are present", { df_char <- data.frame( x = factor(1:2), y = c("A", "A"), stringsAsFactors = FALSE ) expect_error( recipe(~., data = df_char) %>% step(x) %>% prep(), "should be numeric" ) }) } test_seed <- function(step, data = NULL) { data <- data %||% circle_example step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(~., data = data) %>% step(class, seed = seed) %>% prep(training = data, retain = TRUE) %>% bake(new_data = NULL) %>% pull(x) } test_that("`seed` produces identical sampling", { run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) } test_tidy <- function(step) { rec <- recipe(~., data = circle_example) %>% step(class, id = "") rec_p <- prep(rec, training = circle_example, retain = TRUE) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) test_that("basic usage", { expect_equivalent(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) } test_under_ratio <- function(step) { res1 <- recipe(~., data = circle_example) %>% step(class) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(~., data = circle_example) %>% step(class, under_ratio = 1.5) %>% prep() %>% bake(new_data = NULL) test_that("ratio value", { expect_true(all(table(res1$class) == min(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), min(table(circle_example$class)) * c(1, 1.5) ) }) } test_over_ratio <- function(step, ...) { res1 <- recipe(~., data = circle_example) %>% step(class, ...) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(~., data = circle_example) %>% step(class, over_ratio = 0.5) %>% prep() %>% bake(new_data = NULL) test_that("ratio value", { expect_true(all(table(res1$class) == max(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), max(table(circle_example$class)) * c(0.5, 1) ) }) } test_basic_usage <- function(step, ...) { rec1 <- recipe(~., data = circle_example) %>% step(class, ...) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") test_that("basic usage", { expect_equal(sort(te_xtab), sort(og_xtab)) expect_warning(prep(rec1, training = circle_example), NA) }) } test_2_class_only <- function(step) { test_that("only except 2 classes", { df_char <- data.frame( x = factor(1:3), stringsAsFactors = FALSE ) expect_error( recipe(~., data = df_char) %>% step(x) %>% prep(), "only have 2 levels." ) }) } test_multiclass <- function(step, data = NULL) { data <- data %||% rename(iris, class = Species) test_that("allows multi-class", { expect_error( recipe(~., data = data) %>% step(class) %>% prep(), NA ) }) } test_multi_majority <- function(step, ...) { rec1_p2 <- recipe(~., data = iris[-c(51:75), ]) %>% step(Species, ...) %>% prep() %>% bake(new_data = NULL) test_that("majority classes are ignored if there is more than 1", { expect_true(all(max(table(rec1_p2$Species)) <= 50)) }) } test_multi_minority <- function(step, ...) { rec1_p2 <- recipe(~., data = iris[-c(1:25, 51:75), ]) %>% step(Species, ...) %>% prep() %>% bake(new_data = NULL) test_that("minority classes are ignored if there is more than 1", { expect_true(all(max(table(rec1_p2$Species)) == 25)) }) } test_factor_level_memory <- function(step, ...) { # Only checks for two level case circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor(circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class))) } test_that("factor levels are not affected by alphabet ordering or class sizes", { for (i in 1:4) { rec_p <- recipe(~., data = circle_example_alt_levels[[i]]) %>% step(class) %>% prep(training = circle_example_alt_levels[[i]]) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) } test_result_ordering <- function(step, ...) { res <- recipe(~., data = circle_example) %>% step(class, ...) %>% prep() %>% bake(new_data = NULL) test_that("ordering of newly generated points are right", { expect_equal( res[seq_len(nrow(circle_example)), ], as_tibble(circle_example) ) }) } test_id_variables_are_ignores <- function(step, ...) { circle_example2 <- circle_example %>% mutate(id = as.character(row_number())) %>% as_tibble() res <- recipe(class ~ ., data = circle_example2) %>% update_role(id, new_role = "id") %>% step(class, ...) %>% prep() %>% bake(new_data = NULL) test_that("non-predictor variables are ignored", { expect_equal( c(circle_example2$id, rep(NA, nrow(res) - nrow(circle_example2))), as.character(res$id) ) }) } themis/tests/testthat/test-downsample.R0000644000176200001440000000211614022525454020021 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) context("Down-sampling") test_that("ratio deprecation", { expect_message( new_rec <- recipe(~., data = circle_example) %>% step_downsample(class, ratio = 2), "argument is now deprecated" ) expect_equal(new_rec$steps[[1]]$under_ratio, 2) }) test_that("tunable", { rec <- recipe(~., data = iris) %>% step_downsample(all_predictors(), under_ratio = 1) rec_param <- tunable.step_downsample(rec$steps[[1]]) expect_equal(rec_param$name, c("under_ratio")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 1) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_basic_usage(step_downsample) test_printing(step_downsample) test_bad_data(step_downsample) test_no_skipping(step_downsample) test_seed(step_downsample) test_tidy(step_downsample) test_under_ratio(step_downsample) test_multiclass(step_downsample) test_multi_minority(step_downsample) test_factor_level_memory(step_downsample) themis/tests/testthat/test-S3-methods.R0000644000176200001440000000302713753113251017576 0ustar liggesuserscontext("S3 methods") data(two_class_dat, package = "modeldata") # ------------------------------------------------------------------------------ r1 <- recipe(Class ~ ., data = two_class_dat) r2 <- r1 %>% step_adasyn(Class) r3 <- r1 %>% step_bsmote(Class) r4 <- r1 %>% step_downsample(Class) r5 <- r1 %>% step_nearmiss(Class) r6 <- r1 %>% step_rose(Class) r7 <- r1 %>% step_smote(Class) r8 <- r1 %>% step_tomek(Class) r9 <- r1 %>% step_upsample(Class) # ------------------------------------------------------------------------------ test_that("required packages", { expect_equal(required_pkgs(r2), c("recipes", "themis")) expect_equal(required_pkgs(r3), c("recipes", "themis")) expect_equal(required_pkgs(r4), c("recipes", "themis")) expect_equal(required_pkgs(r5), c("recipes", "themis")) expect_equal(required_pkgs(r6), c("recipes", "themis", "ROSE")) expect_equal(required_pkgs(r7), c("recipes", "themis")) expect_equal(required_pkgs(r8), c("recipes", "themis", "unbalanced")) expect_equal(required_pkgs(r9), c("recipes", "themis")) }) test_that("tunable arguments", { expect_equal(tunable(r2)$name, c("over_ratio", "neighbors")) expect_equal( tunable(r3)$name, c("over_ratio", "neighbors", "all_neighbors") ) expect_equal(tunable(r4)$name, "under_ratio") expect_equal(tunable(r5)$name, c("under_ratio", "neighbors")) expect_equal(tunable(r6)$name, "over_ratio") expect_equal(tunable(r7)$name, c("over_ratio", "neighbors")) expect_true(nrow(tunable(r8)) == 0) expect_equal(tunable(r9)$name, "over_ratio") }) themis/tests/testthat/test-nearmiss.R0000644000176200001440000000230214022524710017460 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) set.seed(1234) context("nearmiss") test_that("tunable", { rec <- recipe(~., data = iris) %>% step_nearmiss(all_predictors(), under_ratio = 1) rec_param <- tunable.step_nearmiss(rec$steps[[1]]) expect_equal(rec_param$name, c("under_ratio", "neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_basic_usage(step_nearmiss) test_printing(step_nearmiss) test_bad_data(step_nearmiss) test_no_skipping(step_nearmiss) test_character_error(step_nearmiss) test_na_response(step_nearmiss) test_tidy(step_nearmiss) test_under_ratio(step_nearmiss) test_multiclass(step_nearmiss, rename(iris[-c(1:25, 51:75), ], class = Species)) test_multi_minority(step_nearmiss) test_factor_level_memory(step_nearmiss) test_that("id variables are ignored", { rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(x, new_role = "ID") %>% step_nearmiss(class, under_ratio = 1) %>% prep() expect_equal(ncol(bake(rec_id, new_data = NULL)), 3) }) themis/tests/testthat/test-upsample.R0000644000176200001440000000220213753113254017473 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) context("Upsampling") test_that("ratio deprecation", { expect_message( new_rec <- recipe(~., data = circle_example) %>% step_upsample(class, ratio = 2), "argument is now deprecated" ) expect_equal(new_rec$steps[[1]]$over_ratio, 2) }) test_that("tunable", { rec <- recipe(~., data = iris) %>% step_upsample(all_predictors()) rec_param <- tunable.step_upsample(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 1) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_basic_usage(step_upsample) test_printing(step_upsample) test_bad_data(step_upsample) test_no_skipping(step_upsample) test_seed(step_upsample) test_tidy(step_upsample) test_over_ratio(step_upsample) test_multiclass(step_upsample) test_multi_majority(step_upsample) test_factor_level_memory(step_upsample) # add back in once step_upsample is completely deprecated # test_result_ordering(step_upsample) themis/tests/testthat/test-tomek.R0000644000176200001440000000051213714636642016776 0ustar liggesuserslibrary(testthat) library(recipes) library(dplyr) context("tomek") test_basic_usage(step_tomek) test_printing(step_tomek) test_bad_data(step_tomek) test_no_skipping(step_tomek) test_character_error(step_tomek) test_na_response(step_tomek) test_tidy(step_tomek) test_2_class_only(step_tomek) test_factor_level_memory(step_tomek) themis/tests/testthat.R0000644000176200001440000000007013603701230014660 0ustar liggesuserslibrary(testthat) library(themis) test_check("themis") themis/R/0000755000176200001440000000000014061010121011726 5ustar liggesusersthemis/R/step_rose.R0000644000176200001440000001671614016046730014106 0ustar liggesusers#' Apply ROSE algorithm #' #' `step_rose` creates a *specification* of a recipe #' step that generates sample of synthetic data by enlarging the features #' space of minority and majority class example. Using [ROSE::ROSE()]. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param minority_prop A numeric. Determines the of over-sampling of the #' minority class. Defaults to 0.5. #' @param minority_smoothness A numeric. Shrink factor to be multiplied by the #' smoothing parameters to estimate the conditional kernel density of the #' minority class. Defaults to 1. #' @param majority_smoothness A numeric. Shrink factor to be multiplied by the #' smoothing parameters to estimate the conditional kernel density of the #' majority class. Defaults to 1. #' @param seed An integer that will be used as the seed when #' rose-ing. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' The factor variable used to balance around must only have 2 levels. #' #' The ROSE algorithm works by selecting an observation belonging to class k #' and generates new examples in its neighborhood is determined by some matrix #' H_k. Smaller values of these arguments have the effect of shrinking the #' entries of the corresponding smoothing matrix H_k, Shrinking would be a #' cautious choice if there is a concern that excessively large neighborhoods #' could lead to blur the boundaries between the regions of the feature space #' associated with each class. #' #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' All columns used in this step must be numeric. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' @references Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a #' Package for Binary Imbalanced Learning. R Jorunal, 6:82–92. #' @references Menardi, G. and Torelli, N. (2014). Training and assessing #' classification rules with imbalanced data. Data Mining and Knowledge #' Discovery, 28:92–122. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(okc) #' #' sort(table(okc$Class, useNA = "always")) #' #' ds_rec <- recipe(Class ~ age + height, data = okc) %>% #' step_rose(Class) %>% #' prep() #' #' sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) #' #' # since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(ds_rec, new_data = okc) #' table(baked_okc$Class, useNA = "always") #' #' ds_rec2 <- recipe(Class ~ age + height, data = okc) %>% #' step_rose(Class, minority_prop = 0.3) %>% #' prep() #' #' table(bake(ds_rec2, new_data = NULL)$Class, useNA = "always") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without ROSE") #' #' recipe(class ~ ., data = circle_example) %>% #' step_rose(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With ROSE") step_rose <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, minority_prop = 0.5, minority_smoothness = 1, majority_smoothness = 1, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("rose")) { add_step( recipe, step_rose_new( terms = ellipse_check(...), role = role, trained = trained, column = column, over_ratio = over_ratio, minority_prop = minority_prop, minority_smoothness = minority_smoothness, majority_smoothness = majority_smoothness, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_rose_new <- function(terms, role, trained, column, over_ratio, minority_prop, minority_smoothness, majority_smoothness, predictors, skip, seed, id) { step( subclass = "rose", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, minority_prop = minority_prop, minority_smoothness = minority_smoothness, majority_smoothness = majority_smoothness, predictors = predictors, skip = skip, seed = seed, id = id ) } #' @export prep.step_rose <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } check_2_levels_only(training, col_name) predictors <- setdiff(info$variable[info$role == "predictor"], col_name) check_type(training[, predictors], TRUE) step_rose_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, minority_prop = x$minority_prop, minority_smoothness = x$minority_smoothness, majority_smoothness = x$majority_smoothness, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_rose <- function(object, new_data, ...) { if (any(is.na(new_data[[object$column]]))) { missing <- new_data[is.na(new_data[[object$column]]), ] } else { missing <- NULL } new_data <- as.data.frame(new_data) predictor_data <- new_data[, unique(c(object$predictors, object$column))] # rose with seed for reproducibility majority_size <- max(table(predictor_data[[object$column]])) * 2 with_seed( seed = object$seed, code = { original_levels <- levels(predictor_data[[object$column]]) synthetic_data <- ROSE( string2formula(object$column), predictor_data, N = floor(majority_size * object$over_ratio), p = object$minority_prop, hmult.majo = object$majority_smoothness, hmult.mino = object$minority_smoothness ) synthetic_data <- synthetic_data$data synthetic_data[[object$column]] <- factor( synthetic_data[[object$column]], levels = original_levels ) } ) new_data <- na_splice(new_data, synthetic_data, object) as_tibble(new_data) } #' @export print.step_rose <- function(x, width = max(20, options()$width - 26), ...) { cat("ROSE based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_rose #' @param x A `step_rose` object. #' @export tidy.step_rose <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_rose <- function(x, ...) { c("themis", "ROSE") } themis/R/bsmote_impl.R0000644000176200001440000001107514061032263014402 0ustar liggesusers#' borderline-SMOTE algorithm #' #' BSMOTE generates generate new examples of the minority class using nearest #' neighbors of these cases in the border region between classes. #' #' @inheritParams step_smote #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param all_neighbors Type of two borderline-SMOTE method. Defaults to FALSE. #' See details. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' This methods works the same way as [smote()], expect that instead of #' generating points around every point of of the minority class each point is #' first being classified into the boxes "danger" and "not". For each point the #' k nearest neighbors is calculated. If all the neighbors comes from a #' different class it is labeled noise and put in to the "not" box. If more then #' half of the neighbors comes from a different class it is labeled "danger. # Points will be generated around points labeled "danger". #' #' If `all_neighbors = FALSE` then points will be generated between nearest #' neighbors in its own class. If `all_neighbors = TRUE` then points will be #' generated between any nearest neighbors. See examples for visualization. #' #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. #' #' All columns used in this step must be numeric with no missing data. #' #' @references Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: #' a new over-sampling method in imbalanced data sets learning. In #' International Conference on Intelligent Computing, pages 878–887. Springer, #' 2005. #' #' @examples #' bsmote(circle_example, var = "class") #' #' bsmote(circle_example, var = "class", k = 10) #' #' bsmote(circle_example, var = "class", over_ratio = 0.8) #' #' bsmote(circle_example, var = "class", all_neighbors = TRUE) bsmote <- function(df, var, k = 5, over_ratio = 1, all_neighbors = FALSE) { if (length(var) != 1) { rlang::abort("Please select a single factor variable for `var`.") } var <- rlang::arg_match(var, colnames(df)) if (!(is.factor(df[[var]]) | is.character(df[[var]]))) { rlang::abort(paste0(var, " should be a factor or character variable.")) } if (length(k) != 1) { rlang::abort("`k` must be length 1.") } if (k < 1) { rlang::abort("`k` must be non-negative.") } predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -var), "smote") bsmote_impl(df, var, k, over_ratio) } bsmote_impl <- function(df, var, k = 5, over_ratio = 1, all_neighbors = FALSE) { majority_count <- max(table(df[[var]])) ratio_target <- majority_count * over_ratio which_upsample <- which(table(df[[var]]) < ratio_target) samples_needed <- ratio_target - table(df[[var]])[which_upsample] min_names <- names(samples_needed) out_dfs <- list() for (i in seq_along(min_names)) { data_mat <- as.matrix(df[names(df) != var]) ids <- RANN::nn2(data_mat, k = k + 1, searchtype = "priority")$nn.idx min_class_in <- df[[var]] == min_names[i] danger_ids <- danger( x = rowSums(matrix((min_class_in)[ids], ncol = ncol(ids))) - 1, k = k ) if (sum(danger_ids) <= k) { rlang::abort(paste0( "Not enough danger observations of '", min_names[i], "' to perform BSMOTE." )) } if (all_neighbors == FALSE) { tmp_df <- as.data.frame( smote_data( data = data_mat[min_class_in, ], k = k, n_samples = samples_needed[i], smote_ids = which(danger_ids[min_class_in]) ) ) } if (all_neighbors == TRUE) { tmp_df <- as.data.frame( smote_data(data_mat, k, samples_needed[i], which(danger_ids)) ) } colnames(tmp_df) <- colnames(data_mat) tmp_df[[var]] <- min_names[i] out_dfs[[i]] <- tmp_df } final <- rbind(df, do.call(rbind, out_dfs)) final[[var]] <- factor(final[[var]], levels = levels(df[[var]])) rownames(final) <- NULL final } danger <- function(x, k) { (x != k) & (k / 2 <= x) } themis/R/step_bsmote.R0000644000176200001440000001677014061005747014432 0ustar liggesusers#' Apply borderline-SMOTE algorithm #' #' `step_bsmote` creates a *specification* of a recipe #' step that generate new examples of the minority class using nearest #' neighbors of these cases in the border region between classes. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param neighbors An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param all_neighbors Type of two borderline-SMOTE method. Defaults to FALSE. #' See details. #' @param seed An integer that will be used as the seed when #' smote-ing. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' This methods works the same way as [step_smote()], expect that instead of #' generating points around every point of of the minority class each point is #' first being classified into the boxes "danger" and "not". For each point the #' k nearest neighbors is calculated. If all the neighbors comes from a #' different class it is labeled noise and put in to the "not" box. If more then #' half of the neighbors comes from a different class it is labeled "danger. # Points will be generated around points labeled "danger". #' #' If all_neighbors = FALSE then points will be generated between nearest #' neighbors in its own class. If all_neighbors = TRUE then points will be #' generated between any nearest neighbors. See examples for visualization. #' #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. #' #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' @references Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: #' a new over-sampling method in imbalanced data sets learning. In #' International Conference on Intelligent Computing, pages 878–887. Springer, #' 2005. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(credit_data) #' #' sort(table(credit_data$Status, useNA = "always")) #' #' ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>% #' step_meanimpute(all_predictors()) %>% #' step_bsmote(Status) %>% #' prep() #' #' sort(table(bake(ds_rec, new_data = NULL)$Status, useNA = "always")) #' #' # since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(ds_rec, new_data = credit_data) #' table(baked_okc$Status, useNA = "always") #' #' ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>% #' step_meanimpute(all_predictors()) %>% #' step_bsmote(Status, over_ratio = 0.2) %>% #' prep() #' #' table(bake(ds_rec2, new_data = NULL)$Status, useNA = "always") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without SMOTE") #' #' recipe(class ~ ., data = circle_example) %>% #' step_bsmote(class, all_neighbors = FALSE) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With borderline-SMOTE, all_neighbors = FALSE") #' #' recipe(class ~ ., data = circle_example) %>% #' step_bsmote(class, all_neighbors = TRUE) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With borderline-SMOTE, all_neighbors = TRUE") step_bsmote <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, all_neighbors = FALSE, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("bsmote")) { add_step( recipe, step_bsmote_new( terms = ellipse_check(...), role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, all_neighbors = all_neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_bsmote_new <- function(terms, role, trained, column, over_ratio, neighbors, all_neighbors, predictors, skip, seed, id) { step( subclass = "bsmote", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, all_neighbors = all_neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_bsmote <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } predictors <- setdiff(info$variable[info$role == "predictor"], col_name) check_type(training[, predictors], TRUE) check_na(select(training, -col_name), "step_bsmote") step_bsmote_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, neighbors = x$neighbors, all_neighbors = x$all_neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_bsmote <- function(object, new_data, ...) { new_data <- as.data.frame(new_data) predictor_data <- new_data[, unique(c(object$predictors, object$column))] # bsmote with seed for reproducibility with_seed( seed = object$seed, code = { synthetic_data <- bsmote_impl( predictor_data, object$column, k = object$neighbors, over_ratio = object$over_ratio, all_neighbors = object$all_neighbors ) } ) new_data <- na_splice(new_data, synthetic_data, object) as_tibble(new_data) } #' @export print.step_bsmote <- function(x, width = max(20, options()$width - 26), ...) { cat("BorderlineSMOTE based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_bsmote #' @param x A `step_bsmote` object. #' @export tidy.step_bsmote <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_bsmote <- function(x, ...) { c("themis") } themis/R/smote_impl.R0000644000176200001440000000761514061032122014237 0ustar liggesusers#' SMOTE algorithm #' #' SMOTE generates new examples of the minority class using nearest neighbors #' of these cases. #' #' @inheritParams step_smote #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. # #' All columns used in this function must be numeric with no missing data. #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @examples #' smote(circle_example, var = "class") #' #' smote(circle_example, var = "class", k = 10) #' #' smote(circle_example, var = "class", over_ratio = 0.8) smote <- function(df, var, k = 5, over_ratio = 1) { if (length(var) != 1) { rlang::abort("Please select a single factor variable for `var`.") } var <- rlang::arg_match(var, colnames(df)) if (!(is.factor(df[[var]]) | is.character(df[[var]]))) { rlang::abort(paste0(var, " should be a factor or character variable.")) } if (length(k) != 1) { rlang::abort("`k` must be length 1.") } if (k < 1) { rlang::abort("`k` must be non-negative.") } predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -var), "smote") smote_impl(df, var, k, over_ratio) } smote_impl <- function(df, var, k, over_ratio) { data <- split(df, df[[var]]) majority_count <- max(table(df[[var]])) ratio_target <- majority_count * over_ratio which_upsample <- which(table(df[[var]]) < ratio_target) samples_needed <- ratio_target - table(df[[var]])[which_upsample] min_names <- names(samples_needed) out_dfs <- list() for (i in seq_along(samples_needed)) { minority_df <- data[[min_names[i]]] minority <- as.matrix(minority_df[names(minority_df) != var]) if (nrow(minority) <= k) { rlang::abort(paste0( "Not enough observations of '", min_names[i], "' to perform SMOTE." )) } synthetic <- smote_data(minority, k = k, n_samples = samples_needed[i]) out_df <- as.data.frame(synthetic) out_df[var] <- data[[names(samples_needed)[i]]][[var]][1] names(out_df) <- names(df) out_dfs[[i]] <- out_df } final <- rbind(df, do.call(rbind, out_dfs)) final[[var]] <- factor(final[[var]], levels = levels(df[[var]])) rownames(final) <- NULL final } smote_data <- function(data, k, n_samples, smote_ids = seq_len(nrow(data))) { ids <- RANN::nn2(data, k = k + 1, searchtype = "priority")$nn.idx indexes <- rep(sample(smote_ids), length.out = n_samples) index_len <- tabulate(indexes, NROW(data)) out <- matrix(0, nrow = n_samples, ncol = ncol(data)) sampleids <- sample.int(k, n_samples, TRUE) runif_ids <- stats::runif(n_samples) iii <- 0 for (row_num in smote_ids) { index_selection <- iii + seq_len(index_len[row_num]) # removes itself as nearest neighbour id_knn <- ids[row_num, ids[row_num, ] != row_num] dif <- data[id_knn[sampleids[index_selection]], ] - data[rep(row_num, index_len[row_num]), ] gap <- dif * runif_ids[index_selection] out[index_selection, ] <- data[rep(row_num, index_len[row_num]), ] + gap iii <- iii + index_len[row_num] } out } themis/R/tuneable.R0000644000176200001440000000525614016046730013677 0ustar liggesusers#' tunable methods for themis #' #' These functions define what parameters _can_ be tuned for specific steps. #' They also define the recommended objects from the `dials` package that can #' be used to generate new parameter values and other characteristics. #' @param x A recipe step object #' @param ... Not used. #' @return A tibble object. #' @keywords internal #' @export tunable.step_adasyn <- function(x, ...) { tibble::tibble( name = c("over_ratio", "neighbors"), call_info = list( list(pkg = "dials", fun = "over_ratio"), list(pkg = "dials", fun = "neighbors", range = c(1, 10)) ), source = "recipe", component = "step_adasyn", component_id = x$id ) } #' @export #' @rdname tunable.step_adasyn tunable.step_bsmote <- function(x, ...) { tibble::tibble( name = c("over_ratio", "neighbors", "all_neighbors"), call_info = list( list(pkg = "dials", fun = "over_ratio"), list(pkg = "dials", fun = "neighbors"), list(pkg = "dials", fun = "all_neighbors") ), source = "recipe", component = "step_bsmote", component_id = x$id ) } #' @export #' @rdname tunable.step_adasyn tunable.step_downsample <- function(x, ...) { tibble::tibble( name = "under_ratio", call_info = list( list(pkg = "dials", fun = "under_ratio") ), source = "recipe", component = "step_downsample", component_id = x$id ) } #' @export #' @rdname tunable.step_adasyn tunable.step_nearmiss <- function(x, ...) { tibble::tibble( name = c("under_ratio", "neighbors"), call_info = list( list(pkg = "dials", fun = "under_ratio"), list(pkg = "dials", fun = "neighbors", range = c(1, 10)) ), source = "recipe", component = "step_nearmiss", component_id = x$id ) } #' @export #' @rdname tunable.step_adasyn tunable.step_rose <- function(x, ...) { tibble::tibble( name = c("over_ratio"), call_info = list( list(pkg = "dials", fun = "over_ratio") ), source = "recipe", component = "step_rose", component_id = x$id ) } #' @export #' @rdname tunable.step_adasyn tunable.step_smote <- function(x, ...) { tibble::tibble( name = c("over_ratio", "neighbors"), call_info = list( list(pkg = "dials", fun = "over_ratio"), list(pkg = "dials", fun = "neighbors", range = c(1, 10)) ), source = "recipe", component = "step_smote", component_id = x$id ) } #' @export #' @rdname tunable.step_adasyn tunable.step_upsample <- function(x, ...) { tibble::tibble( name = c("over_ratio"), call_info = list( list(pkg = "dials", fun = "over_ratio") ), source = "recipe", component = "step_upsample", component_id = x$id ) } themis/R/step_tomek.R0000644000176200001440000001302314016046730014241 0ustar liggesusers#' Under-sampling by removing Tomek’s links. #' #' `step_tomek` creates a *specification* of a recipe #' step that removes majority class instances of tomek links. Using #' [unbalanced::ubTomek()]. #' #' @inheritParams recipes::step_center #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param seed An integer that will be used as the seed when #' applied. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' The factor variable used to balance around must only have 2 levels. All #' other variables must be numerics with no missing data. #' #' A tomek link is defined as a pair of points from different classes and are #' each others nearest neighbors. #' #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' @references Tomek. Two modifications of cnn. IEEE Trans. Syst. Man Cybern., #' 6:769-772, 1976. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(okc) #' #' sort(table(okc$Class, useNA = "always")) #' #' ds_rec <- recipe(Class ~ age + height, data = okc) %>% #' step_meanimpute(all_predictors()) %>% #' step_tomek(Class) %>% #' prep() #' #' sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) #' #' # since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(ds_rec, new_data = okc) #' table(baked_okc$Class, useNA = "always") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without Tomek") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) #' #' recipe(class ~ ., data = circle_example) %>% #' step_tomek(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With Tomek") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) step_tomek <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("tomek")) { add_step( recipe, step_tomek_new( terms = ellipse_check(...), role = role, trained = trained, column = column, skip = skip, seed = seed, id = id ) ) } step_tomek_new <- function(terms, role, trained, column, skip, seed, id) { step( subclass = "tomek", terms = terms, role = role, trained = trained, column = column, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_tomek <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } check_2_levels_only(training, col_name) check_type(select(training, -col_name), TRUE) if (any(map_lgl(training, ~ any(is.na(.x))))) { rlang::abort("`NA` values are not allowed when using `step_tomek`") } step_tomek_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, skip = x$skip, seed = x$seed, id = x$id ) } # Turns a binary factor a variable where the majority class is coded as 0 and # the minority as 1. response_0_1 <- function(x) { ifelse(x == names(sort(table(x)))[1], 1, 0) } # Turns 0-1 coded variable back into factor variable response_0_1_to_org <- function(old, new, levels) { ref <- names(sort(table(old))) names(ref) <- c("1", "0") factor(unname(ref[as.character(new)]), levels = levels) } #' @export bake.step_tomek <- function(object, new_data, ...) { # tomek with seed for reproducibility with_seed( seed = object$seed, code = { original_levels <- levels(new_data[[object$column]]) tomek_data <- ubTomek( X = select(new_data, -!!object$column), Y = response_0_1(new_data[[object$column]]), verbose = FALSE ) } ) new_data0 <- mutate( tomek_data$X, !!object$column := response_0_1_to_org( new_data[[object$column]], tomek_data$Y, levels = original_levels ) ) as_tibble(new_data0[names(new_data)]) } #' @export print.step_tomek <- function(x, width = max(20, options()$width - 26), ...) { cat("Tomek based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_tomek #' @param x A `step_tomek` object. #' @export tidy.step_tomek <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_tomek <- function(x, ...) { c("themis", "unbalanced") } themis/R/data.R0000644000176200001440000000057114016046730013004 0ustar liggesusers#' Synthetic Dataset with a circle #' #' A random dataset with two classes one of which is inside a circle. Used for #' examples to show how the different methods handles borders. #' #' @format A data frame with 200 rows and 3 variables: #' \describe{ #' \item{x}{Numeric.} #' \item{y}{Numeric.} #' \item{class}{Factor, values "Circle" and "Rest".} #' } "circle_example" themis/R/adasyn_impl.R0000644000176200001440000000733414061030654014375 0ustar liggesusers#' Adaptive Synthetic Sampling Approach algorithm #' #' Generates synthetic positive instances using ADASYN algorithm. #' #' @inheritParams step_adasyn #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' All columns used in this function must be numeric with no missing data. #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @examples #' adasyn(circle_example, var = "class") #' #' adasyn(circle_example, var = "class", k = 10) #' #' adasyn(circle_example, var = "class", over_ratio = 0.8) adasyn <- function(df, var, k = 5, over_ratio = 1) { if (length(var) != 1) { rlang::abort("Please select a single factor variable for `var`.") } var <- rlang::arg_match(var, colnames(df)) if (!(is.factor(df[[var]]) | is.character(df[[var]]))) { rlang::abort(paste0(var, " should be a factor or character variable.")) } if (length(k) != 1) { rlang::abort("`k` must be length 1.") } if (k < 1) { rlang::abort("`k` must be non-negative.") } predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -var), "smote") adasyn_impl(df, var, k, over_ratio) } adasyn_impl <- function(df, var, k = 5, over_ratio = 1) { majority_count <- max(table(df[[var]])) ratio_target <- majority_count * over_ratio which_upsample <- which(table(df[[var]]) < ratio_target) samples_needed <- ratio_target - table(df[[var]])[which_upsample] min_names <- names(samples_needed) out_dfs <- list() data_mat <- as.matrix(df[names(df) != var]) ids_full <- RANN::nn2(data_mat, k = k + 1, searchtype = "priority")$nn.idx for (i in seq_along(min_names)) { min_class_in <- df[[var]] != min_names[i] r_value <- pmax( 0, rowSums(matrix((min_class_in)[ids_full], ncol = ncol(ids_full))) - 1 ) r_value <- r_value[!min_class_in] danger_ids <- sample(seq_along(r_value), samples_needed[i], TRUE, prob = r_value ) minority <- data_mat[!min_class_in, , drop = FALSE] if (nrow(minority) <= k) { rlang::abort(paste0( "Not enough observations of '", min_names[i], "' to perform ADASYN." )) } tmp_df <- as.data.frame( adasyn_sampler( minority, k, samples_needed[i], danger_ids ) ) colnames(tmp_df) <- colnames(data_mat) tmp_df[[var]] <- min_names[i] out_dfs[[i]] <- tmp_df } final <- rbind(df, do.call(rbind, out_dfs)) final[[var]] <- factor(final[[var]], levels = levels(df[[var]])) rownames(final) <- NULL final } adasyn_sampler <- function(data, k, n_samples, smote_ids) { ids <- RANN::nn2(data, k = k + 1, searchtype = "priority")$nn.idx index_len <- tabulate(smote_ids, NROW(data)) out <- matrix(0, nrow = n_samples, ncol = ncol(data)) sampleids <- sample.int(k, n_samples, TRUE) runif_ids <- stats::runif(n_samples) iii <- 0 for (row_num in which(index_len != 0)) { index_selection <- iii + seq_len(index_len[row_num]) # removes itself as nearest neighbour id_knn <- ids[row_num, ids[row_num, ] != row_num] dif <- data[id_knn[sampleids[index_selection]], ] - data[rep(row_num, index_len[row_num]), ] gap <- dif * runif_ids[index_selection] out[index_selection, ] <- data[rep(row_num, index_len[row_num]), ] + gap iii <- iii + index_len[row_num] } out } themis/R/themis-package.R0000644000176200001440000000031214016046730014746 0ustar liggesusers#' @keywords internal "_PACKAGE" # The following block is used by usethis to automatically manage # roxygen namespace tags. Modify with care! ## usethis namespace: start ## usethis namespace: end NULL themis/R/step_downsample.R0000644000176200001440000001453714016046730015306 0ustar liggesusers#' Down-Sample a Data Set Based on a Factor Variable #' #' `step_downsample` creates a *specification* of a recipe #' step that will remove rows of a data set to make the occurrence #' of levels in a specific factor level equal. #' #' @inheritParams recipes::step_center #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param under_ratio A numeric value for the ratio of the #' minority-to-majority frequencies. The default value (1) means #' that all other levels are sampled down to have the same #' frequency as the least occurring level. A value of 2 would mean #' that the majority levels will have (at most) (approximately) #' twice as many rows than the minority level. #' @param ratio Deprecated argument; same as `under_ratio` #' @param target An integer that will be used to subsample. This #' should not be set by the user and will be populated by `prep`. #' @param seed An integer that will be used as the seed when downsampling. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' @details #' Down-sampling is intended to be performed on the _training_ set #' alone. For this reason, the default is `skip = TRUE`. It is #' advisable to use `prep(recipe, retain = TRUE)` when preparing #' the recipe; in this way [juice()] can be used to obtain the #' down-sampled version of the data. #' #' If there are missing values in the factor variable that is used #' to define the sampling, missing data are selected at random in #' the same way that the other factor levels are sampled. Missing #' values are not used to determine the amount of data in the #' minority level #' #' For any data with factor levels occurring with the same #' frequency as the minority level, all data will be retained. #' #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' Keep in mind that the location of down-sampling in the step #' may have effects. For example, if centering and scaling, #' it is not clear whether those operations should be conducted #' _before_ or _after_ rows are removed. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(okc) #' #' sort(table(okc$diet, useNA = "always")) #' #' ds_rec <- recipe(~., data = okc) %>% #' step_downsample(diet) %>% #' prep(training = okc, retain = TRUE) #' #' sort(table(bake(ds_rec, new_data = NULL)$diet, useNA = "always")) #' #' # since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(ds_rec, new_data = okc) #' table(baked_okc$diet, useNA = "always") step_downsample <- function(recipe, ..., under_ratio = 1, ratio = NA, role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("downsample")) { if (!is.na(ratio) & all(under_ratio != ratio)) { message( paste( "The `ratio` argument is now deprecated in favor of `under_ratio`.", "`ratio` will be removed in a subsequent version." ) ) if (!is.na(ratio)) { under_ratio <- ratio } } add_step( recipe, step_downsample_new( terms = ellipse_check(...), under_ratio = under_ratio, ratio = ratio, role = role, trained = trained, column = column, target = target, skip = skip, seed = seed, id = id ) ) } step_downsample_new <- function(terms, under_ratio, ratio, role, trained, column, target, skip, seed, id) { step( subclass = "downsample", terms = terms, under_ratio = under_ratio, ratio = ratio, role = role, trained = trained, column = column, target = target, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_downsample <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } obs_freq <- table(training[[col_name]]) minority <- min(obs_freq) step_downsample_new( terms = x$terms, under_ratio = x$under_ratio, ratio = x$ratio, role = x$role, trained = TRUE, column = col_name, target = floor(minority * x$under_ratio), skip = x$skip, seed = x$seed, id = x$id ) } subsamp <- function(x, num) { n <- nrow(x) if (nrow(x) == num) { out <- x } else { # downsampling is done without replacement out <- x[sample(1:n, min(num, n)), ] } out } #' @export bake.step_downsample <- function(object, new_data, ...) { if (any(is.na(new_data[[object$column]]))) { missing <- new_data[is.na(new_data[[object$column]]), ] } else { missing <- NULL } split_up <- split(new_data, new_data[[object$column]]) # Downsample with seed for reproducibility with_seed( seed = object$seed, code = { new_data <- map_dfr(split_up, subsamp, num = object$target) if (!is.null(missing)) { new_data <- bind_rows(new_data, subsamp(missing, object$target)) } } ) as_tibble(new_data) } print.step_downsample <- function(x, width = max(20, options()$width - 26), ...) { cat("Down-sampling based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_downsample #' @param x A `step_downsample` object. #' @export tidy.step_downsample <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_downsample <- function(x, ...) { c("themis") } themis/R/misc.R0000644000176200001440000000256714061027623013035 0ustar liggesusersstring2formula <- function(x) { out <- a ~ . out[[2]] <- rlang::sym(x) out } check_na <- function(data, step) { na_cols <- vapply(data, function(x) any(is.na(x)), FUN.VALUE = logical(1)) if (any(na_cols)) { rlang::abort(paste0( "`", step, "` cannot have any missing values. NAs found ind: ", paste(names(na_cols)[na_cols], collapse = ", "), "." )) } } check_2_levels_only <- function(data, col_name) { if (length(levels(data[[col_name]])) != 2) { rlang::abort(paste0("`", col_name, "`` must only have 2 levels.")) } } check_numeric <- function(dat) { all_good <- vapply(dat, is.numeric, logical(1)) label <- "numeric" if (!all(all_good)) rlang::abort( paste0( "All columns for this function should be numeric." ) ) invisible(all_good) } na_splice <- function(new_data, synthetic_data, object) { non_predictor <- setdiff(names(new_data), c(object$column, object$predictors)) if (length(non_predictor) == 0) { return(synthetic_data) } new_data[, non_predictor, drop = FALSE] na_data <- matrix( nrow = nrow(synthetic_data) - nrow(new_data), ncol = length(non_predictor) ) colnames(na_data) <- non_predictor na_data <- as.data.frame(na_data) res <- bind_cols( synthetic_data, bind_rows(new_data[, non_predictor, drop = FALSE], na_data) ) res[, names(new_data)] } themis/R/step_upsample.R0000644000176200001440000001565614016046730014766 0ustar liggesusers#' Up-Sample a Data Set Based on a Factor Variable #' #' `step_upsample` creates a *specification* of a recipe step that #' will replicate rows of a data set to make the occurrence of #' levels in a specific factor level equal. #' #' @inheritParams recipes::step_center #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param over_ratio A numeric value for the ratio of the #' majority-to-minority frequencies. The default value (1) means #' that all other levels are sampled up to have the same #' frequency as the most occurring level. A value of 0.5 would mean #' that the minority levels will have (at most) (approximately) #' half as many rows than the majority level. #' @param ratio Deprecated argument; same as `over_ratio`. #' @param target An integer that will be used to subsample. This #' should not be set by the user and will be populated by `prep`. #' @param seed An integer that will be used as the seed when upsampling. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' @details #' Up-sampling is intended to be performed on the _training_ set #' alone. For this reason, the default is `skip = TRUE`. It is #' advisable to use `prep(recipe, retain = TRUE)` when preparing #' the recipe; in this way [juice()] can be used to obtain the #' up-sampled version of the data. #' #' If there are missing values in the factor variable that is used #' to define the sampling, missing data are selected at random in #' the same way that the other factor levels are sampled. Missing #' values are not used to determine the amount of data in the #' majority level (see example below). #' #' For any data with factor levels occurring with the same #' frequency as the majority level, all data will be retained. #' #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(okc) #' #' orig <- table(okc$diet, useNA = "always") #' #' sort(orig, decreasing = TRUE) #' #' up_rec <- recipe(~., data = okc) %>% #' # Bring the minority levels up to about 200 each #' # 200/16562 is approx 0.0121 #' step_upsample(diet, over_ratio = 0.0121) %>% #' prep(training = okc, retain = TRUE) #' #' training <- table(bake(up_rec, new_data = NULL)$diet, useNA = "always") #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(up_rec, new_data = okc) #' baked <- table(baked_okc$diet, useNA = "always") #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' data.frame( #' level = names(orig), #' orig_freq = as.vector(orig), #' train_freq = as.vector(training), #' baked_freq = as.vector(baked) #' ) #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without upsample") #' #' recipe(class ~ ., data = circle_example) %>% #' step_upsample(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_jitter(width = 0.1, height = 0.1) + #' labs(title = "With upsample (with jittering)") step_upsample <- function(recipe, ..., over_ratio = 1, ratio = NA, role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("upsample")) { if (!is.na(ratio) & all(over_ratio != ratio)) { message( paste( "The `ratio` argument is now deprecated in favor of `over_ratio`.", "`ratio` will be removed in a subsequent version." ) ) if (!is.na(ratio)) { over_ratio <- ratio } } add_step( recipe, step_upsample_new( terms = ellipse_check(...), over_ratio = over_ratio, ratio = ratio, role = role, trained = trained, column = column, target = target, skip = skip, seed = seed, id = id ) ) } step_upsample_new <- function(terms, over_ratio, ratio, role, trained, column, target, skip, seed, id) { step( subclass = "upsample", terms = terms, over_ratio = over_ratio, ratio = ratio, role = role, trained = trained, column = column, target = target, skip = skip, id = id, seed = seed ) } #' @export prep.step_upsample <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } obs_freq <- table(training[[col_name]]) majority <- max(obs_freq) step_upsample_new( terms = x$terms, ratio = x$ratio, over_ratio = x$over_ratio, role = x$role, trained = TRUE, column = col_name, target = floor(majority * x$over_ratio), skip = x$skip, id = x$id, seed = x$seed ) } supsamp <- function(x, num) { n <- nrow(x) if (nrow(x) == num) { out <- x } else { # upsampling is done with replacement out <- x[sample(1:n, max(num, n), replace = TRUE), ] } out } #' @export bake.step_upsample <- function(object, new_data, ...) { if (any(is.na(new_data[[object$column]]))) { missing <- new_data[is.na(new_data[[object$column]]), ] } else { missing <- NULL } split_up <- split(new_data, new_data[[object$column]]) # Upsample with seed for reproducibility with_seed( seed = object$seed, code = { new_data <- map_dfr(split_up, supsamp, num = object$target) if (!is.null(missing)) { new_data <- bind_rows(new_data, supsamp(missing, object$target)) } } ) as_tibble(new_data) } print.step_upsample <- function(x, width = max(20, options()$width - 26), ...) { cat("Up-sampling based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_upsample #' @param x A `step_upsample` object. #' @export tidy.step_upsample <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_upsample <- function(x, ...) { c("themis") } themis/R/nearmiss_impl.R0000644000176200001440000000273314061006236014734 0ustar liggesusersnearmiss_impl <- function(df, var, ignore_vars, k = 5, under_ratio = 1) { classes <- downsample_count(df, var, under_ratio) out_dfs <- list() for (i in seq_along(classes)) { class <- subset_to_matrix(df, var, names(classes)[i]) not_class <- subset_to_matrix(df, var, names(classes)[i], FALSE) if (nrow(not_class) <= k) { rlang::abort(paste0( "Not enough danger observations of '", names(classes)[i], "' to perform NEARMISS." )) } dists <- RANN::nn2( not_class[, !(colnames(not_class) %in% ignore_vars)], class[, !(colnames(class) %in% ignore_vars)], k = k )$nn.dists selected_ind <- order(rowMeans(dists)) <= (nrow(class) - classes[i]) selected_rows <- class[selected_ind, ] out_df <- as.data.frame(selected_rows) out_df[var] <- names(classes)[i] out_dfs[[i]] <- out_df[, names(df)] } out_dfs[[i + 1]] <- df[!(df[[var]] %in% names(classes)), ] final <- do.call(rbind, out_dfs) rownames(final) <- NULL final } downsample_count <- function(data, var, ratio) { min_count <- min(table(data[[var]])) ratio_target <- min_count * ratio which_class <- which(table(data[[var]]) > ratio_target) table(data[[var]])[which_class] - ratio_target } subset_to_matrix <- function(data, var, class, equal = TRUE) { if (equal) { return(as.matrix(data[data[[var]] == class, names(data) != var])) } else { return(as.matrix(data[data[[var]] != class, names(data) != var])) } } themis/R/step_smote.R0000644000176200001440000001405314061027711014253 0ustar liggesusers#' Apply SMOTE algorithm #' #' `step_smote` creates a *specification* of a recipe #' step that generate new examples of the minority class using nearest #' neighbors of these cases. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param neighbors An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param seed An integer that will be used as the seed when #' smote-ing. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. #' #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(credit_data) #' #' sort(table(credit_data$Status, useNA = "always")) #' #' ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>% #' step_meanimpute(all_predictors()) %>% #' step_smote(Status) %>% #' prep() #' #' sort(table(bake(ds_rec, new_data = NULL)$Status, useNA = "always")) #' #' # since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(ds_rec, new_data = credit_data) #' table(baked_okc$Status, useNA = "always") #' #' ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>% #' step_meanimpute(all_predictors()) %>% #' step_smote(Status, over_ratio = 0.2) %>% #' prep() #' #' table(bake(ds_rec2, new_data = NULL)$Status, useNA = "always") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without SMOTE") #' #' recipe(class ~ ., data = circle_example) %>% #' step_smote(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With SMOTE") step_smote <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("smote")) { add_step( recipe, step_smote_new( terms = ellipse_check(...), role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_smote_new <- function(terms, role, trained, column, over_ratio, neighbors, predictors, skip, seed, id) { step( subclass = "smote", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_smote <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } predictors <- setdiff(info$variable[info$role == "predictor"], col_name) check_type(training[, predictors], TRUE) check_na(select(training, -col_name), "step_smote") step_smote_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, neighbors = x$neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_smote <- function(object, new_data, ...) { new_data <- as.data.frame(new_data) predictor_data <- new_data[, unique(c(object$predictors, object$column))] # smote with seed for reproducibility with_seed( seed = object$seed, code = { synthetic_data <- smote_impl( predictor_data, object$column, k = object$neighbors, over_ratio = object$over_ratio ) } ) new_data <- na_splice(new_data, synthetic_data, object) as_tibble(new_data) } #' @export print.step_smote <- function(x, width = max(20, options()$width - 26), ...) { cat("SMOTE based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_smote #' @param x A `step_smote` object. #' @export tidy.step_smote <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_smote <- function(x, ...) { c("themis") } themis/R/0_imports.R0000644000176200001440000000117414022524713014006 0ustar liggesusers#' @importFrom dplyr bind_cols bind_rows mutate select #' @importFrom generics tidy #' @importFrom purrr map_dfr map_lgl #' @importFrom recipes add_step bake check_type ellipse_check is_trained prep #' @importFrom recipes printer rand_id sel2char step terms_select #' @importFrom rlang := #' @importFrom ROSE ROSE #' @importFrom tibble as_tibble tibble #' @importFrom unbalanced ubTomek #' @importFrom withr with_seed utils::globalVariables() #' @importFrom generics tidy #' @export generics::tidy #' @importFrom generics required_pkgs #' @export generics::required_pkgs #' @importFrom generics tunable #' @export generics::tunable themis/R/step_nearmiss.R0000644000176200001440000001352714061006221014743 0ustar liggesusers#' Under-sampling by removing points near other classes. #' #' `step_nearmiss` creates a *specification* of a recipe #' step that removes majority class instances by undersampling points #' in the majority class based on their distance to other points in the #' same class. #' #' @inheritParams recipes::step_center #' @inheritParams step_downsample #' @inheritParams step_smote #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param seed An integer that will be used as the seed when #' applied. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' This methods retained the points form the majority classes which has the #' smallest mean distance to the k nearest points in the other classes. #' #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' @references Inderjeet Mani and I Zhang. knn approach to unbalanced data #' distributions: a case study involving information extraction. In Proceedings #' of workshop on learning from imbalanced datasets, 2003. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(okc) #' #' sort(table(okc$Class, useNA = "always")) #' #' ds_rec <- recipe(Class ~ age + height, data = okc) %>% #' step_meanimpute(all_predictors()) %>% #' step_nearmiss(Class) %>% #' prep() #' #' sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) #' #' # since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(ds_rec, new_data = okc) #' table(baked_okc$Class, useNA = "always") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without NEARMISS") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) #' #' recipe(class ~ ., data = circle_example) %>% #' step_nearmiss(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With NEARMISS") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) step_nearmiss <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, under_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("nearmiss")) { add_step( recipe, step_nearmiss_new( terms = ellipse_check(...), role = role, trained = trained, column = column, under_ratio = under_ratio, neighbors = neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_nearmiss_new <- function(terms, role, trained, column, under_ratio, neighbors, predictors, skip, seed, id) { step( subclass = "nearmiss", terms = terms, role = role, trained = trained, column = column, under_ratio = under_ratio, neighbors = neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_nearmiss <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } predictors <- setdiff(info$variable[info$role == "predictor"], col_name) check_type(training[, predictors], TRUE) if (any(map_lgl(training, ~ any(is.na(.x))))) { rlang::abort("`NA` values are not allowed when using `step_nearmiss`") } step_nearmiss_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, under_ratio = x$under_ratio, neighbors = x$neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_nearmiss <- function(object, new_data, ...) { ignore_vars <- setdiff(names(new_data), c(object$predictors, object$column)) # nearmiss with seed for reproducibility with_seed( seed = object$seed, code = { original_levels <- levels(new_data[[object$column]]) new_data <- nearmiss_impl( df = new_data, var = object$column, ignore_vars = ignore_vars, k = object$neighbors, under_ratio = object$under_ratio ) new_data[[object$column]] <- factor(new_data[[object$column]], levels = original_levels) } ) as_tibble(new_data) } #' @export print.step_nearmiss <- function(x, width = max(20, options()$width - 26), ...) { cat("NEARMISS-1 based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_nearmiss #' @param x A `step_nearmiss` object. #' @export tidy.step_nearmiss <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_nearmiss <- function(x, ...) { c("themis") } themis/R/step_adasyn.R0000644000176200001440000001316014061004077014401 0ustar liggesusers#' Adaptive Synthetic Sampling Approach #' #' `step_adasyn` creates a *specification* of a recipe #' step that generates synthetic positive instances using ADASYN algorithm. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [selections()] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param neighbors An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param seed An integer that will be used as the seed when #' applied. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' All columns in the data are sampled and returned by [juice()] #' and [bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' @references He, H., Bai, Y., Garcia, E. and Li, S. 2008. ADASYN: Adaptive #' synthetic sampling approach for imbalanced learning. Proceedings of #' IJCNN 2008. (IEEE World Congress on Computational Intelligence). IEEE #' International Joint Conference. pp.1322-1328. #' #' @keywords datagen #' @concept preprocessing #' @concept subsampling #' @export #' @examples #' library(recipes) #' library(modeldata) #' data(okc) #' #' sort(table(okc$Class, useNA = "always")) #' #' ds_rec <- recipe(Class ~ age + height, data = okc) %>% #' step_meanimpute(all_predictors()) %>% #' step_adasyn(Class) %>% #' prep() #' #' sort(table(bake(ds_rec, new_data = NULL)$Class, useNA = "always")) #' #' # since `skip` defaults to TRUE, baking the step has no effect #' baked_okc <- bake(ds_rec, new_data = okc) #' table(baked_okc$Class, useNA = "always") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without ADASYN") #' #' recipe(class ~ ., data = circle_example) %>% #' step_adasyn(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With ADASYN") step_adasyn <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("adasyn")) { add_step( recipe, step_adasyn_new( terms = ellipse_check(...), role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_adasyn_new <- function(terms, role, trained, column, over_ratio, neighbors, predictors, skip, seed, id) { step( subclass = "adasyn", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_adasyn <- function(x, training, info = NULL, ...) { col_name <- terms_select(x$terms, info = info) if (length(col_name) != 1) { rlang::abort("Please select a single factor variable.") } if (!is.factor(training[[col_name]])) { rlang::abort(paste0(col_name, " should be a factor variable.")) } predictors <- setdiff(info$variable[info$role == "predictor"], col_name) check_type(training[, predictors], TRUE) if (any(map_lgl(training, ~ any(is.na(.x))))) { rlang::abort("`NA` values are not allowed when using `step_adasyn`") } step_adasyn_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, neighbors = x$neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_adasyn <- function(object, new_data, ...) { new_data <- as.data.frame(new_data) predictor_data <- new_data[, unique(c(object$predictors, object$column))] # adasyn with seed for reproducibility with_seed( seed = object$seed, code = { synthetic_data <- adasyn_impl( predictor_data, object$column, k = object$neighbors, over_ratio = object$over_ratio ) } ) new_data <- na_splice(new_data, synthetic_data, object) as_tibble(new_data) } #' @export print.step_adasyn <- function(x, width = max(20, options()$width - 26), ...) { cat("adasyn based on ", sep = "") printer(x$column, x$terms, x$trained, width = width) invisible(x) } #' @rdname step_adasyn #' @param x A `step_adasyn` object. #' @export tidy.step_adasyn <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$column) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' S3 methods for tracking which additional packages are needed for steps. #' #' @param x A recipe step #' @return A character vector #' @rdname required_pkgs.step #' @keywords internal #' @export required_pkgs.step_adasyn <- function(x, ...) { c("themis") } themis/NEWS.md0000644000176200001440000000106514061220015012632 0ustar liggesusers# themis 0.1.4 * export `smote()`, `adasyn()`, and `bsmote()` functions to users. # themis 0.1.3 * Steps that use nearest neighbors gives cleaner errors. # themis 0.1.2 * tuneable steps now properly work with tune package. * Steps now Retain original factor level ordering. (#22) * Oversampling steps now ignore non-predictor variables when appropriate. (#20) # themis 0.1.1 * `step_smote()` now work regardless of order of classes. Thanks to @sebastien-foulle for point it out #14. # themis 0.1.0 * Added a `NEWS.md` file to track changes to the package. themis/MD50000644000176200001440000000625714061222402012056 0ustar liggesusersda251990153559bb0338c6c9f4614197 *DESCRIPTION 8fc0d8e1cf835e1670787e63b4801f78 *LICENSE 9dc510f46abd364d17ef058017b1dacd *NAMESPACE d477f28510f84c9def5643938e5e3507 *NEWS.md 8d5fb174317c07443d65311340e189fd *R/0_imports.R 33d6b5afd77bfd79163d3e5d64823508 *R/adasyn_impl.R ba64221dd1dada5c1d35b99769091cf8 *R/bsmote_impl.R c6124afeab286b6ebb8aef3eea84ede2 *R/data.R 3d69f09811c38a4a1c379c6072f7bccd *R/misc.R 694c40bee7b9d35f87520006793dc8fc *R/nearmiss_impl.R 83167cac452dd7dd9ac3475153f86d89 *R/smote_impl.R b8804737b5bae013d39dcec6e49fd9b1 *R/step_adasyn.R caf103649c1e3b69c72fc225368f58b1 *R/step_bsmote.R 727872f8d2740eff20d44303e2983119 *R/step_downsample.R 49a4c050d8bfe1cdbe54a270fd0dda5f *R/step_nearmiss.R 753f789ca41c374a53c3834c5cb0c6a1 *R/step_rose.R 00c81580bcd6a87dd8d9ffe0b8081f93 *R/step_smote.R 2958374bfd22272314a92969a6b2f740 *R/step_tomek.R 419786c9c535533ee73449883a995be9 *R/step_upsample.R e0ef681901465d24f47f9139cb2c086a *R/themis-package.R af6562064d0afcb1f33df80f53c4d3f0 *R/tuneable.R c0e98e88d3d8966eca2a6b0ab5bdbf79 *README.md 663b0bb0e50d3834860a30bd2bdeec53 *data/circle_example.rda 1b2d94643dc59a69192dda65cec41225 *man/adasyn.Rd 25fa053b4e02b6d8755b57541969d384 *man/bsmote.Rd c502dc5cf95e92d5125035c50aa1c408 *man/circle_example.Rd f65b62c96471d99466ac2466cee1f9fc *man/figures/README-unnamed-chunk-2-1.png fe044891a7e6f1558dc962c324ade7b3 *man/figures/README-unnamed-chunk-3-1.png 356800bd8a0510b9635825e8e8d128ca *man/figures/README-unnamed-chunk-4-1.png 93c77dcb3f5e594f3c7c33376eea1db3 *man/figures/README-unnamed-chunk-5-1.png 66c265f16657aff1fd48fb0480cd1007 *man/figures/README-unnamed-chunk-6-1.png 4ac57e0badc3ebdddcfbc5964ea909fe *man/reexports.Rd 73611f9919f11194baf96f0bf98d2356 *man/required_pkgs.step.Rd e8ed345c05dc2cea324c18fe76544223 *man/smote.Rd 1517d291da6dee1915034d6ff9ace8c8 *man/step_adasyn.Rd 31eac0d95ee55d759eae499c8edd9ff1 *man/step_bsmote.Rd 37972d8ca72b9c7163c2955778cade7a *man/step_downsample.Rd edd9f81d56b2fe897f11e522de5c290f *man/step_nearmiss.Rd 2c4115efe01a9c906129157a427fece9 *man/step_rose.Rd 74a460eefaacc1eb1a3013d503791d28 *man/step_smote.Rd 8eb0f6e6a0da9963fe834d4329a7593d *man/step_tomek.Rd e7869651420c20d5434e2aad51135416 *man/step_upsample.Rd 9e68b087a8dc7ecb5ec4da613f43257e *man/themis-package.Rd ecd73b1b70bfd138b161ab982ce3a5e2 *man/tunable.step_adasyn.Rd 19c296e16132ecd6d694bde113e15f41 *tests/testthat.R 43ada26b8ba7b9b1efb5cc4e7525a993 *tests/testthat/helper-test-functions.R e32c44bd281ff59ced7bfac1f3a7011b *tests/testthat/test-S3-methods.R aeecf2ba7eedeb8951271161da2d5206 *tests/testthat/test-adasyn.R 794d15a13b7e8fa220fa3b2ac9ec1154 *tests/testthat/test-adasyn_impl.R 82b4371b3cc5f61b764bc5f1721e59ca *tests/testthat/test-bsmote.R c88b99dfbfe19bac1d1cde215d778211 *tests/testthat/test-bsmote_impl.R 074148af070453aff7be3ee7e84627ab *tests/testthat/test-downsample.R c87f82e19fc79ecadd8f9787f6b8c35c *tests/testthat/test-nearmiss.R c93ca6f34cef4489b46f7e8e7b7aaee0 *tests/testthat/test-rose.R 7b13c6a4ffb2e2346dc73bd5359f377c *tests/testthat/test-smote.R 9de7a3d2bb2cd1caeebeceb06a21b249 *tests/testthat/test-smote_impl.R 10c64adbdb2d4d2d04361f1308bd589f *tests/testthat/test-tomek.R 46882884f435598bd3887a7cffe54523 *tests/testthat/test-upsample.R