themis/0000755000176200001440000000000014744304332011547 5ustar liggesusersthemis/tests/0000755000176200001440000000000014744045253012715 5ustar liggesusersthemis/tests/testthat/0000755000176200001440000000000014744304332014551 5ustar liggesusersthemis/tests/testthat/test-smotenc_impl.R0000644000176200001440000000057114744045253020351 0ustar liggesuserstest_that("bad args", { expect_snapshot( error = TRUE, smotenc(matrix()) ) expect_snapshot( error = TRUE, smotenc(circle_example, var = "class", k = 0) ) expect_snapshot( error = TRUE, smotenc(circle_example, var = "class", k = 5.5) ) expect_snapshot( error = TRUE, smotenc(circle_example, var = "class", over_ratio = TRUE) ) }) themis/tests/testthat/test-downsample.R0000644000176200001440000001724214744045253020034 0ustar liggesuserstest_that("ratio deprecation", { expect_snapshot( error = TRUE, new_rec <- recipe(~., data = circle_example) %>% step_downsample(class, ratio = 2) ) }) test_that("basic usage", { rec1 <- recipe(~., data = circle_example) %>% step_downsample(class) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_downsample(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_downsample(class, id) %>% prep() ) }) test_that("`seed` produces identical sampling", { step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(~., data = circle_example) %>% step_downsample(class, seed = seed) %>% prep() %>% bake(new_data = NULL) %>% pull(x) } run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) test_that("test tidy()", { rec <- recipe(~., data = circle_example) %>% step_downsample(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("ratio value works when undersampling", { res1 <- recipe(~., data = circle_example) %>% step_downsample(class) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(~., data = circle_example) %>% step_downsample(class, under_ratio = 1.5) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$class) == min(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), min(table(circle_example$class)) * c(1, 1.5) ) }) test_that("allows multi-class", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_no_error( recipe(Home ~ Age + Income + Assets, data = credit_data) %>% step_impute_mean(Income, Assets) %>% step_downsample(Home) ) }) test_that("minority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:84), ] ) %>% step_impute_mean(all_predictors()) %>% step_downsample(species) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 68)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(~., data = circle_example_alt_levels[[i]]) %>% step_downsample(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_downsample(class) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("case_weights", { circle_example_cw <- circle_example %>% mutate(weights = frequency_weights(rep(0:1, c(200, 200)))) rec1_p <- recipe(~., data = circle_example_cw) %>% step_downsample(class) %>% prep() exp_count <- circle_example_cw %>% filter(as.integer(weights) == 1) %>% count(class) %>% pull(n) %>% min() rec_count <- bake(rec1_p, new_data = NULL) %>% count(class) %>% pull(n) expect_true(all(exp_count == rec_count)) expect_snapshot(rec1_p) # ignore importance weights circle_example_cw <- circle_example %>% mutate(weights = importance_weights(rep(0:1, c(200, 200)))) rec1_p <- recipe(~., data = circle_example_cw) %>% step_downsample(class) %>% prep() exp_count <- circle_example_cw %>% count(class) %>% pull(n) %>% min() rec_count <- bake(rec1_p, new_data = NULL) %>% count(class) %>% pull(n) expect_true(all(exp_count == rec_count)) expect_snapshot(rec1_p) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_downsample(all_predictors()) rec_param <- tunable.step_downsample(rec$steps[[1]]) expect_equal(rec_param$name, c("under_ratio")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 1) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_downsample(under_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_downsample(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_downsample(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_downsample(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_downsample(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_downsample(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(~., data = circle_example) %>% step_downsample(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_downsample( all_predictors(), under_ratio = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 1L) }) themis/tests/testthat/test-extension_check.R0000644000176200001440000000020214744045253021020 0ustar liggesuserstest_that("recipes_extension_check", { expect_snapshot( recipes::recipes_extension_check( pkg = "themis" ) ) }) themis/tests/testthat/test-smote_impl.R0000644000176200001440000000442014744045253020025 0ustar liggesuserstest_that("samples stay inside convex hull of data.", { rdata <- matrix(c(0, 1, 1, 0, 0, 0, 1, 1), ncol = 2) expect_true(all(dplyr::between(smote_data(rdata, 3, 100), 0, 1))) }) test_that("order doesn't matter", { df <- data.frame( target = rep(c("Yes", "No"), c(10, 50)), x = rep(1:2, c(10, 50)) ) expect_equal(100, nrow(themis:::smote_impl(df, "target", 5, 1))) df <- data.frame( target = rep(c("Yes", "No"), c(50, 10)), x = rep(1:2, c(50, 10)) ) expect_equal(100, nrow(themis:::smote_impl(df, "target", 5, 1))) }) test_that("smote() interfaces correctly", { circle_example_num <- circle_example[, 1:3] expect_no_error(smote(circle_example_num, var = "class")) expect_snapshot( error = TRUE, smote(circle_example_num, var = "Class") ) expect_snapshot( error = TRUE, smote(circle_example_num, var = c("class", "x")) ) expect_snapshot( error = TRUE, smote(circle_example_num, var = "x") ) circle_example0 <- circle_example_num circle_example0[1, 1] <- NA expect_snapshot( error = TRUE, smote(circle_example0, var = "class") ) expect_snapshot( error = TRUE, smote(circle_example_num, var = "class", k = 0) ) expect_snapshot( error = TRUE, smote(circle_example_num, var = "class", k = -1) ) expect_snapshot( error = TRUE, smote(circle_example_num, var = "class", k = c(5, 10)) ) }) test_that("ordering of columns shouldn't matter", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") credit_data0 <- credit_data %>% filter(!is.na(Job)) %>% select(Job, Time, Age, Expenses) expect_no_error( smote(credit_data0, "Job", over_ratio = 1) ) }) test_that("Doesn't error if no upsampling is done (#119)", { dat <- data.frame( outcome = c(rep("X", 101), rep("Z", 50)), X1 = 1) expect_no_error( smote_impl(dat, "outcome", 5, over_ratio = 0.5) ) }) test_that("bad args", { expect_snapshot( error = TRUE, smote(matrix()) ) expect_snapshot( error = TRUE, smote(circle_example, var = "class", k = 0) ) expect_snapshot( error = TRUE, smote(circle_example, var = "class", k = 5.5) ) expect_snapshot( error = TRUE, smote(circle_example, var = "class", over_ratio = TRUE) ) }) themis/tests/testthat/test-tomek.R0000644000176200001440000001140614744045253016776 0ustar liggesuserstest_that("basic usage", { rec1 <- recipe(class ~ x + y, data = circle_example) %>% step_tomek(class) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_smote(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_smote(class, id) %>% prep() ) }) test_that("errors if character are present", { df_char <- data.frame( x = factor(1:2), y = c("A", "A"), stringsAsFactors = FALSE ) expect_snapshot( error = TRUE, recipe(~., data = df_char) %>% step_tomek(x) %>% prep() ) }) test_that("NA in response", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") credit_data0 <- credit_data credit_data0[1, 1] <- NA expect_snapshot( error = TRUE, recipe(Status ~ Age, data = credit_data0) %>% step_tomek(Status) %>% prep() ) }) test_that("test tidy()", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_tomek(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(class ~ x + y, data = circle_example_alt_levels[[i]]) %>% step_tomek(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("id variables are ignored", { rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_tomek(class) %>% prep() expect_equal(ncol(bake(rec_id, new_data = NULL)), 4) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_tomek(class) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_tomek(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_tomek(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_tomek(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_tomek(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_tomek(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_tomek(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) themis/tests/testthat/test-tomek_impl.R0000644000176200001440000000247114744045253020021 0ustar liggesusers test_that("order doesn't matter", { df <- data.frame( target = rep(c("Yes", "No"), c(10, 50)), x = rep(c(1, 2, 3), c(9, 2, 49)) ) expect_equal(c(10, 11), themis:::tomek_impl(df, "target")) df <- data.frame( target = rep(c("Yes", "No"), c(50, 10)), x = rep(c(1, 2, 3), c(49, 2, 9)) ) expect_equal(c(50, 51), themis:::tomek_impl(df, "target")) }) test_that("tomek() interfaces correctly", { circle_example_num <- circle_example[, 1:3] expect_no_error(tomek(circle_example_num, var = "class")) expect_snapshot( error = TRUE, tomek(circle_example_num, var = "Class") ) expect_snapshot( error = TRUE, tomek(circle_example_num, var = c("class", "x")) ) expect_snapshot( error = TRUE, tomek(circle_example_num, var = "x") ) circle_example0 <- circle_example_num circle_example0[1, 1] <- NA expect_snapshot( error = TRUE, tomek(circle_example0, var = "class") ) }) test_that("ordering of columns shouldn't matter", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") credit_data0 <- credit_data %>% filter(!is.na(Job)) %>% select(Job, Time, Age, Expenses) expect_no_error( tomek(credit_data0, "Job") ) }) test_that("bad args", { expect_snapshot( error = TRUE, bsmote(matrix()) ) }) themis/tests/testthat/test-upsample.R0000644000176200001440000001720114744045253017504 0ustar liggesuserstest_that("ratio deprecation", { expect_snapshot( error = TRUE, new_rec <- recipe(~., data = circle_example) %>% step_upsample(class, ratio = 2) ) }) test_that("basic usage", { rec1 <- recipe(~., data = circle_example) %>% step_upsample(class) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_upsample(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_upsample(class, id) %>% prep() ) }) test_that("`seed` produces identical sampling", { step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(~., data = circle_example) %>% step_upsample(class, seed = seed) %>% prep() %>% bake(new_data = NULL) %>% pull(x) } run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) test_that("test tidy()", { rec <- recipe(~., data = circle_example) %>% step_upsample(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("ratio value works when oversampling", { res1 <- recipe(~., data = circle_example) %>% step_upsample(class) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(~., data = circle_example) %>% step_upsample(class, over_ratio = 0.5) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$class) == max(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), max(table(circle_example$class)) * c(0.5, 1) ) }) test_that("allows multi-class", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_no_error( recipe(Home ~ Age + Income + Assets, data = credit_data) %>% step_impute_mean(Income, Assets) %>% step_upsample(Home) ) }) test_that("majority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:28), ] ) %>% step_impute_mean(all_predictors()) %>% step_upsample(species) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 124)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(~., data = circle_example_alt_levels[[i]]) %>% step_upsample(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_upsample(class) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("case_weights", { circle_example_cw <- circle_example %>% mutate(weights = frequency_weights(rep(0:1, c(200, 200)))) rec1_p <- recipe(~., data = circle_example_cw) %>% step_upsample(class, over_ratio = 2) %>% prep() exp_count <- circle_example_cw %>% filter(as.integer(weights) == 1) %>% count(class) %>% pull(n) %>% max() rec_count <- bake(rec1_p, new_data = NULL) %>% count(class) %>% pull(n) expect_true(all(exp_count * 2 == rec_count)) expect_snapshot(rec1_p) # ignore importance weights circle_example_cw <- circle_example %>% mutate(weights = importance_weights(rep(0:1, c(200, 200)))) rec1_p <- recipe(~., data = circle_example_cw) %>% step_upsample(class) %>% prep() exp_count <- circle_example_cw %>% count(class) %>% pull(n) %>% max() rec_count <- bake(rec1_p, new_data = NULL) %>% count(class) %>% pull(n) expect_true(all(exp_count == rec_count)) expect_snapshot(rec1_p) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_upsample(all_predictors()) rec_param <- tunable.step_upsample(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 1) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_upsample(over_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_upsample(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_upsample(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_upsample(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_upsample(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_upsample(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(~., data = circle_example) %>% step_upsample(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_upsample( all_predictors(), over_ratio = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 1L) }) themis/tests/testthat/test-rose.R0000644000176200001440000001704414744045253016633 0ustar liggesuserstest_that("minority_prop value", { rec <- recipe(class ~ x + y, data = circle_example) rec21 <- rec %>% step_rose(class, minority_prop = 0.1) rec22 <- rec %>% step_rose(class, minority_prop = 0.2) rec21_p <- prep(rec21) rec22_p <- prep(rec22) tr_xtab1 <- table(bake(rec21_p, new_data = NULL)$class, useNA = "no") tr_xtab2 <- table(bake(rec22_p, new_data = NULL)$class, useNA = "no") expect_equal(sum(tr_xtab1), sum(tr_xtab2)) expect_lt(tr_xtab1[["Circle"]], tr_xtab2[["Circle"]]) }) test_that("row matching works correctly #36", { expect_no_error( recipe(class ~ ., data = circle_example) %>% step_rose(class, over_ratio = 1.2) %>% prep() ) expect_no_error( recipe(class ~ ., data = circle_example) %>% step_rose(class, over_ratio = 0.8) %>% prep() ) expect_no_error( recipe(class ~ ., data = circle_example) %>% step_rose(class, over_ratio = 1.7) %>% prep() ) }) test_that("basic usage", { rec1 <- recipe(class ~ x + y, data = circle_example) %>% step_rose(class) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_rose(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_rose(class, id) %>% prep() ) }) test_that("NA in response", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") credit_data0 <- credit_data credit_data0[1, 1] <- NA expect_snapshot( error = TRUE, recipe(Status ~ Age, data = credit_data0) %>% step_rose(Status) %>% prep() ) }) test_that("`seed` produces identical sampling", { step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(class ~ x + y, data = circle_example) %>% step_rose(class, seed = seed) %>% prep() %>% bake(new_data = NULL) %>% pull(x) } run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) test_that("test tidy()", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_rose(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("only except 2 classes", { df_char <- data.frame( x = factor(1:3), stringsAsFactors = FALSE ) expect_snapshot( error = TRUE, recipe(~., data = df_char) %>% step_rose(x) %>% prep() ) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(class ~ x + y, data = circle_example_alt_levels[[i]]) %>% step_rose(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("non-predictor variables are ignored", { circle_example2 <- circle_example %>% mutate(id = as.character(row_number())) %>% as_tibble() res <- recipe(class ~ ., data = circle_example2) %>% update_role(id, new_role = "id") %>% step_rose(class) %>% prep() %>% bake(new_data = NULL) expect_equal( c(circle_example2$id, rep(NA, nrow(res) - nrow(circle_example2))), as.character(res$id) ) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_rose(class) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_rose(all_predictors()) rec_param <- tunable.step_rose(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 1) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_rose(over_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_rose(minority_prop = TRUE) ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_rose(minority_smoothness = TRUE) ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_rose(majority_smoothness = TRUE) ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_rose(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_rose(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_rose(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_rose(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_rose(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_rose(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_rose( all_predictors(), over_ratio = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 1L) }) themis/tests/testthat/test-adasyn.R0000644000176200001440000002056114744045253017140 0ustar liggesuserstest_that("errors if there isn't enough data", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") credit_data0 <- credit_data credit_data0$Status <- as.character(credit_data0$Status) credit_data0$Status[1] <- "dummy" credit_data0$Status <- as.factor(credit_data0$Status) expect_snapshot( error = TRUE, recipe(Status ~ Age, data = credit_data0) %>% step_adasyn(Status) %>% prep() ) }) test_that("basic usage", { rec1 <- recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_adasyn(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_adasyn(class, id) %>% prep() ) }) test_that("errors if character are present", { df_char <- data.frame( x = factor(1:2), y = c("A", "A"), stringsAsFactors = FALSE ) expect_snapshot( error = TRUE, recipe(~., data = df_char) %>% step_adasyn(x) %>% prep() ) }) test_that("NA in response", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_snapshot( error = TRUE, recipe(Job ~ Age, data = credit_data) %>% step_adasyn(Job) %>% prep() ) }) test_that("`seed` produces identical sampling", { step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class, seed = seed) %>% prep() %>% bake(new_data = NULL) %>% pull(x) } run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) test_that("test tidy()", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("ratio value works when oversampling", { res1 <- recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class, over_ratio = 0.5) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$class) == max(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), max(table(circle_example$class)) * c(0.5, 1) ) }) test_that("allows multi-class", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_no_error( recipe(Home ~ Age + Income + Assets, data = credit_data) %>% step_impute_mean(Income, Assets) %>% step_adasyn(Home) ) }) test_that("majority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:28), ] ) %>% step_impute_mean(all_predictors()) %>% step_adasyn(species) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 124)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(class ~ x + y, data = circle_example_alt_levels[[i]]) %>% step_adasyn(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("ordering of newly generated points are right", { res <- recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class) %>% prep() %>% bake(new_data = NULL) expect_equal( res[seq_len(nrow(circle_example)), ], as_tibble(circle_example[, c("x", "y", "class")]) ) }) test_that("non-predictor variables are ignored", { circle_example2 <- circle_example %>% mutate(id = as.character(row_number())) %>% as_tibble() res <- recipe(class ~ ., data = circle_example2) %>% update_role(id, new_role = "id") %>% step_adasyn(class) %>% prep() %>% bake(new_data = NULL) expect_equal( c(circle_example2$id, rep(NA, nrow(res) - nrow(circle_example2))), as.character(res$id) ) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_adasyn(class) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_adasyn(all_predictors()) rec_param <- tunable.step_adasyn(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio", "neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_adasyn(over_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_adasyn(neighbors = TRUE) %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_adasyn(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]), ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_adasyn(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_adasyn(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_adasyn(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_adasyn(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_adasyn( all_predictors(), over_ratio = hardhat::tune(), neighbors = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 2L) }) themis/tests/testthat/test-nearmiss.R0000644000176200001440000001606014744045253017501 0ustar liggesuserstest_that("basic usage", { rec1 <- recipe(class ~ x + y, data = circle_example) %>% step_nearmiss(class) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_nearmiss(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_nearmiss(class, id) %>% prep() ) }) test_that("errors if character are present", { df_char <- data.frame( x = factor(1:2), y = c("A", "A"), stringsAsFactors = FALSE ) expect_snapshot( error = TRUE, recipe(~., data = df_char) %>% step_nearmiss(x) %>% prep() ) }) test_that("NA in response", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_snapshot( error = TRUE, recipe(Job ~ Age, data = credit_data) %>% step_nearmiss(Job) %>% prep() ) }) test_that("test tidy()", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_nearmiss(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("ratio value works when undersampling", { res1 <- recipe(class ~ x + y, data = circle_example) %>% step_nearmiss(class) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(class ~ x + y, data = circle_example) %>% step_nearmiss(class, under_ratio = 1.5) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$class) == min(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), min(table(circle_example$class)) * c(1, 1.5) ) }) test_that("allows multi-class", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_no_error( recipe(Home ~ Age + Income + Assets, data = credit_data) %>% step_impute_mean(Income, Assets) %>% step_nearmiss(Home) ) }) test_that("minority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:84), ] ) %>% step_impute_mean(all_predictors()) %>% step_nearmiss(species) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 68)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(class ~ x + y, data = circle_example_alt_levels[[i]]) %>% step_nearmiss(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("id variables are ignored", { rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_nearmiss(class, under_ratio = 1) %>% prep() expect_equal(ncol(bake(rec_id, new_data = NULL)), 4) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_nearmiss(class, under_ratio = 1) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_nearmiss(all_predictors()) rec_param <- tunable.step_nearmiss(rec$steps[[1]]) expect_equal(rec_param$name, c("under_ratio", "neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_nearmiss(over_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_nearmiss(neighbors = TRUE) %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_nearmiss(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_nearmiss(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_nearmiss(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_nearmiss(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_nearmiss(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_nearmiss(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_nearmiss( all_predictors(), under_ratio = hardhat::tune(), neighbors = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 2L) }) themis/tests/testthat/test-nearmiss_impl.R0000644000176200001440000000057614744045253020527 0ustar liggesuserstest_that("bad args", { expect_snapshot( error = TRUE, nearmiss(matrix()) ) expect_snapshot( error = TRUE, nearmiss(circle_example, var = "class", k = 0) ) expect_snapshot( error = TRUE, nearmiss(circle_example, var = "class", k = 5.5) ) expect_snapshot( error = TRUE, nearmiss(circle_example, var = "class", under_ratio = TRUE) ) }) themis/tests/testthat/test-bsmote_impl.R0000644000176200001440000000254414744045253020174 0ustar liggesuserscircle_example_num <- circle_example[, c("x", "y", "class")] test_that("bsmote() interfaces correctly", { expect_no_error(bsmote(circle_example_num, var = "class")) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "Class") ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = c("class", "x")) ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "x") ) circle_example0 <- circle_example_num circle_example0[1, 1] <- NA expect_snapshot( error = TRUE, bsmote(circle_example0, var = "class") ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "class", k = 0) ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "class", k = -1) ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "class", k = c(5, 10)) ) }) test_that("bad args", { expect_snapshot( error = TRUE, bsmote(matrix()) ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "class", k = 0) ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "class", k = 5.5) ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "class", over_ratio = TRUE) ) expect_snapshot( error = TRUE, bsmote(circle_example_num, var = "class", all_neighbors = 1) ) }) themis/tests/testthat/test-adasyn_impl.R0000644000176200001440000000237214744045253020161 0ustar liggesuserscircle_example_num <- circle_example[, c("x", "y", "class")] test_that("adasyn() interfaces correctly", { expect_no_error(adasyn(circle_example_num, var = "class")) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "Class") ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = c("class", "x")) ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "x") ) circle_example0 <- circle_example_num circle_example0[1, 1] <- NA expect_snapshot( error = TRUE, adasyn(circle_example0, var = "class") ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "class", k = 0) ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "class", k = -1) ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "class", k = c(5, 10)) ) }) test_that("bad args", { expect_snapshot( error = TRUE, adasyn(matrix()) ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "class", k = 0) ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "class", k = 5.5) ) expect_snapshot( error = TRUE, adasyn(circle_example_num, var = "class", over_ratio = TRUE) ) }) themis/tests/testthat/test-bsmote.R0000644000176200001440000002570314744045253017155 0ustar liggesuserstest_that("all minority classes are upsampled", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins ) %>% step_impute_mean(all_predictors()) %>% step_bsmote(species) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 152)) }) test_that("basic usage", { rec1 <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, all_neighbors = FALSE) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("basic usage", { rec1 <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, all_neighbors = TRUE) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_bsmote(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_bsmote(class, id) %>% prep() ) }) test_that("errors if character are present", { df_char <- data.frame( x = factor(1:2), y = c("A", "A"), stringsAsFactors = FALSE ) expect_snapshot( error = TRUE, recipe(~., data = df_char) %>% step_bsmote(x) %>% prep() ) }) test_that("NA in response", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_snapshot( error = TRUE, recipe(Job ~ Age, data = credit_data) %>% step_bsmote(Job) %>% prep() ) }) test_that("`seed` produces identical sampling", { step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, seed = seed) %>% prep() %>% bake(new_data = NULL) %>% pull(x) } run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) test_that("test tidy()", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("ratio value works when oversampling", { res1 <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, all_neighbors = FALSE) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, over_ratio = 0.5, all_neighbors = FALSE) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$class) == max(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), max(table(circle_example$class)) * c(0.5, 1) ) }) test_that("ratio value works when oversampling", { res1 <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, all_neighbors = TRUE) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, over_ratio = 0.5, all_neighbors = TRUE) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$class) == max(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), max(table(circle_example$class)) * c(0.5, 1) ) }) test_that("allows multi-class", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_no_error( recipe(Home ~ Age + Income + Assets, data = credit_data) %>% step_impute_mean(Income, Assets) %>% step_bsmote(Home) ) }) test_that("majority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:28), ] ) %>% step_impute_mean(all_predictors()) %>% step_bsmote(species, all_neighbors = FALSE) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 124)) }) test_that("majority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:28), ] ) %>% step_impute_mean(all_predictors()) %>% step_bsmote(species, all_neighbors = TRUE) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 124)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(class ~ x + y, data = circle_example_alt_levels[[i]]) %>% step_bsmote(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("ordering of newly generated points are right", { res <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, all_neighbors = FALSE) %>% prep() %>% bake(new_data = NULL) expect_equal( res[seq_len(nrow(circle_example)), ], as_tibble(circle_example[, c("x", "y", "class")]) ) }) test_that("ordering of newly generated points are right", { res <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, all_neighbors = TRUE) %>% prep() %>% bake(new_data = NULL) expect_equal( res[seq_len(nrow(circle_example)), ], as_tibble(circle_example[, c("x", "y", "class")]) ) }) test_that("non-predictor variables are ignored", { res <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_bsmote(class, all_neighbors = FALSE) %>% prep() %>% bake(new_data = NULL) expect_equal( c(circle_example$id, rep(NA, nrow(res) - nrow(circle_example))), as.character(res$id) ) }) test_that("non-predictor variables are ignored", { res <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_bsmote(class, all_neighbors = TRUE) %>% prep() %>% bake(new_data = NULL) expect_equal( c(circle_example$id, rep(NA, nrow(res) - nrow(circle_example))), as.character(res$id) ) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_bsmote(class, all_neighbors = FALSE) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_bsmote(class, all_neighbors = TRUE) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_bsmote(all_predictors()) rec_param <- tunable.step_bsmote(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio", "neighbors", "all_neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 3) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_bsmote(over_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_bsmote(neighbors = TRUE) %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_bsmote(all_neighbors = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_bsmote(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_bsmote(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_bsmote(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_bsmote(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_bsmote(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_bsmote( all_predictors(), over_ratio = hardhat::tune(), neighbors = hardhat::tune(), all_neighbors = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 3L) }) themis/tests/testthat/test-S3-methods.R0000644000176200001440000000256514744045253017613 0ustar liggesusersr1 <- recipe(class ~ ., data = circle_example) r2 <- r1 %>% step_adasyn(class) r3 <- r1 %>% step_bsmote(class) r4 <- r1 %>% step_downsample(class) r5 <- r1 %>% step_nearmiss(class) r6 <- r1 %>% step_rose(class) r7 <- r1 %>% step_smote(class) r8 <- r1 %>% step_tomek(class) r9 <- r1 %>% step_upsample(class) # ------------------------------------------------------------------------------ test_that("required packages", { expect_equal(required_pkgs(r2), c("recipes", "themis")) expect_equal(required_pkgs(r3), c("recipes", "themis")) expect_equal(required_pkgs(r4), c("recipes", "themis")) expect_equal(required_pkgs(r5), c("recipes", "themis")) expect_equal(required_pkgs(r6), c("recipes", "themis", "ROSE")) expect_equal(required_pkgs(r7), c("recipes", "themis")) expect_equal(required_pkgs(r8), c("recipes", "themis")) expect_equal(required_pkgs(r9), c("recipes", "themis")) }) test_that("tunable arguments", { expect_equal(tunable(r2)$name, c("over_ratio", "neighbors")) expect_equal( tunable(r3)$name, c("over_ratio", "neighbors", "all_neighbors") ) expect_equal(tunable(r4)$name, "under_ratio") expect_equal(tunable(r5)$name, c("under_ratio", "neighbors")) expect_equal(tunable(r6)$name, "over_ratio") expect_equal(tunable(r7)$name, c("over_ratio", "neighbors")) expect_true(nrow(tunable(r8)) == 0) expect_equal(tunable(r9)$name, "over_ratio") }) themis/tests/testthat/_snaps/0000755000176200001440000000000014744273647016052 5ustar liggesusersthemis/tests/testthat/_snaps/tomek_impl.md0000644000176200001440000000154214744271025020522 0ustar liggesusers# tomek() interfaces correctly Code tomek(circle_example_num, var = "Class") Condition Error in `tomek()`: ! `var` must be one of "x", "y", or "class", not "Class". i Did you mean "class"? --- Code tomek(circle_example_num, var = c("class", "x")) Condition Error in `tomek()`: ! Please select a single factor variable for `var`. --- Code tomek(circle_example_num, var = "x") Condition Error in `tomek()`: ! `x` should refer to a factor or character column, not a double vector. --- Code tomek(circle_example0, var = "class") Condition Error in `tomek()`: ! Cannot have any missing values. NAs found in x. # bad args Code bsmote(matrix()) Condition Error in `bsmote()`: ! `df` must be a data frame, not a logical matrix. themis/tests/testthat/_snaps/tomek.md0000644000176200001440000000533114744271044017502 0ustar liggesusers# bad data Code rec %>% step_smote(x) %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_smote(class, id) %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # errors if character are present Code recipe(~., data = df_char) %>% step_tomek(x) %>% prep() Condition Error in `step_tomek()`: Caused by error in `prep()`: x All columns selected for the step should be double or integer. * 1 factor variable found: `y` # NA in response Code recipe(Status ~ Age, data = credit_data0) %>% step_tomek(Status) %>% prep() Condition Error in `step_tomek()`: Caused by error in `prep()`: ! Cannot have any missing values. NAs found in Status. # bad args Code recipe(~., data = mtcars) %>% step_tomek(seed = TRUE) Condition Error in `step_tomek()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_tomek()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * Tomek based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * Tomek based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Operations * Tomek based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * Tomek based on: class | Trained themis/tests/testthat/_snaps/adasyn_impl.md0000644000176200001440000000366014744273640020672 0ustar liggesusers# adasyn() interfaces correctly Code adasyn(circle_example_num, var = "Class") Condition Error in `adasyn()`: ! `var` must be one of "x", "y", or "class", not "Class". i Did you mean "class"? --- Code adasyn(circle_example_num, var = c("class", "x")) Condition Error in `adasyn()`: ! Please select a single factor variable for `var`. --- Code adasyn(circle_example_num, var = "x") Condition Error in `adasyn()`: ! `x` should refer to a factor or character column, not a double vector. --- Code adasyn(circle_example0, var = "class") Condition Error in `adasyn()`: ! Cannot have any missing values. NAs found in x. --- Code adasyn(circle_example_num, var = "class", k = 0) Condition Error in `adasyn()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code adasyn(circle_example_num, var = "class", k = -1) Condition Error in `adasyn()`: ! `k` must be a whole number larger than or equal to 1, not the number -1. --- Code adasyn(circle_example_num, var = "class", k = c(5, 10)) Condition Error in `adasyn()`: ! `k` must be a whole number, not a double vector. # bad args Code adasyn(matrix()) Condition Error in `adasyn()`: ! `df` must be a data frame, not a logical matrix. --- Code adasyn(circle_example_num, var = "class", k = 0) Condition Error in `adasyn()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code adasyn(circle_example_num, var = "class", k = 5.5) Condition Error in `adasyn()`: ! `k` must be a whole number, not the number 5.5. --- Code adasyn(circle_example_num, var = "class", over_ratio = TRUE) Condition Error in `adasyn()`: ! `over_ratio` must be a number, not `TRUE`. themis/tests/testthat/_snaps/rose.md0000644000176200001440000000663414744271044017342 0ustar liggesusers# bad data Code rec %>% step_rose(x) %>% prep() Condition Error in `step_rose()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_rose(class, id) %>% prep() Condition Error in `step_rose()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # NA in response Code recipe(Status ~ Age, data = credit_data0) %>% step_rose(Status) %>% prep() Condition Error in `step_rose()`: Caused by error in `prep()`: ! Cannot have any missing values. NAs found in Status. # only except 2 classes Code recipe(~., data = df_char) %>% step_rose(x) %>% prep() Condition Error in `step_rose()`: Caused by error in `prep()`: ! The `x` must only have 2 levels. # bad args Code recipe(~., data = mtcars) %>% step_rose(over_ratio = "yes") %>% prep() Condition Error in `step_rose()`: Caused by error in `prep()`: ! `over_ratio` must be a number, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_rose(minority_prop = TRUE) Condition Error in `step_rose()`: ! `minority_prop` must be a number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_rose(minority_smoothness = TRUE) Condition Error in `step_rose()`: ! `minority_smoothness` must be a number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_rose(majority_smoothness = TRUE) Condition Error in `step_rose()`: ! `majority_smoothness` must be a number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_rose(seed = TRUE) Condition Error in `step_rose()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_rose()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * ROSE based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * ROSE based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Operations * ROSE based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * ROSE based on: class | Trained themis/tests/testthat/_snaps/downsample.md0000644000176200001440000000715114744273642020544 0ustar liggesusers# ratio deprecation Code new_rec <- recipe(~., data = circle_example) %>% step_downsample(class, ratio = 2) Condition Error: ! The `ratio` argument of `step_downsample()` was deprecated in themis 0.2.0 and is now defunct. i Please use the `under_ratio` argument instead. # bad data Code rec %>% step_downsample(x) %>% prep() Condition Error in `step_downsample()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_downsample(class, id) %>% prep() Condition Error in `step_downsample()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # case_weights Code rec1_p Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 case_weights: 1 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * Down-sampling based on: class | Trained, weighted --- Code rec1_p Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 case_weights: 1 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * Down-sampling based on: class | Trained, ignored weights # bad args Code recipe(~., data = mtcars) %>% step_downsample(under_ratio = "yes") %>% prep() Condition Error in `step_downsample()`: Caused by error in `prep()`: ! `under_ratio` must be a number, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_downsample(seed = TRUE) Condition Error in `step_downsample()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_downsample()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * Down-sampling based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * Down-sampling based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 -- Operations * Down-sampling based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * Down-sampling based on: class | Trained themis/tests/testthat/_snaps/smotenc.md0000644000176200001440000000626114744271044020036 0ustar liggesusers# errors if there isn't enough data Code recipe(Status ~ Age, data = credit_data0) %>% step_smotenc(Status) %>% prep() Condition Error in `step_smotenc()`: Caused by error in `smotenc_impl()`: ! Not enough observations of `dummy` to perform SMOTE. # bad data Code rec %>% step_smotenc(x) %>% prep() Condition Error in `step_smotenc()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_smotenc(class, id) %>% prep() Condition Error in `step_smotenc()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # NA in response Code recipe(Job ~ Age, data = credit_data) %>% step_smotenc(Job) %>% prep() Condition Error in `step_smotenc()`: Caused by error in `prep()`: ! Cannot have any missing values. NAs found in Job. # bad args Code recipe(~., data = mtcars) %>% step_smotenc(over_ratio = "yes") %>% prep() Condition Error in `step_smotenc()`: Caused by error in `prep()`: ! `over_ratio` must be a number, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_smotenc(neighbors = TRUE) %>% prep() Condition Error in `step_smotenc()`: Caused by error in `prep()`: ! `neighbors` must be a whole number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_smotenc(seed = TRUE) Condition Error in `step_smotenc()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_smotenc()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * SMOTENC based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * SMOTENC based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Operations * SMOTENC based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * SMOTENC based on: class | Trained themis/tests/testthat/_snaps/smote.md0000644000176200001440000000665214744271044017521 0ustar liggesusers# errors if there isn't enough data Code recipe(Status ~ Age, data = credit_data0) %>% step_smote(Status) %>% prep() Condition Error in `step_smote()`: Caused by error in `bake()`: ! Not enough observations of "dummy" to perform SMOTE. # bad data Code rec %>% step_smote(x) %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_smote(class, id) %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # errors if character are present Code recipe(~., data = df_char) %>% step_smote(x) %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: x All columns selected for the step should be double or integer. * 1 factor variable found: `y` # NA in response Code recipe(Job ~ Age, data = credit_data) %>% step_smote(Job) %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: ! Cannot have any missing values. NAs found in Job. # bad args Code recipe(~., data = mtcars) %>% step_smote(over_ratio = "yes") %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: ! `over_ratio` must be a number, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_smote(neighbors = TRUE) %>% prep() Condition Error in `step_smote()`: Caused by error in `prep()`: ! `neighbors` must be a whole number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_smote(seed = TRUE) Condition Error in `step_smote()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_smote()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * SMOTE based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * SMOTE based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Operations * SMOTE based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * SMOTE based on: class | Trained themis/tests/testthat/_snaps/smote_impl.md0000644000176200001440000000361514744271023020533 0ustar liggesusers# smote() interfaces correctly Code smote(circle_example_num, var = "Class") Condition Error in `smote()`: ! `var` must be one of "x", "y", or "class", not "Class". i Did you mean "class"? --- Code smote(circle_example_num, var = c("class", "x")) Condition Error in `smote()`: ! Please select a single factor variable for `var`. --- Code smote(circle_example_num, var = "x") Condition Error in `smote()`: ! `x` should refer to a factor or character column, not a double vector. --- Code smote(circle_example0, var = "class") Condition Error in `smote()`: ! Cannot have any missing values. NAs found in x. --- Code smote(circle_example_num, var = "class", k = 0) Condition Error in `smote()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code smote(circle_example_num, var = "class", k = -1) Condition Error in `smote()`: ! `k` must be a whole number larger than or equal to 1, not the number -1. --- Code smote(circle_example_num, var = "class", k = c(5, 10)) Condition Error in `smote()`: ! `k` must be a whole number, not a double vector. # bad args Code smote(matrix()) Condition Error in `smote()`: ! `df` must be a data frame, not a logical matrix. --- Code smote(circle_example, var = "class", k = 0) Condition Error in `smote()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code smote(circle_example, var = "class", k = 5.5) Condition Error in `smote()`: ! `k` must be a whole number, not the number 5.5. --- Code smote(circle_example, var = "class", over_ratio = TRUE) Condition Error in `smote()`: ! `over_ratio` must be a number, not `TRUE`. themis/tests/testthat/_snaps/extension_check.md0000644000176200001440000000021314744273642021534 0ustar liggesusers# recipes_extension_check Code recipes::recipes_extension_check(pkg = "themis") Message v All steps have all method! themis/tests/testthat/_snaps/adasyn.md0000644000176200001440000000670014744273641017650 0ustar liggesusers# errors if there isn't enough data Code recipe(Status ~ Age, data = credit_data0) %>% step_adasyn(Status) %>% prep() Condition Error in `step_adasyn()`: Caused by error in `bake()`: ! Not enough observations of "dummy" to perform ADASYN. # bad data Code rec %>% step_adasyn(x) %>% prep() Condition Error in `step_adasyn()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_adasyn(class, id) %>% prep() Condition Error in `step_adasyn()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # errors if character are present Code recipe(~., data = df_char) %>% step_adasyn(x) %>% prep() Condition Error in `step_adasyn()`: Caused by error in `prep()`: x All columns selected for the step should be double or integer. * 1 factor variable found: `y` # NA in response Code recipe(Job ~ Age, data = credit_data) %>% step_adasyn(Job) %>% prep() Condition Error in `step_adasyn()`: Caused by error in `prep()`: ! Cannot have any missing values. NAs found in Job. # bad args Code recipe(~., data = mtcars) %>% step_adasyn(over_ratio = "yes") %>% prep() Condition Error in `step_adasyn()`: Caused by error in `prep()`: ! `over_ratio` must be a number, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_adasyn(neighbors = TRUE) %>% prep() Condition Error in `step_adasyn()`: Caused by error in `prep()`: ! `neighbors` must be a whole number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_adasyn(seed = TRUE) Condition Error in `step_adasyn()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_adasyn()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * adasyn based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * adasyn based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Operations * adasyn based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * adasyn based on: class | Trained themis/tests/testthat/_snaps/bsmote.md0000644000176200001440000000671614744273642017672 0ustar liggesusers# bad data Code rec %>% step_bsmote(x) %>% prep() Condition Error in `step_bsmote()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_bsmote(class, id) %>% prep() Condition Error in `step_bsmote()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # errors if character are present Code recipe(~., data = df_char) %>% step_bsmote(x) %>% prep() Condition Error in `step_bsmote()`: Caused by error in `prep()`: x All columns selected for the step should be double or integer. * 1 factor variable found: `y` # NA in response Code recipe(Job ~ Age, data = credit_data) %>% step_bsmote(Job) %>% prep() Condition Error in `step_bsmote()`: Caused by error in `prep()`: ! Cannot have any missing values. NAs found in Job. # bad args Code recipe(~., data = mtcars) %>% step_bsmote(over_ratio = "yes") %>% prep() Condition Error in `step_bsmote()`: Caused by error in `prep()`: ! `over_ratio` must be a number, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_bsmote(neighbors = TRUE) %>% prep() Condition Error in `step_bsmote()`: Caused by error in `prep()`: ! `neighbors` must be a whole number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_bsmote(all_neighbors = "yes") %>% prep() Condition Error in `step_bsmote()`: Caused by error in `prep()`: ! `all_neighbors` must be `TRUE` or `FALSE`, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_bsmote(seed = TRUE) Condition Error in `step_bsmote()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_bsmote()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * BorderlineSMOTE based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * BorderlineSMOTE based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Operations * BorderlineSMOTE based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * BorderlineSMOTE based on: class | Trained themis/tests/testthat/_snaps/smotenc_impl.md0000644000176200001440000000124114744271024021046 0ustar liggesusers# bad args Code smotenc(matrix()) Condition Error in `smotenc()`: ! `df` must be a data frame, not a logical matrix. --- Code smotenc(circle_example, var = "class", k = 0) Condition Error in `smotenc()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code smotenc(circle_example, var = "class", k = 5.5) Condition Error in `smotenc()`: ! `k` must be a whole number, not the number 5.5. --- Code smotenc(circle_example, var = "class", over_ratio = TRUE) Condition Error in `smotenc()`: ! `over_ratio` must be a number, not `TRUE`. themis/tests/testthat/_snaps/nearmiss.md0000644000176200001440000000635514744271044020213 0ustar liggesusers# bad data Code rec %>% step_nearmiss(x) %>% prep() Condition Error in `step_nearmiss()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_nearmiss(class, id) %>% prep() Condition Error in `step_nearmiss()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # errors if character are present Code recipe(~., data = df_char) %>% step_nearmiss(x) %>% prep() Condition Error in `step_nearmiss()`: Caused by error in `prep()`: x All columns selected for the step should be double or integer. * 1 factor variable found: `y` # NA in response Code recipe(Job ~ Age, data = credit_data) %>% step_nearmiss(Job) %>% prep() Condition Error in `step_nearmiss()`: Caused by error in `prep()`: ! Cannot have any missing values. NAs found in Job. # bad args Code recipe(~., data = mtcars) %>% step_nearmiss(over_ratio = "yes") %>% prep() Condition Error in `step_nearmiss()`: Caused by error in `prep()`: ! The following argument was specified but do not exist: `over_ratio`. --- Code recipe(~., data = mtcars) %>% step_nearmiss(neighbors = TRUE) %>% prep() Condition Error in `step_nearmiss()`: Caused by error in `prep()`: ! `neighbors` must be a whole number, not `TRUE`. --- Code recipe(~., data = mtcars) %>% step_nearmiss(seed = TRUE) Condition Error in `step_nearmiss()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_nearmiss()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * NEARMISS-1 based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * NEARMISS-1 based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Operations * NEARMISS-1 based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 2 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * NEARMISS-1 based on: class | Trained themis/tests/testthat/_snaps/nearmiss_impl.md0000644000176200001440000000125314744273642021232 0ustar liggesusers# bad args Code nearmiss(matrix()) Condition Error in `nearmiss()`: ! `df` must be a data frame, not a logical matrix. --- Code nearmiss(circle_example, var = "class", k = 0) Condition Error in `nearmiss()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code nearmiss(circle_example, var = "class", k = 5.5) Condition Error in `nearmiss()`: ! `k` must be a whole number, not the number 5.5. --- Code nearmiss(circle_example, var = "class", under_ratio = TRUE) Condition Error in `nearmiss()`: ! `under_ratio` must be a number, not `TRUE`. themis/tests/testthat/_snaps/bsmote_impl.md0000644000176200001440000000416014744273641020701 0ustar liggesusers# bsmote() interfaces correctly Code bsmote(circle_example_num, var = "Class") Condition Error in `bsmote()`: ! `var` must be one of "x", "y", or "class", not "Class". i Did you mean "class"? --- Code bsmote(circle_example_num, var = c("class", "x")) Condition Error in `bsmote()`: ! Please select a single factor variable for `var`. --- Code bsmote(circle_example_num, var = "x") Condition Error in `bsmote()`: ! `x` should refer to a factor or character column, not a double vector. --- Code bsmote(circle_example0, var = "class") Condition Error in `bsmote()`: ! Cannot have any missing values. NAs found in x. --- Code bsmote(circle_example_num, var = "class", k = 0) Condition Error in `bsmote()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code bsmote(circle_example_num, var = "class", k = -1) Condition Error in `bsmote()`: ! `k` must be a whole number larger than or equal to 1, not the number -1. --- Code bsmote(circle_example_num, var = "class", k = c(5, 10)) Condition Error in `bsmote()`: ! `k` must be a whole number, not a double vector. # bad args Code bsmote(matrix()) Condition Error in `bsmote()`: ! `df` must be a data frame, not a logical matrix. --- Code bsmote(circle_example_num, var = "class", k = 0) Condition Error in `bsmote()`: ! `k` must be a whole number larger than or equal to 1, not the number 0. --- Code bsmote(circle_example_num, var = "class", k = 5.5) Condition Error in `bsmote()`: ! `k` must be a whole number, not the number 5.5. --- Code bsmote(circle_example_num, var = "class", over_ratio = TRUE) Condition Error in `bsmote()`: ! `over_ratio` must be a number, not `TRUE`. --- Code bsmote(circle_example_num, var = "class", all_neighbors = 1) Condition Error in `bsmote()`: ! `all_neighbors` must be `TRUE` or `FALSE`, not the number 1. themis/tests/testthat/_snaps/upsample.md0000644000176200001440000000710614744271044020213 0ustar liggesusers# ratio deprecation Code new_rec <- recipe(~., data = circle_example) %>% step_upsample(class, ratio = 2) Condition Error: ! The `ratio` argument of `step_downsample()` was deprecated in themis 0.2.0 and is now defunct. i Please use the `over_ratio` argument instead. # bad data Code rec %>% step_upsample(x) %>% prep() Condition Error in `step_upsample()`: Caused by error in `prep()`: ! `x` should be a factor variable. --- Code rec %>% step_upsample(class, id) %>% prep() Condition Error in `step_upsample()`: Caused by error in `prep()`: ! The selector should select at most a single variable. # case_weights Code rec1_p Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 case_weights: 1 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * Up-sampling based on: class | Trained, weighted --- Code rec1_p Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 case_weights: 1 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * Up-sampling based on: class | Trained, ignored weights # bad args Code recipe(~., data = mtcars) %>% step_upsample(over_ratio = "yes") %>% prep() Condition Error in `step_upsample()`: Caused by error in `prep()`: ! `over_ratio` must be a number, not the string "yes". --- Code recipe(~., data = mtcars) %>% step_upsample(seed = TRUE) Condition Error in `step_upsample()`: ! `seed` must be a whole number, not `TRUE`. # bake method errors when needed non-standard role columns are missing Code bake(trained, new_data = circle_example[, -3]) Condition Error in `step_upsample()`: ! The following required column is missing from `new_data`: class. # empty printing Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Operations * Up-sampling based on: --- Code rec Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role outcome: 1 predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations * Up-sampling based on: | Trained # printing Code print(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 -- Operations * Up-sampling based on: class --- Code prep(rec) Message -- Recipe ---------------------------------------------------------------------- -- Inputs Number of variables by role predictor: 4 -- Training information Training data contained 400 data points and no incomplete rows. -- Operations * Up-sampling based on: class | Trained themis/tests/testthat/test-smotenc.R0000644000176200001440000002143614744045253017333 0ustar liggesuserstest_that("errors if there isn't enough data", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") credit_data0 <- credit_data credit_data0$Status <- as.character(credit_data0$Status) credit_data0$Status[1] <- "dummy" credit_data0$Status <- as.factor(credit_data0$Status) expect_snapshot( error = TRUE, recipe(Status ~ Age, data = credit_data0) %>% step_smotenc(Status) %>% prep() ) }) test_that("basic usage", { skip_if_not_installed("modeldata") data("ames", package = "modeldata") rec1 <- recipe(Alley ~ MS_SubClass + MS_Zoning + Lot_Frontage + Lot_Area + Street, data = ames) %>% step_smotenc(Alley) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = ames)$Alley, useNA = "no") og_xtab <- table(ames$Alley, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_smotenc(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_smotenc(class, id) %>% prep() ) }) test_that("allows for character variables", { df_char <- data.frame( x = factor(1:2), y = c("A", "A"), stringsAsFactors = FALSE ) expect_no_error( recipe(~., data = df_char) %>% step_smotenc(x) %>% prep() ) }) test_that("NA in response", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_snapshot( error = TRUE, recipe(Job ~ Age, data = credit_data) %>% step_smotenc(Job) %>% prep() ) }) test_that("`seed` produces identical sampling", { step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(class ~ x + y, data = circle_example) %>% step_smotenc(class, seed = seed) %>% prep() %>% bake(new_data = NULL) %>% pull(x) } run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) test_that("test tidy()", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_smotenc(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("ratio value works when oversampling", { skip_if_not_installed("modeldata") data("ames", package = "modeldata") res1 <- recipe(Alley ~ MS_SubClass + MS_Zoning + Lot_Frontage + Lot_Area + Street, data = ames) %>% step_smotenc(Alley) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(Alley ~ MS_SubClass + MS_Zoning + Lot_Frontage + Lot_Area + Street, data = ames) %>% step_smotenc(Alley, over_ratio = 0.5) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$Alley) == max(table(ames$Alley)))) expect_equal( sort(as.numeric(table(res1.5$Alley))), max(table(ames$Alley)) * c(0.5, 0.5, 1) ) }) test_that("allows multi-class", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_no_error( recipe(Home ~ Age + Income + Assets, data = credit_data) %>% step_impute_mean(Income, Assets) %>% step_smotenc(Home) ) }) test_that("majority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:28), ]) %>% step_impute_mean(all_predictors()) %>% step_smotenc(species) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 124)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor(x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class))) } for (i in 1:4) { rec_p <- recipe(class ~ x + y, data = circle_example_alt_levels[[i]]) %>% step_smotenc(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("ordering of newly generated points are right", { res <- recipe(class ~ x + y, data = circle_example) %>% step_smotenc(class) %>% prep() %>% bake(new_data = NULL) expect_equal( res[seq_len(nrow(circle_example)), ], as_tibble(circle_example[, c("x", "y", "class")]) ) }) test_that("non-predictor variables are ignored", { res <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_smotenc(class) %>% prep() %>% bake(new_data = NULL) expect_equal( c(circle_example$id, rep(NA, nrow(res) - nrow(circle_example))), as.character(res$id) ) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_smotenc(class) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("Doesn't error if no upsampling is done (#119)", { dat <- data.frame( outcome = c(rep("X", 101), rep("Z", 50)), X1 = 1 ) expect_no_error( smotenc_impl(dat, "outcome", 5, over_ratio = 0.5) ) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_smotenc(all_predictors()) rec_param <- tunable.step_smotenc(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio", "neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_smotenc(over_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_smotenc(neighbors = TRUE) %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_smotenc(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_smotenc(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_smotenc(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_smotenc(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_smotenc(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_smotenc(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_smotenc( all_predictors(), over_ratio = hardhat::tune(), neighbors = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 2L) }) themis/tests/testthat/test-smote.R0000644000176200001440000002035314744045253017007 0ustar liggesuserstest_that("errors if there isn't enough data", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") credit_data0 <- credit_data credit_data0$Status <- as.character(credit_data0$Status) credit_data0$Status[1] <- "dummy" credit_data0$Status <- as.factor(credit_data0$Status) expect_snapshot( error = TRUE, recipe(Status ~ Age, data = credit_data0) %>% step_smote(Status) %>% prep() ) }) test_that("basic usage", { rec1 <- recipe(class ~ x + y, data = circle_example) %>% step_smote(class) rec1_p <- prep(rec1) te_xtab <- table(bake(rec1_p, new_data = circle_example)$class, useNA = "no") og_xtab <- table(circle_example$class, useNA = "no") expect_equal(sort(te_xtab), sort(og_xtab)) expect_no_warning(prep(rec1)) }) test_that("bad data", { rec <- recipe(~., data = circle_example) # numeric check expect_snapshot( error = TRUE, rec %>% step_smote(x) %>% prep() ) # Multiple variable check expect_snapshot( error = TRUE, rec %>% step_smote(class, id) %>% prep() ) }) test_that("errors if character are present", { df_char <- data.frame( x = factor(1:2), y = c("A", "A"), stringsAsFactors = FALSE ) expect_snapshot( error = TRUE, recipe(~., data = df_char) %>% step_smote(x) %>% prep() ) }) test_that("NA in response", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_snapshot( error = TRUE, recipe(Job ~ Age, data = credit_data) %>% step_smote(Job) %>% prep() ) }) test_that("`seed` produces identical sampling", { step_with_seed <- function(seed = sample.int(10^5, 1)) { recipe(class ~ x + y, data = circle_example) %>% step_smote(class, seed = seed) %>% prep() %>% bake(new_data = NULL) %>% pull(x) } run_1 <- step_with_seed(seed = 1234) run_2 <- step_with_seed(seed = 1234) run_3 <- step_with_seed(seed = 12345) expect_equal(run_1, run_2) expect_false(identical(run_1, run_3)) }) test_that("test tidy()", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_smote(class, id = "") rec_p <- prep(rec) untrained <- tibble( terms = "class", id = "" ) trained <- tibble( terms = "class", id = "" ) expect_equal(untrained, tidy(rec, number = 1)) expect_equal(trained, tidy(rec_p, number = 1)) }) test_that("ratio value works when oversampling", { res1 <- recipe(class ~ x + y, data = circle_example) %>% step_smote(class) %>% prep() %>% bake(new_data = NULL) res1.5 <- recipe(class ~ x + y, data = circle_example) %>% step_smote(class, over_ratio = 0.5) %>% prep() %>% bake(new_data = NULL) expect_true(all(table(res1$class) == max(table(circle_example$class)))) expect_equal( sort(as.numeric(table(res1.5$class))), max(table(circle_example$class)) * c(0.5, 1) ) }) test_that("allows multi-class", { skip_if_not_installed("modeldata") data("credit_data", package = "modeldata") expect_no_error( recipe(Home ~ Age + Income + Assets, data = credit_data) %>% step_impute_mean(Income, Assets) %>% step_smote(Home) ) }) test_that("majority classes are ignored if there is more than 1", { skip_if_not_installed("modeldata") data("penguins", package = "modeldata") rec1_p2 <- recipe(species ~ bill_length_mm + bill_depth_mm, data = penguins[-(1:28), ] ) %>% step_impute_mean(all_predictors()) %>% step_smote(species) %>% prep() %>% bake(new_data = NULL) expect_true(all(max(table(rec1_p2$species)) == 124)) }) test_that("factor levels are not affected by alphabet ordering or class sizes", { circle_example_alt_levels <- list() for (i in 1:4) circle_example_alt_levels[[i]] <- circle_example # Checking for forgetting levels by majority/minor switching for (i in c(2, 4)) { levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class)) } # Checking for forgetting levels by alphabetical switching for (i in c(3, 4)) { circle_example_alt_levels[[i]]$class <- factor( x = circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)) ) } for (i in 1:4) { rec_p <- recipe(class ~ x + y, data = circle_example_alt_levels[[i]]) %>% step_smote(class) %>% prep() expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels rec_p$levels$class$values # New levels ) expect_equal( levels(circle_example_alt_levels[[i]]$class), # Original levels levels(bake(rec_p, new_data = NULL)$class) # New levels ) } }) test_that("ordering of newly generated points are right", { res <- recipe(class ~ x + y, data = circle_example) %>% step_smote(class) %>% prep() %>% bake(new_data = NULL) expect_equal( res[seq_len(nrow(circle_example)), ], as_tibble(circle_example[, c("x", "y", "class")]) ) }) test_that("non-predictor variables are ignored", { res <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_smote(class) %>% prep() %>% bake(new_data = NULL) expect_equal( c(circle_example$id, rep(NA, nrow(res) - nrow(circle_example))), as.character(res$id) ) }) test_that("id variables don't turn predictors to factors", { # https://github.com/tidymodels/themis/issues/56 rec_id <- recipe(class ~ ., data = circle_example) %>% update_role(id, new_role = "id") %>% step_smote(class) %>% prep() %>% bake(new_data = NULL) expect_equal(is.double(rec_id$x), TRUE) expect_equal(is.double(rec_id$y), TRUE) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_smote(all_predictors()) rec_param <- tunable.step_smote(rec$steps[[1]]) expect_equal(rec_param$name, c("over_ratio", "neighbors")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("bad args", { expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_smote(over_ratio = "yes") %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_smote(neighbors = TRUE) %>% prep() ) expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% step_smote(seed = TRUE) ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_smote(class, skip = FALSE) %>% add_role(class, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) trained <- prep(rec, training = circle_example, verbose = FALSE) expect_snapshot( error = TRUE, bake(trained, new_data = circle_example[, -3]) ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_smote(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_smote(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked1) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_smote(rec) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { rec <- recipe(class ~ x + y, data = circle_example) %>% step_smote(class) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_smote( all_predictors(), over_ratio = hardhat::tune(), neighbors = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 2L) }) themis/tests/testthat/testthat-problems.rds0000644000176200001440000006401614744045253020757 0ustar liggesusers] xU՞LimnI$mi "PZ-$6I&d&]@vY\AMpAePdGePpEsg;_mZ99s]/ϗefG_|2+R* }lo-?qf=7uzS=-hff%zC]RrPk՘f>[~Gp&RKPgerw+<3V_݄uƎVM]|4;>}aSir1HH;#8ٶG7D:Wue4 $>s7U]eblc3%RZlxV|C! u'iCY >fΞfʞ؍#̐ #i`;ڷ#(WRuk3oIp>N*R7]5Xkx_؄bf> [od eԵuumŨ\0PaB&*׿?MS .|Jpư35`%ӷJ3SVEʴɌXPRR5b<,Lj 33ܚj^3 sի5/\,h`<ܲ%jM.fno Uvf;^ɉbe抅W-;d`06| kTnkǢzL mбtoO`fpb>>tŘ:p/;8{:;m+ɦ ʔɴ\{غ Ckek='RE:WX}]^p2蝑tQ 1u[P097z#2ǘjH[ong&Zi!K";`,N4A[91`L=ɪѺ^9D0$,\ܚ0иA0QijE*Ks%&uEvqÍMFL}clqlY8&_]x嚥3Qgߛ|-1)4qw"Rј5bNT覀!6}5hɍc\]8}aPQrLN{Cز+O6)o#]TbC,~6EJLZLTᣫ׬8-"?LoU噷 (6}xP$ -vٵ}##]>OxFĮן\XTza7R[K24\Kesr;l+)%>S ՗}_a)_GY%vѾ"-?|"jx Ѿ _Ld|}F>3um+QfR}9!7gY/'~~9yh|<|7q"@B,Gڶ=GDj̑[9EP=D/ôsHbP>Oy$Ld ?S#-BBc CT_D$RńQJ_h!RE4C"4TÈVK_#Nk=Qu#p1OP囱q>ߗZ?L%rѿG`G_8T$cE-"~"!jHT:BOɊTGo&DGBÛx/iQNKuY? :|Dn٪"EeE{J|""P}!́XORB?YTO%4M9t=J44 78KM< sY"&B>3v&yO"ϧ&D~~?p`"Ϳ "L_*rKfˉ/UK"S} !"WZs*#}%o_'">WjG_!Ni7׈|UߵUDnRބȯ=&K&V> _هo,T '&ܩbB,'%e.U_Nybw_8Ɨ{D~{EXGh#{=Ɋx1?_A7=WX,m"fB_o&4|S"Ϩ{*Eωb牯VE^P} ,7KK+^_=oyCu%n׷7/U oI?-'y}3%WՏ'D|a?R&by?Ǘs.YO[Ʉ?g?UZg0zFoU܏Yńȿ[쩲k|b:f4"gό^OwH?!{oO'%WAf=NSeK?f/cN,.}5lV} 26lfK=WT_߉*aɪJ9{2q?hiK*rO#~Ž BBpN,y׺S"Q} !i -K9?.s~YՏ"4囹" uq%/!фƿ.%>x Cp~79)y,<_EY c/|E%迫EW/Ub'\mVU;O{D5oYQOVl r En8lO_- Ҿd-_M?wܥBNY.͙0GOf-r/#%P뫉DS}oF;B76"%,B-{k~=5\3 QggM 2嬓oIo,G|"/o|7ф{/ PVdʗsg)g.smf~S},(RQ~ɜeQė+K,2qMK|Μhk _>gU=?C8Y3{KxBq9A~ ZDY2Ϣ[TQ'Ya5Xg[{*ʛ[b/"sT_F4>أ9ė+r~Y4s =<|{Q ,ޮAyyT"KTP-1?/bKdz%"T_F/9bˈ/WO"+T?_a+,sbi?537Eܟ9b_HLs?x+#DNW*Vф^x=E|"ϏrlQB̏x }鄘ډHCa_Hamć ٭ I>Q$"Q(H*_j'Y MfK&$ad3743T_tY y|)r+1~bg_8?"窾s{?Iyid>կI+6Rǘȯ ɯD.Q|B% K39ͯF}i;_K_b_&T_ȼZWWDn5kESXBHG t?y7U?{M'+b}0mAo?۩}3 K~׬|/}#T_F/s=?@|/ yX!G-0Պ|>#1G,s~>JK{zS"O[x샄3ڿ'/){.q`(׾ ~37|{?>@4h=Ez["Dao_M /{ij,E?c"T_'?MأP,-K }!LG~]W?bkwgC'&$Y9{ol~ϒ\VxB,j?|s,g>P/Y%i{P nOV{ԯؒ1\@_H&GgBV5?doGė+I%jU_Mx|:i]Oaifɚ;RBGr!k_HkɱՏ"#{o ZoU%DgxLKiBmφK \\[ė)V9sg*B_$r%~"Ddk1aɝO")?1s"-<_N 4!鄰_e?_.{ PGY+u*B_տAz"+"תFk{BB{z?pN1wW~j_H{+Ŋz߿!m_n>=m=;@qg]/U~/}|#՗yb22_O~+ȳ{_Uo"Hi=?L|"~,k_a?Fq,TqՏ#D~X}?{[|[[?A?eC 0WC[տuE_.VLB,{&Pٲt򻉸=̖d@?f.{_X_rvG"~bb%ʛg9e 5ˉW#[*՗~2'f-_k-<2B=e&e.Ϯ_K:>Xc-ws[IG߀迩 ӈ'D!ϴqN|ο)c-{ Ѿ?Rr/kX\Wx\KP* Q=?TEď"DKhEL~ʾ5{? %5?2Vk}_ժ/!DX4~i-^Bf[ ^R#T_D]~ G#/U&"s4Պ%cU?akxU}P +oޡ+={HD=ʏ_I'ҩ*Bx{ﴔOc9Vu "'QLlR}!'b/$D7[il8~E]mG(/Ds w=_`>>?pg~_b|yU D~N=E/^)Տ!D]a_ ~!Wݗ[kYԾkDz?!w{U~7:U_A=_o)}-["[{Mc}>֗>+r {"Xv Q;DR})!;E]ė+}{T_E|%jEY?DXB?^LyH ȯ=Q,C~^3gRO&K_?&+/b/_HdWT"B]mmʷ_#:Ŋ؟"%l!{{ 3>X<?FIõ>;yiR}P#G|kW S1&B{%~69Rsסq7XsoBNq09YQ~1U?e=͑.\Տ#DJ8Q' WF$BO)/mAQI9S-h_͊_sΑ,^ChʗWh¿y9k~BK~, Q~PdD_@/d?oOBO#~!!{y~mΈ944/kiR=H|b/5~$rfPG9,?JB\CH_i=MD3R>{Z^?+gAB]r%NŪo&D~a-G3WEivqYrT_A\!rD|""!~"ϡ ?_8Fdzأ~,Jz?riuۯWxo9洨gV=_Hk+Rd?ەTmP}!',A|"_D"DG|"of#("]s3Nb+rmSKw'Y&S,BB-"'a'Zʧiå X}#P9g*u=AE;"Nq*E(nFRid%~߯W ?X՗"~+!T'?=j1 T?{.>\{A%՗"~/b/_8Rת!_a__6{YUO Da?Q﯋NS ?Y 4Bݤ6@?<^g]Eyt-1YտuEz~+}фyO|-?I89ȕ[zbYrsأ2#rέT})!'g{$_>Vo( {&/rkU?zws[ῖq/rOKg{g)3?q+4wSu͝L4EܯwsG3w=7͒7}X 9ܙʛ3Vb/"ͪ/#D~6|b/WosU_MasQ磹 Kٓ8Gbꃄ蟅ޖO/v/n~>A(BԿՇk:= P篃DVq=$L+V}%!W[ʧuS7X@Q/RwH"T"-E#w/~"㣕"Em"aWb{أ~aO̥G|!w<EԯS$K;Yȟn{//\4Tse<~B2c(Y+L~1TW"=Y?gt ?g_#垭JBYʟ;ە?G}"ϧD.GD|_؃տn2͟sU/|yT_BR=_E|"=ȗT_C=18ߙ]#UO Db_%>?L^QyOQ_7?u %[-<ʿ߰ćΉ3!cɏfC< r-rK`{/UDS}5!5=(K7ET},7-&xo֟DfBϟbK~%bYď''jsl[|8?B/"^qOxأ~ S U)/`O5՗j~%A=ʯ!\/}jB=Osڇv~S ۾N ԿAM%SUOȚoTh4=?`8~؟Z_HsPB}DTOZbM5㧙 BW)bnƶ%BwK~pis ][חؗ"_'-?E~RW9bKTr?P{_NXB_!rb}9Hd&>H+EV~"!֗Cl,3~?J')b0c-눟wߓ1{K~Ip^/TǗV(R"R}!X=_KH ѿmN|"7:L?Ug.ljoQd@ħ{(3'DURoׇ&DZBoŞׯTBE^N9?Ie >3ǝkS(wȟӉR 3-'rfy/XJ+*=ݿΩ$B-DTEof 1V: D|*r_GLf5T?o ~G(7ܬ7b_bo-#Dji26@Bfd;rBa_le|.'+^g7HD~Bb_c+/81>"J|@&+ Q-U5q'U_KOh')gT?i, uS{zpN"oٿH'KeBؿb޿F7וSQMnI|"oYń'[ė(~!"߈-~7OKx7o2!c䔽T?0#yjɜ ?y"{CtE<1a~EG|!˙ 7j|-k&r٣e¾2ؾʵ~|OWO9S5~>0LB/z>ȥ/*oPקD~!CjyWZ&zcI_5~!Hh7WD/i <}Eأ){Ӿ=E=/O?.Ѿ/+o7yBoȅ{?@}: [B^^_ {U_Fh< Y`+JU&4Mn"rW_89TAB{?R>}4*:b|=#pK)^z3#yG?[qoj"-߇{ԯŒ/\f﫸/ڕ7gd"'MoP}5!F>= ~"?9^ ѿy Jx pNq?`I,fQHVT_A(rO"R ST?9OO{sWC=5?RZ b3Dη?_;lYhߧ\x_ujRn庾}&!ʿ—+-o5(KϼTU{%(oиK,rhEcGXFU䛪/'D|i)w}7*/;DR}"_w|bw_H+yG"=W)O+CՏ#"?8'r 0c[~!ORS _Oiy£|~zBߜ-%Aq:Ny ϓ B[ ~*ك3%lK ;bF G![R+o!(h"oaeGo?JqKϪLLB/{7+V`)]].yOx^[/W$|,j3=?R>ݿiE$>&S|ߢ |YsTOɗ8bEb=Ω_"?L'RߪouOR(}2'XEd>̯&PקW)"d/}_kQ~-"c,_:ݿ}Sy| ĠAń{o D\?S/kEA{I|b5!)$1?!rG9Cń~r?B?U4 })!l~7+blw9w ,(g"竾дOgy73p~_j]H\b/R,:>nWZ[]EO%TG}"/"cOU}1!7ߴأ_BcO𰿎K 1rBzo& Bϯ[GsWIo'whsw!s*gpߝwҞO$bG PǛ1WE='szO4B[^ba@Y@???@ E<4B!o3!aK4po~"S__<!1?XQ('DR}!?)"\ׯEU E^%~!?'s7"D,.e7"~ElyT__NK u /'ik!Wj~,pNa_NxߡIQD?м0HoEjT?^qϙE&ZQ⃄_+2V u~ >hkdO")PGl_b|iأ {:񥄆}P`労~iT EZ[QFg¿%٪o$D6{la?R?Z K̈́ȏģ~$xߋ;B~3w!Պ5_E1{ ceHKE2R|Y[.rWY} ?{J?BZ7sZG;ŊKk#R|b}Yged$%$c(?¾5}ͯ9>뛜c]h̲]z@W^P9Ey8E__D&. g %UD '})!$Kwhs[Nͩ›qAM|O9S~V {,ߙ+YDGE_T?b?'foI -)'E.P}!ֿP/3^F|!ˉwT"g>}Vj=9ERX__N/[_ .pV;~xدO+o_9\Bu2\-8{3~LHEu=Iބ$-k3OgHH~EoXXQ~"L|ZEU_Jdvqn٨jB=mZ#5!r2 ""{?qH;l]ﶴ(*b5sg)OS܃'Y1>7yأ~-Ag Wp(1O(o(9T~9MeOs?yF|(j"g?ϴ9Ծ1sE.S9oߴ"V}>!) -ihg,ta8~ԾD/Vy}\|B2/Yr؟s?%J;-<<"WZӗ\sW7]_~/L&rKE޶ ʿ7M"_W}^E׉WoU^=2JMo%rM(E&>)'"?aߵOWiB)d/*!ֿ_O? /Az'"?S}1!"أ_8GΓ~!17,8 Ѿ'}OΉTE/Q__UοKET}!o*=H|" jBg=d)ίsWH"GR|U7  Y|Bg,!ڿbOW9+ToT'{£UO%bGPIc j!JοNJ,<} !j?\sGt}~]}in՗"Q{,ߣo@>c"Y?N_S3~D6ypV=o&~"=_*b7*b~77{T?9Uj=?X%aa Q3S9"~*!ȅ{q~2?_SNI{|~A3 FE.UB+-(R*GL /&D/+,/&Jj ܟ)W,_43@S{?7ܢ2BEnأ[/WVo7Dmo?JoYy6o4!{}>ݮ/9ak{t}>qn{K$DϿacy\Bc>_=?N|^ T=?E>ޜ^Hߛ7}?W- bdTPe7[Q/R"V}!+ZW*be/WU_KhS[U¿G _S8BQ_d?AΗɄs/o~B7=Q,Cw+BWf7K/U_JYQė)_"V}!{7Ut6!q?E]$sUQ'jeykmpN*H沢j_}@??"9PH|UYῘrE:I_o5!Saei}4ӾE: m__ԿA:Ϣ1'D|d*o1{:?Ω_"d)jU>&>\-e5k _SRwQoQ~=" pisM7wK;YBWܢ{_@|h/suB¿EBk뢽E>H"}bHZ'G~u,#(~u(ZJE̯/E+D7~[ίK?/=ֈS} !wH_s!Zk9Y !"G[= (=T:B1qR&"ڈQ7~KrtiSmz>+:Ay;^3 Q~ܗ~~- 43T D~.:bg_K9[ ?gy3<ͽʿ5~3~5{i K)~Pe"W[B~"KT?P_E\aKM\BeT?-3ُQ RW|A Wy,jK2/=^y?_b_ɏDnR}5!FoY&G+bm-K~oOkk?i|S=W|Ky;E~j/P"[=C|"ǦP???JT?=?di#2d%,EDP}""Z Ӫ/Ie=?M|"3"ϩ2N=a?JT?6Na/Y%̿|/~"W}@'~'="R?:Gw,~Pћ"R:=ъȿ_ǧ鿿{?,O;2,Ϥzqłŕ{olHX~.Q}Y Ų+.@b T?*b=?ZPXS+.bK /Wj(K/VWЉ-?JjU_B'<:.?9xwAO3u}cgЩ_SawsTު鿅\UZo gwؒ3aoL<_SS›u5_o->@dh_eGˈ(EV4N|b+/S J!O|b+NժBVQW[gd_H7kEW}0N~- qK2O@+ZDB+r+eE&?ZEU?*Novy lT}m nk?~4W$:평bEɸ'?o٢:7c- )[ENR}Q s=D|"O w*R wSt3,?'D^yyJ鿧=Ky9R ~t -<kRKp~LMWR*_T_B~+Y{; K)~'H%KD~T 9@d_He"U_B'zrK<>WZ*8>`u/O"GYt_rB#,o%7A iiW_Mdۉ($,Q)tA$nė(C8՗Й"񕊜"=M?Q{ﱴ/n/&*?ѕf9Y)t ,2f?MRT{?R>Y>ls k)tI_e(_ Avȝ~a41,)aKrQah5=ah9z˓< ]j;<<{%޽x4kݾ1{A1m.Y}!KW?'ԁtg>Xc }7wq{Z9 pGvyqOZwك]Z5w_~ gBq{`C]W]}!WA%f!3}:ܱNH/=;;gBA`պ+5LgG߻\+~v#mԻr/{lGߗ;>!wf {^}#e;v9ƎaI^ۦ9;k2Bc/0FO߁ !0nG{>~Z>MK# <|D:({gִ}^kLWo߸Yh$>}ݯ.) >0laW,sŪ ޶hw=RlGa* ,͑{sŰ7=9|6zẋ-*#O'Lma=<}$v?eN:_{S/T!|ɽMAҗ#5rݳ#QZz e/`._[BF퐞|f d8lǣgmᄇޱ Wv\C vh=YgaarG5el2{o\j?8҈hH?S5Tu-݂mI{=2sEz'y4x=weۢwe^9w`7~MqD̹;mΔ[₥2OV3ݪ?fb^<;Bq|{mplGtnQ/]pg{(S @ Gx_w3\ot}#$tA?f3d8fs}W-;h;E?ពXW4nVq7iGS]SpoKo(1{07K uyƹ\1s,.95vFb3Ƞo3,ҽ!:X})DݽCg;Y5gXc2sNٞzyf{n)kG?/ćA/u_jt=.ngHX\űe Cβ+ >%NͮL1yV, .u#A}=P]MZwj euN v- Et~,!.9о0(|E\wl,n?Vh{_[Xv#m`lt)ND|s$b\MfNT gnv|7Xg \Q8-}WCkߥ)w9+`ws$12>,H=g5=sFH=zz;v冴7~b:hAB1w&S!WHl=Iӱɬu*?̻ qؠ?u{CH9rw%^;%C{9fm՚zam߫8iGZĖ t:.[pS/=>ᆾϖgCucFvUR80; /644' ޖ mپj+OCo+PQo7/N 3FlMFw@l`8w u2$c.do͝N0\=g/0 t^mrWtv8 =!Ly]X"]=}pKW8ԣ)M؆Mːƴ3#wuF[9#'rxK4S;8{GA'ܽ/27C/ILFP$*ntvtJu-]/(pOڷ+v1nc9{yFW]!؆4=!=r !oꀡ8mBgeN7:۝9;6lغ5ho{ؼ h091vap1'ܛs ]95^4ۃrۧ;W = 1.1.0) # --- # # ## Changelog # # 2024-08-15: # - `check_character()` gains an `allow_na` argument (@martaalcalde, #1724) # # 2023-03-13: # - Improved error messages of number checkers (@teunbrand) # - Added `allow_infinite` argument to `check_number_whole()` (@mgirlich). # - Added `check_data_frame()` (@mgirlich). # # 2023-03-07: # - Added dependency on rlang (>= 1.1.0). # # 2023-02-15: # - Added `check_logical()`. # # - `check_bool()`, `check_number_whole()`, and # `check_number_decimal()` are now implemented in C. # # - For efficiency, `check_number_whole()` and # `check_number_decimal()` now take a `NULL` default for `min` and # `max`. This makes it possible to bypass unnecessary type-checking # and comparisons in the default case of no bounds checks. # # 2022-10-07: # - `check_number_whole()` and `_decimal()` no longer treat # non-numeric types such as factors or dates as numbers. Numeric # types are detected with `is.numeric()`. # # 2022-10-04: # - Added `check_name()` that forbids the empty string. # `check_string()` allows the empty string by default. # # 2022-09-28: # - Removed `what` arguments. # - Added `allow_na` and `allow_null` arguments. # - Added `allow_decimal` and `allow_infinite` arguments. # - Improved errors with absent arguments. # # # 2022-09-16: # - Unprefixed usage of rlang functions with `rlang::` to # avoid onLoad issues when called from rlang (#1482). # # 2022-08-11: # - Added changelog. # # nocov start # Scalars ----------------------------------------------------------------- .standalone_types_check_dot_call <- .Call check_bool <- function(x, ..., allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x) && .standalone_types_check_dot_call(ffi_standalone_is_bool_1.0.7, x, allow_na, allow_null)) { return(invisible(NULL)) } stop_input_type( x, c("`TRUE`", "`FALSE`"), ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_string <- function(x, ..., allow_empty = TRUE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { is_string <- .rlang_check_is_string( x, allow_empty = allow_empty, allow_na = allow_na, allow_null = allow_null ) if (is_string) { return(invisible(NULL)) } } stop_input_type( x, "a single string", ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } .rlang_check_is_string <- function(x, allow_empty, allow_na, allow_null) { if (is_string(x)) { if (allow_empty || !is_string(x, "")) { return(TRUE) } } if (allow_null && is_null(x)) { return(TRUE) } if (allow_na && (identical(x, NA) || identical(x, na_chr))) { return(TRUE) } FALSE } check_name <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { is_string <- .rlang_check_is_string( x, allow_empty = FALSE, allow_na = FALSE, allow_null = allow_null ) if (is_string) { return(invisible(NULL)) } } stop_input_type( x, "a valid name", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } IS_NUMBER_true <- 0 IS_NUMBER_false <- 1 IS_NUMBER_oob <- 2 check_number_decimal <- function(x, ..., min = NULL, max = NULL, allow_infinite = TRUE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (missing(x)) { exit_code <- IS_NUMBER_false } else if (0 == (exit_code <- .standalone_types_check_dot_call( ffi_standalone_check_number_1.0.7, x, allow_decimal = TRUE, min, max, allow_infinite, allow_na, allow_null ))) { return(invisible(NULL)) } .stop_not_number( x, ..., exit_code = exit_code, allow_decimal = TRUE, min = min, max = max, allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_number_whole <- function(x, ..., min = NULL, max = NULL, allow_infinite = FALSE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (missing(x)) { exit_code <- IS_NUMBER_false } else if (0 == (exit_code <- .standalone_types_check_dot_call( ffi_standalone_check_number_1.0.7, x, allow_decimal = FALSE, min, max, allow_infinite, allow_na, allow_null ))) { return(invisible(NULL)) } .stop_not_number( x, ..., exit_code = exit_code, allow_decimal = FALSE, min = min, max = max, allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } .stop_not_number <- function(x, ..., exit_code, allow_decimal, min, max, allow_na, allow_null, arg, call) { if (allow_decimal) { what <- "a number" } else { what <- "a whole number" } if (exit_code == IS_NUMBER_oob) { min <- min %||% -Inf max <- max %||% Inf if (min > -Inf && max < Inf) { what <- sprintf("%s between %s and %s", what, min, max) } else if (x < min) { what <- sprintf("%s larger than or equal to %s", what, min) } else if (x > max) { what <- sprintf("%s smaller than or equal to %s", what, max) } else { abort("Unexpected state in OOB check", .internal = TRUE) } } stop_input_type( x, what, ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_symbol <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_symbol(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a symbol", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_arg <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_symbol(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an argument name", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_call <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_call(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a defused call", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_environment <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_environment(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an environment", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_function <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_function(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a function", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_closure <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_closure(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an R function", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_formula <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_formula(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a formula", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } # Vectors ----------------------------------------------------------------- # TODO: Figure out what to do with logical `NA` and `allow_na = TRUE` check_character <- function(x, ..., allow_na = TRUE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_character(x)) { if (!allow_na && any(is.na(x))) { abort( sprintf("`%s` can't contain NA values.", arg), arg = arg, call = call ) } return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a character vector", ..., allow_null = allow_null, arg = arg, call = call ) } check_logical <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_logical(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a logical vector", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_data_frame <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is.data.frame(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a data frame", ..., allow_null = allow_null, arg = arg, call = call ) } # nocov end themis/R/adasyn_impl.R0000644000176200001440000000711414744045253014402 0ustar liggesusers#' Adaptive Synthetic Algorithm #' #' Generates synthetic positive instances using ADASYN algorithm. #' #' @inheritParams step_adasyn #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' All columns used in this function must be numeric with no missing data. #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @seealso [step_adasyn()] for step function of this method #' @family Direct Implementations #' #' @examples #' circle_numeric <- circle_example[, c("x", "y", "class")] #' #' res <- adasyn(circle_numeric, var = "class") #' #' res <- adasyn(circle_numeric, var = "class", k = 10) #' #' res <- adasyn(circle_numeric, var = "class", over_ratio = 0.8) adasyn <- function(df, var, k = 5, over_ratio = 1) { check_data_frame(df) check_var(var, df) check_number_whole(k, min = 1) check_number_decimal(over_ratio) predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -all_of(var))) adasyn_impl(df, var, k, over_ratio) } adasyn_impl <- function(df, var, k = 5, over_ratio = 1, call = caller_env()) { majority_count <- max(table(df[[var]])) ratio_target <- majority_count * over_ratio which_upsample <- which(table(df[[var]]) < ratio_target) samples_needed <- ratio_target - table(df[[var]])[which_upsample] min_names <- names(samples_needed) out_dfs <- list() data_mat <- as.matrix(df[names(df) != var]) ids_full <- RANN::nn2(data_mat, k = k + 1, searchtype = "priority")$nn.idx for (i in seq_along(min_names)) { min_class_in <- df[[var]] != min_names[i] r_value <- pmax( 0, rowSums(matrix((min_class_in)[ids_full], ncol = ncol(ids_full))) - 1 ) r_value <- r_value[!min_class_in] danger_ids <- sample(seq_along(r_value), samples_needed[i], TRUE, prob = r_value ) minority <- data_mat[!min_class_in, , drop = FALSE] if (nrow(minority) <= k) { cli::cli_abort("Not enough observations of {.val {min_names[i]}} to perform ADASYN.", call = call) } tmp_df <- as.data.frame( adasyn_sampler( minority, k, samples_needed[i], danger_ids ) ) colnames(tmp_df) <- colnames(data_mat) tmp_df[[var]] <- min_names[i] out_dfs[[i]] <- tmp_df } final <- rbind(df, do.call(rbind, out_dfs)) final[[var]] <- factor(final[[var]], levels = levels(df[[var]])) rownames(final) <- NULL final } adasyn_sampler <- function(data, k, n_samples, smote_ids) { ids <- RANN::nn2(data, k = k + 1, searchtype = "priority")$nn.idx index_len <- tabulate(smote_ids, NROW(data)) out <- matrix(0, nrow = n_samples, ncol = ncol(data)) sampleids <- sample.int(k, n_samples, TRUE) runif_ids <- stats::runif(n_samples) iii <- 0 for (row_num in which(index_len != 0)) { index_selection <- iii + seq_len(index_len[row_num]) # removes itself as nearest neighbour id_knn <- ids[row_num, ids[row_num, ] != row_num] dif <- data[id_knn[sampleids[index_selection]], ] - data[rep(row_num, index_len[row_num]), ] gap <- dif * runif_ids[index_selection] out[index_selection, ] <- data[rep(row_num, index_len[row_num]), ] + gap iii <- iii + index_len[row_num] } out } themis/R/smotenc.R0000644000176200001440000001576514744276166013576 0ustar liggesusers#' Apply SMOTENC algorithm #' #' `step_smotenc()` creates a *specification* of a recipe step that generate new #' examples of the minority class using nearest neighbors of these cases. #' Gower's distance is used to handle mixed data types. For categorical #' variables, the most common category along neighbors is chosen. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param neighbors An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param seed An integer that will be used as the seed when #' smote-ing. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' Columns can be numeric and categorical with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_smotenc" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-not-supported #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @seealso [smotenc()] for direct implementation #' @family Steps for over-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' orig <- count(hpc_data, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data) %>% #' step_impute_knn(all_predictors()) %>% #' # Bring the minority levels up to about 1000 each #' # 1000/2211 is approx 0.4523 #' step_smotenc(class, over_ratio = 0.4523) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data) %>% #' count(class, name = "baked") #' baked #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") step_smotenc <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("smotenc")) { check_number_whole(seed) add_step( recipe, step_smotenc_new( terms = enquos(...), role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_smotenc_new <- function(terms, role, trained, column, over_ratio, neighbors, predictors, skip, seed, id) { step( subclass = "smotenc", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_smotenc <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$over_ratio, arg = "over_ratio", min = 0) check_number_whole(x$neighbors, arg = "neighbors", min = 1) check_1_selected(col_name) check_column_factor(training, col_name) predictors <- setdiff(get_from_info(info, "predictor"), col_name) check_na(select(training, all_of(c(col_name, predictors)))) step_smotenc_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, neighbors = x$neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_smotenc <- function(object, new_data, ...) { col_names <- unique(c(object$predictors, object$column)) check_new_data(col_names, object, new_data) if (length(object$column) == 0L) { # Empty selection return(new_data) } new_data <- as.data.frame(new_data) predictor_data <- new_data[, col_names] # smotenc with seed for reproducibility with_seed( seed = object$seed, code = { synthetic_data <- smotenc_impl( predictor_data, object$column, k = object$neighbors, over_ratio = object$over_ratio ) synthetic_data <- as_tibble(synthetic_data) } ) new_data <- na_splice(new_data, synthetic_data, object) new_data } #' @export print.step_smotenc <- function(x, width = max(20, options()$width - 26), ...) { title <- "SMOTENC based on " print_step(x$column, x$terms, x$trained, title, width) invisible(x) } #' @rdname step_smotenc #' @usage NULL #' @export tidy.step_smotenc <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_smotenc <- function(x, ...) { tibble::tibble( name = c("over_ratio", "neighbors"), call_info = list( list(pkg = "dials", fun = "over_ratio"), list(pkg = "dials", fun = "neighbors", range = c(1, 10)) ), source = "recipe", component = "step_smotenc", component_id = x$id ) } #' @rdname required_pkgs.step #' @export required_pkgs.step_smotenc <- function(x, ...) { c("themis") } themis/R/upsample.R0000644000176200001440000002055014744276166013740 0ustar liggesusers#' Up-Sample a Data Set Based on a Factor Variable #' #' `step_upsample()` creates a *specification* of a recipe step that will #' replicate rows of a data set to make the occurrence of levels in a specific #' factor level equal. #' #' @inheritParams recipes::step_center #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param over_ratio A numeric value for the ratio of the #' minority-to-majority frequencies. The default value (1) means #' that all other levels are sampled up to have the same #' frequency as the most occurring level. A value of 0.5 would mean #' that the minority levels will have (at most) (approximately) #' half as many rows than the majority level. #' @param ratio Deprecated argument; same as `over_ratio`. #' @param target An integer that will be used to subsample. This #' should not be set by the user and will be populated by `prep`. #' @param seed An integer that will be used as the seed when upsampling. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' @details #' Up-sampling is intended to be performed on the _training_ set #' alone. For this reason, the default is `skip = TRUE`. #' #' If there are missing values in the factor variable that is used #' to define the sampling, missing data are selected at random in #' the same way that the other factor levels are sampled. Missing #' values are not used to determine the amount of data in the #' majority level (see example below). #' #' For any data with factor levels occurring with the same #' frequency as the majority level, all data will be retained. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_upsample" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-unsupervised #' #' @family Steps for over-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' # Bring the minority levels up to about 1000 each #' # 1000/2211 is approx 0.4523 #' step_upsample(class, over_ratio = 0.4523) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without upsample") #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_upsample(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_jitter(width = 0.1, height = 0.1) + #' labs(title = "With upsample (with jittering)") step_upsample <- function(recipe, ..., over_ratio = 1, ratio = deprecated(), role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("upsample")) { if (lifecycle::is_present(ratio)) { lifecycle::deprecate_stop( "0.2.0", "step_downsample(ratio = )", "step_downsample(over_ratio = )" ) } check_number_whole(seed) add_step( recipe, step_upsample_new( terms = enquos(...), over_ratio = over_ratio, ratio = NULL, role = role, trained = trained, column = column, target = target, skip = skip, seed = seed, id = id, case_weights = NULL ) ) } step_upsample_new <- function(terms, over_ratio, ratio, role, trained, column, target, skip, seed, id, case_weights) { step( subclass = "upsample", terms = terms, over_ratio = over_ratio, ratio = ratio, role = role, trained = trained, column = column, target = target, skip = skip, id = id, seed = seed, case_weights = case_weights ) } #' @export prep.step_upsample <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$over_ratio, arg = "over_ratio", min = 0) wts <- recipes::get_case_weights(info, training) were_weights_used <- recipes::are_weights_used(wts, unsupervised = TRUE) if (isFALSE(were_weights_used) || is.null(wts)) { wts <- rep(1, nrow(training)) } check_1_selected(col_name) check_column_factor(training, col_name) if (length(col_name) == 0) { majority <- 0 } else { obs_freq <- weighted_table(training[[col_name]], as.integer(wts)) majority <- max(obs_freq) } check_na(select(training, all_of(col_name))) step_upsample_new( terms = x$terms, ratio = x$ratio, over_ratio = x$over_ratio, role = x$role, trained = TRUE, column = col_name, target = floor(majority * x$over_ratio), skip = x$skip, id = x$id, seed = x$seed, case_weights = were_weights_used ) } supsamp <- function(x, wts, num) { n <- nrow(x) if (nrow(x) == num) { out <- x } else { # upsampling is done with replacement out <- x[sample(seq_len(n), max(num, n), replace = TRUE, prob = wts), ] } out } #' @export bake.step_upsample <- function(object, new_data, ...) { col_names <- names(object$column) check_new_data(col_names, object, new_data) if (length(col_names) == 0L) { # Empty selection return(new_data) } if (isTRUE(object$case_weights)) { wts_col <- purrr::map_lgl(new_data, hardhat::is_case_weights) wts <- new_data[[names(which(wts_col))]] wts <- as.integer(wts) } else { wts <- rep(1, nrow(new_data)) } if (any(is.na(new_data[[col_names]]))) { missing <- new_data[is.na(new_data[[col_names]]), ] } else { missing <- NULL } split_data <- split(new_data, new_data[[col_names]]) split_wts <- split(wts, new_data[[col_names]]) # Upsample with seed for reproducibility with_seed( seed = object$seed, code = { new_data <- purrr::map2_dfr( split_data, split_wts, supsamp, num = object$target ) if (!is.null(missing)) { new_data <- bind_rows(new_data, supsamp(missing, object$target)) } } ) new_data } #' @export print.step_upsample <- function(x, width = max(20, options()$width - 26), ...) { title <- "Up-sampling based on " print_step(x$column, x$terms, x$trained, title, width, case_weights = x$case_weights) invisible(x) } #' @rdname step_upsample #' @usage NULL #' @export tidy.step_upsample <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_upsample <- function(x, ...) { tibble::tibble( name = c("over_ratio"), call_info = list( list(pkg = "dials", fun = "over_ratio") ), source = "recipe", component = "step_upsample", component_id = x$id ) } #' @rdname required_pkgs.step #' @export required_pkgs.step_upsample <- function(x, ...) { c("themis") } themis/R/bsmote.R0000644000176200001440000002147614744276166013413 0ustar liggesusers#' Apply borderline-SMOTE Algorithm #' #' `step_bsmote()` creates a *specification* of a recipe step that generate new #' examples of the minority class using nearest neighbors of these cases in the #' border region between classes. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param neighbors An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param all_neighbors Type of two borderline-SMOTE method. Defaults to FALSE. #' See details. #' @param seed An integer that will be used as the seed when #' smote-ing. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' This methods works the same way as [step_smote()], expect that instead of #' generating points around every point of of the minority class each point is #' first being classified into the boxes "danger" and "not". For each point the #' k nearest neighbors is calculated. If all the neighbors comes from a #' different class it is labeled noise and put in to the "not" box. If more then #' half of the neighbors comes from a different class it is labeled "danger. # Points will be generated around points labeled "danger". #' #' If all_neighbors = FALSE then points will be generated between nearest #' neighbors in its own class. If all_neighbors = TRUE then points will be #' generated between any nearest neighbors. See examples for visualization. #' #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_bsmote" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-not-supported #' #' @references Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: #' a new over-sampling method in imbalanced data sets learning. In #' International Conference on Intelligent Computing, pages 878–887. Springer, #' 2005. #' #' @seealso [bsmote()] for direct implementation #' @family Steps for over-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' # Bring the minority levels up to about 1000 each #' # 1000/2211 is approx 0.4523 #' step_bsmote(class, over_ratio = 0.4523) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without SMOTE") #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_bsmote(class, all_neighbors = FALSE) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With borderline-SMOTE, all_neighbors = FALSE") #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_bsmote(class, all_neighbors = TRUE) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With borderline-SMOTE, all_neighbors = TRUE") step_bsmote <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, all_neighbors = FALSE, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("bsmote")) { check_number_whole(seed) add_step( recipe, step_bsmote_new( terms = enquos(...), role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, all_neighbors = all_neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_bsmote_new <- function(terms, role, trained, column, over_ratio, neighbors, all_neighbors, predictors, skip, seed, id) { step( subclass = "bsmote", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, all_neighbors = all_neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_bsmote <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$over_ratio, arg = "over_ratio", min = 0) check_number_whole(x$neighbors, arg = "neighbors", min = 1) check_bool(x$all_neighbors, arg = "all_neighbors") check_1_selected(col_name) check_column_factor(training, col_name) predictors <- setdiff(get_from_info(info, "predictor"), col_name) check_type(training[, predictors], types = c("double", "integer")) check_na(select(training, all_of(c(col_name, predictors)))) step_bsmote_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, neighbors = x$neighbors, all_neighbors = x$all_neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_bsmote <- function(object, new_data, ...) { col_names <- unique(c(object$predictors, object$column)) check_new_data(col_names, object, new_data) if (length(object$column) == 0L) { # Empty selection return(new_data) } new_data <- as.data.frame(new_data) predictor_data <- new_data[, col_names] # bsmote with seed for reproducibility with_seed( seed = object$seed, code = { synthetic_data <- bsmote_impl( predictor_data, object$column, k = object$neighbors, over_ratio = object$over_ratio, all_neighbors = object$all_neighbors ) synthetic_data <- as_tibble(synthetic_data) } ) new_data <- na_splice(new_data, synthetic_data, object) new_data } #' @export print.step_bsmote <- function(x, width = max(20, options()$width - 26), ...) { title <- "BorderlineSMOTE based on " print_step(x$column, x$terms, x$trained, title, width) invisible(x) } #' @rdname step_bsmote #' @usage NULL #' @export tidy.step_bsmote <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_bsmote <- function(x, ...) { tibble::tibble( name = c("over_ratio", "neighbors", "all_neighbors"), call_info = list( list(pkg = "dials", fun = "over_ratio"), list(pkg = "dials", fun = "neighbors"), list(pkg = "dials", fun = "all_neighbors") ), source = "recipe", component = "step_bsmote", component_id = x$id ) } #' @rdname required_pkgs.step #' @export required_pkgs.step_bsmote <- function(x, ...) { c("themis") } themis/R/import-standalone-obj-type.R0000644000176200001440000002113414744045253017267 0ustar liggesusers# Standalone file: do not edit by hand # Source: https://github.com/r-lib/rlang/blob/HEAD/R/standalone-obj-type.R # Generated by: usethis::use_standalone("r-lib/rlang", "obj-type") # ---------------------------------------------------------------------- # # --- # repo: r-lib/rlang # file: standalone-obj-type.R # last-updated: 2024-02-14 # license: https://unlicense.org # imports: rlang (>= 1.1.0) # --- # # ## Changelog # # 2024-02-14: # - `obj_type_friendly()` now works for S7 objects. # # 2023-05-01: # - `obj_type_friendly()` now only displays the first class of S3 objects. # # 2023-03-30: # - `stop_input_type()` now handles `I()` input literally in `arg`. # # 2022-10-04: # - `obj_type_friendly(value = TRUE)` now shows numeric scalars # literally. # - `stop_friendly_type()` now takes `show_value`, passed to # `obj_type_friendly()` as the `value` argument. # # 2022-10-03: # - Added `allow_na` and `allow_null` arguments. # - `NULL` is now backticked. # - Better friendly type for infinities and `NaN`. # # 2022-09-16: # - Unprefixed usage of rlang functions with `rlang::` to # avoid onLoad issues when called from rlang (#1482). # # 2022-08-11: # - Prefixed usage of rlang functions with `rlang::`. # # 2022-06-22: # - `friendly_type_of()` is now `obj_type_friendly()`. # - Added `obj_type_oo()`. # # 2021-12-20: # - Added support for scalar values and empty vectors. # - Added `stop_input_type()` # # 2021-06-30: # - Added support for missing arguments. # # 2021-04-19: # - Added support for matrices and arrays (#141). # - Added documentation. # - Added changelog. # # nocov start #' Return English-friendly type #' @param x Any R object. #' @param value Whether to describe the value of `x`. Special values #' like `NA` or `""` are always described. #' @param length Whether to mention the length of vectors and lists. #' @return A string describing the type. Starts with an indefinite #' article, e.g. "an integer vector". #' @noRd obj_type_friendly <- function(x, value = TRUE) { if (is_missing(x)) { return("absent") } if (is.object(x)) { if (inherits(x, "quosure")) { type <- "quosure" } else { type <- class(x)[[1L]] } return(sprintf("a <%s> object", type)) } if (!is_vector(x)) { return(.rlang_as_friendly_type(typeof(x))) } n_dim <- length(dim(x)) if (!n_dim) { if (!is_list(x) && length(x) == 1) { if (is_na(x)) { return(switch( typeof(x), logical = "`NA`", integer = "an integer `NA`", double = if (is.nan(x)) { "`NaN`" } else { "a numeric `NA`" }, complex = "a complex `NA`", character = "a character `NA`", .rlang_stop_unexpected_typeof(x) )) } show_infinites <- function(x) { if (x > 0) { "`Inf`" } else { "`-Inf`" } } str_encode <- function(x, width = 30, ...) { if (nchar(x) > width) { x <- substr(x, 1, width - 3) x <- paste0(x, "...") } encodeString(x, ...) } if (value) { if (is.numeric(x) && is.infinite(x)) { return(show_infinites(x)) } if (is.numeric(x) || is.complex(x)) { number <- as.character(round(x, 2)) what <- if (is.complex(x)) "the complex number" else "the number" return(paste(what, number)) } return(switch( typeof(x), logical = if (x) "`TRUE`" else "`FALSE`", character = { what <- if (nzchar(x)) "the string" else "the empty string" paste(what, str_encode(x, quote = "\"")) }, raw = paste("the raw value", as.character(x)), .rlang_stop_unexpected_typeof(x) )) } return(switch( typeof(x), logical = "a logical value", integer = "an integer", double = if (is.infinite(x)) show_infinites(x) else "a number", complex = "a complex number", character = if (nzchar(x)) "a string" else "\"\"", raw = "a raw value", .rlang_stop_unexpected_typeof(x) )) } if (length(x) == 0) { return(switch( typeof(x), logical = "an empty logical vector", integer = "an empty integer vector", double = "an empty numeric vector", complex = "an empty complex vector", character = "an empty character vector", raw = "an empty raw vector", list = "an empty list", .rlang_stop_unexpected_typeof(x) )) } } vec_type_friendly(x) } vec_type_friendly <- function(x, length = FALSE) { if (!is_vector(x)) { abort("`x` must be a vector.") } type <- typeof(x) n_dim <- length(dim(x)) add_length <- function(type) { if (length && !n_dim) { paste0(type, sprintf(" of length %s", length(x))) } else { type } } if (type == "list") { if (n_dim < 2) { return(add_length("a list")) } else if (is.data.frame(x)) { return("a data frame") } else if (n_dim == 2) { return("a list matrix") } else { return("a list array") } } type <- switch( type, logical = "a logical %s", integer = "an integer %s", numeric = , double = "a double %s", complex = "a complex %s", character = "a character %s", raw = "a raw %s", type = paste0("a ", type, " %s") ) if (n_dim < 2) { kind <- "vector" } else if (n_dim == 2) { kind <- "matrix" } else { kind <- "array" } out <- sprintf(type, kind) if (n_dim >= 2) { out } else { add_length(out) } } .rlang_as_friendly_type <- function(type) { switch( type, list = "a list", NULL = "`NULL`", environment = "an environment", externalptr = "a pointer", weakref = "a weak reference", S4 = "an S4 object", name = , symbol = "a symbol", language = "a call", pairlist = "a pairlist node", expression = "an expression vector", char = "an internal string", promise = "an internal promise", ... = "an internal dots object", any = "an internal `any` object", bytecode = "an internal bytecode object", primitive = , builtin = , special = "a primitive function", closure = "a function", type ) } .rlang_stop_unexpected_typeof <- function(x, call = caller_env()) { abort( sprintf("Unexpected type <%s>.", typeof(x)), call = call ) } #' Return OO type #' @param x Any R object. #' @return One of `"bare"` (for non-OO objects), `"S3"`, `"S4"`, #' `"R6"`, or `"S7"`. #' @noRd obj_type_oo <- function(x) { if (!is.object(x)) { return("bare") } class <- inherits(x, c("R6", "S7_object"), which = TRUE) if (class[[1]]) { "R6" } else if (class[[2]]) { "S7" } else if (isS4(x)) { "S4" } else { "S3" } } #' @param x The object type which does not conform to `what`. Its #' `obj_type_friendly()` is taken and mentioned in the error message. #' @param what The friendly expected type as a string. Can be a #' character vector of expected types, in which case the error #' message mentions all of them in an "or" enumeration. #' @param show_value Passed to `value` argument of `obj_type_friendly()`. #' @param ... Arguments passed to [abort()]. #' @inheritParams args_error_context #' @noRd stop_input_type <- function(x, what, ..., allow_na = FALSE, allow_null = FALSE, show_value = TRUE, arg = caller_arg(x), call = caller_env()) { # From standalone-cli.R cli <- env_get_list( nms = c("format_arg", "format_code"), last = topenv(), default = function(x) sprintf("`%s`", x), inherit = TRUE ) if (allow_na) { what <- c(what, cli$format_code("NA")) } if (allow_null) { what <- c(what, cli$format_code("NULL")) } if (length(what)) { what <- oxford_comma(what) } if (inherits(arg, "AsIs")) { format_arg <- identity } else { format_arg <- cli$format_arg } message <- sprintf( "%s must be %s, not %s.", format_arg(arg), what, obj_type_friendly(x, value = show_value) ) abort(message, ..., call = call, arg = arg) } oxford_comma <- function(chr, sep = ", ", final = "or") { n <- length(chr) if (n < 2) { return(chr) } head <- chr[seq_len(n - 1)] last <- chr[n] head <- paste(head, collapse = sep) # Write a or b. But a, b, or c. if (n > 2) { paste0(head, sep, final, " ", last) } else { paste0(head, " ", final, " ", last) } } # nocov end themis/R/tunable.R0000644000176200001440000000061514744045253013533 0ustar liggesusers#' tunable methods for themis #' #' These functions define what parameters _can_ be tuned for specific steps. #' They also define the recommended objects from the `dials` package that can #' be used to generate new parameter values and other characteristics. #' @param x A recipe step object #' @param ... Not used. #' @name tunable_themis #' @return A tibble object. #' @keywords internal NULL themis/R/misc.R0000644000176200001440000000550114744045253013033 0ustar liggesusersstring2formula <- function(x) { out <- a ~ . out[[2]] <- rlang::sym(x) out } check_na <- function(data, step, call = caller_env()) { na_cols <- vapply(data, function(x) any(is.na(x)), FUN.VALUE = logical(1)) if (any(na_cols)) { cols <- paste(names(na_cols)[na_cols], collapse = ", ") cli::cli_abort( "Cannot have any missing values. NAs found in {cols}.", call = call ) } } check_2_levels_only <- function(data, col_name, call = caller_env()) { if (length(col_name) == 1 && length(levels(data[[col_name]])) != 2) { cli::cli_abort("The {.code {col_name}} must only have 2 levels.", call = call) } } check_1_selected <- function(x, call = caller_env()) { if (length(x) > 1) { cli::cli_abort( "The selector should select at most a single variable.", call = call ) } } check_numeric <- function(dat) { all_good <- vapply(dat, is.numeric, logical(1)) label <- "numeric" if (!all(all_good)) { cli::cli_abort("All columns for this function should be numeric.") } invisible(all_good) } check_column_factor <- function(data, column, call = caller_env()) { if (length(column) == 1 && !is.factor(data[[column]])) { cli::cli_abort("{.code {column}} should be a factor variable.", call = call) } } check_var <- function(var, df, call = caller_env()) { if (length(var) != 1) { cli::cli_abort( "Please select a single factor variable for {.arg var}.", call = call ) } var <- rlang::arg_match(var, names(df), error_call = call) column <- df[[var]] if (!(is.factor(column) || is.character(column))) { cli::cli_abort( "{.var {var}} should refer to a factor or character column, not {.obj_type_friendly {column}}.", call = call ) } } na_splice <- function(new_data, synthetic_data, object) { non_predictor <- setdiff(names(new_data), c(object$column, object$predictors)) if (length(non_predictor) == 0) { return(synthetic_data) } new_data[, non_predictor, drop = FALSE] na_data <- matrix( nrow = nrow(synthetic_data) - nrow(new_data), ncol = length(non_predictor) ) colnames(na_data) <- non_predictor na_data <- as.data.frame(na_data) res <- vec_cbind( synthetic_data, bind_rows(new_data[, non_predictor, drop = FALSE], na_data) ) res <- res[, names(new_data)] as_tibble(res) } #https://stackoverflow.com/questions/2547402/how-to-find-the-statistical-mode Mode <- function(x) { ux <- unique(x) ux[which.max(tabulate(match(x, ux)))] } weighted_table <- function(x, wts = NULL) { if (is.null(wts)) { wts <- rep(1, length(x)) } if (!is.factor(x)) { x <- factor(x) } hardhat::weighted_table(x, weights = wts) } get_from_info <- function(info, role, na_rm = TRUE) { res <- info$variable[info$role == role] if (na_rm) { res <- stats::na.omit(res) } res } themis/R/bsmote_impl.R0000644000176200001440000001073014744045253014412 0ustar liggesusers#' borderline-SMOTE Algorithm #' #' BSMOTE generates generate new examples of the minority class using nearest #' neighbors of these cases in the border region between classes. #' #' @inheritParams step_smote #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param all_neighbors Type of two borderline-SMOTE method. Defaults to FALSE. #' See details. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' This methods works the same way as [smote()], expect that instead of #' generating points around every point of of the minority class each point is #' first being classified into the boxes "danger" and "not". For each point the #' k nearest neighbors is calculated. If all the neighbors comes from a #' different class it is labeled noise and put in to the "not" box. If more then #' half of the neighbors comes from a different class it is labeled "danger. # Points will be generated around points labeled "danger". #' #' If `all_neighbors = FALSE` then points will be generated between nearest #' neighbors in its own class. If `all_neighbors = TRUE` then points will be #' generated between any nearest neighbors. See examples for visualization. #' #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. #' #' All columns used in this step must be numeric with no missing data. #' #' @references Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: #' a new over-sampling method in imbalanced data sets learning. In #' International Conference on Intelligent Computing, pages 878–887. Springer, #' 2005. #' #' @seealso [step_bsmote()] for step function of this method #' @family Direct Implementations #' #' @examples #' circle_numeric <- circle_example[, c("x", "y", "class")] #' #' res <- bsmote(circle_numeric, var = "class") #' #' res <- bsmote(circle_numeric, var = "class", k = 10) #' #' res <- bsmote(circle_numeric, var = "class", over_ratio = 0.8) #' #' res <- bsmote(circle_numeric, var = "class", all_neighbors = TRUE) bsmote <- function(df, var, k = 5, over_ratio = 1, all_neighbors = FALSE) { check_data_frame(df) check_var(var, df) check_number_whole(k, min = 1) check_number_decimal(over_ratio) check_bool(all_neighbors) predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -all_of(var))) bsmote_impl(df, var, k, over_ratio) } bsmote_impl <- function(df, var, k = 5, over_ratio = 1, all_neighbors = FALSE) { majority_count <- max(table(df[[var]])) ratio_target <- majority_count * over_ratio which_upsample <- which(table(df[[var]]) < ratio_target) samples_needed <- ratio_target - table(df[[var]])[which_upsample] min_names <- names(samples_needed) out_dfs <- list() for (i in seq_along(min_names)) { data_mat <- as.matrix(df[names(df) != var]) ids <- RANN::nn2(data_mat, k = k + 1, searchtype = "priority")$nn.idx min_class_in <- df[[var]] == min_names[i] danger_ids <- danger( x = rowSums(matrix((min_class_in)[ids], ncol = ncol(ids))) - 1, k = k ) if (sum(danger_ids) <= k) { cli::cli_abort( "Not enough danger observations of {.val {min_names[i]}} to perform BSMOTE." ) } if (all_neighbors == FALSE) { tmp_df <- as.data.frame( smote_data( data = data_mat[min_class_in, ], k = k, n_samples = samples_needed[i], smote_ids = which(danger_ids[min_class_in]) ) ) } if (all_neighbors == TRUE) { tmp_df <- as.data.frame( smote_data(data_mat, k, samples_needed[i], which(danger_ids)) ) } colnames(tmp_df) <- colnames(data_mat) tmp_df[[var]] <- min_names[i] out_dfs[[i]] <- tmp_df } final <- rbind(df, do.call(rbind, out_dfs)) final[[var]] <- factor(final[[var]], levels = levels(df[[var]])) rownames(final) <- NULL final } danger <- function(x, k) { (x != k) & (k / 2 <= x) } themis/R/smote.R0000644000176200001440000001640614744276166013246 0ustar liggesusers#' Apply SMOTE Algorithm #' #' `step_smote()` creates a *specification* of a recipe step that generate new #' examples of the minority class using nearest neighbors of these cases. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param neighbors An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param seed An integer that will be used as the seed when #' smote-ing. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_smote" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-not-supported #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @seealso [smote()] for direct implementation #' @family Steps for over-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' # Bring the minority levels up to about 1000 each #' # 1000/2211 is approx 0.4523 #' step_smote(class, over_ratio = 0.4523) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without SMOTE") #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_smote(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With SMOTE") step_smote <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("smote")) { check_number_whole(seed) add_step( recipe, step_smote_new( terms = enquos(...), role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_smote_new <- function(terms, role, trained, column, over_ratio, neighbors, predictors, skip, seed, id) { step( subclass = "smote", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_smote <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$over_ratio, arg = "over_ratio", min = 0) check_number_whole(x$neighbors, arg = "neighbors", min = 1) check_1_selected(col_name) check_column_factor(training, col_name) predictors <- setdiff(get_from_info(info, "predictor"), col_name) check_type(training[, predictors], types = c("double", "integer")) check_na(select(training, all_of(c(col_name, predictors)))) step_smote_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, neighbors = x$neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_smote <- function(object, new_data, ...) { col_names <- unique(c(object$predictors, object$column)) check_new_data(col_names, object, new_data) if (length(object$column) == 0L) { # Empty selection return(new_data) } new_data <- as.data.frame(new_data) predictor_data <- new_data[, col_names] # smote with seed for reproducibility with_seed( seed = object$seed, code = { synthetic_data <- smote_impl( predictor_data, object$column, k = object$neighbors, over_ratio = object$over_ratio ) synthetic_data <- as_tibble(synthetic_data) } ) new_data <- na_splice(new_data, synthetic_data, object) new_data } #' @export print.step_smote <- function(x, width = max(20, options()$width - 26), ...) { title <- "SMOTE based on " print_step(x$column, x$terms, x$trained, title, width) invisible(x) } #' @rdname step_smote #' @usage NULL #' @export tidy.step_smote <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_smote <- function(x, ...) { tibble::tibble( name = c("over_ratio", "neighbors"), call_info = list( list(pkg = "dials", fun = "over_ratio"), list(pkg = "dials", fun = "neighbors", range = c(1, 10)) ), source = "recipe", component = "step_smote", component_id = x$id ) } #' @rdname required_pkgs.step #' @export required_pkgs.step_smote <- function(x, ...) { c("themis") } themis/R/downsample.R0000644000176200001440000002111014744276166014254 0ustar liggesusers#' Down-Sample a Data Set Based on a Factor Variable #' #' `step_downsample()` creates a *specification* of a recipe step that will #' remove rows of a data set to make the occurrence of levels in a specific #' factor level equal. #' #' @inheritParams recipes::step_center #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param under_ratio A numeric value for the ratio of the #' majority-to-minority frequencies. The default value (1) means #' that all other levels are sampled down to have the same #' frequency as the least occurring level. A value of 2 would mean #' that the majority levels will have (at most) (approximately) #' twice as many rows than the minority level. #' @param ratio Deprecated argument; same as `under_ratio` #' @param target An integer that will be used to subsample. This #' should not be set by the user and will be populated by `prep`. #' @param seed An integer that will be used as the seed when downsampling. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' @details #' Down-sampling is intended to be performed on the _training_ set #' alone. For this reason, the default is `skip = TRUE`. #' #' If there are missing values in the factor variable that is used #' to define the sampling, missing data are selected at random in #' the same way that the other factor levels are sampled. Missing #' values are not used to determine the amount of data in the #' minority level #' #' For any data with factor levels occurring with the same #' frequency as the minority level, all data will be retained. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' Keep in mind that the location of down-sampling in the step #' may have effects. For example, if centering and scaling, #' it is not clear whether those operations should be conducted #' _before_ or _after_ rows are removed. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_downsample" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-unsupervised #' #' @family Steps for under-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' # Bring the majority levels down to about 1000 each #' # 1000/259 is approx 3.862 #' step_downsample(class, under_ratio = 3.862) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without downsample") #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_downsample(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With downsample") step_downsample <- function(recipe, ..., under_ratio = 1, ratio = deprecated(), role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("downsample")) { if (lifecycle::is_present(ratio)) { lifecycle::deprecate_stop( "0.2.0", "step_downsample(ratio = )", "step_downsample(under_ratio = )" ) } check_number_whole(seed) add_step( recipe, step_downsample_new( terms = enquos(...), under_ratio = under_ratio, ratio = NULL, role = role, trained = trained, column = column, target = target, skip = skip, seed = seed, id = id, case_weights = NULL ) ) } step_downsample_new <- function(terms, under_ratio, ratio, role, trained, column, target, skip, seed, id, case_weights) { step( subclass = "downsample", terms = terms, under_ratio = under_ratio, ratio = ratio, role = role, trained = trained, column = column, target = target, skip = skip, id = id, seed = seed, id = id, case_weights = case_weights ) } #' @export prep.step_downsample <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$under_ratio, arg = "under_ratio", min = 0) wts <- recipes::get_case_weights(info, training) were_weights_used <- recipes::are_weights_used(wts, unsupervised = TRUE) if (isFALSE(were_weights_used) || is.null(wts)) { wts <- rep(1, nrow(training)) } check_1_selected(col_name) check_column_factor(training, col_name) if (length(col_name) == 0) { minority <- 1 } else { obs_freq <- weighted_table(training[[col_name]], as.integer(wts)) minority <- min(obs_freq) } check_na(select(training, all_of(col_name))) step_downsample_new( terms = x$terms, under_ratio = x$under_ratio, ratio = x$ratio, role = x$role, trained = TRUE, column = col_name, target = floor(minority * x$under_ratio), skip = x$skip, seed = x$seed, id = x$id, case_weights = were_weights_used ) } subsamp <- function(x, wts, num) { n <- nrow(x) if (nrow(x) == num) { out <- x } else { # downsampling is done without replacement out <- x[sample(seq_len(n), min(num, n), prob = wts), ] } out } #' @export bake.step_downsample <- function(object, new_data, ...) { col_names <- names(object$column) check_new_data(col_names, object, new_data) if (length(col_names) == 0L) { # Empty selection return(new_data) } if (isTRUE(object$case_weights)) { wts_col <- purrr::map_lgl(new_data, hardhat::is_case_weights) wts <- new_data[[names(which(wts_col))]] wts <- as.integer(wts) } else { wts <- rep(1, nrow(new_data)) } if (any(is.na(new_data[[col_names]]))) { missing <- new_data[is.na(new_data[[col_names]]), ] } else { missing <- NULL } split_data <- split(new_data, new_data[[col_names]]) split_wts <- split(wts, new_data[[col_names]]) # Downsample with seed for reproducibility with_seed( seed = object$seed, code = { new_data <- purrr::map2_dfr( split_data, split_wts, subsamp, num = object$target ) if (!is.null(missing)) { new_data <- bind_rows(new_data, subsamp(missing, object$target)) } } ) new_data } #' @export print.step_downsample <- function(x, width = max(20, options()$width - 26), ...) { title <- "Down-sampling based on " print_step(x$column, x$terms, x$trained, title, width, case_weights = x$case_weights) invisible(x) } #' @rdname step_downsample #' @usage NULL #' @export tidy.step_downsample <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_downsample <- function(x, ...) { tibble::tibble( name = "under_ratio", call_info = list( list(pkg = "dials", fun = "under_ratio") ), source = "recipe", component = "step_downsample", component_id = x$id ) } #' @rdname required_pkgs.step #' @export required_pkgs.step_downsample <- function(x, ...) { c("themis") } themis/R/themis-package.R0000644000176200001440000000165214744045253014765 0ustar liggesusers#' @keywords internal "_PACKAGE" ## usethis namespace: start #' @import rlang #' @importFrom dplyr all_of #' @importFrom dplyr bind_rows #' @importFrom dplyr mutate #' @importFrom dplyr select #' @importFrom glue glue #' @importFrom lifecycle deprecated #' @importFrom purrr map_dfr #' @importFrom purrr map_lgl #' @importFrom recipes add_step #' @importFrom recipes bake #' @importFrom recipes check_new_data #' @importFrom recipes check_type #' @importFrom recipes is_trained #' @importFrom recipes prep #' @importFrom recipes print_step #' @importFrom recipes rand_id #' @importFrom recipes recipes_eval_select #' @importFrom recipes sel2char #' @importFrom recipes step #' @importFrom rlang := #' @importFrom rlang caller_env #' @importFrom rlang enquos #' @importFrom ROSE ROSE #' @importFrom tibble as_tibble #' @importFrom tibble tibble #' @importFrom vctrs vec_cbind #' @importFrom withr with_seed ## usethis namespace: end NULL themis/R/adasyn.R0000644000176200001440000001605414744276166013375 0ustar liggesusers#' Apply Adaptive Synthetic Algorithm #' #' `step_adasyn()` creates a *specification* of a recipe step that generates #' synthetic positive instances using ADASYN algorithm. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param neighbors An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' @param seed An integer that will be used as the seed when #' applied. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_adasyn" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-not-supported #' #' @references He, H., Bai, Y., Garcia, E. and Li, S. 2008. ADASYN: Adaptive #' synthetic sampling approach for imbalanced learning. Proceedings of #' IJCNN 2008. (IEEE World Congress on Computational Intelligence). IEEE #' International Joint Conference. pp.1322-1328. #' #' @seealso [adasyn()] for direct implementation #' @family Steps for over-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' # Bring the minority levels up to about 1000 each #' # 1000/2211 is approx 0.4523 #' step_adasyn(class, over_ratio = 0.4523) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without ADASYN") #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_adasyn(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With ADASYN") step_adasyn <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("adasyn")) { check_number_whole(seed) add_step( recipe, step_adasyn_new( terms = enquos(...), role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_adasyn_new <- function(terms, role, trained, column, over_ratio, neighbors, predictors, skip, seed, id) { step( subclass = "adasyn", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, neighbors = neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_adasyn <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$over_ratio, arg = "over_ratio", min = 0) check_number_whole(x$neighbors, arg = "neighbors", min = 1) check_1_selected(col_name) check_column_factor(training, col_name) predictors <- setdiff(get_from_info(info, "predictor"), col_name) check_type(training[, predictors], types = c("double", "integer")) check_na(select(training, all_of(c(col_name, predictors)))) step_adasyn_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, neighbors = x$neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_adasyn <- function(object, new_data, ...) { col_names <- unique(c(object$predictors, object$column)) check_new_data(col_names, object, new_data) if (length(object$column) == 0L) { # Empty selection return(new_data) } new_data <- as.data.frame(new_data) predictor_data <- new_data[, col_names] # adasyn with seed for reproducibility with_seed( seed = object$seed, code = { synthetic_data <- adasyn_impl( predictor_data, object$column, k = object$neighbors, over_ratio = object$over_ratio ) synthetic_data <- as_tibble(synthetic_data) } ) new_data <- na_splice(new_data, synthetic_data, object) new_data } #' @export print.step_adasyn <- function(x, width = max(20, options()$width - 26), ...) { title <- "adasyn based on " print_step(x$column, x$terms, x$trained, title, width) invisible(x) } #' @rdname step_adasyn #' @usage NULL #' @export tidy.step_adasyn <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_adasyn <- function(x, ...) { tibble::tibble( name = c("over_ratio", "neighbors"), call_info = list( list(pkg = "dials", fun = "over_ratio"), list(pkg = "dials", fun = "neighbors", range = c(1, 10)) ), source = "recipe", component = "step_adasyn", component_id = x$id ) } #' S3 methods for tracking which additional packages are needed for steps. #' #' @param x A recipe step #' @return A character vector #' @rdname required_pkgs.step #' @keywords internal #' @export required_pkgs.step_adasyn <- function(x, ...) { c("themis") } themis/R/tomek.R0000644000176200001440000001306214744276166013231 0ustar liggesusers#' Remove Tomek’s Links #' #' `step_tomek()` creates a *specification* of a recipe step that removes #' majority class instances of tomek links. #' #' @inheritParams recipes::step_center #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param seed An integer that will be used as the seed when #' applied. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' The factor variable used to balance around must only have 2 levels. All #' other variables must be numerics with no missing data. #' #' A tomek link is defined as a pair of points from different classes and are #' each others nearest neighbors. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' @template case-weights-not-supported #' #' @references Tomek. Two modifications of cnn. IEEE Trans. Syst. Man Cybern., #' 6:769-772, 1976. #' #'@seealso [tomek()] for direct implementation #' @family Steps for under-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' step_tomek(class) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without Tomek") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_tomek(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With Tomek") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) step_tomek <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("tomek")) { check_number_whole(seed) add_step( recipe, step_tomek_new( terms = enquos(...), role = role, trained = trained, column = column, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_tomek_new <- function(terms, role, trained, column, predictors, skip, seed, id) { step( subclass = "tomek", terms = terms, role = role, trained = trained, column = column, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_tomek <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_1_selected(col_name) check_column_factor(training, col_name) predictors <- setdiff(get_from_info(info, "predictor"), col_name) check_type(training[, predictors], types = c("double", "integer")) check_na(select(training, all_of(c(col_name, predictors)))) step_tomek_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_tomek <- function(object, new_data, ...) { col_names <- unique(c(object$predictors, object$column)) check_new_data(col_names, object, new_data) if (length(object$column) == 0L) { # Empty selection return(new_data) } predictor_data <- new_data[, col_names] # tomek with seed for reproducibility with_seed( seed = object$seed, code = { tomek_data <- tomek_impl( df = predictor_data, var = object$column ) } ) if (length(tomek_data) > 0) { new_data <- new_data[-tomek_data, ] } new_data } #' @export print.step_tomek <- function(x, width = max(20, options()$width - 26), ...) { title <- "Tomek based on " print_step(x$column, x$terms, x$trained, title, width) invisible(x) } #' @rdname step_tomek #' @usage NULL #' @export tidy.step_tomek <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @rdname required_pkgs.step #' @export required_pkgs.step_tomek <- function(x, ...) { c("themis") } themis/R/rose.R0000644000176200001440000002105514744276166013063 0ustar liggesusers#' Apply ROSE Algorithm #' #' `step_rose()` creates a *specification* of a recipe step that generates #' sample of synthetic data by enlarging the features space of minority and #' majority class example. Using [ROSE::ROSE()]. #' #' @inheritParams recipes::step_center #' @inheritParams step_upsample #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param minority_prop A numeric. Determines the of over-sampling of the #' minority class. Defaults to 0.5. #' @param minority_smoothness A numeric. Shrink factor to be multiplied by the #' smoothing parameters to estimate the conditional kernel density of the #' minority class. Defaults to 1. #' @param majority_smoothness A numeric. Shrink factor to be multiplied by the #' smoothing parameters to estimate the conditional kernel density of the #' majority class. Defaults to 1. #' @param seed An integer that will be used as the seed when #' rose-ing. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' The factor variable used to balance around must only have 2 levels. #' #' The ROSE algorithm works by selecting an observation belonging to class k #' and generates new examples in its neighborhood is determined by some matrix #' H_k. Smaller values of these arguments have the effect of shrinking the #' entries of the corresponding smoothing matrix H_k, Shrinking would be a #' cautious choice if there is a concern that excessively large neighborhoods #' could lead to blur the boundaries between the regions of the feature space #' associated with each class. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_rose" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-not-supported #' #' @references Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a #' Package for Binary Imbalanced Learning. R Jorunal, 6:82–92. #' @references Menardi, G. and Torelli, N. (2014). Training and assessing #' classification rules with imbalanced data. Data Mining and Knowledge #' Discovery, 28:92–122. #' #' @family Steps for over-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' mutate(class = factor(class == "VF", labels = c("not VF", "VF"))) %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' step_rose(class) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without ROSE") #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_rose(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With ROSE") step_rose <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, minority_prop = 0.5, minority_smoothness = 1, majority_smoothness = 1, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("rose")) { check_number_decimal(minority_prop, min = 0) check_number_decimal(minority_smoothness, min = 0) check_number_decimal(majority_smoothness, min = 0) check_number_whole(seed) add_step( recipe, step_rose_new( terms = enquos(...), role = role, trained = trained, column = column, over_ratio = over_ratio, minority_prop = minority_prop, minority_smoothness = minority_smoothness, majority_smoothness = majority_smoothness, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_rose_new <- function(terms, role, trained, column, over_ratio, minority_prop, minority_smoothness, majority_smoothness, predictors, skip, seed, id) { step( subclass = "rose", terms = terms, role = role, trained = trained, column = column, over_ratio = over_ratio, minority_prop = minority_prop, minority_smoothness = minority_smoothness, majority_smoothness = majority_smoothness, predictors = predictors, skip = skip, seed = seed, id = id ) } #' @export prep.step_rose <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$over_ratio, arg = "over_ratio", min = 0) check_1_selected(col_name) check_column_factor(training, col_name) check_2_levels_only(training, col_name) predictors <- setdiff(get_from_info(info, "predictor"), col_name) check_na(select(training, all_of(col_name))) step_rose_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, over_ratio = x$over_ratio, minority_prop = x$minority_prop, minority_smoothness = x$minority_smoothness, majority_smoothness = x$majority_smoothness, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_rose <- function(object, new_data, ...) { col_names <- unique(c(object$predictors, object$column)) check_new_data(col_names, object, new_data) if (length(object$column) == 0L) { # Empty selection return(new_data) } if (any(is.na(new_data[[object$column]]))) { missing <- new_data[is.na(new_data[[object$column]]), ] } else { missing <- NULL } new_data <- as.data.frame(new_data) predictor_data <- new_data[, col_names] # rose with seed for reproducibility majority_size <- max(table(predictor_data[[object$column]])) * 2 with_seed( seed = object$seed, code = { original_levels <- levels(predictor_data[[object$column]]) synthetic_data <- ROSE( string2formula(object$column), predictor_data, N = floor(majority_size * object$over_ratio), p = object$minority_prop, hmult.majo = object$majority_smoothness, hmult.mino = object$minority_smoothness ) synthetic_data <- synthetic_data$data synthetic_data[[object$column]] <- factor( synthetic_data[[object$column]], levels = original_levels ) synthetic_data <- as_tibble(synthetic_data) } ) new_data <- na_splice(new_data, synthetic_data, object) new_data } #' @export print.step_rose <- function(x, width = max(20, options()$width - 26), ...) { title <- "ROSE based on " print_step(x$column, x$terms, x$trained, title, width) invisible(x) } #' @rdname step_rose #' @usage NULL #' @export tidy.step_rose <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_rose <- function(x, ...) { tibble::tibble( name = c("over_ratio"), call_info = list( list(pkg = "dials", fun = "over_ratio") ), source = "recipe", component = "step_rose", component_id = x$id ) } #' @rdname required_pkgs.step #' @export required_pkgs.step_rose <- function(x, ...) { c("themis", "ROSE") } themis/R/nearmiss_impl.R0000644000176200001440000000561414744045253014747 0ustar liggesusers#' Remove Points Near Other Classes #' #' Generates synthetic positive instances using nearmiss algorithm. #' #' @inheritParams step_nearmiss #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' All columns used in this function must be numeric with no missing data. #' #' @references Inderjeet Mani and I Zhang. knn approach to unbalanced data #' distributions: a case study involving information extraction. In Proceedings #' of workshop on learning from imbalanced datasets, 2003. #' #' @seealso [step_nearmiss()] for step function of this method #' @family Direct Implementations #' #' @examples #' circle_numeric <- circle_example[, c("x", "y", "class")] #' #' res <- nearmiss(circle_numeric, var = "class") #' #' res <- nearmiss(circle_numeric, var = "class", k = 10) #' #' res <- nearmiss(circle_numeric, var = "class", under_ratio = 1.5) nearmiss <- function(df, var, k = 5, under_ratio = 1) { check_data_frame(df) check_var(var, df) check_number_whole(k, min = 1) check_number_decimal(under_ratio) predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -all_of(var))) nearmiss_impl(df, var, ignore_vars = character(), k, under_ratio) } nearmiss_impl <- function(df, var, ignore_vars, k = 5, under_ratio = 1) { classes <- downsample_count(df, var, under_ratio) out_dfs <- list() deleted_rows <- integer() for (i in seq_along(classes)) { df_only <- df[, !names(df) %in% ignore_vars] class <- subset_to_matrix(df_only, var, names(classes)[i]) not_class <- subset_to_matrix(df_only, var, names(classes)[i], FALSE) if (nrow(not_class) <= k) { cli::cli_abort("Not enough danger observations of {.val {names(classes)[i]}} to perform NEARMISS.") } dists <- RANN::nn2( not_class[, !(colnames(not_class) %in% ignore_vars)], class[, !(colnames(class) %in% ignore_vars)], k = k )$nn.dists selected_ind <- order(rowMeans(dists)) <= (nrow(class) - classes[i]) deleted_rows <- c(deleted_rows, which(df[[var]] %in% names(classes)[i])[!selected_ind]) } if (length(deleted_rows) > 0) { df <- df[-deleted_rows, ] } df } downsample_count <- function(data, var, ratio) { min_count <- min(table(data[[var]])) ratio_target <- min_count * ratio which_class <- which(table(data[[var]]) > ratio_target) table(data[[var]])[which_class] - ratio_target } subset_to_matrix <- function(data, var, class, equal = TRUE) { if (equal) { return(as.matrix(data[data[[var]] == class, names(data) != var])) } else { return(as.matrix(data[data[[var]] != class, names(data) != var])) } } themis/R/tomek_impl.R0000644000176200001440000000306214744045253014240 0ustar liggesusers#' Remove Tomek's links #' #' Removed observations that are part of tomek links. #' #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' All columns used in this function must be numeric with no missing data. #' #' @references Tomek. Two modifications of cnn. IEEE Trans. Syst. Man Cybern., #' 6:769-772, 1976. #' #' @seealso [step_tomek()] for step function of this method #' @family Direct Implementations #' #' @examples #' circle_numeric <- circle_example[, c("x", "y", "class")] #' #' res <- tomek(circle_numeric, var = "class") tomek <- function(df, var) { check_data_frame(df) check_var(var, df) predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -all_of(var))) df[-tomek_impl(df, var), ] } tomek_impl <- function(df, var) { res <- RANN::nn2(df[names(df) != var], k = 2)$nn.idx # Make sure itself isn't counted as nearest neighbor for overlaps res <- dplyr::if_else(seq_len(nrow(res)) == res[, 2], res[, 1], res[, 2]) remove <- logical(nrow(df)) outcome <- df[[var]] for (class in unique(outcome)) { target <- which(outcome == class) neighbor <- res[target] neighbor_neighbor <- res[neighbor] tomek <- target == neighbor_neighbor & outcome[target] != outcome[neighbor] tomek_links <- c(target[tomek], neighbor[tomek]) remove[tomek_links] <- TRUE } which(remove) } themis/R/data.R0000644000176200001440000000064114744045253013011 0ustar liggesusers#' Synthetic Dataset With a Circle #' #' A random dataset with two classes one of which is inside a circle. Used for #' examples to show how the different methods handles borders. #' #' @format A data frame with 200 rows and 4 variables: #' \describe{ #' \item{x}{Numeric.} #' \item{y}{Numeric.} #' \item{class}{Factor, values "Circle" and "Rest".} #' \item{id}{character, ID variable.} #' } "circle_example" themis/R/nearmiss.R0000644000176200001440000001630414744276166013735 0ustar liggesusers#' Remove Points Near Other Classes #' #' `step_nearmiss()` creates a *specification* of a recipe step that removes #' majority class instances by undersampling points in the majority class based #' on their distance to other points in the same class. #' #' @inheritParams recipes::step_center #' @inheritParams step_downsample #' @inheritParams step_smote #' @param ... One or more selector functions to choose which #' variable is used to sample the data. See [recipes::selections] #' for more details. The selection should result in _single #' factor variable_. For the `tidy` method, these are not #' currently used. #' @param role Not used by this step since no new variables are #' created. #' @param column A character string of the variable name that will #' be populated (eventually) by the `...` selectors. #' @param seed An integer that will be used as the seed when #' applied. #' @return An updated version of `recipe` with the new step #' added to the sequence of existing steps (if any). For the #' `tidy` method, a tibble with columns `terms` which is #' the variable used to sample. #' #' @details #' This method retains the points from the majority class which have the #' smallest mean distance to the k nearest points in the minority class. #' #' All columns in the data are sampled and returned by [recipes::juice()] #' and [recipes::bake()]. #' #' All columns used in this step must be numeric with no missing data. #' #' When used in modeling, users should strongly consider using the #' option `skip = TRUE` so that the extra sampling is _not_ #' conducted outside of the training set. #' #' # Tidying #' #' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is retruned with #' columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected} #' \item{id}{character, id of this step} #' } #' #' ```{r, echo = FALSE, results="asis"} #' step <- "step_nearmiss" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` #' #' @template case-weights-not-supported #' #' @references Inderjeet Mani and I Zhang. knn approach to unbalanced data #' distributions: a case study involving information extraction. In Proceedings #' of workshop on learning from imbalanced datasets, 2003. #' #' @seealso [nearmiss()] for direct implementation #' @family Steps for under-sampling #' #' @export #' @examplesIf rlang::is_installed("modeldata") #' library(recipes) #' library(modeldata) #' data(hpc_data) #' #' hpc_data0 <- hpc_data %>% #' select(-protocol, -day) #' #' orig <- count(hpc_data0, class, name = "orig") #' orig #' #' up_rec <- recipe(class ~ ., data = hpc_data0) %>% #' # Bring the majority levels down to about 1000 each #' # 1000/259 is approx 3.862 #' step_nearmiss(class, under_ratio = 3.862) %>% #' prep() #' #' training <- up_rec %>% #' bake(new_data = NULL) %>% #' count(class, name = "training") #' training #' #' # Since `skip` defaults to TRUE, baking the step has no effect #' baked <- up_rec %>% #' bake(new_data = hpc_data0) %>% #' count(class, name = "baked") #' baked #' #' # Note that if the original data contained more rows than the #' # target n (= ratio * majority_n), the data are left alone: #' orig %>% #' left_join(training, by = "class") %>% #' left_join(baked, by = "class") #' #' library(ggplot2) #' #' ggplot(circle_example, aes(x, y, color = class)) + #' geom_point() + #' labs(title = "Without NEARMISS") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) #' #' recipe(class ~ x + y, data = circle_example) %>% #' step_nearmiss(class) %>% #' prep() %>% #' bake(new_data = NULL) %>% #' ggplot(aes(x, y, color = class)) + #' geom_point() + #' labs(title = "With NEARMISS") + #' xlim(c(1, 15)) + #' ylim(c(1, 15)) step_nearmiss <- function(recipe, ..., role = NA, trained = FALSE, column = NULL, under_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("nearmiss")) { check_number_whole(seed) add_step( recipe, step_nearmiss_new( terms = enquos(...), role = role, trained = trained, column = column, under_ratio = under_ratio, neighbors = neighbors, predictors = NULL, skip = skip, seed = seed, id = id ) ) } step_nearmiss_new <- function(terms, role, trained, column, under_ratio, neighbors, predictors, skip, seed, id) { step( subclass = "nearmiss", terms = terms, role = role, trained = trained, column = column, under_ratio = under_ratio, neighbors = neighbors, predictors = predictors, skip = skip, id = id, seed = seed, id = id ) } #' @export prep.step_nearmiss <- function(x, training, info = NULL, ...) { col_name <- recipes_eval_select(x$terms, training, info) check_number_decimal(x$under_ratio, arg = "under_ratio", min = 0) check_number_whole(x$neighbors, arg = "neighbors", min = 1) check_1_selected(col_name) check_column_factor(training, col_name) predictors <- setdiff(get_from_info(info, "predictor"), col_name) check_type(training[, predictors], types = c("double", "integer")) check_na(select(training, all_of(c(col_name, predictors)))) step_nearmiss_new( terms = x$terms, role = x$role, trained = TRUE, column = col_name, under_ratio = x$under_ratio, neighbors = x$neighbors, predictors = predictors, skip = x$skip, seed = x$seed, id = x$id ) } #' @export bake.step_nearmiss <- function(object, new_data, ...) { col_names <- unique(c(object$predictors, object$column)) check_new_data(col_names, object, new_data) if (length(object$column) == 0L) { # Empty selection return(new_data) } ignore_vars <- setdiff(names(new_data), col_names) # nearmiss with seed for reproducibility with_seed( seed = object$seed, code = { original_levels <- levels(new_data[[object$column]]) new_data <- nearmiss_impl( df = new_data, var = object$column, ignore_vars = ignore_vars, k = object$neighbors, under_ratio = object$under_ratio ) new_data[[object$column]] <- factor( new_data[[object$column]], levels = original_levels ) } ) new_data } #' @export print.step_nearmiss <- function(x, width = max(20, options()$width - 26), ...) { title <- "NEARMISS-1 based on " print_step(x$column, x$terms, x$trained, title, width) invisible(x) } #' @rdname step_nearmiss #' @usage NULL #' @export tidy.step_nearmiss <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = unname(x$column)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = unname(term_names)) } res$id <- x$id res } #' @export #' @rdname tunable_themis tunable.step_nearmiss <- function(x, ...) { tibble::tibble( name = c("under_ratio", "neighbors"), call_info = list( list(pkg = "dials", fun = "under_ratio"), list(pkg = "dials", fun = "neighbors", range = c(1, 10)) ), source = "recipe", component = "step_nearmiss", component_id = x$id ) } #' @rdname required_pkgs.step #' @export required_pkgs.step_nearmiss <- function(x, ...) { c("themis") } themis/R/reexports.R0000644000176200001440000000027714744045253014140 0ustar liggesusers#' @importFrom generics tidy #' @export generics::tidy #' @importFrom generics required_pkgs #' @export generics::required_pkgs #' @importFrom generics tunable #' @export generics::tunable themis/R/smote_impl.R0000644000176200001440000000755614744045253014264 0ustar liggesusers#' SMOTE Algorithm #' #' SMOTE generates new examples of the minority class using nearest neighbors #' of these cases. #' #' @inheritParams step_smote #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. # #' All columns used in this function must be numeric with no missing data. #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @seealso [step_smote()] for step function of this method #' @family Direct Implementations #' #' @examples #' circle_numeric <- circle_example[, c("x", "y", "class")] #' #' res <- smote(circle_numeric, var = "class") #' #' res <- smote(circle_numeric, var = "class", k = 10) #' #' res <- smote(circle_numeric, var = "class", over_ratio = 0.8) smote <- function(df, var, k = 5, over_ratio = 1) { check_data_frame(df) check_var(var, df) check_number_whole(k, min = 1) check_number_decimal(over_ratio) predictors <- setdiff(colnames(df), var) check_numeric(df[, predictors]) check_na(select(df, -all_of(var))) smote_impl(df, var, k, over_ratio) } smote_impl <- function(df, var, k, over_ratio, call = caller_env()) { data <- split(df, df[[var]]) majority_count <- max(table(df[[var]])) ratio_target <- majority_count * over_ratio which_upsample <- which(table(df[[var]]) < ratio_target) samples_needed <- ratio_target - table(df[[var]])[which_upsample] min_names <- names(samples_needed) out_dfs <- list() for (i in seq_along(samples_needed)) { minority_df <- data[[min_names[i]]] minority <- as.matrix(minority_df[names(minority_df) != var]) if (nrow(minority) <= k) { cli::cli_abort("Not enough observations of {.val {min_names[i]}} to perform SMOTE.", call = call) } synthetic <- smote_data(minority, k = k, n_samples = samples_needed[i]) out_df <- as.data.frame(synthetic) names(out_df) <- setdiff(names(df), var) out_df_nrow <- min(nrow(out_df), 1) out_df[var] <- data[[names(samples_needed)[i]]][[var]][out_df_nrow] out_df <- out_df[names(df)] out_dfs[[i]] <- out_df } final <- rbind(df, do.call(rbind, out_dfs)) final[[var]] <- factor(final[[var]], levels = levels(df[[var]])) rownames(final) <- NULL final } smote_data <- function(data, k, n_samples, smote_ids = seq_len(nrow(data))) { ids <- RANN::nn2(data, k = k + 1, searchtype = "priority")$nn.idx indexes <- rep(sample(smote_ids), length.out = n_samples) index_len <- tabulate(indexes, NROW(data)) out <- matrix(0, nrow = n_samples, ncol = ncol(data)) sampleids <- sample.int(k, n_samples, TRUE) runif_ids <- stats::runif(n_samples) iii <- 0 for (row_num in smote_ids) { index_selection <- iii + seq_len(index_len[row_num]) # removes itself as nearest neighbour id_knn <- ids[row_num, ids[row_num, ] != row_num] dif <- data[id_knn[sampleids[index_selection]], ] - data[rep(row_num, index_len[row_num]), ] gap <- dif * runif_ids[index_selection] out[index_selection, ] <- data[rep(row_num, index_len[row_num]), ] + gap iii <- iii + index_len[row_num] } out } themis/R/smotenc_impl.R0000644000176200001440000001447114744045253014577 0ustar liggesusers#' SMOTENC Algorithm #' #' SMOTENC generates new examples of the minority class using nearest neighbors #' of these cases, and can handle categorical variables #' #' @inheritParams step_smotenc #' @param df data.frame or tibble. Must have 1 factor variable and remaining #' numeric variables. #' @param var Character, name of variable containing factor variable. #' @param k An integer. Number of nearest neighbor that are used #' to generate the new examples of the minority class. #' #' @return A data.frame or tibble, depending on type of `df`. #' @export #' #' @details #' The parameter `neighbors` controls the way the new examples are created. #' For each currently existing minority class example X new examples will be #' created (this is controlled by the parameter `over_ratio` as mentioned #' above). These examples will be generated by using the information from the #' `neighbors` nearest neighbor of each example of the minority class. #' The parameter `neighbors` controls how many of these neighbor are used. # #' Columns can be numeric and categorical with no missing data. #' #' @references Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, #' W. P. (2002). Smote: Synthetic minority over-sampling technique. #' Journal of Artificial Intelligence Research, 16:321-357. #' #' @seealso [step_smotenc()] for step function of this method #' @family Direct Implementations #' #' @examples #' circle_numeric <- circle_example[, c("x", "y", "class")] #' #' res <- smotenc(circle_numeric, var = "class") #' #' res <- smotenc(circle_numeric, var = "class", k = 10) #' #' res <- smotenc(circle_numeric, var = "class", over_ratio = 0.8) smotenc <- function(df, var, k = 5, over_ratio = 1) { check_data_frame(df) check_var(var, df) check_number_whole(k, min = 1) check_number_decimal(over_ratio) check_na(select(df, -all_of(var))) smotenc_impl(df, var, k, over_ratio) } # Splits data and appends new minority instances smotenc_impl <- function(df, var, k, over_ratio) { # split data into list names by classes data <- split(df, df[[var]]) # Number of majority instances majority_count <- max(table(df[[var]])) # How many minority samples do we want in total? ratio_target <- majority_count * over_ratio # How many classes do we need to upsample (account for 2+ classes!) # Get the indices of those classes which_upsample <- which(table(df[[var]]) < ratio_target) # For each minorty class, determine how many more samples are needed samples_needed <- ratio_target - table(df[[var]])[which_upsample] # Just saving the names of those classes min_names <- names(samples_needed) # Create a list to save all the new minority classes out_dfs <- list() # Loop through all the minorty classes, this will only loop once if there is only one minorit class for (i in seq_along(samples_needed)) { # Extract the minority dataframe minority <- data[[min_names[i]]] # Ensure that we have more minority isntances than desired neighbors if (nrow(minority) <= k) { cli::cli_abort( "Not enough observations of {.var {min_names[i]}} to perform SMOTE." ) } # Run the smote algorithm (minority data, # of neighbors, # of sampeles needed) out_df <- smotenc_data(minority, k = k, n_samples = samples_needed[i]) out_dfs[[i]] <- out_df } # Bind all of the synthesized minority classes together final <- rbind(df, do.call(rbind, out_dfs)) # Make sure the levels are correct for every categorial variable (needed?) final[[var]] <- factor(final[[var]], levels = levels(df[[var]])) rownames(final) <- NULL final } # Uses nearest-neighbors and interpolation to generate new instances smotenc_data <- function(data, k, n_samples, smotenc_ids = seq_len(nrow(data))) { # Turning integer values into doubles integer_cols <- vapply(data, is.integer, FUN.VALUE = logical(1)) if (any(integer_cols)) { for (col in names(integer_cols)[integer_cols]) { data[[col]] <- as.double(data[[col]]) } } numeric_cols <- vapply(data, is.numeric, FUN.VALUE = logical(1)) # Runs a nearest neighbor search # outputs a matrix, each row is a minority instance and each column is a nearest neighbor # k is +1 because the sample is always a nearest neighbor to itself suppressWarnings( ids <- t(gower::gower_topn(x = data, y = data, n = k + 1, nthread = 1)$index) ) # shuffles minority indicies and repeats that shuffling until the desired number of samples is reached indexes <- rep(sample(smotenc_ids), length.out = n_samples) # tabulates how many times each minority instance is used index_len <- tabulate(indexes, NROW(data)) # Initialize matrix for newly generated samples out <- data[rep(smotenc_ids, length.out = n_samples), ] # For each new sample pick a random nearest neighbor to interpoate with (1 to k) sampleids <- sample.int(k, n_samples, TRUE) # pick distance along parameterized line between current sample and chosen nearest neighbor runif_ids <- stats::runif(n_samples) out_numeric <- as.matrix(out[numeric_cols]) out_factors <- as.matrix(out[!numeric_cols]) data_numeric <- as.matrix(data[numeric_cols]) data_factors <- as.matrix(data[!numeric_cols]) iii <- 0 for (row_num in smotenc_ids) { # List indices from 1:n where n is the number of times that sample is used to generate a new sample # iii shifts 1:n to fill in the rows of out (e.g. 1:3, 4:6, 7:8, etc.) index_selection <- iii + seq_len(index_len[row_num]) # removes itself as nearest neighbour id_knn <- ids[row_num, ids[row_num, ] != row_num] # need a total of index_len[row_num] new samples # calculates Xnew = X1 + t*(X1-Xnn) dif <- data_numeric[id_knn[sampleids[index_selection]], ] - data_numeric[rep(row_num, index_len[row_num]), ] gap <- dif * runif_ids[index_selection] out_numeric[index_selection, ] <- data_numeric[rep(row_num, index_len[row_num]), ] + gap # Replace categories with most frequent among nearest neighbors cat_to_upgrade <- data_factors[id_knn[sampleids[index_selection]], , drop = FALSE] cat_modes <- apply(cat_to_upgrade, 2, Mode) cat_replacement <- matrix( rep(cat_modes, length(index_selection)), nrow = length(index_selection), byrow = TRUE ) out_factors[index_selection, ] <- cat_replacement iii <- iii + index_len[row_num] } vec_cbind(out_numeric, out_factors)[names(data)] } themis/data/0000755000176200001440000000000014744045253012464 5ustar liggesusersthemis/data/circle_example.rda0000644000176200001440000001251014744045253016127 0ustar liggesusersBZh91AY&SYb8_pï^|UU+THQ e156SMI&&&ƆL)B4mOSS&SM4lOMOʛ L Tie=LQ6(ނ4a&d4F#55 Dѩ'"aa&#ɦ Shh=Fdj~&E=Lɩ6d50m&H d=53BzOByOFM&A&dѩ@&&@ SC@h2h444Pꩪ ~~4h&22 CC@ h ɠ4*ҥ$)O=jdad da#a2i` @ b0 ba4M14`!I)zDh@zA3F &iM4=F2h4hAFOSCFhyCj4mhhi di@%/bl&ܔ)cCbm,i Mi Л@m6 lhm lm1@؆mclIlm dBEl!!:@`e5pL*1\M,jI:mSSk 9B- "А̥I+Z_6OKz ס0M{_/.Wn`h0HKjt/HHF:V;C)TX2l'<4ܟ qИ`0^,曘ʙYHFRIT"xt1v: Œ$T[>^,)`>ݘ4BJNZbEiJu 6M¡]PT۞a\9!PzYpؙ6Y EbGJqAm)$w5>Y~;Q:yP],|䎜-I*З83 1^$2Z-Qd BlI=,]+"la5MQ1$T#;He(3 (Z+R Ӏ @%qqUz\9f8se&u1nI WG]&>A'YIwxJNYʄTvBI+ǫLE Vrp'0"@' IA: !`f5<`WɱRy/ DۧQl[_r!$ @Zibj |=V˗ kiݏ 1j}N`~l@CSmu#:-Co$amtq/W)Q\v.QLպ鰮}]C C3a|kARC>Nv`c9!zkJ`_Y1Mk#{3 H}[YL 'jlFC홬zjKWi~on9s꧗Nߠvݣ_.\ K,,,,ӧDDDDDDDDDYI$[NsRΥJGxƃ$h2I&$h2I&$h2I&$h2I&$h6"鰶gL2dɜwwp 1RsJV)JT)JRXRI$dMI$dMI$kQEMRJ–X,lliӧN;=I$dMI$n QELQD I$ I$ T3ӖA3Y%^zp\ע+Rw~|NznnnUZjնxVVQzJ8J*TwpjΤ8;"Sn+2vvvdɓ&Lѧ9񲔥5a6 Q|x 4˗.]0#;SϱN{Zjծwwwww,"@AR}>: '(5Ge4An(?ǵʋlfQg>n[ KTٳʻfW Og7yǽoBGd/Qb+aAʨ}ŷjEyZVĠr4n̡ƾElSLMY/@on:[,S*^e'~8V!q@Rj0hg?iI θ1lJs1,3e`0Rb>H(NQpM8tQ~&J$IʖNd@^Wv4FI W50-Dq~SPݏ8Sm_XeF1z« '%{T8b$EcfEhȫ~)h+6nޟeB/cI(FPɳySt >4;0z5Y]O-`h7 "1REiѣ'r"5ɷscܼXt*[FE eWpm_X߭ @GAƾbN1YioipTz)5Wdn'Uv#'a^.fujb*.Ÿ6>d s޻,"DiG,,/3^#(^e4wzH ~ҙwc~l*"QjIm0%0*#ߋ3o"HIKZٌ`e]'7Cji7dQx4۝ kkwYvu T/n+r. !I_TIL&Voxq{\>e2zI!,:lAeo\fGkGEq;{oW^#[_:h0R$N_qm?iMG{pwcUu=%;kCUŕ'! [![R-CMD-check](https://github.com/tidymodels/themis/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/themis/actions/workflows/R-CMD-check.yaml) [![Codecov test coverage](https://codecov.io/gh/tidymodels/themis/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidymodels/themis?branch=main) [![CRAN status](https://www.r-pkg.org/badges/version/themis)](https://CRAN.R-project.org/package=themis) [![Downloads](http://cranlogs.r-pkg.org/badges/themis)](https://CRAN.R-project.org/package=themis) [![Lifecycle: maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html) **themis** contains extra steps for the [`recipes`](https://CRAN.R-project.org/package=recipes) package for dealing with unbalanced data. The name **themis** is that of the [ancient Greek god](https://thishollowearth.wordpress.com/2012/07/02/god-of-the-week-themis/) who is typically depicted with a balance. ## Installation You can install the released version of themis from [CRAN](https://CRAN.R-project.org) with: ``` r install.packages("themis") ``` Install the development version from GitHub with: ``` r # install.packages("pak") pak::pak("tidymodels/themis") ``` ## Example Following is a example of using the [SMOTE](https://jair.org/index.php/jair/article/view/10302/24590) algorithm to deal with unbalanced data ``` r library(recipes) library(modeldata) library(themis) data("credit_data", package = "modeldata") credit_data0 <- credit_data %>% filter(!is.na(Job)) count(credit_data0, Job) #> Job n #> 1 fixed 2805 #> 2 freelance 1024 #> 3 others 171 #> 4 partime 452 ds_rec <- recipe(Job ~ Time + Age + Expenses, data = credit_data0) %>% step_impute_mean(all_predictors()) %>% step_smote(Job, over_ratio = 0.25) %>% prep() ds_rec %>% bake(new_data = NULL) %>% count(Job) #> # A tibble: 4 × 2 #> Job n #> #> 1 fixed 2805 #> 2 freelance 1024 #> 3 others 701 #> 4 partime 701 ``` ## Methods Below is some unbalanced data. Used for examples latter. ``` r example_data <- data.frame(class = letters[rep(1:5, 1:5 * 10)], x = rnorm(150)) library(ggplot2) example_data %>% ggplot(aes(class)) + geom_bar() ``` Bar chart with 5 columns. class on the x-axis and count on the y-axis. Class a has height 10, b has 20, c has 30, d has 40, and e has 50. ### Upsample / Over-sampling The following methods all share the tuning parameter `over_ratio`, which is the ratio of the minority-to-majority frequencies. | name | function | Multi-class | |----|----|----| | Random minority over-sampling with replacement | `step_upsample()` | :heavy_check_mark: | | Synthetic Minority Over-sampling Technique | `step_smote()` | :heavy_check_mark: | | Borderline SMOTE-1 | `step_bsmote(method = 1)` | :heavy_check_mark: | | Borderline SMOTE-2 | `step_bsmote(method = 2)` | :heavy_check_mark: | | Adaptive synthetic sampling approach for imbalanced learning | `step_adasyn()` | :heavy_check_mark: | | Generation of synthetic data by Randomly Over Sampling Examples | `step_rose()` | | By setting `over_ratio = 1` you bring the number of samples of all minority classes equal to 100% of the majority class. ``` r recipe(~., example_data) %>% step_upsample(class, over_ratio = 1) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` Bar chart with 5 columns. class on the x-axis and count on the y-axis. class a, b, c, d, and e all have a height of 50. and by setting `over_ratio = 0.5` we upsample any minority class with less samples then 50% of the majority up to have 50% of the majority. ``` r recipe(~., example_data) %>% step_upsample(class, over_ratio = 0.5) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` Bar chart with 5 columns. class on the x-axis and count on the y-axis. Class a has height 25, b has 25, c has 30, d has 40, and e has 50. ### Downsample / Under-sampling Most of the the following methods all share the tuning parameter `under_ratio`, which is the ratio of the majority-to-minority frequencies. | name | function | Multi-class | under_ratio | |----|----|----|----| | Random majority under-sampling with replacement | `step_downsample()` | :heavy_check_mark: | :heavy_check_mark: | | NearMiss-1 | `step_nearmiss()` | :heavy_check_mark: | :heavy_check_mark: | | Extraction of majority-minority Tomek links | `step_tomek()` | | | By setting `under_ratio = 1` you bring the number of samples of all majority classes equal to 100% of the minority class. ``` r recipe(~., example_data) %>% step_downsample(class, under_ratio = 1) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` Bar chart with 5 columns. class on the x-axis and count on the y-axis. Class a, b, c, d, and e all have a height of 10. and by setting `under_ratio = 2` we downsample any majority class with more then 200% samples of the minority class down to have to 200% samples of the minority. ``` r recipe(~., example_data) %>% step_downsample(class, under_ratio = 2) %>% prep() %>% bake(new_data = NULL) %>% ggplot(aes(class)) + geom_bar() ``` Bar chart with 5 columns. class on the x-axis and count on the y-axis. Class a has height 10, b, c, d, and e have ha height of 20. ## Contributing This project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. - For questions and discussions about tidymodels packages, modeling, and machine learning, [join us on RStudio Community](https://forum.posit.co/new-topic?category_id=15&tags=tidymodels,question). - If you think you have encountered a bug, please [submit an issue](https://github.com/tidymodels/themis/issues). - Either way, learn how to create and share a [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) (a minimal, reproducible example), to clearly communicate about your code. - Check out further details on [contributing guidelines for tidymodels packages](https://www.tidymodels.org/contribute/) and [how to get help](https://www.tidymodels.org/help/). themis/build/0000755000176200001440000000000014744301005012640 5ustar liggesusersthemis/build/partial.rdb0000644000176200001440000000007514744301005014767 0ustar liggesusersb```b`aab`b1g``d`aҬy@D?M7themis/man/0000755000176200001440000000000014744045253012326 5ustar liggesusersthemis/man/required_pkgs.step.Rd0000644000176200001440000000227114744045253016435 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/adasyn.R, R/bsmote.R, R/downsample.R, % R/nearmiss.R, R/rose.R, R/smote.R, R/smotenc.R, R/tomek.R, R/upsample.R \name{required_pkgs.step_adasyn} \alias{required_pkgs.step_adasyn} \alias{required_pkgs.step_bsmote} \alias{required_pkgs.step_downsample} \alias{required_pkgs.step_nearmiss} \alias{required_pkgs.step_rose} \alias{required_pkgs.step_smote} \alias{required_pkgs.step_smotenc} \alias{required_pkgs.step_tomek} \alias{required_pkgs.step_upsample} \title{S3 methods for tracking which additional packages are needed for steps.} \usage{ \method{required_pkgs}{step_adasyn}(x, ...) \method{required_pkgs}{step_bsmote}(x, ...) \method{required_pkgs}{step_downsample}(x, ...) \method{required_pkgs}{step_nearmiss}(x, ...) \method{required_pkgs}{step_rose}(x, ...) \method{required_pkgs}{step_smote}(x, ...) \method{required_pkgs}{step_smotenc}(x, ...) \method{required_pkgs}{step_tomek}(x, ...) \method{required_pkgs}{step_upsample}(x, ...) } \arguments{ \item{x}{A recipe step} } \value{ A character vector } \description{ S3 methods for tracking which additional packages are needed for steps. } \keyword{internal} themis/man/tomek.Rd0000644000176200001440000000203014744045253013727 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tomek_impl.R \name{tomek} \alias{tomek} \title{Remove Tomek's links} \usage{ tomek(df, var) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ Removed observations that are part of tomek links. } \details{ All columns used in this function must be numeric with no missing data. } \examples{ circle_numeric <- circle_example[, c("x", "y", "class")] res <- tomek(circle_numeric, var = "class") } \references{ Tomek. Two modifications of cnn. IEEE Trans. Syst. Man Cybern., 6:769-772, 1976. } \seealso{ \code{\link[=step_tomek]{step_tomek()}} for step function of this method Other Direct Implementations: \code{\link{adasyn}()}, \code{\link{bsmote}()}, \code{\link{nearmiss}()}, \code{\link{smote}()}, \code{\link{smotenc}()} } \concept{Direct Implementations} themis/man/smotenc.Rd0000644000176200001440000000434714744045253014275 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/smotenc_impl.R \name{smotenc} \alias{smotenc} \title{SMOTENC Algorithm} \usage{ smotenc(df, var, k = 5, over_ratio = 1) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ SMOTENC generates new examples of the minority class using nearest neighbors of these cases, and can handle categorical variables } \details{ The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. Columns can be numeric and categorical with no missing data. } \examples{ circle_numeric <- circle_example[, c("x", "y", "class")] res <- smotenc(circle_numeric, var = "class") res <- smotenc(circle_numeric, var = "class", k = 10) res <- smotenc(circle_numeric, var = "class", over_ratio = 0.8) } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } \seealso{ \code{\link[=step_smotenc]{step_smotenc()}} for step function of this method Other Direct Implementations: \code{\link{adasyn}()}, \code{\link{bsmote}()}, \code{\link{nearmiss}()}, \code{\link{smote}()}, \code{\link{tomek}()} } \concept{Direct Implementations} themis/man/step_rose.Rd0000644000176200001440000001351314744276170014627 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rose.R \name{step_rose} \alias{step_rose} \alias{tidy.step_rose} \title{Apply ROSE Algorithm} \usage{ step_rose( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, minority_prop = 0.5, minority_smoothness = 1, majority_smoothness = 1, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("rose") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{minority_prop}{A numeric. Determines the of over-sampling of the minority class. Defaults to 0.5.} \item{minority_smoothness}{A numeric. Shrink factor to be multiplied by the smoothing parameters to estimate the conditional kernel density of the minority class. Defaults to 1.} \item{majority_smoothness}{A numeric. Shrink factor to be multiplied by the smoothing parameters to estimate the conditional kernel density of the majority class. Defaults to 1.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when rose-ing.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_rose()} creates a \emph{specification} of a recipe step that generates sample of synthetic data by enlarging the features space of minority and majority class example. Using \code{\link[ROSE:ROSE]{ROSE::ROSE()}}. } \details{ The factor variable used to balance around must only have 2 levels. The ROSE algorithm works by selecting an observation belonging to class k and generates new examples in its neighborhood is determined by some matrix H_k. Smaller values of these arguments have the effect of shrinking the entries of the corresponding smoothing matrix H_k, Shrinking would be a cautious choice if there is a concern that excessively large neighborhoods could lead to blur the boundaries between the regions of the feature space associated with each class. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 1 tuning parameters: \itemize{ \item \code{over_ratio}: Over-Sampling Ratio (type: double, default: 1) } } \section{Case weights}{ The underlying operation does not allow for case weights. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% mutate(class = factor(class == "VF", labels = c("not VF", "VF"))) \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% step_rose(class) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without ROSE") recipe(class ~ x + y, data = circle_example) \%>\% step_rose(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With ROSE") \dontshow{\}) # examplesIf} } \references{ Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a Package for Binary Imbalanced Learning. R Jorunal, 6:82–92. Menardi, G. and Torelli, N. (2014). Training and assessing classification rules with imbalanced data. Data Mining and Knowledge Discovery, 28:92–122. } \seealso{ Other Steps for over-sampling: \code{\link{step_adasyn}()}, \code{\link{step_bsmote}()}, \code{\link{step_smote}()}, \code{\link{step_smotenc}()}, \code{\link{step_upsample}()} } \concept{Steps for over-sampling} themis/man/circle_example.Rd0000644000176200001440000000107414744045253015573 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{circle_example} \alias{circle_example} \title{Synthetic Dataset With a Circle} \format{ A data frame with 200 rows and 4 variables: \describe{ \item{x}{Numeric.} \item{y}{Numeric.} \item{class}{Factor, values "Circle" and "Rest".} \item{id}{character, ID variable.} } } \usage{ circle_example } \description{ A random dataset with two classes one of which is inside a circle. Used for examples to show how the different methods handles borders. } \keyword{datasets} themis/man/smote.Rd0000644000176200001440000000427114744045253013750 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/smote_impl.R \name{smote} \alias{smote} \title{SMOTE Algorithm} \usage{ smote(df, var, k = 5, over_ratio = 1) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ SMOTE generates new examples of the minority class using nearest neighbors of these cases. } \details{ The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns used in this function must be numeric with no missing data. } \examples{ circle_numeric <- circle_example[, c("x", "y", "class")] res <- smote(circle_numeric, var = "class") res <- smote(circle_numeric, var = "class", k = 10) res <- smote(circle_numeric, var = "class", over_ratio = 0.8) } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } \seealso{ \code{\link[=step_smote]{step_smote()}} for step function of this method Other Direct Implementations: \code{\link{adasyn}()}, \code{\link{bsmote}()}, \code{\link{nearmiss}()}, \code{\link{smotenc}()}, \code{\link{tomek}()} } \concept{Direct Implementations} themis/man/step_bsmote.Rd0000644000176200001440000001546014744276170015153 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bsmote.R \name{step_bsmote} \alias{step_bsmote} \alias{tidy.step_bsmote} \title{Apply borderline-SMOTE Algorithm} \usage{ step_bsmote( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, all_neighbors = FALSE, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("bsmote") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{all_neighbors}{Type of two borderline-SMOTE method. Defaults to FALSE. See details.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when smote-ing.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_bsmote()} creates a \emph{specification} of a recipe step that generate new examples of the minority class using nearest neighbors of these cases in the border region between classes. } \details{ This methods works the same way as \code{\link[=step_smote]{step_smote()}}, expect that instead of generating points around every point of of the minority class each point is first being classified into the boxes "danger" and "not". For each point the k nearest neighbors is calculated. If all the neighbors comes from a different class it is labeled noise and put in to the "not" box. If more then half of the neighbors comes from a different class it is labeled "danger. If all_neighbors = FALSE then points will be generated between nearest neighbors in its own class. If all_neighbors = TRUE then points will be generated between any nearest neighbors. See examples for visualization. The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 3 tuning parameters: \itemize{ \item \code{over_ratio}: Over-Sampling Ratio (type: double, default: 1) \item \code{neighbors}: # Nearest Neighbors (type: integer, default: 5) \item \code{all_neighbors}: All Neighbors (type: logical, default: FALSE) } } \section{Case weights}{ The underlying operation does not allow for case weights. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% # Bring the minority levels up to about 1000 each # 1000/2211 is approx 0.4523 step_bsmote(class, over_ratio = 0.4523) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without SMOTE") recipe(class ~ x + y, data = circle_example) \%>\% step_bsmote(class, all_neighbors = FALSE) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With borderline-SMOTE, all_neighbors = FALSE") recipe(class ~ x + y, data = circle_example) \%>\% step_bsmote(class, all_neighbors = TRUE) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With borderline-SMOTE, all_neighbors = TRUE") \dontshow{\}) # examplesIf} } \references{ Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: a new over-sampling method in imbalanced data sets learning. In International Conference on Intelligent Computing, pages 878–887. Springer, 2005. } \seealso{ \code{\link[=bsmote]{bsmote()}} for direct implementation Other Steps for over-sampling: \code{\link{step_adasyn}()}, \code{\link{step_rose}()}, \code{\link{step_smote}()}, \code{\link{step_smotenc}()}, \code{\link{step_upsample}()} } \concept{Steps for over-sampling} themis/man/rmd/0000755000176200001440000000000014744045253013110 5ustar liggesusersthemis/man/rmd/tunable-args.Rmd0000644000176200001440000000145314744045253016143 0ustar liggesusers```{r, include = FALSE} get_dials <- function(x) { if (any(names(x) == "range")) { cl <- rlang::call2(x$fun, .ns = x$pkg, range = x$range) } else { cl <- rlang::call2(x$fun, .ns = x$pkg) } rlang::eval_tidy(cl) } get_param_list <- function(x) { args <- formals(x) params <- getS3method("tunable", x)(list()) %>% dplyr::mutate( default = args[name], dials = purrr::map(call_info, get_dials), label = purrr::map_chr(dials, ~ .x$label), type = purrr::map_chr(dials, ~ .x$type), item = glue::glue("- `{name}`: {label} (type: {type}, default: {default})\n\n") ) params$item } ``` # Tuning Parameters ```{r echo = FALSE} param <- get_param_list(step) ``` This step has `r length(param)` tuning parameters: ```{r echo = FALSE, results = "asis"} param ``` themis/man/nearmiss.Rd0000644000176200001440000000342414744045253014441 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nearmiss_impl.R \name{nearmiss} \alias{nearmiss} \title{Remove Points Near Other Classes} \usage{ nearmiss(df, var, k = 5, under_ratio = 1) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{under_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled down to have the same frequency as the least occurring level. A value of 2 would mean that the majority levels will have (at most) (approximately) twice as many rows than the minority level.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ Generates synthetic positive instances using nearmiss algorithm. } \details{ All columns used in this function must be numeric with no missing data. } \examples{ circle_numeric <- circle_example[, c("x", "y", "class")] res <- nearmiss(circle_numeric, var = "class") res <- nearmiss(circle_numeric, var = "class", k = 10) res <- nearmiss(circle_numeric, var = "class", under_ratio = 1.5) } \references{ Inderjeet Mani and I Zhang. knn approach to unbalanced data distributions: a case study involving information extraction. In Proceedings of workshop on learning from imbalanced datasets, 2003. } \seealso{ \code{\link[=step_nearmiss]{step_nearmiss()}} for step function of this method Other Direct Implementations: \code{\link{adasyn}()}, \code{\link{bsmote}()}, \code{\link{smote}()}, \code{\link{smotenc}()}, \code{\link{tomek}()} } \concept{Direct Implementations} themis/man/step_smote.Rd0000644000176200001440000001305214744276170015004 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/smote.R \name{step_smote} \alias{step_smote} \alias{tidy.step_smote} \title{Apply SMOTE Algorithm} \usage{ step_smote( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("smote") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when smote-ing.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_smote()} creates a \emph{specification} of a recipe step that generate new examples of the minority class using nearest neighbors of these cases. } \details{ The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 2 tuning parameters: \itemize{ \item \code{over_ratio}: Over-Sampling Ratio (type: double, default: 1) \item \code{neighbors}: # Nearest Neighbors (type: integer, default: 5) } } \section{Case weights}{ The underlying operation does not allow for case weights. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% # Bring the minority levels up to about 1000 each # 1000/2211 is approx 0.4523 step_smote(class, over_ratio = 0.4523) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without SMOTE") recipe(class ~ x + y, data = circle_example) \%>\% step_smote(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With SMOTE") \dontshow{\}) # examplesIf} } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } \seealso{ \code{\link[=smote]{smote()}} for direct implementation Other Steps for over-sampling: \code{\link{step_adasyn}()}, \code{\link{step_bsmote}()}, \code{\link{step_rose}()}, \code{\link{step_smotenc}()}, \code{\link{step_upsample}()} } \concept{Steps for over-sampling} themis/man/step_smotenc.Rd0000644000176200001440000001254614744276170015334 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/smotenc.R \name{step_smotenc} \alias{step_smotenc} \alias{tidy.step_smotenc} \title{Apply SMOTENC algorithm} \usage{ step_smotenc( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("smotenc") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when smote-ing.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_smotenc()} creates a \emph{specification} of a recipe step that generate new examples of the minority class using nearest neighbors of these cases. Gower's distance is used to handle mixed data types. For categorical variables, the most common category along neighbors is chosen. } \details{ The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. Columns can be numeric and categorical with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 2 tuning parameters: \itemize{ \item \code{over_ratio}: Over-Sampling Ratio (type: double, default: 1) \item \code{neighbors}: # Nearest Neighbors (type: integer, default: 5) } } \section{Case weights}{ The underlying operation does not allow for case weights. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) orig <- count(hpc_data, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data) \%>\% step_impute_knn(all_predictors()) \%>\% # Bring the minority levels up to about 1000 each # 1000/2211 is approx 0.4523 step_smotenc(class, over_ratio = 0.4523) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data) \%>\% count(class, name = "baked") baked # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") \dontshow{\}) # examplesIf} } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } \seealso{ \code{\link[=smotenc]{smotenc()}} for direct implementation Other Steps for over-sampling: \code{\link{step_adasyn}()}, \code{\link{step_bsmote}()}, \code{\link{step_rose}()}, \code{\link{step_smote}()}, \code{\link{step_upsample}()} } \concept{Steps for over-sampling} themis/man/adasyn.Rd0000644000176200001440000000335514744045253014102 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/adasyn_impl.R \name{adasyn} \alias{adasyn} \title{Adaptive Synthetic Algorithm} \usage{ adasyn(df, var, k = 5, over_ratio = 1) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ Generates synthetic positive instances using ADASYN algorithm. } \details{ All columns used in this function must be numeric with no missing data. } \examples{ circle_numeric <- circle_example[, c("x", "y", "class")] res <- adasyn(circle_numeric, var = "class") res <- adasyn(circle_numeric, var = "class", k = 10) res <- adasyn(circle_numeric, var = "class", over_ratio = 0.8) } \references{ Chawla, N. V., Bowyer, K. W., Hall, L. O., and Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16:321-357. } \seealso{ \code{\link[=step_adasyn]{step_adasyn()}} for step function of this method Other Direct Implementations: \code{\link{bsmote}()}, \code{\link{nearmiss}()}, \code{\link{smote}()}, \code{\link{smotenc}()}, \code{\link{tomek}()} } \concept{Direct Implementations} themis/man/step_downsample.Rd0000644000176200001440000001303014744276170016022 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/downsample.R \name{step_downsample} \alias{step_downsample} \alias{tidy.step_downsample} \title{Down-Sample a Data Set Based on a Factor Variable} \usage{ step_downsample( recipe, ..., under_ratio = 1, ratio = deprecated(), role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("downsample") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{under_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled down to have the same frequency as the least occurring level. A value of 2 would mean that the majority levels will have (at most) (approximately) twice as many rows than the minority level.} \item{ratio}{Deprecated argument; same as \code{under_ratio}} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{target}{An integer that will be used to subsample. This should not be set by the user and will be populated by \code{prep}.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when downsampling.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_downsample()} creates a \emph{specification} of a recipe step that will remove rows of a data set to make the occurrence of levels in a specific factor level equal. } \details{ Down-sampling is intended to be performed on the \emph{training} set alone. For this reason, the default is \code{skip = TRUE}. If there are missing values in the factor variable that is used to define the sampling, missing data are selected at random in the same way that the other factor levels are sampled. Missing values are not used to determine the amount of data in the minority level For any data with factor levels occurring with the same frequency as the minority level, all data will be retained. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. Keep in mind that the location of down-sampling in the step may have effects. For example, if centering and scaling, it is not clear whether those operations should be conducted \emph{before} or \emph{after} rows are removed. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 1 tuning parameters: \itemize{ \item \code{under_ratio}: Under-Sampling Ratio (type: double, default: 1) } } \section{Case weights}{ This step performs an unsupervised operation that can utilize case weights. To use them, see the documentation in \link[recipes:case_weights]{recipes::case_weights} and the examples on \code{tidymodels.org}. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% # Bring the majority levels down to about 1000 each # 1000/259 is approx 3.862 step_downsample(class, under_ratio = 3.862) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without downsample") recipe(class ~ x + y, data = circle_example) \%>\% step_downsample(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With downsample") \dontshow{\}) # examplesIf} } \seealso{ Other Steps for under-sampling: \code{\link{step_nearmiss}()}, \code{\link{step_tomek}()} } \concept{Steps for under-sampling} themis/man/themis-package.Rd0000644000176200001440000000257214744045253015505 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/themis-package.R \docType{package} \name{themis-package} \alias{themis} \alias{themis-package} \title{themis: Extra Recipes Steps for Dealing with Unbalanced Data} \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} A dataset with an uneven number of cases in each class is said to be unbalanced. Many models produce a subpar performance on unbalanced datasets. A dataset can be balanced by increasing the number of minority cases using SMOTE 2011 \href{https://arxiv.org/abs/1106.1813}{arXiv:1106.1813}, BorderlineSMOTE 2005 \doi{10.1007/11538059_91} and ADASYN 2008 \url{https://ieeexplore.ieee.org/document/4633969}. Or by decreasing the number of majority cases using NearMiss 2003 \url{https://www.site.uottawa.ca/~nat/Workshop2003/jzhang.pdf} or Tomek link removal 1976 \url{https://ieeexplore.ieee.org/document/4309452}. } \seealso{ Useful links: \itemize{ \item \url{https://github.com/tidymodels/themis} \item \url{https://themis.tidymodels.org} \item Report bugs at \url{https://github.com/tidymodels/themis/issues} } } \author{ \strong{Maintainer}: Emil Hvitfeldt \email{emil.hvitfeldt@posit.co} (\href{https://orcid.org/0000-0002-0679-1945}{ORCID}) Other contributors: \itemize{ \item Posit Software, PBC [copyright holder, funder] } } \keyword{internal} themis/man/reexports.Rd0000644000176200001440000000077214744045253014656 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reexports.R \docType{import} \name{reexports} \alias{reexports} \alias{tidy} \alias{required_pkgs} \alias{tunable} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{generics}{\code{\link[generics]{required_pkgs}}, \code{\link[generics]{tidy}}, \code{\link[generics]{tunable}}} }} themis/man/step_nearmiss.Rd0000644000176200001440000001254114744276170015500 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nearmiss.R \name{step_nearmiss} \alias{step_nearmiss} \alias{tidy.step_nearmiss} \title{Remove Points Near Other Classes} \usage{ step_nearmiss( recipe, ..., role = NA, trained = FALSE, column = NULL, under_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("nearmiss") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{under_ratio}{A numeric value for the ratio of the majority-to-minority frequencies. The default value (1) means that all other levels are sampled down to have the same frequency as the least occurring level. A value of 2 would mean that the majority levels will have (at most) (approximately) twice as many rows than the minority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when applied.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_nearmiss()} creates a \emph{specification} of a recipe step that removes majority class instances by undersampling points in the majority class based on their distance to other points in the same class. } \details{ This method retains the points from the majority class which have the smallest mean distance to the k nearest points in the minority class. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 2 tuning parameters: \itemize{ \item \code{under_ratio}: Under-Sampling Ratio (type: double, default: 1) \item \code{neighbors}: # Nearest Neighbors (type: integer, default: 5) } } \section{Case weights}{ The underlying operation does not allow for case weights. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% # Bring the majority levels down to about 1000 each # 1000/259 is approx 3.862 step_nearmiss(class, under_ratio = 3.862) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without NEARMISS") + xlim(c(1, 15)) + ylim(c(1, 15)) recipe(class ~ x + y, data = circle_example) \%>\% step_nearmiss(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With NEARMISS") + xlim(c(1, 15)) + ylim(c(1, 15)) \dontshow{\}) # examplesIf} } \references{ Inderjeet Mani and I Zhang. knn approach to unbalanced data distributions: a case study involving information extraction. In Proceedings of workshop on learning from imbalanced datasets, 2003. } \seealso{ \code{\link[=nearmiss]{nearmiss()}} for direct implementation Other Steps for under-sampling: \code{\link{step_downsample}()}, \code{\link{step_tomek}()} } \concept{Steps for under-sampling} themis/man/figures/0000755000176200001440000000000014744271147013775 5ustar liggesusersthemis/man/figures/README-unnamed-chunk-3-1.png0000644000176200001440000003650514744271147020502 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_8IDATx ]U}?_7  E( F $gac}w=~w  @nMa @d @U@VnF  @[n` @ @U``>Zv(u1dȐhooϾ7=ׯ_lݺoCpAŖ-[»╯~/3{Sb#Fu4BeG=3^}ӛ?b{4}'abÆ މOoߕ~/ڦ?0,{zz 1?g @) 9 @@chc~&@S@Ls @M PZ' @ 9 @NN0  @@s6 @@h` @4& 6l @::4'@hL@m @u uiN ИN 6ĢE4>c*ۋ/g}6&N{we @ tK}ꫯ:{x'Ck&oPig @\@~8SOtX`Azѿ;wn̙3'..l @Ghy7gW::ׯ_,Y$&LD?~[n%nʾqW!L C ycR 0 qJ)e6-GF9R}C-DŽ̢"rb+[nΝ@׾XbE5s,1rXzue;Y09hРxIuO|"c-$pm5oiPۦxj%hlwe_n!sJG>G?=\:::*ioo?>WrXn]}@@]d-o}Vm+P;x\ SZ^LKj 1vXfMei}ܸqm+ @hzmkko|#dzO=T44iRL˗/ts޼y1y) @@EO?u]}?8p`v̙3cƌ1f̘?~|L628+ @hzMdGyd}]6 LǦNSL-[Ĉ#.  @@%vm}{z>}Y @/{@Oh @hiK а0 @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zz%@hX@mP @hiK а0 @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zӸ80  C1n>/  @@t: @@}h}^Z @4( 6t @&@hP@m @ yiM Р  @@Қ @AA@ @' 5 @@hN'@O@Kk @N PZ @  : @>>/  @@t: @@}h}^Z @4( 6t @&@hP@m @ yiM Р@~.^xq{アjժ.m @O[>_c͚5ٳgǬYb…1}XlY @ )7pC瞕\ti,X n߿̝;7̙\pA @r tKݶm[\veWWqW,Y&Lg9qĘ?~xZyGk*V/0bĈOP}[|U {fE엷tKMW8ַQGe,+VQFU92V^]N+O=Tvs'Æ w孯ڪmy="~v7oޜ3ַ@ 1tvZ9ӳΝ+W^zs> .j(̊|PM>l|#}C /Č3ǎIIƍ` @Mf\N=׾ǏuŕW^˗/ςybɝM}'@(@hYs̙Y@3fLJMwc @\ۿu!:ujL2%lEK6 @ZN"}Y @/!  @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zz%@hX@mP @hiK а0 @@Җ @aaB @# ֣- @@hÄ: @G@GK[ @І u@ PZ @   @-m  @@& @@=h=Z @4, 6L @zz%@hX@mP @hiK а{  Gp=)0z|xdm2pv=V&pEԷ=w.nܸ1lR3v ˽n\nZmoXUڭzc";xܩ{ >A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @-Kqw3</^{oZjcv @K[8瞋1ٳc֬Yp˜>}z,[r  @@6{J۶mˮn^tEqƻ7;K. ĭsƜ9s .hO CM*k?pDӎ%KĄ 'Nhڗj1hРM@[[[eߝj(̊M/oiz|ui6m믿>۽bŊ5jTg9rd^Vn+*n8+V/谔W@}ն坙6ͬnܸ1h wuWşǘ1ci.6 @rj ֭^z).g}rs @@@M:r8蠃?i~o @x>.8o~oP @x#EFtڵqgҋv  @`5_|1m۶ns6 @9< @j+VСCѣ @/Ps=ò{@wFНG 3C=>֯_?x\s5VX!@ #Ps=#v}olܸ1կ_q; @T @O7n\qM>lذ3Q`ԷpҖPm[Dm 'mo{{{j>hlYZK/N8!A @@5g/@JAzyW-R5m @] @>?c=͛7ꪫO8@ @`{h:k^4yw}c̘18:8s: @] *C9$~~XxqC=4>s @T |4ҟٟŪU+gqFL6-fϞjU @9}ٱt8ΆW\qEv]wݵp @5=uָ{'/;?2ӟt_>tk @Nj^|uŚ5kw&@ Sh[[[|??$Ξқӧ{@qZ @"PSM}ߎޟp@5*=&M4} @@-5:J`-~G|goF-  @LZyae_ @@ON!@ P- V&@hT^ @T "  @* 6W @hm @ M9 @@Z-b @hSyuN P- V&@hT^ @T "  @* 6W @hm @ M9 @@Z-b @@_~bŊ;LhqƪUv8f @@%yqYg3<]tQ\qٳgǬYb…1}XlY @ l:::⦛nBAvZ|򓟌O?=֮] ,[o5s΍9s\a @@ wb7o6lL,Y&Lg:8qĘ?~o1ʾ~!T_`})$ շW[-@gVn{yKhzǕW^'tRFoȑzvZy;* 6ml[)z[ږW3+g7e[h-[+_Jl۶-.lLhڹСC;7G}teʕ~Mz[ږW3+gwH"7__{_|q 4(رcc͚5qU @O[藾??UeҤIhѢX|y͋ɓ'w @M Gy$[*W_}u3gƌ3b̘11~6mZ @ 4=va[-nԩ1eʔ#F쪙 @%hzũ-җ @rh͐ @VV) @ @ a  @@hR @" ¨ @ZZ#@(D@-Q' @ JiG PZN @j@kҎ @F @* * @@!h!:!@U@UJ; @BBuB PZv @0 @VV) @ @ a  @@hR @" ¨ @ZZ#@(D@-Q' @ JiG PZN @j@kҎ @F @* * @@!h!:!@U@UJ; @BBuB PZJH_#0tо3>8S-oVm+Pۯ_\^@mۖ;!'3R5ʵQ5ʷu=җl޼LT}[tU {fE<8=< @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @-ЭtÆ 0ŋǽV @ywygٳgǬYb…1}XlY6 @%-tɒ%qg-]4,X]w]|O1gΜ.ml @K``wLgƍqի+ 'NW|0xʾO?=gʶ 93T_mնY?z.R#8"ď~.YbE5/M8G6mZ 4m]oy뫶j[^rϬݭ["uK 1tvZԧ>}u\r+ub^VBoy뫶j[^rϬw -ǎk֬Nƍl[!@(@I&ŢEb呮~Λ7/&O\>e3"@Sϙ3gƌ3b̘11~HxZ @+ЭN2uԘ2eJlٲ%F! @%+H_ @ =5C @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @j @M@ʫs @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @j @M@ʫs @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @j @M@ʫs @jZ6 @@SЦ @Z@M T:'@@El @4U@m*  @Vh~E.wշU[-@g?.2$wwȑ#U*-U9LFmpjCmKU&SD};::vw.nڴ)۷ ]3S_mնY?Er)1A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @ТEG + 8H PZ @r\  @@ @@  @@hѢ#@@sy$@(Z@-ZT @h. @E E @ W@q @hhQ @ < @- -? @\4A @Z".^8XjU @@ x={v̚5+.\ӧOe˖ @)0kҥ`[1wܘ3gN\pv= @@/dɒ0aB>ĉc]/_>leAC l[)A?>/]|饗*V{#͊z{_~ 8kHWsxf-.rÆ e cǎ'x25kĸq*i%]!*ʕ+#Um%5M 5-Joܛ{Eoxն7Ʊ.-OYI&ŢE"Byb  @VȏMus̙1cƌ3fL?>{ʽ{ @=uԘ2eJlٲ%Fу @;zFpn ѣGZXm! @T/U9M hE◿e0u_ 쮻B_J3(P@-SW'?InH5kڍ֯QM#ܴiSWUV^%/?߻m5 qYl٪Iw5ںukV[oջVhSM_|Dwvvn앖yۦa=}7`k+)C Ek{O<C!;묳cMS0].R?7pC˿KD\ϫZo&?#nx駳k[nT{8~T=#9&۸qcy;,8~Kf=;NѲY\ztԏ~H j]@ݪ%[n޼9OG}tL6->OX j㉊-[{9ϧn3<xZR-{=={nI_ߣ3ZM`w2=أn67eV-&rXvm_ec .Fi[P ?q+^hCe6НS[D <و~e E 084hP<#Y/Y r|Ȟz;~x|SOcr8 W@ַ5?*#'_[D ,Kw Hb?4,Xn]9|3xW{3, @pn9 @`ss @n  4 @@wY @) & P@.(/_^Ksm ' O$ )KHƞ%@\m @> 6=G`ժUٕ|zjy;΋1c8v @\mB }"'7tStIBfgyfO2%ΝsxHYçM&Me]s7g[ԧ"#w ^ h/(! /b򗿌AeMW('?Yիc}o}[qaeñr3g?C#[OuQq9D"wޙ^DzI ?)9s0ȝ?yŋǃ>)tx`v󪫮c=6 G#=te4]i_ԩS+[<G@oHW?CݼysiaÆiٳggW5ӕD886Ce}}ё @7 ZJ@K |o|#.ww="^$/~G%3]1M˳>^xaСC+M{CSC _OǶm^xޖ?-hoq вy{"B=]kSN:+{|Ap ''}ƍ^D/~1{tE3ꫯ_W} nɓCc- ,M^ԛe@*g};w6 O͆ +cǎpz>=A@[@G(VQ!@2< @@вU| @-. x  P6l5 @@ -^ #@M@-[E͇ h @efZ~'v IENDB`themis/man/figures/README-unnamed-chunk-4-1.png0000644000176200001440000003731214744271147020500 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_:3IDATx \e}?_.; P*H j1l#P*7 Xx* M) S-%JS4 ȅ\wI2dΜ{۝|33}& @$з0 @2 @@ ` @ @n@ @P? @*п[֬Y9T& WF׻G}-[no߾1`ؼysxWrj9"˝׆ h8,{wkd;obnݺu< }߅@z2dH_׻iUy96a*|}~M PZ#  @@h @N PZG @( fw @< @FF0 @'  @5 5ٝ @>>?&@Q@ @ y4 @@h`v'@O}ׯ [Z^hQ|xZfƌΕ 7T羽{#nۆ^t~JKv`/?Zk~{޶m[6%x㍱{g~|#GzMsjooN8!WbŊXvm]_؛7o/Sjmmm~Ul4鏋t"v}>,P{M^7#g 86LMz{:{dj1zXzui~رe3 @hxMgmoģ>=ew1q홖-[ܹscҤIS6" @@/?_}K\pAvSsƌ1}5jT7.NZ @ 4<&:*1֬YШUi۔)SbٽÆ KL @t>} @/{@Oh @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-h-Z%@[@P @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-h-Z%@[@P @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-h-Z%@[@P @hٗ @nnB  @" ֢e_ @к 5@ PZ}  @@& @@-kٹ߿ׯt}ӧOf3p'~S}ն>f|t[[[֭ĶmۚT@MoݭC] ׾}+l~ӗi捊GZ j Ugj[[+yI ݉\ݺuknKOj[6,6m`/Qz^No?-6<~^,j- @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/ 6 P(P4 @@ _#=$@J@-T9  hH  @@ UN!@45C @@B` @/?q[nE}+Wa @ tk}衇K_R^$9k֬9sf,X MK.-m3C P<5tvo޻t%Kn}Ɯ9sbq1C P,n ۷o/<:⊒ŋcYL+'L+mO3=X<#uOi}İa^_i~OԶa˖ T,/_#F(>|xZf~ iSN9%immmŠX?gXu:w]Їz[q'Zq {<ٴi[T}ַvH˿=ʟ8H_ӊ+_\1c^wYkDž;!]HWu]wps#QtСZ_s=@1}ѣwxARzqرcwY @Z_g@SL_i_bܸqvڸꪫbٲeY;wnL4sW  @ (Z,9cƌ,5* SN @Z\ۿ@6eʔ}bРA˪}#GV;@kF۷ooNe\T<^ Uq8f̘شiS_CK$a+Nz>Ě5k#5" x.*R5wK8p wY|  @@o @].f  @/ 6 @]0 @4^@m# @t@`%@hxcG @" v0K xƎ@ E@a @h @, @@; @@ Y @ 7v @.h  @@o @].f  @/ 6 @]0 @4^@m# @t@`%@hxcG @" v0K xƎ@ E@a @h @, @@; @@ Y @ 7v @.h  @/=>NZhQw}rʝYA P,n ?Ϗ>/ļyJf͊3gƂ bڴit63 @!m߾=;yAx;f|C%Kn}Ɯ9sbq6['@!>}W_ o֭#dA4Xxq?> iy„ ;Mҥ_|1fӾ \L [] 5.mWrO{@+x.j*yo?yk駟7nn![|1bD.1|XjUi9~qWqqG(0tH_b ?L5jT1fT\TȲG}7lPjoW3@S㡇:+ׯ_ttt.-}cqǗ֍9ҋJ;ΤRO_b 6,W_}Xۃ|ӟރGyHO|{:f7xp7 !oxC (@lOgf̘ӧOt8ҩS> @ |g@ׯ_^Ӕ)Sbɱy杶u< @@1%vR2}/lw @h=;t=#@ ' Ћ^\|C'@Ih/. @zB@ u$@b  @=! c @zڋo @@{B1  @X@7t @@O= @^, : @'ОPwL @@/@{q  UГN:)֯_SԩSwZo @] u?G?Q_ qwơZZg @@%o}k{yغuk̛7/Wj-:?_Zg @@%t}Gy${rK 6R{ @(P1v}wߝ ]tiJǎuy @:~1}xWwjN[ouV @(:_e|QFЎ;pX @ PU]vmqW~W9 @T}@|p?ܚ @@3K/49x0ڿMgE878 @"^Oo]fMuY;EH;XA K/۷oe3]ߜ~;XI @ T@ @@U˗ cȑuwF @:~="qT#@ؕ@R֭'x"AK @ Pu=#wj~wlذ!կ 7ܰv+ @ T@ع=}u횎5qεַ;U +UkP֪WͣoIU||R %:6 @RҥKwzCF׏쥆M P@tꫯ-ozӛ]zWpH @fO>2|HNJ+"9'fHc'@NouE̘1#;XlYW^x!{fG  ?ϳWpYz˟t sύ|3> @@U۷ot^{-E*o2 @@1 {}~s< 6mqWI'TL"@]zue/@4iR3&F裏;/i @U C_(-ZŔ1* @T}4ҟʕ+3gyfL:5f͚ІF @b T@9XdI+cT @ Tu ~˖-qSO=։O}Snݺӑ>9  @O3=?nݚ|9ڵkc- @إ@U-\}RgK=8M @*Dz<0FЉ'f> D @M ?~L/#=~w~؇ @@&PuM{yg_ @@՗q!@ P. X&@hP^ @ "  @* 6W @he @  8 @@Z.b @hCy5N P. X&@hP^ @ "  @* 6W @he @  8 @@Z.b @@_y啸c; hѢEq}ʕ+wf @@%u]qgdz>_|q\y%Yf̙3c1mڴXtii @ o:::oB~z|38#֬YϏn-s̉ٳgDž^ni @@wcС7mׯL/^Ǐg8a„7o7tSp uiC--)~WAoqj[\b,ݔ*M s۶mqUW'ov?#J>|xZfG?ҺAƍKf/Ůj(M-4u`ww}袋>Lh\̾s1:+Vĺu:}]"oq뫶j[\b,݁VD!mذ!o&kKbYFW.u0͏;l @x@/g/Hg=;'… cٲe~Ν;7&MԹw @ 4O?>hu뭷H3f̈ӧǨQbܸq1u>f @'zgo;)Sɓ{D ݬ'@(@h5NmmmL @{@h @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@hR#@E@ͅQ# @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@hR#@E@ͅQ# @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@hR#@E@ͅQ# @ Jُ @ 4F @T+ V+e? @\\5B PZ @r@sa @@ݱYkkke==#U]mնY}rtdcԼx5:"Qy-V=Fmjo;r=җlڴ T}[tU {dy8"{@+H  @* yl$@[@[T{ @Њ<6 @- -= @hE  @@ @@E" @y yj @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T{ @Њ<6 @- -= @hE  @@ @@E" @y yj @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H @#<-Zw_\rmV @KM+_Ju];Κ5+fΜ ,iӦҥKwn @X@/^gyf[n%K믏}sO}*fϞ> @%п;aÆ袋bժUC`:~r b޼yi桇|3ΈۯlÇ/ {շW[-@Gm*"uK=#N'?١3˗/#F֥uJOf_+WXJ[W#ww1tBu=;ѣcեi~رe3 @:qXpa,[,ϹsƤIlD @%1cFL>=Fƍt @@O4hPiL PAoqj[\b,tbԣt1bĈRV*-qWqqGt{Ӈz\ moMpm5aÆگ_(u=\ZN3SNSN9.|Kf^w}##̵^{W^yUU 3 {ww*n&{+㵵Ő!CvSѣGǓO>Yիcر4ΐv=KbŊHAմk۷w&&*i9ƏG+ l۶Mm[`UretҤT}GmĉpHyP9wܘ4iR-k @Y*:9cƌ>}z5*ƍ]roa5O Ѓ=@ӸL'O͛7ǰaz¡  @ChdQ5} @/У  @r @ @ʫq @r\2 @@CІj @\@-L P'@(@E, @4T@m(  @r @ @ʫq @r\2 @@CІj @\WZnM'㬳jn/wܱ}lhM~8,8[sz[>:f̘}v'ZR =/?qd{΀tr<~GGGl۶-5,T_SRmsūm@#  @@ٙAfo[I'q衇f}qǶu7=P\~qb7FzݺuqyE{{{_>{~ѧOVRa鯰<ꪸ+믏szC{tn}/k*H2k֬H1Gɥ^_kSO=k>`G7/t6g@kj_\zk<3YHٺuk 8C W>V<ҥ 6QGuhSjXdI{ae>ko#V ]Je]}OH'?ꭚrM6ŴiwwcS?N@߾_ؼysp{֐G5@zTۮ>qAT3mmmet":ޢG4IxG[{WiLuVj2+VĚ5k"[`ACOd;?OW_B~8ǷΘ0j*J8#[n~Q8ڍ؍8ԛ8Æ Ξoo7fgyfX:ӝ^b*ɟėw]M/B2@#"]Mb̘1njFhc^h>rTͫiNcԨQ; 43\}#@%'r)q7')dM}gwɓcΜ9?{̃>:?+ӟt:ujL81 _~yM7sN^|򓟌X3-P$]$@9^z :PO|UV~ַ֧3cŊg?3<3~aEGGG6#=sύtF4ѻ+ >0#-"EH-R($@y>dgϞS'w"_򗑾-Z=PЙΖtAΫ:;,~#tI?Mh:[[ٶ)SdgRw.a'x+hj<:G@+C}înڴ)D' ?dȐ8wxܬYh Gyd\p>}{GGgms15 @V ДrHv6koĥ^uUySO=sO|_?'aÆ)T?_W"W.,,XOzj}%>OO6ǎK_RFfza5\o<+2~Y3I&e^+[:& J^JWZ |pz|zgѣG9]O9кV @&/ @&|*j< @&@@G(Z @&/ @& Chr { @ E @\@m @hhIENDB`themis/man/figures/lifecycle-questioning.svg0000644000176200001440000000171414744045253021020 0ustar liggesuserslifecyclelifecyclequestioningquestioning themis/man/figures/lifecycle-stable.svg0000644000176200001440000000167414744045253017732 0ustar liggesuserslifecyclelifecyclestablestable themis/man/figures/README-unnamed-chunk-5-1.png0000644000176200001440000003630014744271147020475 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_8)IDATx }?% 7hcvs^v 3{wϜs;+;;3e篗`!@ P'uz/C @ @  @U@+#@@  @U@+#@@  @U{]_/z:Z1^k׮gϞa˖-ݷQZF٫Wcǎm۶ZvOuz6o\b{---Wϱ_c t=/F~7fK,17_>l߾ZanݚaQIx@>}յkVlRl?lذjX1bjKM+ނN @Y YI @&&&; @d% f% @КD ~ @j@kb @@VhV!@I@N @Y YI @&&&; @d% f% @КD ~ @jk]~}7o^-Z(<aժUJlْ<駟۶m+m @`u 7o_ [n%|aᢋ. ˖-h+6m \pA={v뮻5\vn? @_.^J;S!t0wpwiӦ>;}ĕ{7;6\n 7n o  @/нC oVxG/v}75*<҃%KN;~?|7n\y 7~'{m+ХKA4@ݓߡ={6Ѭ~W oC4c]Q%jznݺV[@=%zV\X6`$7kb[i_yj}ȑ%y!+rgjj￿CPquvZ:iX];i;wjie w:ޱcGd{]ht-l߾d>}w9vOpk7֭[oLt6_մjgW,jګWTZmC W.7x`y nATj @@@G}ٰ|a̘1 _%SLǏ>hWmVL'xb  @$Щos9/pC=4L2%ی3B=W\&N4^%OO~!h_~妩Sǫ|/xxh\ @S!\<vwh'@hpN=m  @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 4T] @T@h!@A@U @6Z @r@s@% @@u @PuI P]@n @ 9]߿!a uƝ YkgYtǎliӦԉklnZhjul; um+R,jڽ{zLo-aKKKGoPhg_u-Nj*UԵXeYԴ[n/T @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U@MH  @RT @Y Y @ U{jk΍+W K.x.]qUl+ ,[l)o?# @CS%K~ԫ֮]z%L:5tI{Z PN Ǐ+.E]MN/%>|x馛ڵ@ P,N o;.|Cj9yx$>cIP4iR۷ol @h|mx }ݻ{âE '^~pw3g!C+Ü9sN|r-0lذ>:;um«i4H]Ytͩ0 @1cT!KB*o@KiIqѣGذaC&P,6_]մjgW,jsT=ģݺu ;[z @'#$s@; @4@XFC  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @* + @y y  @B@B  @*W5h6&i:mS֮մLk뷫gQ۷ ֭+O҃P欯6_]մjgW,jڳgT ;wLPe뙨kkxQǶP׶"_GMZ3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J@-T  P|54 @@B`  @@_C3 @J:iҤ~v9sf2eJ6 @ؕ@]m,m?~>So {.5۷|0~m @HH vXꪫ–-[¶m?uVG}{_>W @4'̛7/y{'/?m @RRhg>Cɑe˖%[Pz1 @] @p5kִ3 w_6 @h+Psç>ax[  @@@Mtݺu77tS8RD @ ]0 1"_zoZ  @F#|++ JFwSQc9f7/ @@Mш瀮]6|3iڑ@ PE믇;w76 @@W^ @@tʕU' 4holꫯ?}iW-?4^եK0z?B P<QG)9o.nڴ)|ӟN.ta??׾$j< @4@ ۷o/w ?¿˿$-7tŋr!Uycdž+"'^50nܸ@ И5c= ~~/lܸ1|_ wygm]zux7Ü9s„ =mɒ%N+o5jTx+͛CKKKO|>~Y n:kUMqF|uGMkx<VkN_2s;򓟄xi3ɕ-xxz|;wݻMw^X~}ŅFSN-4)- @-x@X|yVXE[j|WWg5  @)K`+KW7ŷ௺0k֬L0  @:[PƋvuO 6Tܞ'  @[0aB6mZrFaҤI=K#@hhߞܣs̘1m>񏇓O>9\} 3!!@hl/'?O0,Z(# G{FG P5\˒#/Ln6TI  5EK. q?crC=س4: @-[<G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d-=%K˗qƅ>}첋 -[ێ80x @zUWnݺ#Go=\}a̘1z---aԩᤓN*o?sв @tj}gÛoo%bwvtҥaᦛn* @RSG֮]6o\^/=Xxq@{mI&}+W 6lHw% 0J tީ? 3SN.@/96@ExY4fSڵk7x#9joaѢEN/r;̙3Ð!Cʻ~K_ gNy .,yCm S+6_MԵEM7nܘ ө4*M.BK.$įQx1R<--W\qE8xN[oUjԺ9 WW5mk5,Vm /ۿ W^ye8Sv5ưbŊhgi"rHo^<֫+4֭[j>&󮀺6O6_MԵEM{ ө]jUkv3ů̙3'B>>tg͚N=Ըj!@(@š5k__=|0cƌУGZ?3+[xҨQ @8@/v)^>}g<`wuw՟m @!Щs@;< @4@8 FB P/^^ @ @  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@@  @U@+#@ެ j֩Wn$$l5 5mDյdQӖT ov56Z7O-[D][k4c5m:)z5իW*Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@h֢#@H@Sy4 @d- f-? @T4G# @@ @-Zx≰jժ۲eK7o^xöm۪ @:=r-o ,]tQXlY;M6 . ̞=;u]k ;wl  @/Щtҥaܹ;ӦM g}v۩{aرv[ظqc?~l @h|9ė^z)kwsQ?nHK, vZy{ƍ+o{[ow-q6[gϞ=}tv|Wi6_]i)Ur0p P卿~kVZW^y|[J>}… +tӧz^ if/{OU.MMOV-@۷Ғ>%7xm7zfOB~k}[n ׯ/L $лwn+4!C$N1}v!~=zHlڷSСC36oޜlb: @H9?y饗/8ķ=0eʔUg̘+"L81hJ>>y0bĈG{ @<qP3X>h_~s|wsDwwX7v-zJ~<}%]n]Q`m⅔|mkSaÆ 68_ul=[o믷JF- *_[] @4@6G .  @ m @ 怪K @hu- @99 @ZF  @@h$@. VB .  @ m @ 怪K @hu- @9t%~uـs gO<>!'?0zww{ti@x \wua…W^ 8BCcdž /0\ve{ti@o=̘1#̟?GCr;vȔ;۾}{RNd&Of0]mRd6{j鸀q3 @ {\O-N?зo߂pƏF ><]1.u'N vX>%j鸀s@;n @{!T @ 7  !fqƆA P@r1hm\jbkj-pm5k4$͎@ īofj:@;nVg^:,YQVp_|yXvm-@˲eޥnݚmvSĭ+Vzcv| / /CŽB_җBR_w^8sr}=/3τ}sO~g?φq^!7! 7xc1b?*rSoNCuɝHկ.]w ~5ūuկ~o&O{?ZwL'ᬳJn2lذV x'3x Ukb@WnpQGGy$;o} #2w/^}fS0aB2ɡC&/Z(ŭ{|+/^XvI'%%QKq.]|GL`q'ceٳg'O|Ï~#, ~p 7ɓ'38#p a޼y Ö-[BC")W?W̺#9B/>֭[Y˶3fLii{K}AEH5 }'W$}^S)?lv|wwX +ӟ49wP,nQ9гg$⑳;k)?χc ԶxʌvTUaoŋzmNO%/֭[MN92!} _^zxRR\xgi>}Sd.\}| -nMGrY<]&& {[ 0[yMQN"^a5U)CyذaCǢchǼM K@O'@蘀1/{ @쥀N 1c^&@Kt/=}O?F@mRE7@R-$@ 4U} @Tp4 w?O·dɒ0iҤpYg%Ӷ-[[o5]}CMv]fM曓A%}]tE~7lݺ5lS}hG@B@3 ̟??|# /R?38cSG?|4ncX5kV5]s=72eJ=ztoLpWLJ??K;q]hG@A@S M?ɼ>O&G@_T p@pQG%m?o&GL~ .Hڏ<Ȱ}y'h B<"!@,! Pl;w BKː!Cq}ܹ=yOgΜ-Z̙oڴ)~/<|[Jc9&iOJ>" @`1/aÆ~~v27o~zm}s9yrKrT3׿=p&zIxAJ;vlZ ШhVƸ(@~?O6%| =X?:蠤->gƍaƌ!o|W_ _8㏇իWӧ xgy&^BV @hPhK.$?Cx')9B/"jM|דͿկu]<O>nKxƣmxn޽{ /^{-#G~  Pmr "Cd b=zQF%G2[쭗 &O-ote%G4>BiӦ%3#F?泟l>k׮][: ,hT.ygθ Ptx5z|xv " 6H!  8t_y @D@mB @`_@J' @A)a @t_y @D@mB @`_@J' @A)a @%:IENDB`themis/man/figures/lifecycle-experimental.svg0000644000176200001440000000171614744045253021152 0ustar liggesuserslifecyclelifecycleexperimentalexperimental themis/man/figures/lifecycle-deprecated.svg0000644000176200001440000000171214744045253020551 0ustar liggesuserslifecyclelifecycledeprecateddeprecated themis/man/figures/lifecycle-superseded.svg0000644000176200001440000000171314744045253020615 0ustar liggesusers lifecyclelifecyclesupersededsuperseded themis/man/figures/logo.png0000644000176200001440000006174514744045253015455 0ustar liggesusersPNG  IHDRޫhgAMA a cHRMz&u0`:pQ<bKGDtIME:\ JbIDATxw$u~2j炙x`gZЈEʭ*n݈{c۸{7{"(V$E`03x?mf{SUw7~s~{5N~] pO g Λ][5HV ,Y [@iz[?[W? bL n1-SrԀA YOd*@ V^e" D#OG#Y4bD^e0J0\:PSТo4:`x >;_$I3n7CF y,ZGL3+ `8f"$.Wk@ 3 !+{?>x"F 4L8Ӌl-. "# &KS31٦q*$%%JϤq9<8LiұS?mFy,jlJ$&,fn͍,˓5.0 WY5BA,!& eH:n:)pZh60B`.Z9'gD2)50["f, NËlt)0U}A䕁Ae_mm7'&@xd2̞ `2q;8( t "// /#>Eu)*T";<[<l%9wia"s`x9ax0Aw 3!'CD!P7uml1f{(2>K2&+ꘉ~n0;AZ"Ԫo\D^s#rqD:feci%5,[\S#s$c2?LǮZ"fLE51:'̺{yfsZ5,PgLx5$[\գtq|j:f([lgLءo\D^uqhQLuz5,Px-Z ҲEXD^uvd~3S]ְ-&axՌ n#c2©1d~ -*-!*ZƩ% G5[|U$["+[tv6'[t-^؅m1lQ@\ZO?C14 TTUoBuy1^?"O&UE=-s4OHAK{Mw}d~$BDcK+BPQZHA*rh,nΛx"h/Hˇ2+.G- Mhl eH`Yd~.fH&܁*&|6ʠDQ68b6&P>V@&s%$M L.O"זcRFUJ3(XӃMZfZS& $)8VffI! XDe,3riD@P-oB|`8l703:;t8O`:N$p"'WP'y]R)n5wL0)3Xb5V]BXfu ܐAb{AZhyLN ! Ɲx% l*nh59a`ʁul(IU$I^ 箶 ǐe㷚+SaRjboENN1(.@,B(݃Ɨ} Ai'{`߿h0 |C%FbjUMM:jYLj?z:>̕^ALb&k\mౣ۱ՌiNgYfʲ"TUpx<.2}л]v`( ª*8qh'&EIBI6pB =NGQ﵈<'KM4vKfJHyo8,޷?SB54Ҋ[ VŽ\ɵ.,8J}<|poAߐ[_(gJjJq:l roSxc$y0d+!6q+w:HRD$D2E,``8ȵNvm2'yBR,csHP[l{Ѽ0!jJU$IPWЗ{Tuyؿ_!HnG7yp#%>7jËe8Vv7б]E]GՔ!MwDb+$ zFǓ2J@>7.;^nFNZ&Efצ*#Hr=≤Fz )xV~"bY v_'i pʋI$Sn &C$O[^!% 'Qd(D"⻿ y`7Dx2Eg0zxƆJ*hn3HXj1Ɲ.zr5 !(-R\X@ ^ϺK% !JD^$ UQV䡼7 LH&zV\v+ 5T+lf*&ڱ30x TU!nyAMe1.]t !壉ȇA.)!H&U:C PqvڰM 3sR |\oc6)$S~5e7K )IN,uR_Sl  IpR%>7DM""M%H*VL8Vb$%>7Ey&lxFPWSn#t{Jb6QW[-$IuڊDW>RMm}%bdZF5<}nRJ<.V; شIpΞ)MDFME h;-]+=L٤")U%^%)j FB7H8%'pb\St22B#I*(.PRapO{W$km2zInJB?*Df)h#78yjP݂?!RS;y I_')J|n(QplBe%>ʋH*(٬߂ ORUb n634TUbvLCjǚ Ņ\#)UY%EaF%"dR( K~Ըy  G h S]U{ NY/.&\P0%YB@N4@$&Ed8idֈ6 ,OjSx">ań$J Œ?'pڭ86J@O0X'M93f=s?OM5|t{ٳ/}!NFwCqaUuPUIJ<\y7k[A(8* .ZX (i觻`$F2YOC0) nb7n@%b$ɥzKN` {g"޻EQhS c,, )#t9lV0ރ;/7ni{)Kw&݃N||owAMe1;j1P[00Gx~_^fmov)J—?>#~ܻttg@;awqVk_xs~GFo={~6C#k7 UE6GmSOƝv.^mΐR6Taǎޙw 7IEiM(G dIv->xp8lꗩJ[gDr"yFaجf|LކYجfÖ&Z;Ω M4_{[X5e86zF{Ur"8m=|q/'.b1څ?w.U7%>n4st6 #۬TWF`O!TUаmjH$\L ^E`#$??O*؁m߾.]oYH #۱Y|>oLhHu@Kg?CA`YR)}^eED"1t2 AeYNVnrI?Nb]=tNTnP8J<䷞=}[z. N)BPuQR%Hq;xë:^tȔIȾ"[:"BmU)fƧ BɲĈ?79Fʳ'PGտd$Ƿqح8f}n9҈fɤaF ^GYa|1.eEz] h6YWS,K/b4 !(+-p;_VMe1-a=r^.h対 ڻ]Tw:=fs:'Wڞaj_n{r޶DmUu/ڮ"l.%OrNی7eY&󽟼NOmS |_/`^ÑHC!~`7kJ8{aY: \6.nj66xXzKۭtܸ;. LB}m9VD2ًrbTc6A"eCk 5?l:NJY x˨G춧;.M2!uH$FKXeqBFA:2ȮA)rTUΒL_ inhK dGcx2ե>[ F[nks#ǒBka ,G,wfTJnP[U,Ip!NɒR):F&ҜH&V2kJ=zKC m}#1Y pZ3-s6T[i]m 9G3Rp9픕 jA,xO$1xO8=dl9 t*qj+o[H&UJ(Zܹ׻lQEQ(.,`d4H(@c9y^-"0gyΆɤP^#R #ZɗJCm9nGUUzPUo- ՈL{x.>Hal6L$zc}.K}D"1;sDZ@qƉ&t8{3kQUCS$̱lwb.ÛL^7+l(dRPU5-'ZŠbo77XrgB*/BQzHhO<ȝNTRQZ㢭)8$a6+X&*K 3YU_#Dcq WX|L2Z*x䁽W}iMWQVHoNYtGM[g?o!Y^%<N|^7 8ql7%ܞY~"J<)-ϼVn$I )/Yr!S`}D*ŝN{`m7> ncMj;?x{=4BͦLCvgOmS ks;(ͦv^~cٿk~uڻ {VBk[oƔ/+R+Ïoғλ5vm\hk$Fa΍?(U|=ݭ`x$Hb&RUQ|9FaJI?'PU$KH*/?׸tezG R`$ApUUٱWZI ~1y  L-r9{rw#2_sC$g=LeY?{}Zr spf6UPV{?y߻R)M]v6TҸC;xoCÿL{X*Tʁ]KsV[e,Yx!* /ߦwʥx<,KڳMH|\ݖ^҂A^};7i:iI⯿KZ5ֻTWuc5ɾٻGpm~S$I$y`7W=8!E0Ȯv 8a2q rBueñpso3e"&0-m=muܺ7Z~M{ i|BjfώHID2ݎ>_n^wK7U`1hM<Hel'r'lFJzF[=ry}86|YJvspf }n0Wo3W; ~lX|4x[UH*\U-FU ª+2B`HɄYHB:apyS*b/Ly\!f")*ILGfd"LH25kЫBPsm}&}Civfb&eZ,g CtEs[VR%d. +IҜʌvLYf4iYR{] +'IゑHx>N֚" =N]kDO *c۱M;bq=cI}|>Cɮjϣ[VE&KKvkD}TUяn  A,+w:QqhG=C~WL:qN;;ޭ5~BΐKDB@I?vcp$ XF,KX[=\Վj1qkDrV+@~ X;s7yF Ho1-,fF8qڭxvXIB _^87RY㧯}Dߐ0' :*fE;xܺ?=E ACU 2S34iLӫ !rA~`7ꭋ\ka)ȰiGwk-x<~|;:PBrQd7@$Yb;ACQ~9sꊏ$H-[J> :Nڊ"!d iȹ|5B`1)<~n^9KK>, EPdyk SGPQO_ta<%tin[s'_UUV _xOݷw!ZNX^j* PQDyQɔxb=Z&EdzR6gCy!d2' #0S}ƛgnTuix CUNcvۼq~F0:H]e1+u6|'.pWZ30J ,+k` $L&\Vdb]I^:u!o'Gwov6m=C"˲̰?w~.|飔 zA5d.38}/^xjJOej+o& \v^;}Af.4 ` 7 H0M~BUIYb2i*tTZ/Vpv;cc%ʟ|$ךx {2B0ٻi;\[so]vԚ'!YصCl(ǯ.gȫl6ϜωCWo_o""@$#z]=ղ vn&L5faCew;HZ=p$)/5]#||TzyV|GȒ˥[8V:5"1F_6 x#[9TZXv-u?ݞ̾bRvWȑFJ}\o;?Ad |t캘?Y(6@K 1Li\Fd=4R^[O-\A4o|"fvmν^aFa(ɔiʸ< W)&ByQvshg=dtsW[ FbJn,#۩,r1}Ó*̐nbaͯ.-^HA.9B'Q~96Ք-hh֊@ 18❏nf.}n+(PUnt3x "3@Y)P])70/P,ZŃ}c%W:g +:J0%H-U!.+DF%s7T;0:nMNywz=Ȏ߻xuںQd]tpNv/6WqpJ Ȍ^v+x2gҕAeF,+ʋ=Q!hjニt2*(+*#qHן_%@8:;|WvlwO,n5w:&|٘}').lFK77Wsp#],Hh^(/.[O\F"!8k#{[ޯޟr 2A'hF3Bʊ<ܿo3[+4#яeb6זrj+};,ϸ\[ZW+\EU^\N+#La{!tB7$grnk(LW??y#6ՖqV7Tƙa?^d*E2.Y /؞cRd>_H'i:֕3w>C~-{ii@PQA:T0)2kJ8m=T!,qkI Dc͑:t"Q.ܼ٤hDq=}Mzhd>^n M ϭ)"CAAe X ɔݚT+2 %lo``$ȏ^>H Y!eYbV EaOطmH-]s59$7;gh4faj\iɐ@75/V'5eIa&2Tzi"}R7[9w>S?yɔŤh̦idxSmU%,Ol3kJ0:w3dY+ 'Oo&]} -5zDMy]ܺۛJ6y:9נ릡=ij*z*p ̋] l,ҊIH "cY̙lIU՚ɤɔl㲓RUu ĐgFtҏBl럺9S[,\o֚@PQEn5ĴvVKcXw{H, WO$5{={!{Q,֌OGgn38alxLFEM8ln,fSjax,:,t˒H|?I9_y~NB`YfZ;f^;}X~O-,},p5Jfne_R:y$x7``8Hmy!=~"ѕ4axL8{MD zG b"q,f+mIĥ[eNʇ r6֔ϿsF#ZƆ nm` 7TrR`! %Tx򇰘Lv-vy<{4N[o;?ħOc4^ $qkCGal+g{C%ɔJ$X}@k<}T\K*n"2'o'qj+}n$%H*fh0%sgOT8fl\ȿO$II =NNl? g|$I CX-iOسU}oX_ޭ5u&!s,j11a/r9 鏍 9( @2T8yx 4qf%m[ $L#\PwHxѣIRwv|O.HH h i˛4q;H$S90;ܿ 9RU5 uٛJxB7bv3Sx!ΦR6[PZN_nn $a%,DZL 20݉ڭxVJ|ndIc;&+zBn>; Fx[t20d4!K @ByQyBKZ(8sfcBuC9h +\Es7e"Ij61T!p٭v7[߷ O^=G(}+Krc}ü-MqZL]6F5K0Xg͵e$Se$!j:B߹ZþmvtP͢eA[ͦB(_?"_EFCxnc;xkc+CbKѵē 9kʋOgSM ";LJˇ,:]yIBphw>٬ر!x dBhM\-2Ls{?oxn#zڴ1B~5=6JeI;Ž!2$a6!߷B7]#4Ԕɇ |pavmbZ.lݞq3$ay|} 7ۀ1h, .nGQd>I1׬'7w?~;mieGwo8H(3 !Z|>4msfѣQU 5+IhܪR/7+nOg`tR{z9WZ8g{6. /3tikG?X7.vo⭶/ݿ3Gwo$; Dص=[j8vO"f$;]alVKz+BdQO޷ҭviK = 4h9 ^F:l"3LiYHJ>A+d˷;2ݬ̉X o8m9HJUyq$Iē)F7n.#K$+ӔL)6 7-UǷxUFޗ*=E\+ onqRst3 /7$mV H$RM&+ QMp -pQQ⡭k_wD"ɞ\5{|-]|#|xa8Sg}ILԙS20DUOY?zb ͂݇j) 2BJq;m8lBDs\|!\Ԗ/ά*3a_Ͻ}6 ކ7N}t0q`{hIA%FC F豊ِW,k:{3:F x/܄iNescM)DB[Ǖͱݛx]Ȳ8}l+g.jFkϔf$I$U/5{s5NuEVN vŌj>Pʲv`6)KWHbxP$lWǥތl[}wMK몊pڭo'Hb6+4#VL R][l*DA&*#e>c^ B04f¤(Y^{݃I۷b:{+U*{Pus*xJI`} XL&BhŬAؓX0$ RuFe %1&)Z:)xpR)_}کjȮzFpm^۾A?NeٞK͵eBQbD&=ٲ~HBJ#q$ fI`X>QQ5!ICH,Q-(I7 Vr=>ؔV5hpZ0{ӕE<"\( $IB"/7tZudj\^oyb箶_Ȯ A[vUOS{ãqxJU%cXƛI)ݺ⦣,DcL[y_-ʩ({$'4St* + [C 尮,IZbElI組:@(ʭ=E`L=yH&sn[@Z.jז&]#}37$़B.Y_#ӊNBfO߁"˜ښφ` p@ 콈ilx=1?H>U!ػu掾exBry_0'œB2BZЉ(Bs;{0N]C8G!4[# p@WrJԑ$d*Ep /I*fQrRqr3WZI,$o 9 j|!}.@%~`7;7UyeH7F\q;kX50l`_?M9OSQnP^!u-p{K5ך:QU٬`f j2tJA|Y0ʏ_=[n37RUޭ5&"IR14g7@]}e^>ܜsUyҹ:B)a-lNI1$3hdj*lR5 kvԲTFH( *عB^iͤh3$3 dRMP`ʦejZԯL1;ji9ya{3M+^~xKIc}%7[r2cꙪlXIcC_l" d: Tֺ2n,[hf`k|heJ%Mc* i1π9.`l%  ODgJBp`[-Oݿ~%ԒUB(2Na_{HJUIαjnXVNv Y~ *h x>p" yπ=3eIiwc E#$ ?SJe/=yĿ}%T!RS=h&f3Rk>ٙLf\/.{bM{&C`PhzɯfF#{Ug+& \حNBʗ&O,i8Ulz{xhz$5jq{S̸oӀ c<1i)?hr u+F8 9qx<@8_:CWȒVPZS'b̑lIͩ[نasg>BF0L&)ɬbh_7&mV˲ᡤ b6 xdW_z0,7saur]趺 Z;yBHs$œZUǻI1qQRXA95h#Y@^A0Bs_m,{ons ҝ z)jڧ{Wx5`i߽GHx 4K@4 JM>se!Ӌl|ZpdM7}i"6s 1ӃEcaab/t\|!*={FbK:_~ 6זrj+Db9Ϊ$)UQ^6{<[-v >lVl;WhCIqudeo{V}C~bdލ+mNN&>GVs;pI8_XEdp?Y$`R?%t{y]L oɏ^:h0,=}샔ij?xy][7l­YhYdKFӗq{0[h~,pƝE;YDvA[z:8K"Le`2tVOЎ:/{߹D$XrW$n૟8NIaooq{UU,fJ^:uȏ10Nt,K ڌ`:L0+~0 _B"'!22A'l<~j ǯ僋ͨBh*!wR??].n߫lRh;y__0W94Xlsn\>g DK r)cYEd3p?)[̖RB% KDb-BDEcGwPQ8Kƙ%'eIyVNަӦT߸_:'=_o|ξa{yXz -чK^w }WDz,"{/wwJ eglzYωh*!Op]^x-.tBd(rdWDuYxUV*p;m8O=7&.Ye~\v$Řo@߸6`Vo`LXkqq"feUe^nj+ IT5w׸tx,%t޽ĭ=sЭ <~l7VbRd9QvMMk4ЇhWNJ,"ۀie-9d!I<,Y(+yC;7V" IrN;/5snFY7{6$0) f٤h}L vAEbjJ)/`3~7@4n!!>TI$)$dJj,f.bRuUT(D<^ Ɩvko qgJ@^ fue\gd*A('%ZY8{ff뢦26֖RU尢a.nNzGI9$E^uWSYv`14",fl3 YH]I.z$ ڿ})b>}$w].`Z !ߏI1txp9<so\T>YU\W'1?B-tl<%YbvI16 b/5e>*Ky\جf)(Bߠ!;5p@,gfR͖ ezML?}vB+%4-ZE1) &YQ2dۤh*"Kgj*4bI#AڻKsG?Cd]t+8 3&o+ [<ʘl>ql1-._Z^[ࠢKIBÆŬ  0Á0#0px" MLH,>*4SXUf˒,Kij5gSلlj1e9)w*K$ GC(pX<.9>皣/Θ]![.%uL->FRXbujԗIŹ|"G8C.MV9Σ~NZI^s$ͤ#4{TJrDn`ʿsm [vx1f 7+qu5ȍ ;N-j![\XϘ1߉cՍſ`VPZhtX+X/ɘ Y\XwlF-9lq]aϓ**ڌc2?X}UJ`\Řl6dBCڰ@wњ7FX#6l-FҲŰ![s,9A\XS#5|Y'bF78-F58e :T\X_j"n%L$bXK\oI2?{l&7cD-9dQbV@h:-.@Ø̯Uhwf͑D@= ;N&זY@suS \6F2cl$|dJ$c񵑖9hM=+)dʵd(w/ &WְoFMb-Vlq,-sue~zY7]{"L-dL8!x@({\u"[.(Y![K򷳔-je} HYկXg2B~5Dښk|-.@׊&T\0@͌Kf:VO G+m2?+,4k߿%ckr 2`({1_ . /#&?F#V!\xϋ^0[ƺ2eAeFd _jd~yW1٢oK_25`5 clAd{/\8QNM+ yſwbZf('QI4q;s]C,iAA lifecyclelifecyclearchivedarchived themis/man/figures/lifecycle-defunct.svg0000644000176200001440000000170414744045253020102 0ustar liggesuserslifecyclelifecycledefunctdefunct themis/man/figures/lifecycle-maturing.svg0000644000176200001440000000170614744045253020302 0ustar liggesuserslifecyclelifecyclematuringmaturing themis/man/figures/README-unnamed-chunk-6-1.png0000644000176200001440000003524714744271147020507 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_6IDATx u}_M6ټ"ixE AVc)P9Q+6`pliJ9ъ"=ڨJS^ƼCym?l6{ν{gܙ?w7ߝ;wȮD @AC!@  @* 6 @P @ @mg @ @ t4tolcǎZj&F===W݁}DÇ!CĶml6*^Nc0˵Kٚ @JJ0  @j@k5 @@h` @& gk @**T'@M@ @U UN PZ  @@S @66?[ @T) V : @@mhm~&@R@Lu @@׮]=P{xٲeo;VZW @%Аo~3.xⳟl,ZWq…q?]tQ,_  @@ tvrK28?GϏc=wqG :4nXxq\~  @Itذa}-F-[bƍ?3gg*<؃o>ǑGٻlf!C+F{H?Gi|:蠶8[ k&N{H5jT?mۖ[4>w7pCuY~vpܸqz4sgrĈ*=^;::`}8}C{,[hn;SߪT֕wv|p|ySCh֭[b׮]qWf}J=3gO=H_iʕaÆҢ2vrNgMZEsT4՛ƶUFj`,b|;;;swސ7!uwwǟرccbiʔ)f͚S.!@h?|3q1e]gi5kV;<1c쫚 @m"PZ#} @_!?#$@T@TJ= @BB5B PZz @0j @RR) @ @ a @@hR @" ¨ @JJ#@(D@-Q# @ JG PZF @*@+R @F @T* V* @@!h!!@T@TJ= @BB5B PZz @0j @RR) @ @ a @@hR @" ¨ @JJ#@(D@-Q# @ JG PZF @*@+R @F @T* V* @@!h!!@T@TJ= @BB5B PZz @0j @RR) @ @ a @@hR @" ¨ @J:*,:;;cԨQҝǐ!CbĈ1vئeƶ/6Oˊߝ;wB\ݾ}{r '6oGMk1-mضۈy}z @hXݲeK\uU1t=M7+V &d3g@{ @hH}+g_=xg뮋iӦ6O Ц ;>cZf͚Xre|߈W^yer  @'А3|&#!s֭[cҥ1jԨ'?^xa|w7Ǎ7ػ1ӻlf!Cb4Ի 5ƶ釨1˼!t_1cF}1qĬG)pзm1w&bӦMfv 3&o۶m۽&TCm xMm{E]r{1tݺuaÆMoHڹsgfO<1*E\Mo[ L64Z{flklmM=+b|;;;sҐ{@Ճc͑zjov @=ꨣsύyEOOO7.,XкzN _N;-Wty睗]6ND z DN/|4 @[)h{;: @r\< @@к @@Za @h݉ @\@-0O Pwv@ P. k'@Zwb; @(@5 @]@; @   @. ֝ @r  @u@Nl @hy @ u' @r\< @@к @@Za @h݉ @\@-0O Pwv@ P. k'@Zwb; @(@5 @]@; @   @. ֝ @r  @u@Nl @hy @ u' @r\< @@к @@Za @h݉ @\@-0O Pwv@ P. k'@Zwb; @(@5 @]{(xFCX 1lذ#ДǏo~Tƶvfm6ӯ"wǎiyAq92n6mjth&aÆv;$ƶ}mm:"wĈH-@ڵ+B>w@kjzkljƶƫ6b|]ˮvT'@I@ @ ՊO PZ  @@S @&&> @T+ V+> @@MhM|6&@V@VL} @КlL PZ @5 5٘ @ZZ1  @j@k1 @@hb @$ gc @jj'@I@ @ ՊO PZ  @@S @&&> @T+ V+> @@MhM|6&@V@VL} @КlL PZ @5 5٘ @ZZ1  @j8qqƽvկ~5̙z+ @'Һ|'[\dI\{1rRqر#瞘>}z:3 @rG\rIlݺ5o<@ 6ÇW\qE:3 @rɓ{^9zk3&=e @rrh{ov&tkyY SN-_e @@;3Ν֭۫q  @}*[G&Mڣ@ #PQݰaC뮻.74 @@@E7n\q(5 @#PW_ϏW^y% 7MgE;J1 @)r?'>jz^$V @CbŊصkW͔? V @hgg'4 @5 T@_}5jTL8qٸqc<q)QwٲeK/ 'te @K:cƌ{@lW]uU :tpxcq7ƴiۍu @m Pq]dIرxO~|м矏+"Ə}cwܑn-/^_~yW @68{}s{V:W^ye^:_:R09sf>t -O3۶m˾J+wC )-zGO?(V5צ;flk&lmM!kV@uDSN{g_Rx}G+MgEKSz} WXhQ請3G{5;vl/Vx*]*m`-Tض` Eo:7U@?#zzzzJ?kN;w}53MS M駟)䖦X׭[WZG?ok [3L+Ps83c֭1f̘ @@k L'].Ofꫯf2zWog0 @PJ({1w7K\rIoFH{>w @@I3陟۷o>_ڰaÆXfMi+ @\hO?=.쓐J-ӧ{@q @T"PQM }_iӦ#:k֬ӐL @*ФIӟ4=o}[W~W*ُ: @2hB3/~ @"P%4n @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}о"  @* ֕W @}:h宮6lXwS 8qbioHq1Y6[KƶF1===j;#Vrk׮m>pƷrVil[m*ﯱܪk1|.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Z@-ZT{ @h.B @ТEG + ($@(Zmokn6y>}z @zM7Ŋ+b„ ̙3s4 @==>quŴi @k֬+Wƣ>vZz{HUFRW6Hx7zvc| 5 @]Aon!CvPs=[nKƨQⓟd\x~-]뮻]6SLi<۾}dE즓yӠ3fw'NxG7߼G=sWW{!+ze3/`{o5+GVaâkP@׭[6l >􆤝;wС-JS\m۶Ң@x _cklW\A}cyصkWqꩧܞ+$@hIA=zQGŹ͋7n\,X%!u @2A ~wyq,Vm @ZU`P/짉 @"?#$@( %  @ @l' @%$ @!hC @$ $ @4D@m @ВW @ a @ZJ !vB P@K^  @" 6N @JhI+ @@CІ0  @@I@-Ix%@hf;!@( %  @ @l' @%$ @!hC @$ $ @4D@m @ВW @ a @ZJ !vB P@K^  @" 6N @JhI+ @@CІ0  @@I@-Ix%@hf;!@( %  @ @l' @%$ @!hC @$ $ @4D@m @ВW @ a @ZJ tuuŰa lQS.0qf `|kkMmP 35EoOOO\T+l9k׶\ur[U46bVnՊ5Cw >G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @- -= @\4G! @@hѢ#@@sy @-tٲeo;VZUi @=.\0x㢋.˗7 @)Qdcն/c=wqG :4nXxq\~6> @@ j}c̙YL^'pB<{нK/;#bȑf_`Ĉ|cklWtb1o{7nܸXzurcѢE8z:o}kڮmP356ƶƮ[p^j6lXر===1jԨ43gΜx߻./~ѻlfAi7軷0*cǎXvmtY?+Hg&L]Ukr|jxÇ}V:eʔx'{;f͚:ujrIgHϒ\2RP5/k׮=B}m54iPZmҘRvil[i*PIޔ>of͚O=TO>/"?3}QКK,??|0M֚8餓b޼y>+)hI{g;S%?؝vtGر#vY`jwb_S M?ئ#29{y+Ζ @66iR38#:&n"pki¶M*0eʔ8묳I{[oFy䑵4a&x['Nl5c @p  @_@m1X੧gC' f{7P:15Eh~fG@;᯿zvhQE=ܼys}Ѹk#>Zuo^ziƍ|s!C! Jv [tip qu}zKŋFv /=>OG4 F衇Hꫯq79s?7|s?SDΫj,=nxg}lPz׻~TO>#9o&.uwwlj'umƌYm~O/bc6~;Y}#hZtU*M\sMnzG"=TZUSܲeK\tEq);3gN|Cjʾ}b֭q衇![5@z|_x8]k3Ç.֙Us\}h6ٳgn;wLegZM&rX~}ɟIc?xCOdUCwǺu"lɒ%=54g&Ha3&?tY P ݘ6mZ1"~d3f)COϮDs1qqŭ_ײKMHHD˳il/얨#H4ۿ1a„f#WM (꼚v,]qgQMKsI=vlܜ*FC~6lqƵ߁9شiSvw V @plF 00t`n"@:@8 @ L@ @( fD }xg?x+? A 0XKY>qF~ h6g@mD >  UVM7ݔL8N<Ľv.p tH)Ow饗aMzg'N38#GtOM#}8A,O}S1iҤcYf}#@%'[n%:H!3}l /W≮}o?J>;|Μ91k֬,`^{Y;_c;;{[A@ 84WXbEg?#FdMg(zG>իWǛ1cF>>+W>3~_u\pY;v}'tR\r%Έ o~3 >0#-"MH-2PI@ \x%ŋބӟ4ײeG:?<;/|!N>,ofKiJgFәַ-YgIp.a/x+hjzxtVHg?G߮nٲ%Dw; n…Ytf4?>.첬{,~%Kdmtߨ$ h+M)pGfg3;77qW{'O?ַӟtr!Ythwww?cP.kqUWEj+|0֬Y\sM<$|l;@&@|t袋3ezCR`~޿oLgLK/ŕW^ͧ9jԨҗLg57oޜ.92x8g]v鉣:*k?hUFJ? hZޡxN<99眸⋳K>6N//է3E_LWJod;fRg;vlNH$ZIZiHg%_},<_;sCKq )S*N~=,U 4{ @vp F @\@m= @@ 6 h @M@mu< @&@|t nh!@4{ @v[k8k݌IENDB`themis/man/figures/README-unnamed-chunk-2-1.png0000644000176200001440000003752014744271147020477 0ustar liggesusersPNG  IHDRz4iCCPkCGColorSpaceGenericRGB8U]hU>+$΃Ԧ5lRфem,lAݝi&3i)>A['!j-P(G 3k~s ,[%,-:t} }-+*&¿ gPG݅ج8"eŲ]A b ;l õWϙ2_E,(ۈ#Zsێ<5)"E6N#ӽEkۃO0}*rUt.iei #]r >cU{t7+ԙg߃xuWB_-%=^ t0uvW9 %/VBW'_tMۓP\>@y0`D i|[` hh)Tj0B#ЪhU# ~yhu fp#1I/I"0! 'Sdd:J5ǖ"sdy#R7wAgdJ7kʕn^:}nWFVst$gj-tԝr_װ_7Z ~V54V }o[G=Nd>-UlaY5V}xg[?k&>srq߀].r_r_qsGjy4k iQܟBZ-<(d=dKO a/zv7]ǰod}sn?TF'|3Nn#I?"mzv~K=گsl<b|_|4>?pߋQrib 2* (Ѧh{28oIyes8';Z9h6g>xRx'b8ՃWOϫ[xn%|^z}%x c8eXIfMM*i_:IDATx \e}?&y'@P$R`@KЪh & };Rw X8QBK Ar4ECH!!e7w&L=g}3;wfڶf  @>Gq @@ Чhr; @9@ Чhr; @9@ Ч}zf͚ʡU1lذQGGGŦM6D{?hР2dHlܸ1|*^rƴG^%35jh8,So]w5^}?[ieXz[vm+wC۷#^G֭v|Z}Uz]Nc0Kr}|}~&@Q@Lq @M PZ# @ ٛ @FF0  @@7 @@h` @' go @'@O@ @5 5)N PZ  @j@kS @>vnu?M#dv|gL<9xe˖E:9o޼2eJ @$K|3qWgt syJ|Κ5+fΜcǎ &K3C P<Dva??ƚ5k7u3m6mZL:5wqԨQi @}@z{].ӧ @ zH PZ @u u @-e  @@&T @@-h-Z @- M @ZZ%@[@P @h)K PZ7  @j@kR @nnB @" ֢, @@h݄* @E@EKY @к U@ PZ @u u @-e  @@&T @@-h-Z @- M @ZZ%@[@P @h)K PZ7  @j@kR @nnB @" ֢, @@h݄* @EP= Mi6e6CmiP}9،{wttd2dHlٲMu:y<_ |heN m+Rmm~=3Ri˭4Zښx=~mۑh͛7Ggg3׏=:yC;Ύc[Mc;jԨذaCtuuGu9ww?Zy<_,j- @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/ 6i! @PhSg @/G?Q]vEw+Wf @ iK_R^$9gΜ={v,\0f̘K.-m3C P!^y 'Dz6m_#RX1K מ>|x9}h@;]~; 6O~ַի!/(>S#=+Vċ/ؽg+^}Xn]f PK\x^Fs>Nᶥ,h.7xݚ<79r7!{3ĉ'矏3gfǍқƏ@  ?fztO'|r|k_ &+]vY,[, ͋)St @ =fرcP:}JF }@}iӦԩScƍ޽ @@1ۧii\Ԟ-=rc[ܱM=c|薢p#ꪫF+}R>a„D  @hxM||;nݺHMSN7ֳy @bI*e<] @+{@ZF h;& @` xu h;& @` xu h;& @` xu h;& @` xu h;& @` xu h;& @` xu h;& @` xu U;.֭[׫c[A @`{[ٽ?i?xcذaݛ+n8K @$P1o:+6n7oKutt~J @$P1ftIq 7ĨQ*g @h=LҥK=P:~ @خ@[n3g/ܫO>9n^ @ P.Pu˿G>cǎݦ@ PA+ċ/\rI @@e>tѱ~ʵJ @ : 3όg}6 Ίr!op(  @D"@#f͚8{&^$V @@ /֭[[Mn+  @СC @[|>|xnu7F @:|=#qT#@؞@Rk׮G}4AK @ Pu=C{Uw;֯__Wk  @UǏv[b1"~mmm1lذ~k[V }Akݲ#80wc[ܱM=c|;;;+"U@~YY㢋.c=AܘθlGuz]{Wlذ!֭[]21cdXKe[,h^z[fll7> )] [R @Ft^4rM+ !@hZѣ_n!'7Żx{C ij @@s T@{HWrN81VX?5 @(0F͚5+:XlY򗿌{..j#@Ug;g,}OYg}7u @Zh[f:_^񫯾cQ, @P><}g?xG0>o?A\~qq6 @)PUM;\uULǎ>g  @Pwx<~ƢEwtA@ @\3#zjY&,X7|s 4(nƘ;wnwyn  @It|'FuqÆ nݺHt1q,|&MoCqu5\SZ<Ҳ2#F\!ئǞ{Y|-8mY^4;|nٲ%.8c=3fL}GUVG>RZ7lذxJf^HΛ7oxs^[ ^9WƶcWiqغuk|ڔΎ3Sggg >{1yGDztO+Vkv/C ܴiSvjZsSO]3u5ƶ1C؎>yoob]v . 5jܸqzRKf @''_rv)sύtֳ{T  @@@>죖vD7mڴ:ujvQvTz @4 @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T} @Њ<6 @- -> @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o> ֭|W-Zw}w\6+ @%gtÆ |%nm̙gώ ƌ3bҥl@ P,> /N;-֮]ޒ%KbqWg?ԧ>sݦ @b Ew֯_cժUC`:q4rIbiSO=ܳlu:th-4ѣqZVw6M=nܸXzuis?~|i  @@5N<9XlYR)SG @%~5kV̜93Ǝ&Lt @}@=HӴibԩq5jTM  @ (ЧtG~& @ =C @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @r @ @ʫr @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @r @ @ʫr @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @r @ @ʫr @r\2 @@CІ @\@-L P*'@(@E, @4T@m(  @W4r[[[ib-[c[\b/~w[.6, rOm{7zmWX*-pnc GmWg߮^\r^Ξ}0_p5k{Ʒolmqݳ<~wZɩĊ<6 @- -> @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T} @Њ<6 @- -> @hE  @@U @@E" @y y @ZF @мEG PQ@c# @@hޢ#@( V䱑 @ o4oQ @T@+H  @* yl$@[@[T} @Њ<6 @-tѢEqwʕ+ @L9sbٱp˜1cF,]Ɉ4 @ O<+%KĂ oAō7s΍;֪'@h~ /'f3yM4)ϟ ݲeg)cذae32dH;9{h|;W=w7X4k]|y3ԾѣGǪUJi~\z饥uzkr!坝;wvW5myƷN&64c['` ~:x*53^ZN3ӧON8.|Kf^c="  {k]v%㥗^j&kgLîkrM>@u4re1b k7n\P ~h;T ?FU^_ױrʪ+Z/R888#[ ں?.qV]v8쳣3֭[}Whkkk.K {G.K.$7oϏkJΝwKxi|MT̙i{)Fx/R0 _z\yqI' pbuhk__ 7& hm^MW:qM7SO=͛7СCTho_գ:*JkG Kwׯ;,kfk.dɒv>kM+J颋.~[~G\PShVMYrÆ 1cƌ8#b})۪Q;'0h*6nUdH<ƶO?t~.5Hޘ{znIǵhf2eJ(5k]v)͛N٪+T X"֬YW… ~&:$^~Ho4{{|ήM"JgL|E?ϳ{G$TG3w2dH3}݈Coc9&; :jԨ{>lov#>H_yie3 +bq|_|;%&$S ?"4W]uU^nja- t?u]7\5&8kj^t}M;Du5,}KwȑucHW1 +1zuLW_=Нs @N t'F sιً @`'Н @ ;f/ @@wnF }y˖-2? Y' /KJ%@m @@|RX+W??eg:|qa:x$e#<C}1a„lٳggwm8o:I"}9A.s9'Ǝ8V @m6ZB }# '_}Bfԧ~WN7xc"}_wӟӧOɓ'g/3όߋO~ @mAD[^_W1dȐ ='J_jUo}+>l}:~0VX}gO~8ӲtPtuue>?<:HgDS۳3" "ބ"4ɟdΝ۫{/~HE)t_v/:* #]OS:3δoVmڴiٙvz[A@S ãq@:9r7l Ky{#FSN9e̙LgFS=Csʼ͂:;;-oyKV *hv дz׻"C=}㤓N38#D޳{liv7}K_>>Lo,+׿uvgxF[n%&9eʔ]v%{'|H$ZIZiHg%{,7ʬ[.;:nܸ^t}' XA&@|4 P46ChrH @E@6ChrH @E@6ChrH @E@6ChrH @E4A͠IENDB`themis/man/step_adasyn.Rd0000644000176200001440000001224614744276170015140 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/adasyn.R \name{step_adasyn} \alias{step_adasyn} \alias{tidy.step_adasyn} \title{Apply Adaptive Synthetic Algorithm} \usage{ step_adasyn( recipe, ..., role = NA, trained = FALSE, column = NULL, over_ratio = 1, neighbors = 5, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("adasyn") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{neighbors}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when applied.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_adasyn()} creates a \emph{specification} of a recipe step that generates synthetic positive instances using ADASYN algorithm. } \details{ All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. All columns used in this step must be numeric with no missing data. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 2 tuning parameters: \itemize{ \item \code{over_ratio}: Over-Sampling Ratio (type: double, default: 1) \item \code{neighbors}: # Nearest Neighbors (type: integer, default: 5) } } \section{Case weights}{ The underlying operation does not allow for case weights. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% # Bring the minority levels up to about 1000 each # 1000/2211 is approx 0.4523 step_adasyn(class, over_ratio = 0.4523) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without ADASYN") recipe(class ~ x + y, data = circle_example) \%>\% step_adasyn(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With ADASYN") \dontshow{\}) # examplesIf} } \references{ He, H., Bai, Y., Garcia, E. and Li, S. 2008. ADASYN: Adaptive synthetic sampling approach for imbalanced learning. Proceedings of IJCNN 2008. (IEEE World Congress on Computational Intelligence). IEEE International Joint Conference. pp.1322-1328. } \seealso{ \code{\link[=adasyn]{adasyn()}} for direct implementation Other Steps for over-sampling: \code{\link{step_bsmote}()}, \code{\link{step_rose}()}, \code{\link{step_smote}()}, \code{\link{step_smotenc}()}, \code{\link{step_upsample}()} } \concept{Steps for over-sampling} themis/man/bsmote.Rd0000644000176200001440000000620214744045253014106 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bsmote_impl.R \name{bsmote} \alias{bsmote} \title{borderline-SMOTE Algorithm} \usage{ bsmote(df, var, k = 5, over_ratio = 1, all_neighbors = FALSE) } \arguments{ \item{df}{data.frame or tibble. Must have 1 factor variable and remaining numeric variables.} \item{var}{Character, name of variable containing factor variable.} \item{k}{An integer. Number of nearest neighbor that are used to generate the new examples of the minority class.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{all_neighbors}{Type of two borderline-SMOTE method. Defaults to FALSE. See details.} } \value{ A data.frame or tibble, depending on type of \code{df}. } \description{ BSMOTE generates generate new examples of the minority class using nearest neighbors of these cases in the border region between classes. } \details{ This methods works the same way as \code{\link[=smote]{smote()}}, expect that instead of generating points around every point of of the minority class each point is first being classified into the boxes "danger" and "not". For each point the k nearest neighbors is calculated. If all the neighbors comes from a different class it is labeled noise and put in to the "not" box. If more then half of the neighbors comes from a different class it is labeled "danger. If \code{all_neighbors = FALSE} then points will be generated between nearest neighbors in its own class. If \code{all_neighbors = TRUE} then points will be generated between any nearest neighbors. See examples for visualization. The parameter \code{neighbors} controls the way the new examples are created. For each currently existing minority class example X new examples will be created (this is controlled by the parameter \code{over_ratio} as mentioned above). These examples will be generated by using the information from the \code{neighbors} nearest neighbor of each example of the minority class. The parameter \code{neighbors} controls how many of these neighbor are used. All columns used in this step must be numeric with no missing data. } \examples{ circle_numeric <- circle_example[, c("x", "y", "class")] res <- bsmote(circle_numeric, var = "class") res <- bsmote(circle_numeric, var = "class", k = 10) res <- bsmote(circle_numeric, var = "class", over_ratio = 0.8) res <- bsmote(circle_numeric, var = "class", all_neighbors = TRUE) } \references{ Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: a new over-sampling method in imbalanced data sets learning. In International Conference on Intelligent Computing, pages 878–887. Springer, 2005. } \seealso{ \code{\link[=step_bsmote]{step_bsmote()}} for step function of this method Other Direct Implementations: \code{\link{adasyn}()}, \code{\link{nearmiss}()}, \code{\link{smote}()}, \code{\link{smotenc}()}, \code{\link{tomek}()} } \concept{Direct Implementations} themis/man/step_tomek.Rd0000644000176200001440000001030714744276170014774 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tomek.R \name{step_tomek} \alias{step_tomek} \alias{tidy.step_tomek} \title{Remove Tomek’s Links} \usage{ step_tomek( recipe, ..., role = NA, trained = FALSE, column = NULL, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("tomek") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when applied.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_tomek()} creates a \emph{specification} of a recipe step that removes majority class instances of tomek links. } \details{ The factor variable used to balance around must only have 2 levels. All other variables must be numerics with no missing data. A tomek link is defined as a pair of points from different classes and are each others nearest neighbors. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. When used in modeling, users should strongly consider using the option \code{skip = TRUE} so that the extra sampling is \emph{not} conducted outside of the training set. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Case weights}{ The underlying operation does not allow for case weights. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% step_tomek(class) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without Tomek") + xlim(c(1, 15)) + ylim(c(1, 15)) recipe(class ~ x + y, data = circle_example) \%>\% step_tomek(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_point() + labs(title = "With Tomek") + xlim(c(1, 15)) + ylim(c(1, 15)) \dontshow{\}) # examplesIf} } \references{ Tomek. Two modifications of cnn. IEEE Trans. Syst. Man Cybern., 6:769-772, 1976. } \seealso{ \code{\link[=tomek]{tomek()}} for direct implementation Other Steps for under-sampling: \code{\link{step_downsample}()}, \code{\link{step_nearmiss}()} } \concept{Steps for under-sampling} themis/man/step_upsample.Rd0000644000176200001440000001265014744276170015506 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/upsample.R \name{step_upsample} \alias{step_upsample} \alias{tidy.step_upsample} \title{Up-Sample a Data Set Based on a Factor Variable} \usage{ step_upsample( recipe, ..., over_ratio = 1, ratio = deprecated(), role = NA, trained = FALSE, column = NULL, target = NA, skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("upsample") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} \item{...}{One or more selector functions to choose which variable is used to sample the data. See \link[recipes:selections]{recipes::selections} for more details. The selection should result in \emph{single factor variable}. For the \code{tidy} method, these are not currently used.} \item{over_ratio}{A numeric value for the ratio of the minority-to-majority frequencies. The default value (1) means that all other levels are sampled up to have the same frequency as the most occurring level. A value of 0.5 would mean that the minority levels will have (at most) (approximately) half as many rows than the majority level.} \item{ratio}{Deprecated argument; same as \code{over_ratio}.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} \item{column}{A character string of the variable name that will be populated (eventually) by the \code{...} selectors.} \item{target}{An integer that will be used to subsample. This should not be set by the user and will be populated by \code{prep}.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{seed}{An integer that will be used as the seed when upsampling.} \item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of existing steps (if any). For the \code{tidy} method, a tibble with columns \code{terms} which is the variable used to sample. } \description{ \code{step_upsample()} creates a \emph{specification} of a recipe step that will replicate rows of a data set to make the occurrence of levels in a specific factor level equal. } \details{ Up-sampling is intended to be performed on the \emph{training} set alone. For this reason, the default is \code{skip = TRUE}. If there are missing values in the factor variable that is used to define the sampling, missing data are selected at random in the same way that the other factor levels are sampled. Missing values are not used to determine the amount of data in the majority level (see example below). For any data with factor levels occurring with the same frequency as the majority level, all data will be retained. All columns in the data are sampled and returned by \code{\link[recipes:juice]{recipes::juice()}} and \code{\link[recipes:bake]{recipes::bake()}}. } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is retruned with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected} \item{id}{character, id of this step} } } \section{Tuning Parameters}{ This step has 1 tuning parameters: \itemize{ \item \code{over_ratio}: Over-Sampling Ratio (type: double, default: 1) } } \section{Case weights}{ This step performs an unsupervised operation that can utilize case weights. To use them, see the documentation in \link[recipes:case_weights]{recipes::case_weights} and the examples on \code{tidymodels.org}. } \examples{ \dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(recipes) library(modeldata) data(hpc_data) hpc_data0 <- hpc_data \%>\% select(-protocol, -day) orig <- count(hpc_data0, class, name = "orig") orig up_rec <- recipe(class ~ ., data = hpc_data0) \%>\% # Bring the minority levels up to about 1000 each # 1000/2211 is approx 0.4523 step_upsample(class, over_ratio = 0.4523) \%>\% prep() training <- up_rec \%>\% bake(new_data = NULL) \%>\% count(class, name = "training") training # Since `skip` defaults to TRUE, baking the step has no effect baked <- up_rec \%>\% bake(new_data = hpc_data0) \%>\% count(class, name = "baked") baked # Note that if the original data contained more rows than the # target n (= ratio * majority_n), the data are left alone: orig \%>\% left_join(training, by = "class") \%>\% left_join(baked, by = "class") library(ggplot2) ggplot(circle_example, aes(x, y, color = class)) + geom_point() + labs(title = "Without upsample") recipe(class ~ x + y, data = circle_example) \%>\% step_upsample(class) \%>\% prep() \%>\% bake(new_data = NULL) \%>\% ggplot(aes(x, y, color = class)) + geom_jitter(width = 0.1, height = 0.1) + labs(title = "With upsample (with jittering)") \dontshow{\}) # examplesIf} } \seealso{ Other Steps for over-sampling: \code{\link{step_adasyn}()}, \code{\link{step_bsmote}()}, \code{\link{step_rose}()}, \code{\link{step_smote}()}, \code{\link{step_smotenc}()} } \concept{Steps for over-sampling} themis/man/tunable_themis.Rd0000644000176200001440000000225114744045253015620 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/adasyn.R, R/bsmote.R, R/downsample.R, % R/nearmiss.R, R/rose.R, R/smote.R, R/smotenc.R, R/tunable.R, R/upsample.R \name{tunable.step_adasyn} \alias{tunable.step_adasyn} \alias{tunable.step_bsmote} \alias{tunable.step_downsample} \alias{tunable.step_nearmiss} \alias{tunable.step_rose} \alias{tunable.step_smote} \alias{tunable.step_smotenc} \alias{tunable_themis} \alias{tunable.step_upsample} \title{tunable methods for themis} \usage{ \method{tunable}{step_adasyn}(x, ...) \method{tunable}{step_bsmote}(x, ...) \method{tunable}{step_downsample}(x, ...) \method{tunable}{step_nearmiss}(x, ...) \method{tunable}{step_rose}(x, ...) \method{tunable}{step_smote}(x, ...) \method{tunable}{step_smotenc}(x, ...) \method{tunable}{step_upsample}(x, ...) } \arguments{ \item{x}{A recipe step object} \item{...}{Not used.} } \value{ A tibble object. } \description{ These functions define what parameters \emph{can} be tuned for specific steps. They also define the recommended objects from the \code{dials} package that can be used to generate new parameter values and other characteristics. } \keyword{internal} themis/DESCRIPTION0000644000176200001440000000340514744304332013257 0ustar liggesusersPackage: themis Title: Extra Recipes Steps for Dealing with Unbalanced Data Version: 1.0.3 Authors@R: c( person("Emil", "Hvitfeldt", , "emil.hvitfeldt@posit.co", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-0679-1945")), person(given = "Posit Software, PBC", role = c("cph", "fnd")) ) Description: A dataset with an uneven number of cases in each class is said to be unbalanced. Many models produce a subpar performance on unbalanced datasets. A dataset can be balanced by increasing the number of minority cases using SMOTE 2011 , BorderlineSMOTE 2005 and ADASYN 2008 . Or by decreasing the number of majority cases using NearMiss 2003 or Tomek link removal 1976 . License: MIT + file LICENSE URL: https://github.com/tidymodels/themis, https://themis.tidymodels.org BugReports: https://github.com/tidymodels/themis/issues Depends: R (>= 3.6), recipes (>= 1.1.0) Imports: cli, gower, lifecycle (>= 1.0.3), dplyr, generics (>= 0.1.0), purrr, RANN, rlang (>= 1.1.0), ROSE, tibble, withr, glue, hardhat, vctrs Suggests: covr, dials (>= 1.2.0), ggplot2, modeldata, testthat (>= 3.0.0) Config/Needs/website: tidyverse/tidytemplate Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.2 NeedsCompilation: no Packaged: 2025-01-22 23:40:53 UTC; emilhvitfeldt Author: Emil Hvitfeldt [aut, cre] (), Posit Software, PBC [cph, fnd] Maintainer: Emil Hvitfeldt Repository: CRAN Date/Publication: 2025-01-23 00:10:02 UTC