fastDummies/ 0000755 0001762 0000144 00000000000 14743531446 012546 5 ustar ligges users fastDummies/tests/ 0000755 0001762 0000144 00000000000 14742467541 013713 5 ustar ligges users fastDummies/tests/testthat/ 0000755 0001762 0000144 00000000000 14743531446 015550 5 ustar ligges users fastDummies/tests/testthat/test-columns-warnings-errors.R 0000644 0001762 0000144 00000023240 13515737037 023471 0 ustar ligges users context("Checks dummy_cols for warnings and errors") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) error_data <- data.frame(numbers = 1:10, number2 = 11:20, stringsAsFactors = FALSE) test <- structure(list( Theory = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 6L, 6L, 6L, 9L, 9L, NA, 1L, 1L, NA, NA, NA, NA, NA, 1L, 1L, 6L, NA, 1L, 1L, 1L, NA, NA, 1L, 1L, NA, 2L, NA, 1L, 1L, 4L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, 2L, NA, NA, NA, NA, 9L, 9L, 1L, 1L, 1L, 6L, 6L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, NA, NA, 1L, 1L, 1L, 8L, 1L, NA, 6L, 1L, 1L, 1L, NA, NA, NA, NA, NA, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, NA, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 1L, NA, 1L, 8L, NA, 8L, 8L, NA, NA, NA, NA, 2L, 1L, 2L, 10L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, NA, NA, NA, NA, NA, NA, 1L, NA, 9L, NA, NA, NA, 1L, 1L, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 1L, NA, 1L, NA, 1L, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, NA, 9L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 2L, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L, 7L, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("Behaviourism", "Behaviourism, Cognitive", "Behaviourism, Gestalt", "Behaviourism, Psychodynamic", "Behaviourism, Psychodynamic, Cognitive", "Cognitive", "Functionalism", "Gestalt", "Psychodynamic", "Structuralism"), class = "factor"), Format = structure(c(1L, 1L, 24L, 1L, 1L, 1L, 1L, 2L, 1L, 10L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 2L, 1L, 1L, 19L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 11L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 15L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 13L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 8L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 7L, 1L, 1L, 15L, 1L, 5L, 25L, 5L, 24L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 15L, 1L, 1L, 1L, 1L, 20L, 1L, 18L, 12L, 1L, 1L, NA, 20L, 20L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 25L, 15L, 16L, 15L, 15L, 1L, 1L, 1L, 1L, 19L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 12L, 5L, 5L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 14L, 1L, 1L, 1L, 1L, 14L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 15L, 12L, NA, 15L, 1L, NA, NA, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 14L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 22L, 1L, 21L, 23L, 5L, 1L, 1L, 1L, 1L, 10L, 1L, 1L, 1L, 1L, 5L, 17L, 1L, 17L, 6L, 1L, 1L, 9L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 18L, 1L, 21L, 18L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 24L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L), .Label = c("16mm", "16mm, 35mm", "16mm, 35mm, VHS", "16mm, AVI", "16mm, Digital", "16mm, DVD", "16mm, DVD, Betacam SP", "16mm, DVD, Digital, Betacam SP", "16mm, DVD, Mini-DV", "16mm, MP4", "16mm, MPG", "16mm, VHS", "16mm, VHS, AVI", "16mm, VHS, Digital", "16mm, VHS, DVD", "16mm, VHS, DVD, Digital, AVI", "35mm", "8mm", "8mm, 16mm", "DVD", "DVD, AVI", "Mini-DV", "MPG", "VHS", "VHS, DVD, Digital"), class = "factor")), .Names = c("Theory", "Format"), row.names = c(NA, -427L), class = c("tbl_df", "tbl", "data.frame")) test_that("Error on stop conditions", { expect_error(dummy_cols(error_data)) expect_error(dummy_cols(error_data), paste0("No character or factor columns found. ", "Please use select_columns to choose columns.")) }) test_that("Including non-existing in select_columns leads to warning", { expect_warning(dummy_cols(fastDummies_example[, "gender", drop = FALSE], select_columns = c("gender", "fake"))) expect_warning(dummy_cols(fastDummies_example[, "gender", drop = FALSE], select_columns = c("fake", "gender"))) expect_warning(dummy_cols(fastDummies_example[, "gender", drop = FALSE], select_columns = c("fake", "gender", "fake"))) }) test_that("Only having non-existing column in select_columns returns error", { expect_error(dummy_cols(fastDummies_example, select_columns = "number")) expect_error(dummy_cols(fastDummies_example[, "numbers", drop = FALSE], select_columns = "number")) expect_error(dummy_cols(fastDummies_example[, "gender", drop = FALSE], select_columns = "gen")) expect_error(dummy_cols(fastDummies_example, select_columns = "")) expect_error(dummy_cols(no_dummies_needed, select_columns = "")) expect_error(dummy_cols(crime, select_columns = "")) expect_error(dummy_cols(fastDummies_example[, "gender", drop = FALSE], select_columns = "")) }) test_that("no errors or warnings", { expect_silent(dummy_cols(fastDummies_example)) expect_silent(dummy_cols(no_dummies_needed)) expect_silent(dummy_cols(no_dummies_needed_DT)) expect_silent(dummy_cols(no_dummies_needed_tibble)) expect_silent(dummy_cols(crime)) expect_silent(dummy_cols(crime_full)) expect_silent(dummy_cols(crime_DT)) expect_silent(dummy_cols(crime_full_DT)) expect_silent(dummy_cols(crime_tibble)) expect_silent(dummy_cols(crime_full_tibble)) expect_silent(dummy_cols(fastDummies_example)) expect_silent(dummy_cols(fastDummies_example_DT)) expect_silent(dummy_cols(fastDummies_example_tibble)) expect_silent(dummy_cols(fastDummies_full)) expect_silent(dummy_cols(fastDummies_full_DT)) expect_silent(dummy_cols(fastDummies_full_tibble)) expect_silent(dummy_cols(test, select_columns = "Theory", split = ", ")) expect_silent(dummy_cols(test, select_columns = "Theory", split = ",")) }) test_that("error if both remove options are true", { expect_error(dummy_cols(no_dummies_needed, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(no_dummies_needed_DT, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(no_dummies_needed_tibble, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(crime, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(crime_full, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(crime_DT, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(crime_full_DT, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(crime_tibble, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(crime_full_tibble, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(fastDummies_example, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(fastDummies_example_DT, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(fastDummies_example_tibble, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(fastDummies_full, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(fastDummies_full_DT, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) expect_error(dummy_cols(fastDummies_full_tibble, remove_first_dummy = TRUE, remove_most_frequent_dummy = TRUE)) }) fastDummies/tests/testthat/test-columns-value-order.R 0000644 0001762 0000144 00000030632 13760474244 022557 0 ustar ligges users context("Order of dummy column values are right") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) split_example <- data.frame(owner = 1:4, pets = c("dog", "dog, cat, hamster", "cat", "hamster"), stringsAsFactors = FALSE) numeric_order <- data.frame(photos = c(1, 5, 7, 2, 40, 23, 12, 6, 1)) test_that("Order of dummy columns (e.g. 0,0,1,0) is right", { expect_equal(dummy_cols(1:3)$.data_1, c(1, 0, 0)) expect_equal(dummy_cols(1:3)$.data_2, c(0, 1, 0)) expect_equal(dummy_cols(1:3)$.data_3, c(0, 0, 1)) expect_equal(dummy_cols(c("a", "b", "c"))$.data_a, c(1, 0, 0)) expect_equal(dummy_cols(c("a", "b", "c"))$.data_b, c(0, 1, 0)) expect_equal(dummy_cols(c("a", "b", "c"))$.data_c, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example)$gender_female, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example)$gender_female, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender")$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender", remove_first_dummy = TRUE)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example)$animals_dog, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example)$animals_cat, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$animals_dog, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example, select_columns = "animals", remove_first_dummy = TRUE)$animals_dog, c(1, 1, 0)) expect_named(dummy_cols(numeric_order), c("photos", "photos_1", "photos_2", "photos_5", "photos_6", "photos_7", "photos_12", "photos_23", "photos_40")) # Splitter test expect_equal(dummy_cols(split_example, split = ",")$pets_dog, c(1, 1, 0, 0)) expect_equal(dummy_cols(split_example, split = ",")$pets_cat, c(0, 1, 1, 0)) expect_equal(dummy_cols(split_example, split = ",")$pets_hamster, c(0, 1, 0, 1)) expect_equal(dummy_cols(fastDummies_example_DT)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_DT)$gender_female, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender")$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_DT, remove_first_dummy = TRUE)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender", remove_first_dummy = TRUE)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_DT)$animals_dog, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_DT)$animals_cat, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example_DT, remove_first_dummy = TRUE)$animals_dog, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "animals", remove_first_dummy = TRUE)$animals_dog, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_tibble)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_tibble)$gender_female, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender")$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_tibble, remove_first_dummy = TRUE)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender", remove_first_dummy = TRUE)$gender_male, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_tibble)$animals_dog, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_tibble)$animals_cat, c(0, 0, 1)) expect_equal(dummy_cols(fastDummies_example_tibble, remove_first_dummy = TRUE)$animals_dog, c(1, 1, 0)) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "animals", remove_first_dummy = TRUE)$animals_dog, c(1, 1, 0)) }) test_that("Order of non-dummy columns is same", { gender_list <- factor(c("male", "male", "female")) animals_list <- factor(c("dog", "dog", "cat")) expect_equal(dummy_cols(fastDummies_example)$numbers, fastDummies_example$numbers) expect_equal(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$numbers, fastDummies_example$numbers) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender")$numbers, fastDummies_example$numbers) expect_equal(dummy_cols(fastDummies_example, select_columns = "animals")$numbers, fastDummies_example$numbers) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender", remove_first_dummy = TRUE)$numbers, fastDummies_example$numbers) expect_equal(dummy_cols(fastDummies_example, select_columns = "animals", remove_first_dummy = TRUE)$numbers, fastDummies_example$numbers) expect_equal(dummy_cols(fastDummies_example)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender")$gender, gender_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "animals")$gender, gender_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender", remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "animals", remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender")$animals, animals_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "animals")$animals, animals_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "gender", remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example, select_columns = "animals", remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_DT)$numbers, fastDummies_example_DT$numbers) expect_equal(dummy_cols(fastDummies_example_DT, remove_first_dummy = TRUE)$numbers, fastDummies_example_DT$numbers) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender")$numbers, fastDummies_example_DT$numbers) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "animals")$numbers, fastDummies_example_DT$numbers) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender", remove_first_dummy = TRUE)$numbers, fastDummies_example_DT$numbers) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "animals", remove_first_dummy = TRUE)$numbers, fastDummies_example_DT$numbers) expect_equal(dummy_cols(fastDummies_example_DT)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_DT, remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender")$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "animals")$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender", remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "animals", remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_DT)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_DT, remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender")$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "animals")$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "gender", remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_DT, select_columns = "animals", remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_tibble)$numbers, fastDummies_example_tibble$numbers) expect_equal(dummy_cols(fastDummies_example_tibble, remove_first_dummy = TRUE)$numbers, fastDummies_example_tibble$numbers) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender")$numbers, fastDummies_example_tibble$numbers) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "animals")$numbers, fastDummies_example_tibble$numbers) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender", remove_first_dummy = TRUE)$numbers, fastDummies_example_tibble$numbers) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "animals", remove_first_dummy = TRUE)$numbers, fastDummies_example_tibble$numbers) expect_equal(dummy_cols(fastDummies_example_tibble)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_tibble, remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender")$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "animals")$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender", remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "animals", remove_first_dummy = TRUE)$gender, gender_list) expect_equal(dummy_cols(fastDummies_example_tibble)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_tibble, remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender")$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "animals")$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "gender", remove_first_dummy = TRUE)$animals, animals_list) expect_equal(dummy_cols(fastDummies_example_tibble, select_columns = "animals", remove_first_dummy = TRUE)$animals, animals_list) }) fastDummies/tests/testthat/test-rows-right-values.R 0000644 0001762 0000144 00000007155 13224606707 022256 0 ustar ligges users context("dummy_rows_returns right data set") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) test_that("dummy_rows return expected data.frame", { expect_equal(dummy_rows(no_dummies_needed), no_dummies_needed) expect_equal(dummy_rows(no_dummies_needed, select_columns = "animals"), no_dummies_needed) expect_equal(dummy_rows(no_dummies_needed, select_columns = "food"), no_dummies_needed) expect_equal(dummy_rows(no_dummies_needed, select_columns = c("animals", "food")), no_dummies_needed) expect_equal(dummy_rows(no_dummies_needed, dummy_indicator = TRUE), cbind(no_dummies_needed, dummy_indicator = rep(0, 4))) expect_equal(dummy_rows(no_dummies_needed, dummy_indicator = TRUE, select_columns = "animals"), cbind(no_dummies_needed, dummy_indicator = rep(0, 4))) expect_equal(dummy_rows(no_dummies_needed, dummy_indicator = TRUE, select_columns = "food"), cbind(no_dummies_needed, dummy_indicator = rep(0, 4))) expect_equal(dummy_rows(no_dummies_needed, dummy_indicator = TRUE, select_columns = c("animals", "food")), cbind(no_dummies_needed, dummy_indicator = rep(0, 4))) # fastDummies_example data - FULL expect_equal(dummy_rows(fastDummies_example), fastDummies_full) expect_equal(dummy_rows(fastDummies_example, select_columns = c("gender", "animals", "dates")), fastDummies_full) expect_equal(dummy_rows(fastDummies_example, dummy_indicator = TRUE), cbind(fastDummies_full, dummy_indicator = c(0, 0, 0, 1, 1, 1, 1, 1))) expect_equal(dummy_rows(fastDummies_example, dummy_indicator = TRUE, select_columns = c("gender", "animals", "dates")), cbind(fastDummies_full, dummy_indicator = c(0, 0, 0, 1, 1, 1, 1, 1))) # fastDummies_example data - not full expect_equal(dummy_rows(fastDummies_example, select_columns = "animals"), fastDummies_example) expect_equal(dummy_rows(fastDummies_example, select_columns = "gender"), fastDummies_example) expect_equal(dummy_rows(fastDummies_example, select_columns = "dates"), fastDummies_example) expect_equal(dummy_rows(fastDummies_example, select_columns = "animals", dummy_indicator = TRUE), cbind(fastDummies_example, dummy_indicator = rep(0, 3))) expect_equal(dummy_rows(fastDummies_example, select_columns = "gender", dummy_indicator = TRUE), cbind(fastDummies_example, dummy_indicator = rep(0, 3))) expect_equal(dummy_rows(fastDummies_example, select_columns = "dates", dummy_indicator = TRUE), cbind(fastDummies_example, dummy_indicator = rep(0, 3))) # Crime dataset expect_equal(dummy_rows(crime, select_columns = c("city", "year")), crime_full) expect_equal(dummy_rows(crime), crime) expect_equal(dummy_rows(crime, select_columns = c("year", "city")), crime_full) expect_equal(dummy_rows(crime, select_columns = "city"), crime) expect_equal(dummy_rows(crime, select_columns = "year"), crime) }) fastDummies/tests/testthat/test-rows-dimensions.R 0000644 0001762 0000144 00000011047 13313007342 021774 0 ustar ligges users context("dummy_rows returns proper dimensions") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) test_that("dummy_rows returns same number of columns as inputted", { expect_equal(ncol(dummy_rows(no_dummies_needed)), ncol(no_dummies_needed)) expect_equal(ncol(dummy_rows(fastDummies_example)), ncol(fastDummies_example)) expect_equal(ncol(dummy_rows(crime)), ncol(crime)) expect_equal(ncol(dummy_rows(no_dummies_needed_DT)), ncol(no_dummies_needed_DT)) expect_equal(ncol(dummy_rows(fastDummies_example_DT)), ncol(fastDummies_example_DT)) expect_equal(ncol(dummy_rows(crime_DT)), ncol(crime_DT)) expect_equal(ncol(dummy_rows(no_dummies_needed_tibble)), ncol(no_dummies_needed_tibble)) expect_equal(ncol(dummy_rows(fastDummies_example_tibble)), ncol(fastDummies_example_tibble)) expect_equal(ncol(dummy_rows(crime_tibble)), ncol(crime_tibble)) # With dummy_indicator TRUE expect_equal(ncol(dummy_rows(no_dummies_needed, dummy_indicator = TRUE)), ncol(no_dummies_needed) + 1) expect_equal(ncol(dummy_rows(fastDummies_example, dummy_indicator = TRUE)), ncol(fastDummies_example) + 1) expect_equal(ncol(dummy_rows(crime, dummy_indicator = TRUE)), ncol(crime) + 1) expect_equal(ncol(dummy_rows(no_dummies_needed_DT, dummy_indicator = TRUE)), ncol(no_dummies_needed_DT) + 1) expect_equal(ncol(dummy_rows(fastDummies_example_DT, dummy_indicator = TRUE)), ncol(fastDummies_example_DT) + 1) expect_equal(ncol(dummy_rows(crime_DT, dummy_indicator = TRUE)), ncol(crime_DT) + 1) expect_equal(ncol(dummy_rows(no_dummies_needed_tibble, dummy_indicator = TRUE)), ncol(no_dummies_needed_tibble) + 1) expect_equal(ncol(dummy_rows(fastDummies_example_tibble, dummy_indicator = TRUE)), ncol(fastDummies_example_tibble) + 1) expect_equal(ncol(dummy_rows(crime_tibble, dummy_indicator = TRUE)), ncol(crime_tibble) + 1) }) test_that("Number of rows is as expected", { expect_equal(nrow(dummy_rows(no_dummies_needed)), 4) expect_equal(nrow(dummy_rows(fastDummies_example)), 8) expect_equal(nrow(dummy_rows(crime)), 3) expect_equal(nrow(dummy_rows(crime, select_columns = c("city", "year"))), 4) expect_equal(nrow(dummy_rows(no_dummies_needed_DT)), 4 ) expect_equal(nrow(dummy_rows(fastDummies_example_DT)), 8) expect_equal(nrow(dummy_rows(crime_DT)), 3) expect_equal(nrow(dummy_rows(crime_DT, select_columns = c("city", "year"))), 4) expect_equal(nrow(dummy_rows(no_dummies_needed_tibble)), 4 ) expect_equal(nrow(dummy_rows(fastDummies_example_tibble)), 8) expect_equal(nrow(dummy_rows(crime_tibble)), 3) expect_equal(nrow(dummy_rows(crime_tibble, select_columns = c("city", "year"))), 4) # With dummy_indicator TRUE expect_equal(nrow(dummy_rows(no_dummies_needed, dummy_indicator = TRUE)), 4) expect_equal(nrow(dummy_rows(fastDummies_example, dummy_indicator = TRUE)), 8) expect_equal(nrow(dummy_rows(crime, dummy_indicator = TRUE)), 3) expect_equal(nrow(dummy_rows(crime, dummy_indicator = TRUE, select_columns = c("city", "year"))), 4) expect_equal(nrow(dummy_rows(no_dummies_needed_DT, dummy_indicator = TRUE)), 4) expect_equal(nrow(dummy_rows(fastDummies_example_DT, dummy_indicator = TRUE)), 8) expect_equal(nrow(dummy_rows(crime_DT, dummy_indicator = TRUE)), 3) expect_equal(nrow(dummy_rows(crime_DT, dummy_indicator = TRUE, select_columns = c("city", "year"))), 4) expect_equal(nrow(dummy_rows(no_dummies_needed_tibble, dummy_indicator = TRUE)), 4) expect_equal(nrow(dummy_rows(fastDummies_example_tibble, dummy_indicator = TRUE)), 8) expect_equal(nrow(dummy_rows(crime_tibble, dummy_indicator = TRUE)), 3) expect_equal(nrow(dummy_rows(crime_tibble, dummy_indicator = TRUE, select_columns = c("city", "year"))), 4) }) fastDummies/tests/testthat/test-rows-warnings-errors.R 0000644 0001762 0000144 00000007120 13533224532 022771 0 ustar ligges users context("Checks dummy_rows for warnings, errors, and silent") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) error_data <- data.frame(numbers = 1:10, number2 = 11:20, stringsAsFactors = FALSE) test_that("Error on stop conditions", { expect_error(dummy_rows(error_data)) expect_error(dummy_rows(error_data), paste0("No character, factor, or Date columns found.", " Please use select_columns")) }) test_that("There are warnings or errors", { # One column data.frame or vector expect_error(dummy_rows(fastDummies_example$gender)) expect_error(dummy_rows(fastDummies_example$dates)) expect_error(dummy_rows(fastDummies_example$numbers)) expect_error(dummy_rows(fastDummies_example[, "gender", drop = FALSE])) expect_error(dummy_rows(fastDummies_example[, "dates", drop = FALSE])) expect_error(dummy_rows(fastDummies_example[, "numbers", drop = FALSE])) expect_error(dummy_rows(fastDummies_example_tibble$gender)) expect_error(dummy_rows(fastDummies_example_tibble$dates)) expect_error(dummy_rows(fastDummies_example_tibble$numbers)) expect_error(dummy_rows(fastDummies_example_tibble[, "gender", drop = FALSE])) expect_error(dummy_rows(fastDummies_example_tibble[, "dates", drop = FALSE])) expect_error(dummy_rows(fastDummies_example_tibble[, "numbers", drop = FALSE])) expect_error(dummy_rows(fastDummies_example_DT$gender)) expect_error(dummy_rows(fastDummies_example_DT$dates)) expect_error(dummy_rows(fastDummies_example_DT$numbers)) expect_error(dummy_rows(fastDummies_example_DT[, "gender", drop = FALSE])) expect_error(dummy_rows(fastDummies_example_DT[, "dates", drop = FALSE])) expect_error(dummy_rows(fastDummies_example_DT[, "numbers", drop = FALSE])) }) test_that("There are no warnings or errors", { expect_silent(dummy_cols(crime)) expect_silent(dummy_cols(crime$city)) expect_silent(dummy_cols(crime$year)) expect_silent(dummy_cols(crime_full)) expect_silent(dummy_cols(fastDummies_example)) expect_silent(dummy_cols(fastDummies_full)) expect_silent(dummy_cols(no_dummies_needed)) expect_silent(dummy_cols(crime_DT)) expect_silent(dummy_cols(crime_DT$city)) expect_silent(dummy_cols(crime_DT$year)) expect_silent(dummy_cols(crime_full_DT)) expect_silent(dummy_cols(fastDummies_example_DT)) expect_silent(dummy_cols(fastDummies_full_DT)) expect_silent(dummy_cols(no_dummies_needed_DT)) expect_silent(dummy_cols(crime_tibble)) expect_silent(dummy_cols(crime_tibble$city)) expect_silent(dummy_cols(crime_tibble$year)) expect_silent(dummy_cols(crime_full_tibble)) expect_silent(dummy_cols(fastDummies_example_tibble)) expect_silent(dummy_cols(fastDummies_full_tibble)) expect_silent(dummy_cols(no_dummies_needed_tibble)) expect_silent(dummy_rows(crime)) expect_silent(dummy_rows(crime_full)) expect_silent(dummy_rows(fastDummies_example)) expect_silent(dummy_rows(fastDummies_full)) expect_silent(dummy_rows(no_dummies_needed)) expect_silent(dummy_rows(crime_DT)) expect_silent(dummy_rows(crime_full_DT)) expect_silent(dummy_rows(fastDummies_example_DT)) expect_silent(dummy_rows(fastDummies_full_DT)) expect_silent(dummy_rows(no_dummies_needed_DT)) expect_silent(dummy_rows(crime_tibble)) expect_silent(dummy_rows(crime_full_tibble)) expect_silent(dummy_rows(fastDummies_example_tibble)) expect_silent(dummy_rows(fastDummies_full_tibble)) expect_silent(dummy_rows(no_dummies_needed_tibble)) }) fastDummies/tests/testthat/test-rows-indicator-values.R 0000644 0001762 0000144 00000004326 13224606712 023106 0 ustar ligges users context("dummy_indicator is only integers 0 and 1") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) test_that("dummy_indicator is binary column", { # With dummy_indicator TRUE expect_true( (unique(dummy_rows(no_dummies_needed, dummy_indicator = TRUE)$dummy_indicator) %in% c(0)) ) expect_true(all(unique(dummy_rows(fastDummies_example, dummy_indicator = TRUE)$dummy_indicator) %in% 0:1)) expect_true(all(unique(dummy_rows(crime, dummy_indicator = TRUE)$dummy_indicator) %in% 0:1)) # with set dummy_value expect_true(all(unique(dummy_rows(no_dummies_needed, dummy_indicator = TRUE, dummy_value = "test")$dummy_indicator) %in% 0:1)) expect_true(all(unique(dummy_rows(fastDummies_example, dummy_indicator = TRUE, dummy_value = "test")$dummy_indicator) %in% 0:1)) expect_true(all(unique(dummy_rows(crime, dummy_indicator = TRUE, dummy_value = "test")$dummy_indicator) %in% 0:1)) # With columns selected TRUE expect_true(all(unique(dummy_rows(no_dummies_needed, dummy_indicator = TRUE, select_columns = "animals")$dummy_indicator) %in% 0)) expect_true(all(unique(dummy_rows(fastDummies_example, dummy_indicator = TRUE, select_columns = "dates")$dummy_indicator) %in% 0)) expect_true(all(unique(dummy_rows(fastDummies_example, dummy_indicator = TRUE, select_columns = "animals")$dummy_indicator) %in% 0)) expect_true(all(unique(dummy_rows(crime, dummy_indicator = TRUE, select_columns = "crime")$dummy_indicator) %in% 0:1)) }) fastDummies/tests/testthat/test-return-type.R 0000644 0001762 0000144 00000005737 13533224451 021151 0 ustar ligges users context("test-return-type") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) test_that("tibble input returns tibble", { expect_is(dummy_cols(tibble::as_tibble(crime)), "tbl_df") expect_is(dummy_cols(tibble::as_tibble(crime[, "city", drop = FALSE])), "tbl_df") expect_is(dummy_cols(tibble::as_tibble(crime[, "year", drop = FALSE])), "tbl_df") expect_is(dummy_cols(tibble::as_tibble(crime_full)), "tbl_df") expect_is(dummy_cols(tibble::as_tibble(fastDummies_example)), "tbl_df") expect_is(dummy_cols(tibble::as_tibble(fastDummies_full)), "tbl_df") expect_is(dummy_cols(tibble::as_tibble(no_dummies_needed)), "tbl_df") expect_is(dummy_rows(tibble::as_tibble(crime)), "tbl_df") expect_is(dummy_rows(tibble::as_tibble(crime_full)), "tbl_df") expect_is(dummy_rows(tibble::as_tibble(fastDummies_example)), "tbl_df") expect_is(dummy_rows(tibble::as_tibble(fastDummies_full)), "tbl_df") expect_is(dummy_rows(tibble::as_tibble(no_dummies_needed)), "tbl_df") }) test_that("data.frame input returns data.frame", { expect_is(dummy_cols(crime), "data.frame") expect_is(dummy_cols(crime$city), "data.frame") expect_is(dummy_cols(crime$year), "data.frame") expect_is(dummy_cols(crime_full), "data.frame") expect_is(dummy_cols(fastDummies_example), "data.frame") expect_is(dummy_cols(fastDummies_full), "data.frame") expect_is(dummy_cols(no_dummies_needed), "data.frame") expect_is(dummy_rows(crime), "data.frame") expect_is(dummy_rows(crime_full), "data.frame") expect_is(dummy_rows(fastDummies_example), "data.frame") expect_is(dummy_rows(fastDummies_full), "data.frame") expect_is(dummy_rows(no_dummies_needed), "data.frame") }) test_that("data.table input returns data.table", { expect_is(dummy_cols(data.table::as.data.table(crime)), "data.table") expect_is(dummy_cols(data.table::as.data.table(crime$city)), "data.table") expect_is(dummy_cols(data.table::as.data.table(crime$year)), "data.table") expect_is(dummy_cols(data.table::as.data.table(crime_full)), "data.table") expect_is(dummy_cols(data.table::as.data.table(fastDummies_example)), "data.table") expect_is(dummy_cols(data.table::as.data.table(fastDummies_full)), "data.table") expect_is(dummy_cols(data.table::as.data.table(no_dummies_needed)), "data.table") expect_is(dummy_rows(data.table::as.data.table(crime)), "data.table") expect_is(dummy_rows(data.table::as.data.table(crime_full)), "data.table") expect_is(dummy_rows(data.table::as.data.table(fastDummies_example)), "data.table") expect_is(dummy_rows(data.table::as.data.table(fastDummies_full)), "data.table") expect_is(dummy_rows(data.table::as.data.table(no_dummies_needed)), "data.table") }) fastDummies/tests/testthat/test-columns.R 0000644 0001762 0000144 00000034023 14742475171 020333 0 ustar ligges users context("Makes correct dummy columns") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) most_frequent <- data.frame(animal = c("dog", "cat", "cat", "gorilla", "gorilla", "gorilla"), day = c("monday", "tuesday", "wednesday", "wednesday", "friday", "saturday"), hour = 1:6) sort_order_example <- data.frame(numbers = 1:12, month = c("February", "January", "March", "July", "June", "May", "April", "August", "October", "September", "December", "November")) sort_order_example$month <- factor(sort_order_example$month, levels = as.character(sort_order_example$month), ordered = TRUE) sort_order_example2 <- sort_order_example sort_order_example2$month <- as.character(sort_order_example2$month) fastDummies_example2 <- fastDummies_example fastDummies_example2$gender <- as.character(fastDummies_example2$gender) fastDummies_example2$animals <- as.character(fastDummies_example2$animals) test_that("The correct dummy columns are made - default", { expect_equal(dummy_cols(sort_order_example, return_generated_variables = TRUE), c("month_February", "month_January", "month_March", "month_July", "month_June", "month_May", "month_April", "month_August", "month_October", "month_September", "month_December", "month_November")) expect_named(dummy_cols(sort_order_example), c("numbers", "month", "month_February", "month_January", "month_March", "month_July", "month_June", "month_May", "month_April", "month_August", "month_October", "month_September", "month_December", "month_November")) expect_named(dummy_cols(c("a", "b")), c(".data", ".data_a", ".data_b")) expect_named(dummy_cols(1:2), c(".data", ".data_1", ".data_2")) expect_named(dummy_cols(sort_order_example2), c("numbers", "month", "month_April", "month_August", "month_December", "month_February", "month_January", "month_July", "month_June", "month_March", "month_May", "month_November", "month_October", "month_September")) expect_named(dummy_cols(fastDummies_example), c("numbers", "gender", "animals", "dates", "gender_female", "gender_male", "animals_cat", "animals_dog")) expect_named(dummy_cols(fastDummies_example2), c("numbers", "gender", "animals", "dates", "gender_female", "gender_male", "animals_cat", "animals_dog")) expect_named(dummy_cols(fastDummies_example[, "gender", drop = FALSE]), c("gender", "gender_female", "gender_male")) expect_named(dummy_cols(fastDummies_example[, "animals", drop = FALSE]), c("animals", "animals_cat", "animals_dog")) expect_named(dummy_cols(fastDummies_example2[, "gender", drop = FALSE]), c("gender", "gender_female", "gender_male")) expect_named(dummy_cols(fastDummies_example2[, "animals", drop = FALSE]), c("animals", "animals_cat", "animals_dog")) expect_named(dummy_cols(fastDummies_example[, "numbers", drop = FALSE]), c("numbers", "numbers_1", "numbers_2", "numbers_3")) }) test_that("The correct dummy columns are made - select_columns", { expect_named(dummy_cols(fastDummies_example[, "gender", drop = FALSE], select_columns = "gender"), c("gender", "gender_female", "gender_male")) expect_named(dummy_cols(fastDummies_example, select_columns = "numbers"), c("numbers", "gender", "animals", "dates", "numbers_1", "numbers_2", "numbers_3")) expect_named(dummy_cols(fastDummies_example[, "animals", drop = FALSE]), c("animals", "animals_cat", "animals_dog")) # animal first in select_columns expect_named(dummy_cols(fastDummies_example, select_columns = c("animals", "gender")), c("numbers", "gender", "animals", "dates", "animals_cat", "animals_dog", "gender_female", "gender_male")) # gender first in select_columns expect_named(dummy_cols(fastDummies_example, select_columns = c("gender", "animals")), c("numbers", "gender", "animals", "dates", "gender_female", "gender_male", "animals_cat", "animals_dog")) expect_named(dummy_cols(fastDummies_example, select_columns = "animals"), c("numbers", "gender", "animals", "dates", "animals_cat", "animals_dog")) expect_named(dummy_cols(fastDummies_example, select_columns = "gender"), c("numbers", "gender", "animals", "dates", "gender_female", "gender_male")) expect_named(dummy_cols(fastDummies_example, select_columns = c("gender", "numbers")), c("numbers", "gender", "animals", "dates", "gender_female", "gender_male", "numbers_1", "numbers_2", "numbers_3")) }) test_that("Remove first dummy leads to proper dummy columns being made", { expect_named(dummy_cols(fastDummies_example[, "gender", drop = FALSE], remove_first_dummy = TRUE), c("gender", "gender_male")) expect_named(dummy_cols(fastDummies_example[, "numbers", drop = FALSE], remove_first_dummy = TRUE), c("numbers", "numbers_2", "numbers_3")) expect_named(dummy_cols(fastDummies_example[, "animals", drop = FALSE], remove_first_dummy = TRUE), c("animals", "animals_dog")) expect_named(dummy_cols(fastDummies_example, remove_first_dummy = TRUE), c("numbers", "gender", "animals", "dates", "gender_male", "animals_dog")) expect_named(dummy_cols(fastDummies_example, select_columns = c("gender", "animals"), remove_first_dummy = TRUE), c("numbers", "gender", "animals", "dates", "gender_male", "animals_dog")) expect_named(dummy_cols(fastDummies_example, select_columns = "gender", remove_first_dummy = TRUE), c("numbers", "gender", "animals", "dates", "gender_male")) expect_named(dummy_cols(fastDummies_example, select_columns = "animals", remove_first_dummy = TRUE), c("numbers", "gender", "animals", "dates", "animals_dog")) expect_named(dummy_cols(fastDummies_example, select_columns = "numbers", remove_first_dummy = TRUE), c("numbers", "gender", "animals", "dates", "numbers_2", "numbers_3")) expect_named(dummy_cols(fastDummies_example, select_columns = c("animals", "numbers"), remove_first_dummy = TRUE), c("numbers", "gender", "animals", "dates", "animals_dog", "numbers_2", "numbers_3")) }) test_that("remove_most_frequent_dummy works", { expect_named(dummy_cols(.data = data.frame(X = as.factor(c("a", "b", "b", "c", "c"))), remove_most_frequent_dummy = TRUE), c("X", "X_a", "X_c")) expect_named(dummy_cols(most_frequent, remove_most_frequent_dummy = TRUE), c("animal", "day", "hour", "animal_cat", "animal_dog", "day_friday", "day_monday", "day_saturday", "day_tuesday")) expect_named(dummy_cols(most_frequent, select_columns = c("animal", "day"), remove_most_frequent_dummy = TRUE), c("animal", "day", "hour", "animal_cat", "animal_dog", "day_friday", "day_monday", "day_saturday", "day_tuesday")) expect_named(dummy_cols(most_frequent, select_columns = "animal", remove_most_frequent_dummy = TRUE), c("animal", "day", "hour", "animal_cat", "animal_dog")) expect_named(dummy_cols(most_frequent, select_columns = "day", remove_most_frequent_dummy = TRUE), c("animal", "day", "hour", "day_friday", "day_monday", "day_saturday", "day_tuesday")) expect_named(dummy_cols(most_frequent, select_columns = "hour", remove_most_frequent_dummy = TRUE), c("animal", "day", "hour", "hour_2", "hour_3", "hour_4", "hour_5", "hour_6")) }) test_that("remove_selected_columns works", { expect_named(dummy_cols(most_frequent, remove_selected_columns = TRUE), c("hour", "animal_cat", "animal_dog", "animal_gorilla", "day_friday", "day_monday", "day_saturday", "day_tuesday", "day_wednesday")) expect_named(dummy_cols(most_frequent, select_columns = c("animal", "day"), remove_selected_columns = TRUE), c("hour", "animal_cat", "animal_dog", "animal_gorilla", "day_friday", "day_monday", "day_saturday", "day_tuesday", "day_wednesday")) expect_named(dummy_cols(most_frequent, select_columns = "animal", remove_selected_columns = TRUE), c("day", "hour", "animal_cat", "animal_dog", "animal_gorilla")) expect_named(dummy_cols(most_frequent, select_columns = "day", remove_selected_columns = TRUE), c("animal", "hour", "day_friday", "day_monday", "day_saturday", "day_tuesday", "day_wednesday")) expect_named(dummy_cols(most_frequent, select_columns = "hour", remove_selected_columns = TRUE), c( "animal", "day", "hour_1", "hour_2", "hour_3", "hour_4", "hour_5", "hour_6")) }) fastDummies/tests/testthat/test-columns-type.R 0000644 0001762 0000144 00000003065 13760474162 021312 0 ustar ligges users context("Original Columns don't change type") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) fastDummies_example_character <- fastDummies_example fastDummies_example_character$animals <- as.character(fastDummies_example_character$animals) test_that("Original columns keep same type", { expect_is(dummy_cols(fastDummies_example)$numbers, "integer") expect_is(dummy_cols(fastDummies_example)$animals, "factor") expect_is(dummy_cols(fastDummies_example_character)$animals, "character") expect_is(dummy_cols(fastDummies_example)$dates, "Date") expect_is(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$numbers, "integer") expect_is(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$animals, "factor") expect_is(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)$dates, "Date") expect_is(dummy_cols(c("a", "b", "c"))$.data, "character") expect_is(dummy_cols(c(1.1, 1.2, 1.3))$.data, "numeric") }) test_that("New columns are integer", { expect_is(dummy_cols(fastDummies_example)$gender_male, "integer") expect_is(dummy_cols(fastDummies_example, select_columns = "numbers")$numbers_1, "integer") expect_is(dummy_cols(fastDummies_example, select_columns = "dates")[, "dates_2012-01-01"], "integer") expect_is(dummy_cols(c("a", "b", "c"))$.data_a, "integer") expect_is(dummy_cols(c(1.1, 1.2, 1.3))$.data_1.1, "integer") }) fastDummies/tests/testthat/test-ignore_na.R 0000644 0001762 0000144 00000005711 14444355170 020611 0 ustar ligges users context("test-ignore_na") na_test <- data.frame(numbers = 1:5, animals = c("cat", "dog", NA, "dog", NA), stringsAsFactors = FALSE) most_frequent <- data.frame(animal = c("dog", NA, "cat", NA, "gorilla", "gorilla"), day = c("monday", "tuesday", "wednesday", "wednesday", "friday", "saturday"), hour = 1:6) test_that("ignore-na parameter works", { expect_named(dummy_cols(na_test), c("numbers", "animals", "animals_cat", "animals_dog", "animals_NA")) expect_named(dummy_cols(na_test, ignore_na = TRUE), c("numbers", "animals", "animals_cat", "animals_dog")) expect_named(dummy_cols(most_frequent, select_columns = "animal"), c("animal", "day", "hour", "animal_cat", "animal_dog", "animal_gorilla", "animal_NA")) expect_named(dummy_cols(most_frequent, select_columns = "animal", ignore_na = TRUE), c("animal", "day", "hour", "animal_cat", "animal_dog", "animal_gorilla")) expect_equal(dummy_cols(most_frequent, select_columns = "animal")$animal_cat, c(0, NA, 1, NA ,0 ,0)) expect_equal(dummy_cols(most_frequent, ignore_na = TRUE)$animal_cat, c(0, NA, 1, NA ,0 ,0)) expect_equal(dummy_cols(most_frequent, select_columns = "animal")$animal_gorilla, c(0, NA, 0, NA, 1, 1)) expect_equal(dummy_cols(most_frequent, ignore_na = TRUE)$animal_gorilla, c(0, NA, 0, NA, 1, 1)) expect_equal(dummy_cols(most_frequent, select_columns = "animal")$animal_NA, c(0, 1, 0, 1, 0, 0)) expect_equal(dummy_cols(na_test)$animals_cat, c(1, 0, NA, 0, NA)) expect_equal(dummy_cols(na_test)$animals_dog, c(0, 1, NA, 1, NA)) expect_equal(dummy_cols(na_test)$animals_NA, c(0, 0, 1, 0, 1)) expect_equal(dummy_cols(na_test, ignore_na = TRUE)$animals_cat, c(1, 0, NA, 0, NA)) expect_equal(dummy_cols(na_test, ignore_na = TRUE)$animals_dog, c(0, 1, NA, 1, NA)) }) fastDummies/tests/testthat/test-columns-dimensions.R 0000644 0001762 0000144 00000022643 14742477061 022506 0 ustar ligges users context("dummy_cols returns proper dimensions") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) test_that("dummy_cols returns same number of rows as inputted", { expect_equal(nrow(dummy_cols(no_dummies_needed)), nrow(no_dummies_needed)) expect_equal(nrow(dummy_cols(crime)), nrow(crime)) expect_equal(nrow(dummy_cols(fastDummies_example)), nrow(fastDummies_example)) # With remove_first_dummy = TRUE expect_equal(nrow(dummy_cols(no_dummies_needed, remove_first_dummy = TRUE)), nrow(no_dummies_needed)) expect_equal(nrow(dummy_cols(crime, remove_first_dummy = TRUE)), nrow(crime)) expect_equal(nrow(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)), nrow(fastDummies_example)) # With select_columns expect_equal(nrow(dummy_cols(no_dummies_needed, select_columns = "animals")), nrow(no_dummies_needed)) expect_equal(nrow(dummy_cols(crime, select_columns = "city")), nrow(crime)) expect_equal(nrow(dummy_cols(crime, select_columns = "crime")), nrow(crime)) expect_equal(nrow(dummy_cols(crime, select_columns = c("crime", "city"))), nrow(crime)) expect_equal(nrow(dummy_cols(fastDummies_example, select_columns = c("numbers", "dates"))), nrow(fastDummies_example)) expect_equal(nrow(dummy_cols(fastDummies_example, select_columns = c("numbers", "gender"))), nrow(fastDummies_example)) expect_equal(nrow(dummy_cols(fastDummies_example, select_columns = "dates")), nrow(fastDummies_example)) }) test_that("dummy_cols returns same number of rows as inputted - vector", { expect_equal(nrow(dummy_cols(fastDummies_example$gender)), length(fastDummies_example$gender)) expect_equal(nrow(dummy_cols(fastDummies_example$numbers)), length(fastDummies_example$numbers)) expect_equal(nrow(dummy_cols(fastDummies_example$dates)), length(fastDummies_example$dates)) expect_equal(nrow(dummy_cols(1:100)), 100) expect_equal(nrow(dummy_cols(fastDummies_example[, "gender", drop = FALSE])), nrow(fastDummies_example[, "gender", drop = FALSE])) expect_equal(nrow(dummy_cols(fastDummies_example[, "numbers", drop = FALSE])), nrow(fastDummies_example[, "numbers", drop = FALSE])) expect_equal(nrow(dummy_cols(fastDummies_example[, "dates", drop = FALSE])), nrow(fastDummies_example[, "dates", drop = FALSE])) }) test_that("dummy_cols returns expected number of columns", { expect_equal(ncol(dummy_cols(no_dummies_needed)), 6) expect_equal(ncol(dummy_cols(no_dummies_needed, remove_first_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(no_dummies_needed, select_columns = "animals")), 4) expect_equal(ncol(dummy_cols(no_dummies_needed, select_columns = "animals", remove_first_dummy = TRUE)), 3) expect_equal(ncol(dummy_cols(crime)), 5) expect_equal(ncol(dummy_cols(crime, remove_first_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = "city")), 5) expect_equal(ncol(dummy_cols(crime, select_columns = "city", remove_first_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = "year")), 5) expect_equal(ncol(dummy_cols(crime, select_columns = "year", remove_first_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = c("city", "year"))), 7) expect_equal(ncol(dummy_cols(crime, select_columns = c("city", "year"), remove_first_dummy = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example)), 8) expect_equal(ncol(dummy_cols(fastDummies_example, remove_first_dummy = TRUE)), 6) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "gender")), 6) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "gender", remove_first_dummy = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "dates")), 6) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "dates", remove_first_dummy = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = c("dates", "gender", "numbers"))), 11) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = c("dates", "gender", "numbers"), remove_first_dummy = TRUE)), 8) }) test_that("returns expected number of columns - remove most common", { expect_equal(ncol(dummy_cols(no_dummies_needed, remove_most_frequent_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(no_dummies_needed, select_columns = "animals", remove_most_frequent_dummy = TRUE)), 3) expect_equal(ncol(dummy_cols(crime, remove_most_frequent_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = "city", remove_most_frequent_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = "year", remove_most_frequent_dummy = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = c("city", "year"), remove_most_frequent_dummy = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, remove_most_frequent_dummy = TRUE)), 6) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "gender", remove_most_frequent_dummy = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "dates", remove_most_frequent_dummy = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = c("dates", "gender", "numbers"), remove_most_frequent_dummy = TRUE)), 8) }) test_that("returns expected number of columns - remove selected columns", { expect_equal(ncol(dummy_cols(no_dummies_needed, remove_selected_columns = TRUE)), 4) expect_equal(ncol(dummy_cols(no_dummies_needed, select_columns = "animals", remove_selected_columns = TRUE)), 3) expect_equal(ncol(dummy_cols(crime, remove_selected_columns = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = "city", remove_selected_columns = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = "year", remove_selected_columns = TRUE)), 4) expect_equal(ncol(dummy_cols(crime, select_columns = c("city", "year"), remove_selected_columns = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, remove_selected_columns = TRUE)), 6) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "gender", remove_selected_columns = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = "dates", remove_selected_columns = TRUE)), 5) expect_equal(ncol(dummy_cols(fastDummies_example, select_columns = c("dates", "gender", "numbers"), remove_selected_columns = TRUE)), 8) }) test_that("dummy_cols returns expected number of columns - vector ", { expect_equal(ncol(dummy_cols(fastDummies_example$numbers)), 4) expect_equal(ncol(dummy_cols(fastDummies_example$animals)), 3) expect_equal(ncol(dummy_cols(fastDummies_example$dates)), 3) expect_equal(ncol(dummy_cols(fastDummies_example[, "gender", drop = FALSE])), 3) expect_equal(ncol(dummy_cols(fastDummies_example[, "numbers", drop = FALSE])), 4) expect_equal(ncol(dummy_cols(fastDummies_example[, "dates", drop = FALSE])), 3) expect_equal(ncol(dummy_cols(1:100)), 101) }) fastDummies/tests/testthat/test-rows-type.R 0000644 0001762 0000144 00000003062 13223024363 020605 0 ustar ligges users context("Columns don't change type") load(system.file("testdata", "fastDummies_data.rda", package = "fastDummies")) test_that("Columns keep same type", { expect_is(dummy_rows(no_dummies_needed)$animals, "factor") expect_is(dummy_rows(no_dummies_needed, dummy_indicator = TRUE)$animals, "factor") expect_is(dummy_rows(no_dummies_needed, select_columns = "animals", dummy_indicator = TRUE)$animals, "factor") expect_is(dummy_rows(crime)$city, "factor") expect_is(dummy_rows(crime)$year, "numeric") expect_is(dummy_rows(crime)$crime, "integer") expect_is(dummy_rows(crime, dummy_indicator = TRUE)$city, "factor") expect_is(dummy_rows(crime, dummy_indicator = TRUE)$dummy_indicator, "integer") expect_is(dummy_rows(crime, dummy_indicator = TRUE)$year, "numeric") expect_is(dummy_rows(crime, select_columns = "city", dummy_indicator = TRUE)$city, "factor") expect_is(dummy_rows(crime, select_columns = "year", dummy_indicator = TRUE)$year, "numeric") expect_is(dummy_rows(fastDummies_example)$dates, "Date") expect_is(dummy_rows(fastDummies_example, select_columns = "dates")$dates, "Date") expect_is(dummy_rows(fastDummies_example, select_columns = "dates", dummy_indicator = TRUE)$dates, "Date") }) fastDummies/tests/testthat/test-columns-split.R 0000644 0001762 0000144 00000006073 13630563351 021461 0 ustar ligges users ID <- seq(1:4) pets <- c("dog", "cat;dog;mouse", "dog;mouse", "cat") df <- data.frame("ID" = ID, "pets" = pets, stringsAsFactors = FALSE) split_test <- data.frame( Theory = c("Behaviourism", "Behaviourism, Cognitive", "Behaviourism, Gestalt", "Behaviourism, Psychodynamic", "Behaviourism, Psychodynamic, Cognitive"), Format = c("16mm", "16mm, 35mm", "16mm, 35mm, VHS", "16mm, 35mm, VHS", "35mm, VHS")) test_that("split parameter works", { expect_named(dummy_cols(df, split = ";"), c("ID", "pets", "pets_cat", "pets_dog", "pets_mouse")) expect_equal(dummy_cols(df, split = ";")$pets_dog, c(1, 1, 1, 0)) expect_equal(dummy_cols(df, split = ";")$pets_cat, c(0, 1, 0, 1)) expect_equal(dummy_cols(df, split = ";")$pets_mouse, c(0, 1, 1, 0)) expect_named(dummy_cols(split_test, split = ","), c("Theory", "Format", "Theory_Behaviourism", "Theory_Cognitive", "Theory_Gestalt", "Theory_Psychodynamic", "Format_16mm", "Format_35mm", "Format_VHS")) expect_named(dummy_cols(split_test, split = ", "), c("Theory", "Format", "Theory_Behaviourism", "Theory_Cognitive", "Theory_Gestalt", "Theory_Psychodynamic", "Format_16mm", "Format_35mm", "Format_VHS")) expect_equal(dummy_cols(split_test, split = ",")$Theory_Behaviourism, c(1, 1, 1, 1, 1)) expect_equal(dummy_cols(split_test, split = ",")$Theory_Cognitive, c(0, 1, 0, 0, 1)) expect_equal(dummy_cols(split_test, split = ",")$Theory_Gestalt, c(0, 0, 1, 0, 0)) expect_equal(dummy_cols(split_test, split = ",")$Theory_Psychodynamic, c(0, 0, 0, 1, 1)) expect_equal(dummy_cols(split_test, split = ",")$Format_16mm, c(1, 1, 1, 1, 0)) expect_equal(dummy_cols(split_test, split = ",")$Format_35mm, c(0, 1, 1, 1, 1)) expect_equal(dummy_cols(split_test, split = ",")$Format_VHS, c(0, 0, 1, 1, 1)) expect_equal(dummy_cols(split_test, split = ", ")$Theory_Behaviourism, c(1, 1, 1, 1, 1)) expect_equal(dummy_cols(split_test, split = ", ")$Theory_Cognitive, c(0, 1, 0, 0, 1)) expect_equal(dummy_cols(split_test, split = ", ")$Theory_Gestalt, c(0, 0, 1, 0, 0)) expect_equal(dummy_cols(split_test, split = ", ")$Theory_Psychodynamic, c(0, 0, 0, 1, 1)) expect_equal(dummy_cols(split_test, split = ", ")$Format_16mm, c(1, 1, 1, 1, 0)) expect_equal(dummy_cols(split_test, split = ", ")$Format_35mm, c(0, 1, 1, 1, 1)) expect_equal(dummy_cols(split_test, split = ", ")$Format_VHS, c(0, 0, 1, 1, 1)) }) fastDummies/tests/testthat/test-omit-colname-prefix.R 0000644 0001762 0000144 00000002772 14126414510 022522 0 ustar ligges users sample_data <- structure( list( colA = c("a", "a", "a", "b", "b", "c", "c", "c", "c", "c"), colB = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3), colC = c( "val1", "val2", "val3", "val1", "val2", "val7", "val2", "val4", "val6", "val8" ) ), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame") ) test_that("omit_colname_prefix works", { expect_named( dummy_cols( sample_data, c("colC"), remove_selected_columns = TRUE, omit_colname_prefix = TRUE ), c( "colA", "colB", "val1", "val2", "val3", "val4", "val6", "val7", "val8" ) ) }) test_that("omit_colname_prefix does not remove prefix when >1 select_columns", { expect_named( dummy_cols( sample_data, c("colB", "colC"), remove_selected_columns = TRUE, omit_colname_prefix = TRUE ), c( "colA", "colB_1", "colB_2", "colB_3", "colC_val1", "colC_val2", "colC_val3", "colC_val4", "colC_val6", "colC_val7", "colC_val8" ) ) }) fastDummies/tests/testthat.R 0000644 0001762 0000144 00000000106 13220107164 015651 0 ustar ligges users library(testthat) library(fastDummies) test_check("fastDummies") fastDummies/tests/spelling.R 0000644 0001762 0000144 00000000244 13417630134 015637 0 ustar ligges users if(requireNamespace('spelling', quietly = TRUE)) spelling::spell_check_test(vignettes = TRUE, error = FALSE, skip_on_cran = TRUE) fastDummies/MD5 0000644 0001762 0000144 00000004474 14743531446 013067 0 ustar ligges users 31417c5fbbbcce4b9ce480aa4b581b65 *DESCRIPTION 731c092528258cd0a1d1af148b130fcc *LICENSE c3b8266565c53588af78f606e2f6c1a1 *NAMESPACE d607cbb2e25e99ed1e5f1e335055117d *NEWS.md aba2eb8865729e8864428e0e15b6c4f1 *R/dummy_cols.R c58e1e78e303aaf419a13f12374e5789 *R/dummy_rows.R 4e3eb4a9139c11b91db8884466ec259b *R/utils.R f51f8279e520506688f390effd9cab98 *build/vignette.rds 3a5798b82ecdee3e46b14bdcdb16e394 *inst/WORDLIST 65b93c57978617cb3eb27b24c025209c *inst/doc/making-dummy-rows.R 4127b091a7465de103eb985fbf8cc9c1 *inst/doc/making-dummy-rows.Rmd 00c3ccbf572c6b1a1342f6f59645351e *inst/doc/making-dummy-rows.html 86fcbcdfaade0cc21c020d749e72a5d9 *inst/doc/making-dummy-variables.R 436801a7e4f2ccbedfe4d549ffd92953 *inst/doc/making-dummy-variables.Rmd 2233f4cec56968e1b7697c39967fbbcf *inst/doc/making-dummy-variables.html c892a894a1799ef27acc7c68cb7524c0 *inst/testdata/fastDummies_data.rda 7fcf4ee915bc796243a577484e427f7a *inst/testdata/makes_test_data.R f434a82fe746b49c4a7e1cabc2d21392 *man/dummy_cols.Rd eaa7b74c18b58e0591e27789974163ff *man/dummy_columns.Rd 7aa6228ca30619a91d4f58d35144a6d9 *man/dummy_rows.Rd 332173f20d20942c02019350ea4dcadc *tests/spelling.R d3109c4c8489c587f56bd1073b0fd985 *tests/testthat.R 112317ef16c3018fc45ef0758a6bf32a *tests/testthat/test-columns-dimensions.R 0b5f9c1a3d23180271616aebd958be3c *tests/testthat/test-columns-split.R dc77896c943767aa2239caae004998c7 *tests/testthat/test-columns-type.R 1b276bfac55e88ec146fbc73b7354f56 *tests/testthat/test-columns-value-order.R be51abdf8c4f52bae204d00f1b80b982 *tests/testthat/test-columns-warnings-errors.R 0dcc393e6e5c5b9bf09f92f01e68b42e *tests/testthat/test-columns.R b00ba7cd33b5da5b37691ce5a88ee4fe *tests/testthat/test-ignore_na.R f2cc65a610485f44d3d1beab40b929a7 *tests/testthat/test-omit-colname-prefix.R 5d92bb1b05a42673a36704a22cd053d4 *tests/testthat/test-return-type.R a81d22fe189de37ede8af3324172dbbe *tests/testthat/test-rows-dimensions.R fa8dea1dd629feb4cf7135ebb8a42de5 *tests/testthat/test-rows-indicator-values.R 0a5982a4ede02c780ce8c7a321fd993b *tests/testthat/test-rows-right-values.R a1ab5ae1ae0e4420364c82cb32d1f896 *tests/testthat/test-rows-type.R 1a155f7f426ef6b9ec024199120df181 *tests/testthat/test-rows-warnings-errors.R 4127b091a7465de103eb985fbf8cc9c1 *vignettes/making-dummy-rows.Rmd 436801a7e4f2ccbedfe4d549ffd92953 *vignettes/making-dummy-variables.Rmd fastDummies/R/ 0000755 0001762 0000144 00000000000 14742470001 012733 5 ustar ligges users fastDummies/R/dummy_cols.R 0000644 0001762 0000144 00000024335 14742477224 015256 0 ustar ligges users #' Fast creation of dummy variables #' #' Quickly create dummy (binary) columns from character and #' factor type columns in the inputted data (and numeric columns if specified.) #' This function is useful for statistical analysis when you want binary #' columns rather than character columns. #' #' @family dummy functions #' @seealso \code{\link{dummy_rows}} For creating dummy rows #' #' @param .data #' An object with the data set you want to make dummy columns from. #' @param select_columns #' Vector of column names that you want to create dummy variables from. #' If NULL (default), uses all character and factor columns. #' @param remove_first_dummy #' Removes the first dummy of every variable such that only n-1 dummies remain. #' This avoids multicollinearity issues in models. #' @param remove_most_frequent_dummy #' Removes the most frequently observed category such that only n-1 dummies #' remain. If there is a tie for most frequent, will remove the first #' (by alphabetical order) category that is tied for most frequent. #' @param ignore_na #' If TRUE, ignores any NA values in the column. If FALSE (default), then it #' will make a dummy column for value_NA and give a 1 in any row which has a #' NA value. #' @param split #' A string to split a column when multiple categories are in the cell. For #' example, if a variable is Pets and the rows are "cat", "dog", and "turtle", #' each of these pets would become its own dummy column. If one row is "cat, dog", #' then a split value of "," this row would have a value of 1 for both the cat #' and dog dummy columns. #' @param remove_selected_columns #' If TRUE (not default), removes the columns used to generate the dummy columns. #' @param omit_colname_prefix #' If TRUE (not default) and `length(select_columns) == 1`, omit pre-pending the #' name of `select_columns` to the names of the newly generated dummy columns #' #' @param return_generated_variables #' If TRUE (not default), returns a vector of the names of the variables that #' would be generated. Does not modify the inputted data. #' #' @return #' A data.frame (or tibble or data.table, depending on input data type) with #' same number of rows as inputted data and original columns plus the newly #' created dummy columns. #' @export #' @examples #' crime <- data.frame( #' city = c("SF", "SF", "NYC"), #' year = c(1990, 2000, 1990), #' crime = 1:3 #' ) #' dummy_cols(crime) #' # Include year column #' dummy_cols(crime, select_columns = c("city", "year")) #' # Remove first dummy for each pair of dummy columns made #' dummy_cols(crime, #' select_columns = c("city", "year"), #' remove_first_dummy = TRUE #' ) dummy_cols <- function(.data, select_columns = NULL, remove_first_dummy = FALSE, remove_most_frequent_dummy = FALSE, ignore_na = FALSE, split = NULL, remove_selected_columns = FALSE, omit_colname_prefix = FALSE, return_generated_variables = FALSE) { old_columns <- names(.data) stopifnot( is.null(select_columns) || is.character(select_columns), select_columns != "", is.logical(remove_first_dummy), length(remove_first_dummy) == 1, is.logical(remove_selected_columns) ) if (remove_first_dummy == TRUE & remove_most_frequent_dummy == TRUE) { stop("Select either 'remove_first_dummy' or 'remove_most_frequent_dummy' to proceed.") } if (is.vector(.data)) { .data <- data.frame(.data = .data, stringsAsFactors = FALSE) } data_type <- check_type(.data) if (!data.table::is.data.table(.data)) { .data <- data.table::as.data.table(.data) } # Grabs column names that are character or factor class ------------------- if (!is.null(select_columns)) { char_cols <- select_columns cols_not_in_data <- char_cols[!char_cols %in% names(.data)] char_cols <- char_cols[!char_cols %in% cols_not_in_data] if (length(char_cols) == 0) { stop("select_columns is/are not in data. Please check data and spelling.") } } else if (ncol(.data) == 1) { char_cols <- names(.data) } else { char_cols_class <- sapply(.data, class) char_cols <- c() for (i in 1:length(char_cols_class)) { char_col_i <- char_cols_class[[i]] if (any(char_col_i %in% c("character", "factor", "ordered"))) { char_cols <- c(char_cols, names(.data)[i]) } } # char_cols <- char_cols[char_cols %in% c("factor", "character")] # char_cols <- names(char_cols) } if (length(char_cols) == 0 && is.null(select_columns)) { stop(paste0( "No character or factor columns found. ", "Please use select_columns to choose columns." )) } if (!is.null(select_columns) && length(cols_not_in_data) > 0) { warning( paste0( "NOTE: The following select_columns input(s) ", "is not a column in data.\n" ), paste0(names(cols_not_in_data), "\t") ) } for (col_name in char_cols) { # If factor type, order by assigned levels if (is.factor(.data[[col_name]]) | is.ordered(.data[[col_name]])) { unique_vals <- levels(.data[[col_name]]) if (any(is.na(.data[[col_name]]))) { unique_vals <- c(unique_vals, NA) } # Else by alphabetical order. } else { unique_vals <- unique(.data[[col_name]]) unique_vals <- stringr::str_sort(unique_vals, na_last = TRUE, locale = "en_US", numeric = TRUE ) } unique_vals <- as.character(unique_vals) # If there is a split value, splits up the unique_vals by that value # and keeps only the unique ones. if (!is.null(split)) { unique_vals <- unique(trimws(unlist(strsplit(unique_vals, split = split)))) } if (ignore_na) { unique_vals <- unique_vals[!is.na(unique_vals)] } if (remove_most_frequent_dummy) { vals <- as.character(.data[[col_name]]) vals <- data.frame(sort(table(vals), decreasing = TRUE), stringsAsFactors = FALSE ) # If there is a actual most frequent value, drop that value. Else, # if there is a tie, drop the one that's first alphabetically. top_vals <- vals[vals$Freq %in% max(vals$Freq), ] other_vals <- vals$vals[!vals$Freq %in% max(vals$Freq)] other_vals <- as.character(other_vals) top_vals <- top_vals[stringr::str_order(top_vals$vals, na_last = TRUE, locale = "en_US", numeric = TRUE ), ] if (nrow(top_vals) == 1) { top_vals <- NULL } else { top_vals <- as.character(top_vals$vals[2:nrow(top_vals)]) } unique_vals <- c(top_vals, other_vals) unique_vals <- stringr::str_sort(unique_vals, na_last = TRUE, locale = "en_US", numeric = TRUE ) } if (remove_first_dummy) { unique_vals <- unique_vals[-1] } data.table::alloc.col(.data, ncol(.data) + length(unique_vals)) .data[, paste0(col_name, "_", unique_vals)] <- 0L for (unique_value in unique_vals) { data.table::set(.data, i = which(data.table::chmatch( as.character(.data[[col_name]]), unique_value, nomatch = 0 ) == 1L), j = paste0(col_name, "_", unique_value), value = 1L ) # Sets NA values to NA, only for columns that are not the NA columns if (!is.na(unique_value)) { data.table::set(.data, i = which(is.na(.data[[col_name]])), j = paste0(col_name, "_", unique_value), value = NA ) } if (!is.null(split)) { max_split_length <- max(sapply(strsplit(as.character(.data[[col_name]]), split = split ), length)) for (split_length in 1:max_split_length) { data.table::set(.data, i = which(data.table::chmatch( as.character(trimws(sapply( strsplit(as.character(.data[[col_name]]), split = split ), `[`, split_length ))), unique_value, nomatch = 0 ) == 1L), j = paste0(col_name, "_", unique_value), value = 1L ) } if (is.na(unique_value)) { .data[[paste0(col_name, "_", unique_value)]][which(!is.na(.data[[col_name]]))] <- 0 } } } } if (remove_selected_columns) { .data <- .data[-which(names(.data) %in% char_cols)] } .data <- fix_data_type(.data, data_type) if (omit_colname_prefix) { if (length(select_columns) == 1) { new_col_index <- as.logical(rowSums(sapply(unique_vals, function(x) { grepl(paste0(select_columns, "_", x), names(.data)) }))) names(.data)[new_col_index] <- gsub(paste0(select_columns, "_"), "", names(.data)[new_col_index]) } else { message("Can't omit the colname prefix when recoding more than one column.") message("Returning prefixed dummy columns.") } } if (return_generated_variables) { new_columns <- names(.data) new_columns <- new_columns[!new_columns %in% old_columns] return(new_columns) } return(.data) } #' Fast creation of dummy variables #' #' dummy_columns() quickly creates dummy (binary) columns from character and #' factor type columns in the inputted data. This function is useful for #' statistical analysis when you want binary columns rather than #' character columns. #' #' @family dummy functions #' @seealso \code{\link{dummy_rows}} For creating dummy rows #' #' #' @inheritParams dummy_cols #' @export #' @examples #' crime <- data.frame( #' city = c("SF", "SF", "NYC"), #' year = c(1990, 2000, 1990), #' crime = 1:3 #' ) #' dummy_cols(crime) #' # Include year column #' dummy_cols(crime, select_columns = c("city", "year")) #' # Remove first dummy for each pair of dummy columns made #' dummy_cols(crime, #' select_columns = c("city", "year"), #' remove_first_dummy = TRUE #' ) dummy_columns <- dummy_cols fastDummies/R/utils.R 0000644 0001762 0000144 00000001004 14742477350 014227 0 ustar ligges users check_type <- function(.data) { if (data.table::is.data.table(.data)) { data_type <- "is_data_table" } else if (tibble::is_tibble(.data)) { data_type <- "is_tibble" } else { data_type <- "is_data_frame" } return(data_type) } fix_data_type <- function(.data, data_type) { if (data_type == "is_data_frame") { .data <- as.data.frame(.data, stringsAsFactors = FALSE) } else if (data_type == "is_tibble") { .data <- tibble::as_tibble(.data) } return(.data) } fastDummies/R/dummy_rows.R 0000644 0001762 0000144 00000012315 14742476024 015300 0 ustar ligges users #' Fast creation of dummy rows #' #' dummy_rows() quickly creates dummy rows to fill in missing rows #' based on all combinations of available character, factor, and #' date columns (if not otherwise specified). This is useful for #' creating balanced panel data. Columns that are not character, #' factor, or dates are filled in with NA (or whatever value you #' specify). #' #' @family dummy functions #' @seealso \code{\link{dummy_cols}} For creating dummy columns #' #' @param .data #' An object with the data set you want to make dummy columns from. #' @param select_columns #' If NULL (default), uses all character, factor, and Date columns to produce categories #' to make the dummy rows by. If not NULL, you manually enter a string or vector of strings of columns name(s). #' @param dummy_value #' Value of the row for columns that are not selected. #' Default is a value of NA. #' @param dummy_indicator #' Adds binary column to say if row is dummy or not (i.e. included in #' original data or not) #' #' @return #' A data.frame (or tibble or data.table, depending on input data type) with #' same number of columns as inputted data and original rows plus the newly #' created dummy rows #' @export #' @examples #' crime <- data.frame(city = c("SF", "SF", "NYC"), #' year = c(1990, 2000, 1990), #' crime = 1:3) #' #' dummy_rows(crime) #' # Include year column #' dummy_rows(crime, select_columns = c("city", "year")) #' # m=Make dummy value 0 #' dummy_rows(crime, select_columns = c("city", "year"), #' dummy_value = 0) #' # Add a dummy indicator #' dummy_rows(crime, select_columns = c("city", "year"), #' dummy_indicator = TRUE) dummy_rows <- function(.data, select_columns = NULL, dummy_value = NA, dummy_indicator = FALSE) { stopifnot(is.null(select_columns) || is.character(select_columns), select_columns != "", is.logical(dummy_indicator), length(dummy_indicator) == 1, length(dummy_value) == 1) if (is.atomic(.data) || ncol(.data) == 1) { stop("Cannot make dummy rows of a vector of one column data.frame/table.") } data_type <- check_type(.data) if (!data.table::is.data.table(.data)) { .data <- data.table::as.data.table(.data) } # Finds class of every column and keeps character, factor, and Date -------- if (is.null(select_columns)) { char_cols_class <- sapply(.data, class) char_cols <- c() for (i in 1:length(char_cols_class)) { char_col_i <- char_cols_class[[i]] if (any(char_col_i %in% c("character", "factor", "Date", "ordered"))) { char_cols <- c(char_cols, names(.data)[i]) } } # char_cols <- names(.data)[char_cols %in% # c("character", "factor", "Date", "ordered")] if (length(char_cols) == 0) { stop("No character, factor, or Date columns found. Please use select_columns") } } else { char_cols <- select_columns } other_cols <- names(.data)[!names(.data) %in% char_cols] # Finds how many possible combinations of the variables there are. # This will be the number of rows in the new data total_length <- prod(sapply(.data[, char_cols, with = FALSE, drop = FALSE], data.table::uniqueN)) # Makes an empty data.table with right # of rows and columns. ------------- temp_table <- data.table::data.table(matrix(nrow = total_length, ncol = ncol(.data))) names(temp_table) <- names(.data) # Fills in all possible combination rows ---------------------------------- for (i in char_cols) { data.table::set(temp_table, j = i, value = rep(unique(.data[[i]]), times = total_length / data.table::uniqueN(.data[[i]]))) temp_table <- data.table::setorderv(temp_table, i) } # Adds the dummy variable columns (and indicator) ------------------------- for (i in other_cols) { data.table::set(temp_table, j = other_cols, value = rep(dummy_value, nrow(temp_table))) } if (dummy_indicator) { # Adding extra column data.table::alloc.col(temp_table, ncol(temp_table) + 1) data.table::alloc.col(.data, ncol(.data) + 1) data.table::set(.data, j = "dummy_indicator", value = 0L) data.table::set(temp_table, j = "dummy_indicator", value = rep(1L, nrow(temp_table))) } # Removes rows that were in original data. -------------------------------- data_temp_pasting <- do.call(paste0, .data[, char_cols, with = FALSE, drop = FALSE]) temp_temp_pasting <- do.call(paste0, temp_table[, char_cols, with = FALSE, drop = FALSE]) temp_table <- subset(temp_table, !temp_temp_pasting %in% data_temp_pasting) # Stacks new data on old data if (nrow(temp_table) > 0) { .data <- data.table::rbindlist(list(.data, temp_table), use.names = TRUE, fill = TRUE) } .data <- fix_data_type(.data, data_type) return(.data) } fastDummies/vignettes/ 0000755 0001762 0000144 00000000000 14743475713 014562 5 ustar ligges users fastDummies/vignettes/making-dummy-variables.Rmd 0000644 0001762 0000144 00000010036 13223763615 021564 0 ustar ligges users --- title: "Making dummy variables with dummy_cols()" author: "Jacob Kaplan" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Making dummy variables with dummy_cols()} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- Dummy variables (or binary variables) are commonly used in statistical analyses and in more simple descriptive statistics. A dummy column is one which has a value of one when a categorical event occurs and a zero when it doesn't occur. In most cases this is a feature of the event/person/object being described. For example, if the dummy variable was for occupation being an R programmer, you can ask, "is this person an R programmer?" When the answer is yes, they get a value of 1, when it is no, they get a value of 0. We'll start with a simple example and then go into using the function `dummy_cols()`. You can also use the function `dummy_columns()` which is identical to `dummy_cols()`. Imagine you have a data set about animals in a local shelter. One of the columns in your data is what animal it is: dog or cat. ```{r echo=FALSE} knitr::kable(data.frame(animals = c("dog", "dog", "cat"))) ``` To make dummy columns from this data, you would need to produce two new columns. One would indicate if the animal is a dog, and the other would indicate if the animal is a cat. Each row would get a value of 1 in the column indicating which animal they are, and 0 in the other column. animals | dog | cat --- | --- | --- dog | 1 | 0 dog | 1 | 0 cat | 0 | 1 In the function dummy_cols, the names of these new columns are concatenated to the original column and separated by an underscore. animals | animals_dog | animals_cat --- | --- | --- dog | 1 | 0 dog | 1 | 0 cat | 0 | 1 With an example like this, it is fairly easy to make the dummy columns yourself. `dummy_cols()` automates the process, and is useful when you have many columns to general dummy variables from or with many categories within the column. ```{r setup, echo=TRUE} fastDummies_example <- data.frame(numbers = 1:3, gender = c("male", "male", "female"), animals = c("dog", "dog", "cat"), dates = as.Date(c("2012-01-01", "2011-12-31", "2012-01-01")), stringsAsFactors = FALSE) knitr::kable(fastDummies_example) ``` The object **fastDummies_example** has two character type columns, one integer column, and a Date column. By default, `dummy_cols()` will make dummy variables from factor or character columns only. This is because in most cases those are the only types of data you want dummy variables from. If those are the only columns you want, then the function takes your data set as the first parameter and returns a data.frame with the newly created variables appended to the end of the original data. ```{r echo=TRUE} results <- fastDummies::dummy_cols(fastDummies_example) knitr::kable(results) ``` In some situations, you would want columns with types other than factor and character to generate dummy variables. For example, a column of years would be numeric but could be well-suited for making into dummy variables depending on your analysis. Use the *select_columns* parameter to select specific columns to make dummy variables from. ```{r echo=TRUE} results <- fastDummies::dummy_cols(fastDummies_example, select_columns = "numbers") knitr::kable(results) ``` The final option for `dummy_cols()` is *remove_first_dummy* which by default is FALSE. If TRUE, it removes the first dummy variable created from each column. This is done to avoid multicollinearity in a multiple regression model caused by included all dummy variables. The "first" dummy variable is the one at the top of the rows (i.e. the first value that is not NA). ```{r echo=TRUE} results <- fastDummies::dummy_cols(fastDummies_example, remove_first_dummy = TRUE) knitr::kable(results) ``` fastDummies/vignettes/making-dummy-rows.Rmd 0000644 0001762 0000144 00000007223 13223763627 020615 0 ustar ligges users --- title: "Making dummy rows with dummy_rows()" author: "Jacob Kaplan" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Making dummy rows with dummy_rows()} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- When dealing with data, there are often missing rows. While truly handling missing data is far beyond the scope of this package, the function `dummy_rows()` lets you add those missing rows back into the data. The function takes all character, factor, and Date columns, finds all possible combinations of their values, and adds the rows that are not in the original data set. Any columns not used in creating the combinations (e.g. numeric) are given a value of NA (unless otherwise specified with *dummy_value*). Lets start with a simple example. ```{r echo=TRUE} fastDummies_example <- data.frame(numbers = 1:3, gender = c("male", "male", "female"), animals = c("dog", "dog", "cat"), dates = as.Date(c("2012-01-01", "2011-12-31", "2012-01-01")), stringsAsFactors = FALSE) knitr::kable(fastDummies_example) ``` This data set has four columns: two character, one Date, and one numeric. The function by default will use the character and Date columns in creating the combinations. First, a small amount of math to explain the combinations. Each column has two distinct values - gender: male & female; animals: dog & cat; dates: 2011-12-31 & 2011-12-31. To find the number of possible combinations, multiple the number of unique values in each column together. 2 \* 2 \* 2 = 8. ```{r echo=TRUE} results <- fastDummies::dummy_rows(fastDummies_example) knitr::kable(results) ``` When we run the function we can see that there are indeed 8 rows possible, and that the 5 rows missing from the original data have been added. To explicitly see which rows are new, set the *dummy_indicator* parameter to TRUE. This provides a column called dummy_indicator with a value of 0 if the row is in the original data and 1 if it was added. ```{r echo=TRUE} results <- fastDummies::dummy_rows(fastDummies_example, dummy_indicator = TRUE) knitr::kable(results) ``` By default, columns not used for making the combinations are given a value of NA in the new rows. You can choose the value given with the parameter *dummy_value*. It takes an input, a string or single number. ```{r echo=TRUE} results1 <- fastDummies::dummy_rows(fastDummies_example, dummy_value = 0) results2 <- fastDummies::dummy_rows(fastDummies_example, dummy_value = "new value") knitr::kable(results1) knitr::kable(results2) ``` The parameter *select_columns* lets you choose which columns to use when making the combinations. It accepts a string or vector of column names. This can come in handy when you want to include a numeric column, such as years, when making the combinations. A new data set will help demonstrate this. This data set shows (imaginary) crime in New York City and San Francisco during 1990 and 2000. The problem is that there is no row for New York City for 2000. We want to add that row. ```{r echo = TRUE} crime <- data.frame(city = c("SF", "SF", "NYC"), year = c(1990, 2000, 1990), crime = 1:3) knitr::kable(crime) ``` Using the default parameters for `dummy_rows()` doesn't give us what we want since it only selects the city column. We need to select both city and year to get all the combinations we want. ```{r echo=TRUE} results <- fastDummies::dummy_rows(crime, select_columns = c("city", "year")) knitr::kable(results) ``` fastDummies/NAMESPACE 0000644 0001762 0000144 00000000157 14742475434 013773 0 ustar ligges users # Generated by roxygen2: do not edit by hand export(dummy_cols) export(dummy_columns) export(dummy_rows) fastDummies/LICENSE 0000644 0001762 0000144 00000000054 13465035615 013547 0 ustar ligges users YEAR: 2019 COPYRIGHT HOLDER: Jacob Kaplan fastDummies/NEWS.md 0000644 0001762 0000144 00000006027 14742475114 013647 0 ustar ligges users # fastDummies 1.7.5 * Now includes ordered factors in dummy_cols() by default. * Adds parameter `return_generated_variables` which will return a vector of strings with the new column names generated by `dummy_cols()`. # fastDummies 1.7.4 * Remove .onAttach message. # fastDummies 1.7.3 * Fix .onAttach message. # fastDummies 1.7.2 * Adds onattach message. # fastDummies 1.7.0 * Add option to omit colname prefix when only one column is recoded. Thanks to @teofiln for the PR. # fastDummies 1.6.3 * Fix bug where inputting a vector or a one column data.frame returned an issue. Now will convert the vector to a data.frame and return that. Will name each column ".data_". Closes #23 by @Garyf20. * Fix bug where `remove_most_frequent_dummy` wasn't working right when there was a tie for which value was the most frequent. Closes #22 by eden70. # fastDummies 1.6.2 * Dummy columns are now returned in alphabetical order including numeric order (e.g. photos_2 is before photos_11). # fastDummies 1.6.1 * Bug fixes. * Dummy columns are now returned in alphabetical order. # fastDummies 1.6.0 * Adds the parameter `remove_selected_columns` to `dummy_columns()`. If TRUE (not default), removes the columns which are used to create the dummy columns. # fastDummies 1.5.0 * Removes `sort_columns` parameter. Now by default will order by level if the variable is a factor type. * Fix bug where `split` parameter didn't work properly. * If value is NA, sets to NA in dummy column rather than value of 0. Closes #18 by @DLustenBerger. # fastDummies 1.4.1 * Fix bug when column is factor type when using `split` parameter. Thanks to Matthew Sigal for submitting issue on GitHub. # fastDummies 1.4.0 * Adds option to ignore NA values in dummy_cols (doesn't make a variable_NA) column is selected. Thanks to juribep5 for the GitHub suggestion. * Adds `split` parameter in dummy_cols to handle if a row has multiple categories. Thanks to Matthew Sigal and Andrew Fernandes for the GitHub suggestion. # fastDummies 1.3.0 * Adds option to sort dummy columns following the order of the original factor variable. Thanks to Patrick Baylis for the pull request with the code for this feature! # fastDummies 1.2.0 * Adds option to exclude the most frequently observed category rather than the first category as is default. Thanks to GitHub user S-UP for the suggestion! # fastDummies 1.1.0 * Thanks to GitHub user yu45020 dummy_cols() is now about >20% faster and much more memory efficient. * Both dummy_cols() and dummy_rows() now return the same data type inputted e.g. data.frame input returns data.frame, tibble returns tibble. * Fix documentation that incorrectly said default value for new dummy rows is 0. It is in fact a value of NA. # fastDummies 1.0.0 * Reduces number of parameter that were in previous version. * Significant speed increases for both dummy_cols() and dummy_rows() functions. * dummy_cols() now accepts numeric columns. fastDummies/inst/ 0000755 0001762 0000144 00000000000 14743475713 013527 5 ustar ligges users fastDummies/inst/doc/ 0000755 0001762 0000144 00000000000 14743475713 014274 5 ustar ligges users fastDummies/inst/doc/making-dummy-variables.Rmd 0000644 0001762 0000144 00000010036 13223763615 021276 0 ustar ligges users --- title: "Making dummy variables with dummy_cols()" author: "Jacob Kaplan" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Making dummy variables with dummy_cols()} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- Dummy variables (or binary variables) are commonly used in statistical analyses and in more simple descriptive statistics. A dummy column is one which has a value of one when a categorical event occurs and a zero when it doesn't occur. In most cases this is a feature of the event/person/object being described. For example, if the dummy variable was for occupation being an R programmer, you can ask, "is this person an R programmer?" When the answer is yes, they get a value of 1, when it is no, they get a value of 0. We'll start with a simple example and then go into using the function `dummy_cols()`. You can also use the function `dummy_columns()` which is identical to `dummy_cols()`. Imagine you have a data set about animals in a local shelter. One of the columns in your data is what animal it is: dog or cat. ```{r echo=FALSE} knitr::kable(data.frame(animals = c("dog", "dog", "cat"))) ``` To make dummy columns from this data, you would need to produce two new columns. One would indicate if the animal is a dog, and the other would indicate if the animal is a cat. Each row would get a value of 1 in the column indicating which animal they are, and 0 in the other column. animals | dog | cat --- | --- | --- dog | 1 | 0 dog | 1 | 0 cat | 0 | 1 In the function dummy_cols, the names of these new columns are concatenated to the original column and separated by an underscore. animals | animals_dog | animals_cat --- | --- | --- dog | 1 | 0 dog | 1 | 0 cat | 0 | 1 With an example like this, it is fairly easy to make the dummy columns yourself. `dummy_cols()` automates the process, and is useful when you have many columns to general dummy variables from or with many categories within the column. ```{r setup, echo=TRUE} fastDummies_example <- data.frame(numbers = 1:3, gender = c("male", "male", "female"), animals = c("dog", "dog", "cat"), dates = as.Date(c("2012-01-01", "2011-12-31", "2012-01-01")), stringsAsFactors = FALSE) knitr::kable(fastDummies_example) ``` The object **fastDummies_example** has two character type columns, one integer column, and a Date column. By default, `dummy_cols()` will make dummy variables from factor or character columns only. This is because in most cases those are the only types of data you want dummy variables from. If those are the only columns you want, then the function takes your data set as the first parameter and returns a data.frame with the newly created variables appended to the end of the original data. ```{r echo=TRUE} results <- fastDummies::dummy_cols(fastDummies_example) knitr::kable(results) ``` In some situations, you would want columns with types other than factor and character to generate dummy variables. For example, a column of years would be numeric but could be well-suited for making into dummy variables depending on your analysis. Use the *select_columns* parameter to select specific columns to make dummy variables from. ```{r echo=TRUE} results <- fastDummies::dummy_cols(fastDummies_example, select_columns = "numbers") knitr::kable(results) ``` The final option for `dummy_cols()` is *remove_first_dummy* which by default is FALSE. If TRUE, it removes the first dummy variable created from each column. This is done to avoid multicollinearity in a multiple regression model caused by included all dummy variables. The "first" dummy variable is the one at the top of the rows (i.e. the first value that is not NA). ```{r echo=TRUE} results <- fastDummies::dummy_cols(fastDummies_example, remove_first_dummy = TRUE) knitr::kable(results) ``` fastDummies/inst/doc/making-dummy-variables.R 0000644 0001762 0000144 00000002176 14743475712 020771 0 ustar ligges users ## ----echo=FALSE--------------------------------------------------------------- knitr::kable(data.frame(animals = c("dog", "dog", "cat"))) ## ----setup, echo=TRUE--------------------------------------------------------- fastDummies_example <- data.frame(numbers = 1:3, gender = c("male", "male", "female"), animals = c("dog", "dog", "cat"), dates = as.Date(c("2012-01-01", "2011-12-31", "2012-01-01")), stringsAsFactors = FALSE) knitr::kable(fastDummies_example) ## ----echo=TRUE---------------------------------------------------------------- results <- fastDummies::dummy_cols(fastDummies_example) knitr::kable(results) ## ----echo=TRUE---------------------------------------------------------------- results <- fastDummies::dummy_cols(fastDummies_example, select_columns = "numbers") knitr::kable(results) ## ----echo=TRUE---------------------------------------------------------------- results <- fastDummies::dummy_cols(fastDummies_example, remove_first_dummy = TRUE) knitr::kable(results) fastDummies/inst/doc/making-dummy-rows.Rmd 0000644 0001762 0000144 00000007223 13223763627 020327 0 ustar ligges users --- title: "Making dummy rows with dummy_rows()" author: "Jacob Kaplan" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Making dummy rows with dummy_rows()} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- When dealing with data, there are often missing rows. While truly handling missing data is far beyond the scope of this package, the function `dummy_rows()` lets you add those missing rows back into the data. The function takes all character, factor, and Date columns, finds all possible combinations of their values, and adds the rows that are not in the original data set. Any columns not used in creating the combinations (e.g. numeric) are given a value of NA (unless otherwise specified with *dummy_value*). Lets start with a simple example. ```{r echo=TRUE} fastDummies_example <- data.frame(numbers = 1:3, gender = c("male", "male", "female"), animals = c("dog", "dog", "cat"), dates = as.Date(c("2012-01-01", "2011-12-31", "2012-01-01")), stringsAsFactors = FALSE) knitr::kable(fastDummies_example) ``` This data set has four columns: two character, one Date, and one numeric. The function by default will use the character and Date columns in creating the combinations. First, a small amount of math to explain the combinations. Each column has two distinct values - gender: male & female; animals: dog & cat; dates: 2011-12-31 & 2011-12-31. To find the number of possible combinations, multiple the number of unique values in each column together. 2 \* 2 \* 2 = 8. ```{r echo=TRUE} results <- fastDummies::dummy_rows(fastDummies_example) knitr::kable(results) ``` When we run the function we can see that there are indeed 8 rows possible, and that the 5 rows missing from the original data have been added. To explicitly see which rows are new, set the *dummy_indicator* parameter to TRUE. This provides a column called dummy_indicator with a value of 0 if the row is in the original data and 1 if it was added. ```{r echo=TRUE} results <- fastDummies::dummy_rows(fastDummies_example, dummy_indicator = TRUE) knitr::kable(results) ``` By default, columns not used for making the combinations are given a value of NA in the new rows. You can choose the value given with the parameter *dummy_value*. It takes an input, a string or single number. ```{r echo=TRUE} results1 <- fastDummies::dummy_rows(fastDummies_example, dummy_value = 0) results2 <- fastDummies::dummy_rows(fastDummies_example, dummy_value = "new value") knitr::kable(results1) knitr::kable(results2) ``` The parameter *select_columns* lets you choose which columns to use when making the combinations. It accepts a string or vector of column names. This can come in handy when you want to include a numeric column, such as years, when making the combinations. A new data set will help demonstrate this. This data set shows (imaginary) crime in New York City and San Francisco during 1990 and 2000. The problem is that there is no row for New York City for 2000. We want to add that row. ```{r echo = TRUE} crime <- data.frame(city = c("SF", "SF", "NYC"), year = c(1990, 2000, 1990), crime = 1:3) knitr::kable(crime) ``` Using the default parameters for `dummy_rows()` doesn't give us what we want since it only selects the city column. We need to select both city and year to get all the combinations we want. ```{r echo=TRUE} results <- fastDummies::dummy_rows(crime, select_columns = c("city", "year")) knitr::kable(results) ``` fastDummies/inst/doc/making-dummy-variables.html 0000644 0001762 0000144 00000046113 14743475712 021533 0 ustar ligges users
Dummy variables (or binary variables) are commonly used in statistical analyses and in more simple descriptive statistics. A dummy column is one which has a value of one when a categorical event occurs and a zero when it doesn’t occur. In most cases this is a feature of the event/person/object being described. For example, if the dummy variable was for occupation being an R programmer, you can ask, “is this person an R programmer?” When the answer is yes, they get a value of 1, when it is no, they get a value of 0.
We’ll start with a simple example and then go into using the function
dummy_cols()
. You can also use the function
dummy_columns()
which is identical to
dummy_cols()
.
Imagine you have a data set about animals in a local shelter. One of the columns in your data is what animal it is: dog or cat.
animals |
---|
dog |
dog |
cat |
To make dummy columns from this data, you would need to produce two new columns. One would indicate if the animal is a dog, and the other would indicate if the animal is a cat. Each row would get a value of 1 in the column indicating which animal they are, and 0 in the other column.
animals | dog | cat |
---|---|---|
dog | 1 | 0 |
dog | 1 | 0 |
cat | 0 | 1 |
In the function dummy_cols, the names of these new columns are concatenated to the original column and separated by an underscore.
animals | animals_dog | animals_cat |
---|---|---|
dog | 1 | 0 |
dog | 1 | 0 |
cat | 0 | 1 |
With an example like this, it is fairly easy to make the dummy
columns yourself. dummy_cols()
automates the process, and
is useful when you have many columns to general dummy variables from or
with many categories within the column.
fastDummies_example <- data.frame(numbers = 1:3,
gender = c("male", "male", "female"),
animals = c("dog", "dog", "cat"),
dates = as.Date(c("2012-01-01", "2011-12-31",
"2012-01-01")),
stringsAsFactors = FALSE)
knitr::kable(fastDummies_example)
numbers | gender | animals | dates |
---|---|---|---|
1 | male | dog | 2012-01-01 |
2 | male | dog | 2011-12-31 |
3 | female | cat | 2012-01-01 |
The object fastDummies_example has two character
type columns, one integer column, and a Date column. By default,
dummy_cols()
will make dummy variables from factor or
character columns only. This is because in most cases those are the only
types of data you want dummy variables from. If those are the only
columns you want, then the function takes your data set as the first
parameter and returns a data.frame with the newly created variables
appended to the end of the original data.
numbers | gender | animals | dates | gender_female | gender_male | animals_cat | animals_dog |
---|---|---|---|---|---|---|---|
1 | male | dog | 2012-01-01 | 0 | 1 | 0 | 1 |
2 | male | dog | 2011-12-31 | 0 | 1 | 0 | 1 |
3 | female | cat | 2012-01-01 | 1 | 0 | 1 | 0 |
In some situations, you would want columns with types other than factor and character to generate dummy variables. For example, a column of years would be numeric but could be well-suited for making into dummy variables depending on your analysis. Use the select_columns parameter to select specific columns to make dummy variables from.
results <- fastDummies::dummy_cols(fastDummies_example, select_columns = "numbers")
knitr::kable(results)
numbers | gender | animals | dates | numbers_1 | numbers_2 | numbers_3 |
---|---|---|---|---|---|---|
1 | male | dog | 2012-01-01 | 1 | 0 | 0 |
2 | male | dog | 2011-12-31 | 0 | 1 | 0 |
3 | female | cat | 2012-01-01 | 0 | 0 | 1 |
The final option for dummy_cols()
is
remove_first_dummy which by default is FALSE. If TRUE, it
removes the first dummy variable created from each column. This is done
to avoid multicollinearity in a multiple regression model caused by
included all dummy variables. The “first” dummy variable is the one at
the top of the rows (i.e. the first value that is not NA).
results <- fastDummies::dummy_cols(fastDummies_example, remove_first_dummy = TRUE)
knitr::kable(results)
numbers | gender | animals | dates | gender_male | animals_dog |
---|---|---|---|---|---|
1 | male | dog | 2012-01-01 | 1 | 1 |
2 | male | dog | 2011-12-31 | 1 | 1 |
3 | female | cat | 2012-01-01 | 0 | 0 |
When dealing with data, there are often missing rows. While truly
handling missing data is far beyond the scope of this package, the
function dummy_rows()
lets you add those missing rows back
into the data.
The function takes all character, factor, and Date columns, finds all possible combinations of their values, and adds the rows that are not in the original data set. Any columns not used in creating the combinations (e.g. numeric) are given a value of NA (unless otherwise specified with dummy_value).
Lets start with a simple example.
fastDummies_example <- data.frame(numbers = 1:3,
gender = c("male", "male", "female"),
animals = c("dog", "dog", "cat"),
dates = as.Date(c("2012-01-01", "2011-12-31",
"2012-01-01")),
stringsAsFactors = FALSE)
knitr::kable(fastDummies_example)
numbers | gender | animals | dates |
---|---|---|---|
1 | male | dog | 2012-01-01 |
2 | male | dog | 2011-12-31 |
3 | female | cat | 2012-01-01 |
This data set has four columns: two character, one Date, and one numeric. The function by default will use the character and Date columns in creating the combinations. First, a small amount of math to explain the combinations. Each column has two distinct values - gender: male & female; animals: dog & cat; dates: 2011-12-31 & 2011-12-31. To find the number of possible combinations, multiple the number of unique values in each column together. 2 * 2 * 2 = 8.
numbers | gender | animals | dates |
---|---|---|---|
1 | male | dog | 2012-01-01 |
2 | male | dog | 2011-12-31 |
3 | female | cat | 2012-01-01 |
NA | female | cat | 2011-12-31 |
NA | male | cat | 2011-12-31 |
NA | female | dog | 2011-12-31 |
NA | male | cat | 2012-01-01 |
NA | female | dog | 2012-01-01 |
When we run the function we can see that there are indeed 8 rows possible, and that the 5 rows missing from the original data have been added.
To explicitly see which rows are new, set the dummy_indicator parameter to TRUE. This provides a column called dummy_indicator with a value of 0 if the row is in the original data and 1 if it was added.
results <- fastDummies::dummy_rows(fastDummies_example, dummy_indicator = TRUE)
knitr::kable(results)
numbers | gender | animals | dates | dummy_indicator |
---|---|---|---|---|
1 | male | dog | 2012-01-01 | 0 |
2 | male | dog | 2011-12-31 | 0 |
3 | female | cat | 2012-01-01 | 0 |
NA | female | cat | 2011-12-31 | 1 |
NA | male | cat | 2011-12-31 | 1 |
NA | female | dog | 2011-12-31 | 1 |
NA | male | cat | 2012-01-01 | 1 |
NA | female | dog | 2012-01-01 | 1 |
By default, columns not used for making the combinations are given a value of NA in the new rows. You can choose the value given with the parameter dummy_value. It takes an input, a string or single number.
results1 <- fastDummies::dummy_rows(fastDummies_example, dummy_value = 0)
results2 <- fastDummies::dummy_rows(fastDummies_example, dummy_value = "new value")
knitr::kable(results1)
numbers | gender | animals | dates |
---|---|---|---|
1 | male | dog | 2012-01-01 |
2 | male | dog | 2011-12-31 |
3 | female | cat | 2012-01-01 |
0 | female | cat | 2011-12-31 |
0 | male | cat | 2011-12-31 |
0 | female | dog | 2011-12-31 |
0 | male | cat | 2012-01-01 |
0 | female | dog | 2012-01-01 |
numbers | gender | animals | dates |
---|---|---|---|
1 | male | dog | 2012-01-01 |
2 | male | dog | 2011-12-31 |
3 | female | cat | 2012-01-01 |
new value | female | cat | 2011-12-31 |
new value | male | cat | 2011-12-31 |
new value | female | dog | 2011-12-31 |
new value | male | cat | 2012-01-01 |
new value | female | dog | 2012-01-01 |
The parameter select_columns lets you choose which columns to use when making the combinations. It accepts a string or vector of column names. This can come in handy when you want to include a numeric column, such as years, when making the combinations. A new data set will help demonstrate this. This data set shows (imaginary) crime in New York City and San Francisco during 1990 and 2000. The problem is that there is no row for New York City for 2000. We want to add that row.
crime <- data.frame(city = c("SF", "SF", "NYC"),
year = c(1990, 2000, 1990),
crime = 1:3)
knitr::kable(crime)
city | year | crime |
---|---|---|
SF | 1990 | 1 |
SF | 2000 | 2 |
NYC | 1990 | 3 |
Using the default parameters for dummy_rows()
doesn’t
give us what we want since it only selects the city column. We need to
select both city and year to get all the combinations we want.
city | year | crime |
---|---|---|
SF | 1990 | 1 |
SF | 2000 | 2 |
NYC | 1990 | 3 |
NYC | 2000 | NA |