dplyr/0000755000176200001440000000000015140333347011407 5ustar liggesusersdplyr/tests/0000755000176200001440000000000015137234433012553 5ustar liggesusersdplyr/tests/testthat/0000755000176200001440000000000015140333347014411 5ustar liggesusersdplyr/tests/testthat/test-generics.R0000644000176200001440000001316715137161765017331 0ustar liggesuserstest_that("row_slice recomputes groups", { gf <- group_by(data.frame(g = c(1, 1, 2, 2, 3, 3)), g) out <- dplyr_row_slice(gf, c(1L, 3L, 5L)) expect_equal(group_data(out)$.rows, list_of(1L, 2L, 3L)) out <- dplyr_row_slice(gf, c(4L, 3L)) expect_equal(group_data(out)$.rows, list_of(c(1L, 2L))) }) test_that("row_slice preserves empty groups if requested", { gf <- group_by(data.frame(g = c(1, 1, 2, 2, 3, 3)), g, .drop = FALSE) out <- dplyr_row_slice(gf, c(3L, 4L)) expect_equal(group_data(out)$.rows, list_of(integer(), c(1L, 2L), integer())) }) # dplyr_col_modify -------------------------------------------------------- test_that("empty cols returns input", { df <- data.frame(x = 1) expect_equal(dplyr_col_modify(df, list()), df) }) test_that("applies tidyverse recycling rules", { expect_equal( dplyr_col_modify(data.frame(x = 1:2), list(y = 1)), data.frame(x = 1:2, y = c(1, 1)) ) expect_equal( dplyr_col_modify(data.frame(x = integer()), list(y = 1)), data.frame(x = integer(), y = integer()) ) expect_error( dplyr_col_modify(data.frame(x = 1:4), list(y = 1:2)), class = "vctrs_error_recycle_incompatible_size" ) }) test_that("can add, remove, and replace columns", { df <- data.frame(x = 1, y = 2) expect_equal(dplyr_col_modify(df, list(y = NULL)), data.frame(x = 1)) expect_equal(dplyr_col_modify(df, list(y = 3)), data.frame(x = 1, y = 3)) expect_equal( dplyr_col_modify(df, list(z = 3)), data.frame(x = 1, y = 2, z = 3) ) }) test_that("doesn't expand row names", { df <- data.frame(x = 1:10) out <- dplyr_col_modify(df, list(y = 1)) expect_equal(.row_names_info(out, 1), -10) }) test_that("preserves existing row names", { df <- data.frame(x = c(1, 2), row.names = c("a", "b")) out <- dplyr_col_modify(df, list(y = 1)) expect_equal(row.names(df), c("a", "b")) }) test_that("reconstruct method gets a data frame", { seen_df <- NULL local_methods( dplyr_reconstruct.dplyr_foobar = function(data, template) { if (is.data.frame(data)) { seen_df <<- TRUE } NextMethod() } ) df <- foobar(data.frame(x = 1)) seen_df <- FALSE dplyr_col_modify(df, list(y = 2)) expect_true(seen_df) seen_df <- FALSE dplyr_row_slice(df, 1) expect_true(seen_df) }) # dplyr_reconstruct ------------------------------------------------------- test_that("classes are restored", { expect_identical( dplyr_reconstruct(tibble(), data.frame()), data.frame() ) expect_identical( dplyr_reconstruct(tibble(), tibble()), tibble() ) expect_identical( dplyr_reconstruct(tibble(), new_data_frame(class = "foo")), new_data_frame(class = "foo") ) }) test_that("attributes of `template` are kept", { expect_identical( dplyr_reconstruct(new_tibble(list(), nrow = 1), new_data_frame(foo = 1)), new_data_frame(n = 1L, foo = 1) ) }) test_that("compact row names are retained", { data <- vec_rbind(tibble(a = 1), tibble(a = 2)) template <- tibble() x <- dplyr_reconstruct(data, template) expect <- tibble(a = c(1, 2)) expect_identical(x, expect) # Explicitly ensure internal row name structure is identical expect_identical( .row_names_info(x, type = 0L), .row_names_info(expect, type = 0L) ) }) test_that("dplyr_reconstruct() strips attributes before dispatch", { local_methods( dplyr_reconstruct.dplyr_foobar = function(data, template) { out <<- data } ) df <- foobar(data.frame(x = 1), foo = "bar") out <- NULL dplyr_reconstruct(df, df) expect_identical(out, data.frame(x = 1)) df <- foobar(data.frame(x = 1, row.names = "a"), foo = "bar") out <- NULL dplyr_reconstruct(df, df) expect_identical(out, data.frame(x = 1, row.names = "a")) }) test_that("`dplyr_reconstruct()` can't guarantee attribute ordering of `template`", { # This used to be `expect_identical()`, but attribute ordering may change # during reconstruction. Attributes should be viewed as a map, so this is fine # (#7797). df <- vctrs::data_frame(x = 1) expect_mapequal( attributes(dplyr_reconstruct(df, df)), attributes(df) ) }) test_that("`dplyr_reconstruct()` doesn't modify the original `data` in place", { data <- new_data_frame(list(x = 1), foo = "bar") template <- vctrs::data_frame(x = 1) out <- dplyr_reconstruct(data, template) expect_null(attr(out, "foo")) expect_identical(attr(data, "foo"), "bar") }) test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", { skip_if( getRversion() >= "4.6.0", "Can't call `ATTRIB()` or `SET_ATTRIB()` anymore." ) skip_if_no_lazy_character() dplyr_attributes <- function(x) { .Call(ffi_test_dplyr_attributes, x) } dplyr_set_attributes <- function(x, attributes) { .Call(ffi_test_dplyr_set_attributes, x, attributes) } df <- vctrs::data_frame(x = 1) attributes <- attributes(df) attributes$row.names <- new_lazy_character(function() "a") attributes <- as.pairlist(attributes) df_with_lazy_row_names <- dplyr_set_attributes(df, attributes) # Ensure `data` row names aren't materialized x <- dplyr_reconstruct(df_with_lazy_row_names, df) attributes <- dplyr_attributes(df_with_lazy_row_names) expect_false(lazy_character_is_materialized(attributes$row.names)) # `data` row names should also propagate into the result unmaterialized attributes <- dplyr_attributes(x) expect_false(lazy_character_is_materialized(attributes$row.names)) # Ensure `template` row names aren't materialized x <- dplyr_reconstruct(df, df_with_lazy_row_names) attributes <- dplyr_attributes(df_with_lazy_row_names) expect_false(lazy_character_is_materialized(attributes$row.names)) }) dplyr/tests/testthat/helper-s3.R0000644000176200001440000000233015106134104016324 0ustar liggesuserslocal_methods <- function(..., .frame = caller_env()) { local_bindings(..., .env = global_env(), .frame = .frame) } local_foo_df <- function(frame = caller_env()) { local_methods( .frame = frame, group_by.foo_df = function(.data, ...) { out <- NextMethod() if (missing(...)) { class(out) <- c("foo_df", class(out)) } else { class(out) <- c("grouped_foo_df", class(out)) } out }, ungroup.grouped_foo_df = function(x, ...) { out <- NextMethod() class(out) <- c("foo_df", class(out)) out } ) } new_ctor <- function(base_class) { function(x = list(), ..., class = NULL) { if (inherits(x, "tbl_df")) { tibble::new_tibble(x, class = c(class, base_class), nrow = nrow(x)) } else if (is.data.frame(x)) { structure(x, class = c(class, base_class, "data.frame"), ...) } else { structure(x, class = c(class, base_class), ...) } } } foobar <- new_ctor("dplyr_foobar") foobaz <- new_ctor("dplyr_foobaz") quux <- new_ctor("dplyr_quux") # For testing reconstructing methods that break invariants by adding # new columns new_dispatched_quux <- function(x) { out <- quux(x) out$dispatched <- rep(TRUE, nrow(out)) out } dplyr/tests/testthat/test-group-data.R0000644000176200001440000000623615106134104017553 0ustar liggesusers# group_data -------------------------------------------------------------- test_that("group_data() returns a data frame", { df <- data.frame(x = 1:3) gd <- group_data(df) expect_s3_class(gd, "data.frame", exact = TRUE) expect_equal(gd$.rows, list_of(1:3)) }) test_that("group_data() returns a tibble", { df <- tibble(x = 1:3) gd <- group_data(df) expect_s3_class(gd, "tbl_df") expect_equal(gd, tibble(".rows" := list_of(1:3))) }) test_that("group_data() returns a tibble", { df <- tibble(x = c(1, 1, 2)) gf <- group_by(df, x) gd <- group_data(gf) expect_s3_class(gd, "tbl_df") expect_equal( gd, tibble(x = c(1, 2), ".rows" := list_of(1:2, 3L)), ignore_attr = TRUE ) }) test_that("group_data( group_by(x) expect_equal(n_groups(df), 3L) expect_equal(group_size(df), rep(10, 3)) }) # n_groups ---------------------------------------------------------------- test_that("n_groups respects zero-length groups (#341)", { df <- tibble(x = factor(1:3, levels = 1:4)) |> group_by(x, .drop = FALSE) expect_equal(n_groups(df), 4) }) dplyr/tests/testthat/test-group-nest.R0000644000176200001440000000360615106134104017611 0ustar liggesuserstest_that("group_nest() works", { grouped <- group_by(starwars, species, homeworld) gdata <- group_data(grouped) res <- group_nest(starwars, species, homeworld) expect_type(pull(res), "list") expect_equal( attr(pull(res), "ptype"), vec_slice(select(starwars, -species, -homeworld), 0L) ) expect_equal(res[1:2], structure(gdata[1:2], .drop = NULL)) nested <- bind_rows(!!!res$data) expect_equal( names(nested), setdiff(names(starwars), c("species", "homeworld")) ) }) test_that("group_nest() can keep the grouping variables", { grouped <- group_by(starwars, species, homeworld) gdata <- group_data(grouped) res <- group_nest(starwars, species, homeworld, keep = TRUE) nested <- bind_rows(!!!res$data) expect_equal(names(nested), names(starwars)) }) test_that("group_nest() works on grouped data frames", { grouped <- group_by(starwars, species, homeworld) gdata <- group_data(grouped) res <- group_nest(grouped) expect_type(pull(res), "list") expect_equal(res[1:2], structure(gdata[1:2], .drop = NULL)) expect_equal( names(bind_rows(!!!res$data)), setdiff(names(starwars), c("species", "homeworld")) ) res <- group_nest(grouped, keep = TRUE) expect_type(pull(res), "list") expect_equal(attr(pull(res), "ptype"), vec_slice(starwars, 0L)) expect_equal(res[1:2], structure(gdata[1:2], .drop = NULL)) expect_equal(names(bind_rows(!!!res$data)), names(starwars)) }) test_that("group_nest.grouped_df() warns about `...`", { expect_snapshot({ group_nest(group_by(mtcars, cyl), cyl) }) }) test_that("group_nest() works if no grouping column", { res <- group_nest(iris) expect_equal(res$data, list(iris)) expect_equal(names(res), "data") }) test_that("group_nest() respects .drop", { nested <- tibble(f = factor("b", levels = c("a", "b", "c")), x = 1, y = 2) |> group_nest(f, .drop = TRUE) expect_equal(nrow(nested), 1L) }) dplyr/tests/testthat/test-utils.R0000644000176200001440000000447315106134104016651 0ustar liggesusers# ------------------------------------------------------------------------------ # quo_is_variable_reference() test_that("quo_is_variable_reference handles .data", { expect_true(quo_is_variable_reference(quo(x))) expect_true(quo_is_variable_reference(quo(.data$x))) expect_true(quo_is_variable_reference(quo(.data[["x"]]))) quo <- new_quosure(quote(.data[[identity("x")]])) expect_false(quo_is_variable_reference(quo)) }) # ------------------------------------------------------------------------------ # list_flatten() test_that("`list_flatten()` is a no-op on flattened lists", { x <- list(1, 2) expect_identical(list_flatten(x), x) }) test_that("`list_flatten()` flattens list elements", { x <- list(list(1, 2), 3, list(4)) expect_identical(list_flatten(x), list(1, 2, 3, 4)) }) test_that("`list_flatten()` doesn't try to be generic", { my_list <- function(...) structure(list(...), class = c("my_list", "list")) x <- my_list(list(1, 2), 3, my_list(4)) expect_identical(list_flatten(x), list(1, 2, 3, 4)) # The no-op case returns a bare list too x <- my_list(1, 2) expect_identical(list_flatten(x), list(1, 2)) }) test_that("`list_flatten()` only retains inner names of flattened elements", { x <- list(a = list(1, b = 2), 3, list(d = 4), e = 5, f = list(1)) expect_identical(list_flatten(x), list(1, b = 2, 3, d = 4, e = 5, 1)) }) test_that("`list_flatten()` can work recursively", { x <- list(list(list(1, 2), 3), 4) # Not by default expect_identical(list_flatten(x), list(list(1, 2), 3, 4)) expect_identical(list_flatten(x, recursive = TRUE), list(1, 2, 3, 4)) }) test_that("recursive `list_flatten()` handles names correctly", { x <- list(a = list(b = list(1), c = list(d = 2), 3, e = 4), f = 5) expect_identical( list_flatten(x, recursive = TRUE), list(1, d = 2, 3, e = 4, f = 5) ) }) test_that("`list_flatten()` accepts a predicate `fn` to selectively flatten", { is_flattenable <- function(x) !is_named(x) x <- list( a = list(list(1), list(b = 2), 3), c = 4, d = list(e = 5), f = list(6) ) expect_identical( list_flatten(x, fn = is_flattenable), list(list(1), list(b = 2), 3, c = 4, d = list(e = 5), 6) ) expect_identical( list_flatten(x, fn = is_flattenable, recursive = TRUE), list(1, list(b = 2), 3, c = 4, d = list(e = 5), 6) ) }) dplyr/tests/testthat/test-rename.R0000644000176200001440000000516315106134104016755 0ustar liggesuserstest_that("rename() handles deprecated `.data` pronoun", { withr::local_options(lifecycle_verbosity = "quiet") expect_identical(rename(tibble(x = 1), y = .data$x), tibble(y = 1)) }) test_that("arguments to rename() don't match vars_rename() arguments (#2861)", { df <- tibble(a = 1) expect_identical(rename(df, var = a), tibble(var = 1)) expect_identical( rename(group_by(df, a), var = a), group_by(tibble(var = 1), var) ) expect_identical(rename(df, strict = a), tibble(strict = 1)) expect_identical( rename(group_by(df, a), strict = a), group_by(tibble(strict = 1), strict) ) }) test_that("rename() to UTF-8 column names", { skip_if_not(l10n_info()$"UTF-8") df <- tibble(a = 1) |> rename("\u5e78" := a) expect_equal(colnames(df), "\u5e78") }) test_that("can rename() with strings and character vectors", { vars <- c(foo = "cyl", bar = "am") expect_identical(rename(mtcars, !!!vars), rename(mtcars, foo = cyl, bar = am)) expect_identical(rename(mtcars, !!vars), rename(mtcars, foo = cyl, bar = am)) }) test_that("rename preserves grouping", { gf <- group_by(tibble(g = 1:3, x = 3:1), g) i <- count_regroups(out <- rename(gf, h = g)) expect_equal(i, 0) expect_equal(group_vars(out), "h") }) test_that("can rename with duplicate columns", { df <- tibble(x = 1, x = 2, y = 1, .name_repair = "minimal") expect_named(df |> rename(x2 = 2), c("x", "x2", "y")) }) test_that("rename() ignores duplicates", { df <- tibble(x = 1) expect_named(rename(df, a = x, b = x), "b") }) # rename_with ------------------------------------------------------------- test_that("can select columns", { df <- tibble(x = 1, y = 2) expect_named(df |> rename_with(toupper, 1), c("X", "y")) df <- tibble(x = 1, y = 2) expect_named(df |> rename_with(toupper, x), c("X", "y")) }) test_that("passes ... along", { df <- tibble(x = 1, y = 2) expect_named( df |> rename_with(gsub, 1, pattern = "x", replacement = "X"), c("X", "y") ) }) test_that("can't create duplicated names", { df <- tibble(x = 1, y = 2) expect_error( df |> rename_with(~ rep_along(.x, "X")), class = "vctrs_error_names" ) }) test_that("`.fn` result type is checked (#6561)", { df <- tibble(x = 1) fn <- function(x) 1L expect_snapshot(error = TRUE, { rename_with(df, fn) }) }) test_that("`.fn` result size is checked (#6561)", { df <- tibble(x = 1, y = 2) fn <- function(x) c("a", "b", "c") expect_snapshot(error = TRUE, { rename_with(df, fn) }) }) test_that("can't rename in `.cols`", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { rename_with(df, toupper, .cols = c(y = x)) }) }) dplyr/tests/testthat/test-locale.R0000644000176200001440000000112215106134104016734 0ustar liggesuserstest_that("`dplyr_legacy_locale()` is `FALSE` by default", { expect_false(dplyr_legacy_locale()) }) test_that("`dplyr_legacy_locale()` respects `dplyr.legacy_locale`", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) expect_true(dplyr_legacy_locale()) local_options(dplyr.legacy_locale = 1) expect_snapshot(error = TRUE, { dplyr_legacy_locale() }) }) test_that("`dplyr_legacy_locale()` treats `dplyr.legacy_locale` as deprecated", { local_options(dplyr.legacy_locale = TRUE) expect_snapshot({ dplyr_legacy_locale() }) }) dplyr/tests/testthat/test-mutate.R0000644000176200001440000006063615106134104017013 0ustar liggesuserstest_that("empty mutate returns input", { df <- tibble(x = 1) gf <- group_by(df, x) expect_equal(mutate(df), df) expect_equal(mutate(df, .by = x), df) expect_equal(mutate(gf), gf) expect_equal(mutate(df, !!!list()), df) expect_equal(mutate(df, !!!list(), .by = x), df) expect_equal(mutate(gf, !!!list()), gf) }) test_that("rownames preserved", { df <- data.frame(x = c(1, 2), row.names = c("a", "b")) df <- mutate(df, y = 2) expect_equal(row.names(df), c("a", "b")) df <- mutate(df, y = 2, .by = x) expect_equal(row.names(df), c("a", "b")) }) test_that("mutations applied progressively", { df <- tibble(x = 1) expect_equal(df |> mutate(y = x + 1, z = y + 1), tibble(x = 1, y = 2, z = 3)) expect_equal(df |> mutate(x = x + 1, x = x + 1), tibble(x = 3)) expect_equal(df |> mutate(x = 2, y = x), tibble(x = 2, y = 2)) df <- data.frame(x = 1, y = 2) expect_equal( df |> mutate(x2 = x, x3 = x2 + 1), df |> mutate(x2 = x + 0, x3 = x2 + 1) ) }) test_that("length-1 vectors are recycled (#152)", { df <- tibble(x = 1:4) expect_equal(mutate(df, y = 1)$y, rep(1, 4)) expect_error(mutate(df, y = 1:2)) }) test_that("can remove variables with NULL (#462)", { df <- tibble(x = 1:3, y = 1:3) gf <- group_by(df, x) expect_equal(df |> mutate(y = NULL), df[1]) expect_equal(gf |> mutate(y = NULL), gf[1]) # even if it doesn't exist expect_equal(df |> mutate(z = NULL), df) # or was just created expect_equal(df |> mutate(z = 1, z = NULL), df) # regression test for https://github.com/tidyverse/dplyr/issues/4974 expect_equal( mutate(data.frame(x = 1, y = 1), z = 1, x = NULL, y = NULL), data.frame(z = 1) ) }) test_that("mutate() names pronouns correctly (#2686)", { expect_named(mutate(tibble(x = 1), .data$x), "x") expect_named(mutate(tibble(x = 1), .data[["x"]]), "x") }) test_that("mutate() supports unquoted values", { df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5) expect_identical(mutate(df, out = !!1), mutate(df, out = 1)) expect_identical(mutate(df, out = !!(1:5)), mutate(df, out = 1:5)) expect_identical(mutate(df, out = !!quote(1:5)), mutate(df, out = 1:5)) gdf <- group_by(df, g) expect_identical(mutate(gdf, out = !!1), mutate(gdf, out = 1)) }) test_that("assignments don't overwrite variables (#315)", { df <- tibble(x = 1, y = 2) out <- df |> mutate(z = { x <- 10 x }) expect_equal(out, tibble(x = 1, y = 2, z = 10)) }) test_that("can mutate a data frame with zero columns", { df <- new_data_frame(n = 2L) expect_equal(mutate(df, x = 1), data.frame(x = c(1, 1))) }) test_that("mutate() handles symbol expressions", { df <- tibble(x = structure(1, class = "alien")) res <- mutate(df, y = x) expect_identical(df$x, res$y) gf <- group_by(df, x) res <- mutate(df, y = x) expect_identical(df$x, res$y) }) test_that("mutate() supports constants (#6056, #6305)", { df <- data.frame(x = 1:10, g = rep(1:2, each = 5)) y <- 1:10 z <- 1:5 expect_identical(df |> mutate(y = !!y) |> pull(y), y) expect_identical(df |> group_by(g) |> mutate(y = !!y) |> pull(y), y) expect_identical(df |> rowwise() |> mutate(y = !!y) |> pull(y), y) expect_snapshot({ (expect_error(df |> mutate(z = !!z))) (expect_error(df |> group_by(g) |> mutate(z = !!z))) (expect_error(df |> rowwise() |> mutate(z = !!z))) }) # `.env$` is used for per group evaluation expect_identical(df |> mutate(y = .env$y) |> pull(y), y) expect_identical( df |> group_by(g) |> mutate(z = .env$z) |> pull(z), c(z, z) ) expect_snapshot({ (expect_error(df |> group_by(g) |> mutate(y = .env$y))) (expect_error(df |> rowwise() |> mutate(y = .env$y))) }) }) test_that("can't overwrite column active bindings (#6666)", { skip_if(getRversion() < "3.6.3", message = "Active binding error changed") df <- tibble(g = 1:2, x = 3:4) gdf <- group_by(df, g) # The error seen here comes from trying to `<-` to an active binding when # the active binding function has 0 arguments. expect_snapshot(error = TRUE, { mutate(df, y = { x <<- 2 x }) }) expect_snapshot(error = TRUE, { mutate(df, .by = g, y = { x <<- 2 x }) }) expect_snapshot(error = TRUE, { mutate(gdf, y = { x <<- 2 x }) }) }) test_that("assigning with `<-` doesn't affect the mask (#6666)", { df <- tibble(g = 1:2, x = 3:4) gdf <- group_by(df, g) out <- mutate(df, .by = g, y = { x <- x + 2L x }) expect_identical(out$x, c(3L, 4L)) expect_identical(out$y, c(5L, 6L)) out <- mutate(gdf, y = { x <- x + 2L x }) expect_identical(out$x, c(3L, 4L)) expect_identical(out$y, c(5L, 6L)) }) test_that("`across()` inline expansions that use `<-` don't affect the mask (#6666)", { df <- tibble(g = 1:2, x = 3:4) out <- df |> mutate( across(x, function(col) { col <- col + 2L col }), .by = g ) expect_identical(out$x, c(5L, 6L)) }) test_that("can't share local variables across expressions (#6666)", { df <- tibble(x = 1:2, y = 3:4) expect_snapshot(error = TRUE, { mutate( df, x2 = { foo <- x x }, y2 = { foo } ) }) }) # column types ------------------------------------------------------------ test_that("glue() is supported", { expect_equal( tibble(x = 1) |> mutate(y = glue("")), tibble(x = 1, y = glue("")) ) }) test_that("mutate disambiguates NA and NaN (#1448)", { df <- tibble(x = c(1, NA, NaN)) out <- mutate(df, y = x * 1) expect_equal(out$y, df$x) }) test_that("mutate preserves names (#1689, #2675)", { df <- tibble(a = 1:3) out1 <- df |> mutate(b = setNames(1:3, letters[1:3])) out2 <- df |> mutate(b = setNames(as.list(1:3), letters[1:3])) expect_named(out1$b, letters[1:3]) expect_named(out2$b, letters[1:3]) }) test_that("mutate handles matrix columns", { df <- data.frame(a = rep(1:3, each = 2), b = 1:6) df_regular <- mutate(df, b = scale(b)) df_grouped <- mutate(group_by(df, a), b = scale(b)) df_rowwise <- mutate(rowwise(df), b = scale(b)) expect_equal(dim(df_regular$b), c(6, 1)) expect_equal(dim(df_grouped$b), c(6, 1)) expect_equal(dim(df_rowwise$b), c(6, 1)) }) test_that("mutate handles data frame columns", { df <- data.frame("a" = c(1, 2, 3), "b" = c(2, 3, 4), "base_col" = c(3, 4, 5)) res <- mutate(df, new_col = data.frame(x = 1:3)) expect_equal(res$new_col, data.frame(x = 1:3)) res <- mutate(group_by(df, a), new_col = data.frame(x = a)) expect_equal(res$new_col, data.frame(x = 1:3)) res <- mutate(rowwise(df), new_col = data.frame(x = a)) expect_equal(res$new_col, data.frame(x = 1:3)) }) test_that("unnamed data frames are automatically unspliced (#2326, #3630)", { expect_identical( tibble(a = 1) |> mutate(tibble(b = 2)), tibble(a = 1, b = 2) ) expect_identical( tibble(a = 1) |> mutate(tibble(b = 2), tibble(b = 3)), tibble(a = 1, b = 3) ) expect_identical( tibble(a = 1) |> mutate(tibble(b = 2), c = b), tibble(a = 1, b = 2, c = 2) ) }) test_that("named data frames are packed (#2326, #3630)", { df <- tibble(x = 1) out <- df |> mutate(y = tibble(a = x)) expect_equal(out, tibble(x = 1, y = tibble(a = 1))) }) test_that("unchop only called for when multiple groups", { df <- data.frame(g = 1, x = 1:5) out <- mutate(df, x = ts(x, start = c(1971, 1), frequency = 52)) expect_s3_class(out$x, "ts") gdf <- group_by(df, g) out <- mutate(gdf, x = ts(x, start = c(1971, 1), frequency = 52)) expect_s3_class(out$x, "ts") }) # output types ------------------------------------------------------------ test_that("mutate preserves grouping", { gf <- group_by(tibble(x = 1:2, y = 2), x) i <- count_regroups(out <- mutate(gf, x = 1)) expect_equal(i, 1L) expect_equal(group_vars(out), "x") expect_equal(nrow(group_data(out)), 1) i <- count_regroups(out <- mutate(gf, z = 1)) expect_equal(i, 0) expect_equal(group_data(out), group_data(gf)) }) test_that("mutate works on zero-row grouped data frame (#596)", { dat <- data.frame(a = numeric(0), b = character(0), stringsAsFactors = TRUE) res <- dat |> group_by(b, .drop = FALSE) |> mutate(a2 = a * 2) expect_type(res$a2, "double") expect_s3_class(res, "grouped_df") expect_equal(res$a2, numeric(0)) expect_type(group_rows(res), "list") expect_equal(attr(group_rows(res), "ptype"), integer()) expect_equal(group_data(res)$b, factor(character(0))) }) test_that("mutate preserves class of zero-row rowwise (#4224, #6303)", { # Each case needs to test both x and identity(x) because these flow # through two slightly different pathways. rf <- rowwise(tibble(x = character(0))) out <- mutate(rf, x2 = identity(x), x3 = x) expect_equal(out$x2, character()) expect_equal(out$x3, character()) # including list-of classes of list-cols where possible rf <- rowwise(tibble(x = list_of(.ptype = character()))) out <- mutate(rf, x2 = identity(x), x3 = x) expect_equal(out$x2, character()) expect_equal(out$x3, character()) # an empty list is turns into a logical (aka unspecified) rf <- rowwise(tibble(x = list())) out <- mutate(rf, x2 = identity(x), x3 = x) expect_equal(out$x2, logical()) expect_equal(out$x3, logical()) # with the empty list case, `x` is `logical()`, not a random logical of length # 1 that happens to get recycled to length 0 (#7710) rf <- rowwise(tibble(x = list())) out <- mutate(rf, x2 = { expect_identical(x, logical()) x }) }) test_that("mutate works on empty data frames (#1142)", { df <- data.frame() res <- df |> mutate() expect_equal(nrow(res), 0L) expect_equal(length(res), 0L) res <- df |> mutate(x = numeric()) expect_equal(names(res), "x") expect_equal(nrow(res), 0L) expect_equal(length(res), 1L) }) test_that("mutate handles 0 rows rowwise (#1300)", { res <- tibble(y = character()) |> rowwise() |> mutate(z = 1) expect_equal(nrow(res), 0L) }) test_that("rowwise mutate gives expected results (#1381)", { f <- function(x) ifelse(x < 2, NA_real_, x) res <- tibble(x = 1:3) |> rowwise() |> mutate(y = f(x)) expect_equal(res$y, c(NA, 2, 3)) }) test_that("rowwise mutate un-lists existing size-1 list-columns (#6302)", { # Existing column rf <- rowwise(tibble(x = as.list(1:3))) out <- mutate(rf, y = x) expect_equal(out$y, 1:3) # New column rf <- rowwise(tibble(x = 1:3)) out <- mutate(rf, y = list(1), z = y) expect_identical(out$z, c(1, 1, 1)) # Column of data 1-row data frames rf <- rowwise(tibble(x = list(tibble(a = 1), tibble(a = 2)))) out <- mutate(rf, y = x) expect_identical(out$y, tibble(a = c(1, 2))) # Preserves known list-of type rf <- rowwise(tibble(x = list_of(.ptype = character()))) out <- mutate(rf, y = x) expect_identical(out$y, character()) # Errors if it's not a length-1 list df <- rowwise(tibble(x = list(1, 2:3))) expect_snapshot(mutate(df, y = x), error = TRUE) }) test_that("grouped mutate does not drop grouping attributes (#1020)", { d <- data.frame(subject = c("Jack", "Jill"), id = c(2, 1)) |> group_by(subject) a1 <- names(attributes(d)) a2 <- names(attributes(d |> mutate(foo = 1))) expect_equal(setdiff(a1, a2), character(0)) }) test_that("mutate() hands list columns with rowwise magic to follow up expressions (#4845)", { test <- rowwise(tibble(x = 1:2)) expect_identical( test |> mutate(a = list(1)) |> mutate(b = list(a + 1)), test |> mutate(a = list(1), b = list(a + 1)) ) }) test_that("mutate keeps zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) expect_equal(group_size(mutate(df, z = 2)), c(2, 2, 0)) }) # other ------------------------------------------------------------------- test_that("no utf8 invasion (#722)", { skip_if_not(l10n_info()$"UTF-8") skip_if_not_installed("lobstr") source("utf-8.txt", local = TRUE, encoding = "UTF-8") }) test_that("mutate() to UTF-8 column names", { df <- tibble(a = 1) |> mutate("\u5e78" := a) expect_equal(colnames(df), c("a", "\u5e78")) }) test_that("Non-ascii column names in version 0.3 are not duplicated (#636)", { local_non_utf8_encoding() df <- tibble(a = "1", b = "2") names(df) <- c("a", enc2native("\u4e2d")) res <- df |> mutate_all(as.numeric) expect_equal(names(res), as_utf8_character(names(df))) }) test_that("mutate coerces results from one group with all NA values (#1463) ", { df <- tibble(x = c(1, 2), y = c(1, NA)) res <- df |> group_by(x) |> mutate(z = ifelse(y > 1, 1, 2)) expect_true(is.na(res$z[2])) expect_type(res$z, "double") }) test_that("grouped subsets are not lazy (#3360)", { make_call <- function(x) { quo(!!x) } res <- tibble(name = 1:2, value = letters[1:2]) |> rowwise() |> mutate(call = list(make_call(value))) |> pull() expect_identical(res, list(make_call("a"), make_call("b"))) res <- tibble(name = 1:2, value = letters[1:2]) |> group_by(name) |> mutate(call = list(make_call(value))) |> pull() expect_identical(res, list(make_call("a"), make_call("b"))) }) test_that("mutate() evaluates expression for empty groups", { df <- tibble(f = factor(c("a", "b"), levels = c("a", "b", "c"))) gf <- group_by(df, f, .drop = FALSE) count <- 0 mutate(gf, x = { count <<- count + 1 }) expect_equal(count, 3L) }) test_that("DataMask$add() forces chunks (#4677)", { df <- tibble(bf10 = 0.244) |> mutate( bf01 = 1 / bf10, log_e_bf10 = log(bf10), log_e_bf01 = log(bf01) ) expect_equal(df$log_e_bf01, log(1 / 0.244)) }) test_that("DataMask uses fresh copies of group id / size variables (#6762)", { df <- tibble(x = 1:2) fn <- function() { df <- tibble(a = 1) # Otherwise, this nested `mutate()` can modify the same # id/size variable as the outer one, which causes havoc mutate(df, b = a + 1) } out <- mutate(df, y = { fn() x }) expect_identical(out$x, 1:2) expect_identical(out$y, 1:2) }) test_that("mutate() correctly auto-names expressions (#6741)", { df <- tibble(a = 1L) expect_identical(mutate(df, -a), tibble(a = 1L, "-a" = -1L)) foo <- "foobar" expect_identical(mutate(df, foo), tibble(a = 1L, foo = "foobar")) a <- 2L expect_identical(mutate(df, a), tibble(a = 1L)) df <- tibble(a = 1L, "a + 1" = 5L) a <- 2L expect_identical(mutate(df, a + 1), tibble(a = 1L, "a + 1" = 2)) }) # .by ------------------------------------------------------------------------- test_that("can group transiently using `.by`", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) out <- mutate(df, x = mean(x), .by = g) expect_identical(out$g, df$g) expect_identical(out$x, c(3, 3, 2, 3, 2)) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains bare data.frame class", { df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) out <- mutate(df, x = mean(x), .by = g) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains data frame attributes (#6100)", { # With data.frames or tibbles df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) tbl <- as_tibble(df) attr(df, "foo") <- "bar" attr(tbl, "foo") <- "bar" out <- mutate(df, x = mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") out <- mutate(tbl, x = mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") }) test_that("can `NULL` out the `.by` column", { df <- tibble(x = 1:3) expect_identical( mutate(df, x = NULL, .by = x), new_tibble(list(), nrow = 3) ) }) test_that("catches `.by` with grouped-df", { df <- tibble(x = 1) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { mutate(gdf, .by = x) }) }) test_that("catches `.by` with rowwise-df", { df <- tibble(x = 1) rdf <- rowwise(df) expect_snapshot(error = TRUE, { mutate(rdf, .by = x) }) }) # .before, .after, .keep ------------------------------------------------------ test_that(".keep = 'unused' keeps variables explicitly mentioned", { df <- tibble(x = 1, y = 2) out <- mutate(df, x1 = x + 1, y = y, .keep = "unused") expect_named(out, c("y", "x1")) }) test_that(".keep = 'used' not affected by across() or pick()", { df <- tibble(x = 1, y = 2, z = 3, a = "a", b = "b", c = "c") # This must evaluate every column in order to figure out if should # be included in the set or not, but that shouldn't be counted for # the purposes of "used" variables out <- mutate(df, across(where(is.numeric), identity), .keep = "unused") expect_named(out, names(df)) out <- mutate(df, pick(where(is.numeric)), .keep = "unused") expect_named(out, names(df)) }) test_that(".keep = 'used' keeps variables used in expressions", { df <- tibble(a = 1, b = 2, c = 3, x = 1, y = 2) out <- mutate(df, xy = x + y, .keep = "used") expect_named(out, c("x", "y", "xy")) }) test_that(".keep = 'none' only keeps grouping variables", { df <- tibble(x = 1, y = 2) gf <- group_by(df, x) expect_named(mutate(df, z = 1, .keep = "none"), "z") expect_named(mutate(gf, z = 1, .keep = "none"), c("x", "z")) }) test_that(".keep = 'none' retains original ordering (#5967)", { df <- tibble(x = 1, y = 2) expect_named(df |> mutate(y = 1, x = 2, .keep = "none"), c("x", "y")) # even when grouped gf <- group_by(df, x) expect_named(gf |> mutate(y = 1, x = 2, .keep = "none"), c("x", "y")) }) test_that("can use .before and .after to control column position", { df <- tibble(x = 1, y = 2) expect_named(mutate(df, z = 1), c("x", "y", "z")) expect_named(mutate(df, z = 1, .before = 1), c("z", "x", "y")) expect_named(mutate(df, z = 1, .after = 1), c("x", "z", "y")) # but doesn't affect order of existing columns df <- tibble(x = 1, y = 2) expect_named(mutate(df, x = 1, .after = y), c("x", "y")) }) test_that("attributes of bare data frames are retained when `.before` and `.after` are used (#6341)", { # We require `[` methods to be in charge of keeping extra attributes for all # data frame subclasses (except for data.tables) df <- vctrs::data_frame(x = 1, y = 2) attr(df, "foo") <- "bar" out <- mutate(df, z = 3, .before = x) expect_identical(attr(out, "foo"), "bar") }) test_that(".keep and .before/.after interact correctly", { df <- tibble(x = 1, y = 1, z = 1, a = 1, b = 2, c = 3) |> group_by(a, b) expect_named(mutate(df, d = 1, x = 2, .keep = "none"), c("x", "a", "b", "d")) expect_named( mutate(df, d = 1, x = 2, .keep = "none", .before = "a"), c("x", "d", "a", "b") ) expect_named( mutate(df, d = 1, x = 2, .keep = "none", .after = "a"), c("x", "a", "d", "b") ) }) test_that("dropping column with `NULL` then readding it retains original location", { df <- tibble(x = 1, y = 2, z = 3, a = 4) df <- group_by(df, z) expect_named( mutate(df, y = NULL, y = 3, .keep = "all"), c("x", "y", "z", "a") ) expect_named( mutate(df, b = a, y = NULL, y = 3, .keep = "used"), c("y", "z", "a", "b") ) expect_named( mutate(df, b = a, y = NULL, y = 3, .keep = "unused"), c("x", "y", "z", "b") ) # It isn't treated as a "new" column expect_named( mutate(df, y = NULL, y = 3, .keep = "all", .before = x), c("x", "y", "z", "a") ) }) test_that("setting a new column to `NULL` works with `.before` and `.after` (#6563)", { df <- tibble(x = 1, y = 2, z = 3, a = 4) expect_named(mutate(df, b = NULL, .before = 1), names(df)) expect_named(mutate(df, b = 1, b = NULL, .before = 1), names(df)) expect_named( mutate(df, b = NULL, b = 1, .before = 1), c("b", "x", "y", "z", "a") ) expect_named( mutate(df, b = NULL, c = 1, .after = 2), c("x", "y", "c", "z", "a") ) }) test_that(".keep= always retains grouping variables (#5582)", { df <- tibble(x = 1, y = 2, z = 3) |> group_by(z) expect_equal( df |> mutate(a = x + 1, .keep = "none"), tibble(z = 3, a = 2) |> group_by(z) ) expect_equal( df |> mutate(a = x + 1, .keep = "all"), tibble(x = 1, y = 2, z = 3, a = 2) |> group_by(z) ) expect_equal( df |> mutate(a = x + 1, .keep = "used"), tibble(x = 1, z = 3, a = 2) |> group_by(z) ) expect_equal( df |> mutate(a = x + 1, .keep = "unused"), tibble(y = 2, z = 3, a = 2) |> group_by(z) ) }) test_that("mutate() preserves the call stack on error (#5308)", { foobar <- function() stop("foo") stack <- NULL expect_error( withCallingHandlers( error = function(...) stack <<- sys.calls(), mutate(mtcars, foobar()) ) ) expect_true(some(stack, is_call, "foobar")) }) test_that("dplyr data mask can become obsolete", { lazy <- function(x) { list(enquo(x)) } df <- tibble( x = 1:2 ) res <- df |> rowwise() |> mutate(y = lazy(x), .keep = "unused") expect_equal(names(res), c("x", "y")) expect_error(eval_tidy(res$y[[1]])) }) test_that("mutate() deals with 0 groups (#5534)", { df <- data.frame(x = numeric()) |> group_by(x) expect_equal( mutate(df, y = x + 1), data.frame(x = numeric(), y = numeric()) |> group_by(x) ) expect_snapshot({ mutate(df, y = max(x)) }) }) test_that("functions are not skipped in data pronoun (#5608)", { f <- function(i) i + 1 df <- tibble(a = list(f), b = 1) two <- df |> rowwise() |> mutate(res = .data$a(.data$b)) |> pull(res) expect_equal(two, 2) }) test_that("mutate() casts data frame results to common type (#5646)", { df <- data.frame(x = 1:2, g = 1:2) |> group_by(g) res <- df |> mutate(if (g == 1) data.frame(y = 1) else data.frame(y = 1, z = 2)) expect_equal(res$z, c(NA, 2)) }) test_that("mutate() supports empty list columns in rowwise data frames (#5804", { res <- tibble(a = list()) |> rowwise() |> mutate(n = lengths(a)) expect_equal(res$n, integer()) }) test_that("mutate() fails on named empty arguments (#5925)", { expect_error( mutate(tibble(), bogus = ) ) }) # Error messages ---------------------------------------------------------- test_that("mutate() give meaningful errors", { expect_snapshot({ tbl <- tibble(x = 1:2, y = 1:2) # setting column to NULL makes it unavailable (expect_error(tbl |> mutate(y = NULL, a = sum(y)))) (expect_error( tbl |> group_by(x) |> mutate(y = NULL, a = sum(y)) )) # incompatible column type (expect_error(tibble(x = 1) |> mutate(y = mean))) # Unsupported type" df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5) (expect_error(df |> mutate(out = env(a = 1)))) (expect_error( df |> group_by(g) |> mutate(out = env(a = 1)) )) (expect_error( df |> rowwise() |> mutate(out = rnorm) )) # incompatible types across groups (expect_error( data.frame(x = rep(1:5, each = 3)) |> group_by(x) |> mutate(val = ifelse(x < 3, "foo", 2)) )) # mixed nulls (expect_error( tibble(a = 1:3, b = 4:6) |> group_by(a) |> mutate(if (a == 1) NULL else "foo") )) (expect_error( tibble(a = 1:3, b = 4:6) |> group_by(a) |> mutate(if (a == 2) NULL else "foo") )) # incompatible size (expect_error( data.frame(x = c(2, 2, 3, 3)) |> mutate(int = 1:5) )) (expect_error( data.frame(x = c(2, 2, 3, 3)) |> group_by(x) |> mutate(int = 1:5) )) (expect_error( data.frame(x = c(2, 3, 3)) |> group_by(x) |> mutate(int = 1:5) )) (expect_error( data.frame(x = c(2, 2, 3, 3)) |> rowwise() |> mutate(int = 1:5) )) (expect_error( tibble(y = list(1:3, "a")) |> rowwise() |> mutate(y2 = y) )) (expect_error( data.frame(x = 1:10) |> mutate(y = 11:20, y = 1:2) )) # .data pronoun (expect_error( tibble(a = 1) |> mutate(c = .data$b) )) (expect_error( tibble(a = 1:3) |> group_by(a) |> mutate(c = .data$b) )) # obsolete data mask lazy <- function(x) list(enquo(x)) res <- tbl |> rowwise() |> mutate(z = lazy(x), .keep = "unused") (expect_error( eval_tidy(res$z[[1]]) )) # Error that contains { (expect_error( tibble() |> mutate(stop("{")) )) }) }) test_that("mutate() errors refer to expressions if not named", { expect_snapshot({ (expect_error(mutate(mtcars, 1:3))) (expect_error(mutate(group_by(mtcars, cyl), 1:3))) }) }) test_that("`mutate()` doesn't allow data frames with missing or empty names (#6758)", { df1 <- new_data_frame(set_names(list(1), "")) df2 <- new_data_frame(set_names(list(1), NA_character_)) expect_snapshot(error = TRUE, { mutate(df1) }) expect_snapshot(error = TRUE, { mutate(df2) }) }) dplyr/tests/testthat/test-relocate.R0000644000176200001440000001270315106134104017302 0ustar liggesusers# ------------------------------------------------------------------------------ # relocate() test_that(".before and .after relocate individual cols", { df <- tibble(x = 1, y = 2) expect_named(relocate(df, x, .after = y), c("y", "x")) expect_named(relocate(df, y, .before = x), c("y", "x")) }) test_that("can move blocks of variables", { df <- tibble(x = 1, a = "a", y = 2, b = "a") expect_named(relocate(df, where(is.character)), c("a", "b", "x", "y")) expect_named( relocate(df, where(is.character), .after = where(is.numeric)), c("x", "y", "a", "b") ) }) test_that("don't lose non-contiguous variables", { df <- tibble(a = 1, b = 1, c = 1, d = 1, e = 1) expect_named(relocate(df, b, .after = c(a, c, e)), c("a", "c", "d", "e", "b")) expect_named(relocate(df, e, .before = c(b, d)), c("a", "e", "b", "c", "d")) }) test_that("no .before/.after moves to front", { df <- tibble(x = 1, y = 2) expect_named(relocate(df, y), c("y", "x")) }) test_that("can only supply one of .before and .after", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { relocate(df, .before = 1, .after = 1) }) }) test_that("before and after are defused with context", { local_fn <- identity expect_identical( names(relocate(mtcars, 3, .before = local_fn(5))), names(relocate(mtcars, 3, .before = 5)) ) expect_identical( names(relocate(mtcars, 3, .after = local_fn(5))), names(relocate(mtcars, 3, .after = 5)) ) }) test_that("relocate() respects order specified by ... (#5328)", { df <- tibble(a = 1, x = 1, b = 1, z = 1, y = 1) expect_equal( names(relocate(df, x, y, z, .before = x)), c("a", "x", "y", "z", "b") ) expect_equal( names(relocate(df, x, y, z, .after = last_col())), c("a", "b", "x", "y", "z") ) expect_equal( names(relocate(df, x, a, z)), c("x", "a", "z", "b", "y") ) }) test_that("relocate() can rename (#5569)", { df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a") expect_equal( relocate(df, ffff = f), tibble(ffff = "a", a = 1, b = 1, c = 1, d = "a", e = "a") ) expect_equal( relocate(df, ffff = f, .before = c), tibble(a = 1, b = 1, ffff = "a", c = 1, d = "a", e = "a") ) expect_equal( relocate(df, ffff = f, .after = c), tibble(a = 1, b = 1, c = 1, ffff = "a", d = "a", e = "a") ) }) test_that("`relocate()` retains the last duplicate when renaming while moving (#6209)", { # To enforce the invariant that `ncol(.data) == ncol(relocate(.data, ...))`. # Also matches `rename()` behavior. df <- tibble(x = 1) expect_named(relocate(df, a = x, b = x), "b") expect_identical( relocate(df, a = x, b = x), rename(df, a = x, b = x) ) df <- tibble(x = 1, y = 2) expect_named(relocate(df, a = x, b = y, c = x), c("b", "c")) expect_identical( relocate(df, a = x, b = y, c = x), select(rename(df, a = x, b = y, c = x), b, c) ) }) test_that("attributes of bare data frames are retained (#6341)", { # We require `[` methods to be in charge of keeping extra attributes for all # data frame subclasses (except for data.tables) df <- vctrs::data_frame(x = 1, y = 2) attr(df, "foo") <- "bar" out <- relocate(df, y, .before = x) expect_identical(attr(out, "foo"), "bar") }) # ------------------------------------------------------------------------------ # eval_relocate() test_that("works with zero column data frames (#6167)", { data <- tibble() expr <- expr(any_of("b")) expect_identical( eval_relocate(expr, data), set_names(integer()) ) }) test_that("works with `before` and `after` `everything()`", { data <- tibble(w = 1, x = 2, y = 3, z = 4) expr <- expr(c(y, z)) expr_everything <- expr(everything()) expect_identical( eval_relocate(expr, data, before = expr_everything), c(y = 3L, z = 4L, w = 1L, x = 2L) ) expect_identical( eval_relocate(expr, data, after = expr_everything), c(w = 1L, x = 2L, y = 3L, z = 4L) ) }) test_that("moves columns to the front when neither `before` nor `after` are specified", { data <- tibble(x = 1, y = 2, z = 3) expr <- expr(c(z, y)) expect_identical( eval_relocate(expr, data), c(z = 3L, y = 2L, x = 1L) ) }) test_that("Empty `before` selection moves columns to front", { data <- tibble(x = 1, y = 2, z = 3) expr <- expr(y) before <- expr(where(is.character)) expect_identical( eval_relocate(expr, data, before = before), c(y = 2L, x = 1L, z = 3L) ) }) test_that("Empty `after` selection moves columns to end", { data <- tibble(x = 1, y = 2, z = 3) expr <- expr(y) after <- expr(where(is.character)) expect_identical( eval_relocate(expr, data, after = after), c(x = 1L, z = 3L, y = 2L) ) }) test_that("Empty `before` and `after` selections work with 0-col data frames", { data <- tibble() expr <- expr(any_of("a")) expr_is_character <- expr(where(is.character)) expect_identical( eval_relocate(expr, data, before = expr_is_character), set_names(integer()) ) expect_identical( eval_relocate(expr, data, after = expr_is_character), set_names(integer()) ) }) test_that("retains the last duplicate when renaming while moving (#6209)", { # To enforce the invariant that relocating can't change the number of columns data <- tibble(x = 1) expr <- expr(c(a = x, b = x)) expect_identical( eval_relocate(expr, data), c(b = 1L) ) data <- tibble(x = 1, y = 2) expr <- expr(c(a = x, b = y, c = x)) expect_identical( eval_relocate(expr, data), c(b = 2L, c = 1L) ) }) dplyr/tests/testthat/test-pick.R0000644000176200001440000004140415137161765016453 0ustar liggesusers# ------------------------------------------------------------------------------ # pick() + mutate() test_that("can pick columns from the data", { df <- tibble(x1 = 1, y = 2, x2 = 3, z = 4) expect <- df[c("z", "x1", "x2")] out <- mutate(df, sel = pick(z, starts_with("x"))) expect_identical(out$sel, expect) out <- mutate(df, sel = pick_wrapper(z, starts_with("x"))) expect_identical(out$sel, expect) }) test_that("can use namespaced call to `pick()`", { df <- tibble(x = 1, y = "y") expect_identical( mutate(df, z = dplyr::pick(where(is.character))), mutate(df, z = pick(where(is.character))) ) }) test_that("returns separate data frames for each group", { fn <- function(x) { x[["x"]] + mean(x[["z"]]) } df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5, z = 11:15) gdf <- group_by(df, g) expect <- mutate(gdf, res = x + mean(z)) out <- mutate(gdf, res = fn(pick(x, z))) expect_identical(out, expect) out <- mutate(gdf, res = fn(pick_wrapper(x, z))) expect_identical(out, expect) }) test_that("returns a tibble", { df <- data.frame(x = 1) out <- mutate(df, y = pick(x)) expect_s3_class(out$y, "tbl_df") out <- mutate(df, y = pick_wrapper(x)) expect_s3_class(out$y, "tbl_df") }) test_that("with `rowwise()` data, leaves list-cols unwrapped (#5951, #6264)", { # Because this most closely mimics macro expansion of: # pick(x) -> tibble(x = x) df <- tibble(x = list(1, 2:3, 4:5), y = 1:3) rdf <- rowwise(df) expect_snapshot(error = TRUE, { mutate(rdf, z = pick(x, y)) }) expect_snapshot(error = TRUE, { mutate(rdf, z = pick_wrapper(x, y)) }) }) test_that("selectors won't select grouping columns", { df <- tibble(g = 1, x = 2) gdf <- group_by(df, g) out <- mutate(gdf, y = pick(everything())) expect_named(out$y, "x") out <- mutate(gdf, y = pick_wrapper(everything())) expect_named(out$y, "x") }) test_that("selectors won't select rowwise 'grouping' columns", { df <- tibble(g = 1, x = 2) rdf <- rowwise(df, g) out <- mutate(rdf, y = pick(everything())) expect_named(out$y, "x") out <- mutate(rdf, y = pick_wrapper(everything())) expect_named(out$y, "x") }) test_that("can't explicitly select grouping columns (#5460)", { # Related to removing the mask layer from the quosure environments df <- tibble(g = 1, x = 2) gdf <- group_by(df, g) expect_snapshot(error = TRUE, { mutate(gdf, y = pick(g)) }) expect_snapshot(error = TRUE, { mutate(gdf, y = pick_wrapper(g)) }) }) test_that("`all_of()` is evaluated in the correct environment (#5460)", { # Related to removing the mask layer from the quosure environments df <- tibble(g = 1, x = 2, y = 3) # We expect an "object not found" error, but we don't control that # so we aren't going to snapshot it, especially since the call reported # by those kinds of errors changed in R 4.3. expect_error(mutate(df, z = pick(all_of(y)))) expect_error(mutate(df, z = pick_wrapper(all_of(y)))) y <- "x" expect <- df["x"] out <- mutate(df, z = pick(all_of(y))) expect_identical(out$z, expect) out <- mutate(df, z = pick_wrapper(all_of(y))) expect_identical(out$z, expect) }) test_that("empty selections create 1 row tibbles (#6685)", { # This makes the result recyclable against other inputs, and ensures that # a `pick(NULL)` call can be used in a `group_by()` wrapper to # "group by nothing". It is a slight departure from viewing `pick()` as a # pure macro expansion into `tibble()`. Instead it is more like an expansion # into: # size <- vctrs::vec_size_common(..., .absent = 1L) # out <- vctrs::vec_recycle_common(..., .size = size) # tibble::new_tibble(out, nrow = size) df <- tibble(g = c(1, 1, 2), x = c(2, 3, 4)) gdf <- group_by(df, g) out <- mutate(gdf, y = pick(starts_with("foo"))) expect_identical(out$y, new_tibble(list(), nrow = 3L)) out <- mutate(gdf, y = pick_wrapper(starts_with("foo"))) expect_identical(out$y, new_tibble(list(), nrow = 3L)) }) test_that("must supply at least one selector to `pick()`", { df <- tibble(x = c(2, 3, 4)) expect_snapshot(error = TRUE, { mutate(df, y = pick()) }) expect_snapshot(error = TRUE, { mutate(df, y = pick_wrapper()) }) }) test_that("the tidyselection and column extraction are evaluated on the current data", { # Because `pick()` is viewed as macro expansion, and the expansion inherits # typical dplyr semantics df <- tibble(g = c(1, 2, 2), x = 1:3) gdf <- group_by(df, g) expect_snapshot(error = TRUE, { # Expands to `tibble(x = x)` mutate(gdf, x = NULL, y = pick(x)) }) expect_snapshot(error = TRUE, { # Does actual `eval_select()` call per group mutate(gdf, x = NULL, y = pick_wrapper(x)) }) # Can select newly created columns out <- mutate(gdf, y = x + 1L, z = pick(x, y)) expect_identical(out[c("x", "y")], out$z) out <- mutate(gdf, y = x + 1L, z = pick_wrapper(x, y)) expect_identical(out[c("x", "y")], out$z) df <- tibble(x = 1) expect <- tibble(x = tibble(x = tibble(x = 1)), y = tibble(x = x)) out <- mutate(df, x = pick(x), x = pick(x), y = pick(x)) expect_identical(out, expect) out <- mutate( df, x = pick_wrapper(x), x = pick_wrapper(x), y = pick_wrapper(x) ) expect_identical(out, expect) }) test_that("can call different `pick()` expressions in different groups", { df <- tibble(g = c(1, 2), x = 1:2, y = 3:4) gdf <- group_by(df, g) expect <- tibble(x = c(1L, NA), y = c(NA, 4L)) out <- mutate(gdf, z = if (g == 1) pick(x) else pick(y)) expect_identical(out$z, expect) out <- mutate(gdf, z = if (g == 1) pick_wrapper(x) else pick_wrapper(y)) expect_identical(out$z, expect) }) test_that("can call `pick()` from a user defined function", { df <- tibble(a = 1, b = 2, c = 3) gdf <- group_by(df, a) # Hardcoded variables in expression my_pick <- function() pick(a, c) out <- mutate(df, d = my_pick()) expect_identical(out$d, df[c("a", "c")]) # Hardcoded `all_of()` using a local variable my_pick <- function() { x <- c("a", "c") pick(all_of(x)) } out <- mutate(df, d = my_pick()) expect_identical(out$d, df[c("a", "c")]) expect_snapshot(error = TRUE, { mutate(gdf, d = my_pick()) }) # Dynamic `all_of()` using user supplied variable my_pick <- function(x) { pick(all_of(x)) } y <- c("a", "c") out <- mutate(df, d = my_pick(y)) expect_identical(out$d, df[c("a", "c")]) expect_snapshot(error = TRUE, { mutate(gdf, d = my_pick(y)) }) }) test_that("wrapped `all_of()` and `where()` selections work", { df <- tibble(a = 1, b = "x", c = 3) my_pick <- function(x) { pick(all_of(x)) } out <- mutate(df, x = my_pick("a"), y = my_pick("b")) expect_identical(out$x, df["a"]) expect_identical(out$y, df["b"]) my_pick2 <- function(x) { pick(all_of(x)) } out <- mutate(df, x = my_pick("a"), y = my_pick2("b")) expect_identical(out$x, df["a"]) expect_identical(out$y, df["b"]) my_where <- function(fn) { pick(where(fn)) } out <- mutate(df, x = my_where(is.numeric), y = my_where(is.character)) expect_identical(out$x, df[c("a", "c")]) expect_identical(out$y, df["b"]) }) test_that("`pick()` expansion evaluates on the full data", { # To ensure tidyselection is consistent across groups df <- tibble(g = c(1, 1, 2, 2), x = c(0, 0, 1, 1), y = c(1, 1, 0, 0)) gdf <- group_by(df, g) # Doesn't select any columns. Returns a 1 row tibble per group (#6685). out <- mutate(gdf, y = pick(where(~ all(.x == 0)))) expect_identical(out$y, new_tibble(list(), nrow = 4L)) # `pick()` evaluation fallback evaluates on the group specific data, # forcing potentially different results per group. out <- mutate(gdf, z = pick_wrapper(where(~ all(.x == 0)))) expect_named(out$z, c("x", "y")) expect_identical(out$z$x, c(0, 0, NA, NA)) expect_identical(out$z$y, c(NA, NA, 0, 0)) }) test_that("`pick()` expansion/tidyselection happens outside the data mask", { # `pick()` expressions are evaluated in the caller environment of the verb. # This is intentional to avoid theoretical per-group differences in what # `pick()` should return. df <- tibble(x = 1, y = 2, z = 3) a <- "z" expect <- df["z"] out <- mutate(df, foo = { a <- "x" pick(all_of(a)) }) expect_identical(out$foo, expect) # `pick()`'s evaluation fallback also performs the tidy-selection # in the calling environment of the verb out <- mutate(df, foo = { a <- "x" pick_wrapper(all_of(a)) }) expect_identical(out$foo, expect) }) test_that("errors correctly outside mutate context", { expect_snapshot(error = TRUE, { pick() }) expect_snapshot(error = TRUE, { pick(a, b) }) }) test_that("can assign `pick()` to new function", { # Will run the evaluation version of `pick()` pick2 <- pick df <- tibble(x = 1, y = 2) out <- mutate(df, z = pick2(y)) expect_identical(out$z, df["y"]) }) test_that("selection on rowwise data frames uses full list-cols, but actual evaluation unwraps them", { df <- tibble(x = list(1:2, 2:4, 5)) df <- rowwise(df) # i.e. can select based on list-ness of the column. # Expands to `y = list(tibble(x = x))` where `x` is `1:2`, `2:4`, `5` like it # would be if you called that directly. out <- mutate(df, y = list(pick(where(is.list)))) expect_identical(out$y, map(df$x, ~ tibble(x = .x))) }) test_that("when expansion occurs, error labels use the pre-expansion quosure", { df <- tibble(g = c(1, 2, 2), x = c(1, 2, 3)) # Fails in common type casting of the group chunks, # which references the auto-named column name expect_snapshot(error = TRUE, { mutate(df, if (cur_group_id() == 1L) pick(x) else "x", .by = g) }) }) test_that("doesn't allow renaming", { expect_snapshot(error = TRUE, { mutate(data.frame(x = 1), pick(y = x)) }) expect_snapshot(error = TRUE, { mutate(data.frame(x = 1), pick_wrapper(y = x)) }) }) # ------------------------------------------------------------------------------ # pick() + summarise()/reframe() test_that("can `pick()` inside `reframe()`", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(1, 1, 1, 2, 2), y = c(1, 1, 1, 2, 1)) gdf <- group_by(df, g) expect_key <- df[c(1, 4, 5), c("x", "y")] expect_count <- c(3L, 1L, 1L) out <- reframe(df, vec_count(pick(x, y), sort = "count")) expect_identical(out$key, expect_key) expect_identical(out$count, expect_count) out <- reframe(df, vec_count(pick_wrapper(x, y), sort = "count")) expect_identical(out$key, expect_key) expect_identical(out$count, expect_count) expect_key <- df[c(1, 4, 3, 5), c("x", "y")] expect_count <- c(2L, 1L, 1L, 1L) out <- reframe(gdf, vec_count(pick(x, y), sort = "count")) expect_identical(out$key, expect_key) expect_identical(out$count, expect_count) out <- reframe(gdf, vec_count(pick_wrapper(x, y), sort = "count")) expect_identical(out$key, expect_key) expect_identical(out$count, expect_count) }) test_that("empty selections recycle to the size of any other column", { df <- tibble(x = 1:5) # Returns size 1 tibbles that stay the same size (#6685) out <- summarise(df, sum = sum(x), y = pick(starts_with("foo"))) expect_identical(out$sum, 15L) expect_identical(out$y, new_tibble(list(), nrow = 1L)) out <- summarise(df, sum = sum(x), y = pick_wrapper(starts_with("foo"))) expect_identical(out$sum, 15L) expect_identical(out$y, new_tibble(list(), nrow = 1L)) # Returns size 1 tibbles that recycle to size 0 because of `empty` (#6685) out <- reframe(df, empty = integer(), y = pick(starts_with("foo"))) expect_identical(out$empty, integer()) expect_identical(out$y, new_tibble(list(), nrow = 0L)) out <- reframe(df, empty = integer(), y = pick_wrapper(starts_with("foo"))) expect_identical(out$empty, integer()) expect_identical(out$y, new_tibble(list(), nrow = 0L)) }) test_that("uses 'current' columns of `summarize()` and `reframe()`", { df <- tibble(x = 1:5, y = 6:10) # Uses size of current version of `x` expect_x <- 15L expect_z <- tibble(x = 15L) out <- summarise(df, x = sum(x), z = pick(x)) expect_identical(out$x, expect_x) expect_identical(out$z, expect_z) out <- summarise(df, x = sum(x), z = pick_wrapper(x)) expect_identical(out$x, expect_x) expect_identical(out$z, expect_z) # Adding in `y` forces recycling expect_x <- vec_rep(15L, 5) expect_z <- tibble(x = 15L, y = 6:10) out <- reframe(df, x = sum(x), z = pick(x, y)) expect_identical(out$x, expect_x) expect_identical(out$z, expect_z) out <- reframe(df, x = sum(x), z = pick_wrapper(x, y)) expect_identical(out$x, expect_x) expect_identical(out$z, expect_z) }) test_that("can select completely new columns in `summarise()`", { df <- tibble(x = 1:5) out <- mutate(df, y = x + 1, z = pick(y)) expect_identical(out["y"], out$z) out <- mutate(df, y = x + 1, z = pick_wrapper(y)) expect_identical(out["y"], out$z) }) # ------------------------------------------------------------------------------ # pick() + arrange() test_that("can `arrange()` with `pick()` selection", { df <- tibble(x = c(2, 2, 1), y = c(3, 1, 3)) expect <- df[c(3, 2, 1), ] expect_identical(arrange(df, pick(x, y)), expect) expect_identical(arrange(df, pick_wrapper(x, y)), expect) expect_identical(arrange(df, pick(x), y), expect) expect_identical(arrange(df, pick_wrapper(x), y), expect) }) test_that("`pick()` errors in `arrange()` are useful", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { arrange(df, pick(y)) }) expect_snapshot(error = TRUE, { arrange(df, foo(pick(x))) }) }) # ------------------------------------------------------------------------------ # pick() + filter() / filter_out() test_that("can `pick()` inside `filter()` / `filter_out()`", { df <- tibble(x = c(1, 2, NA, 3), y = c(2, NA, 5, 3)) expect <- df[c(1, 4), ] out <- filter(df, vec_detect_complete(pick(x, y))) expect_identical(out, expect) out <- filter(df, vec_detect_complete(pick_wrapper(x, y))) expect_identical(out, expect) expect <- df[c(2, 3), ] out <- filter_out(df, vec_detect_complete(pick(x, y))) expect_identical(out, expect) out <- filter_out(df, vec_detect_complete(pick_wrapper(x, y))) expect_identical(out, expect) }) test_that("`filter()` / `filter_out()` with `pick()` that uses invalid tidy-selection errors", { df <- tibble(x = c(1, 2, NA, 3), y = c(2, NA, 5, 3)) expect_snapshot(error = TRUE, { filter(df, pick(x, a)) }) expect_snapshot(error = TRUE, { filter(df, pick_wrapper(x, a)) }) expect_snapshot(error = TRUE, { filter_out(df, pick(x, a)) }) expect_snapshot(error = TRUE, { filter_out(df, pick_wrapper(x, a)) }) }) test_that("`filter()` / `filter_out()` that doesn't use `pick()` result correctly errors", { df <- tibble(x = c(1, 2, NA, 3), y = c(2, NA, 5, 3)) expect_snapshot(error = TRUE, { filter(df, pick(x, y)$x) }) expect_snapshot(error = TRUE, { filter(df, pick_wrapper(x, y)$x) }) expect_snapshot(error = TRUE, { filter_out(df, pick(x, y)$x) }) expect_snapshot(error = TRUE, { filter_out(df, pick_wrapper(x, y)$x) }) }) # ------------------------------------------------------------------------------ # pick() + group_by() test_that("`pick()` can be used inside `group_by()` wrappers", { df <- tibble(a = 1:3, b = 2:4, c = 3:5) tidyselect_group_by <- function(data, groups) { group_by(data, pick({{ groups }})) } tidyselect_group_by_wrapper <- function(data, groups) { group_by(data, pick_wrapper({{ groups }})) } expect_identical( tidyselect_group_by(df, c(a, c)), group_by(df, a, c) ) expect_identical( tidyselect_group_by_wrapper(df, c(a, c)), group_by(df, a, c) ) # Empty selections group by nothing (#6685) expect_identical( tidyselect_group_by(df, NULL), df ) expect_identical( tidyselect_group_by_wrapper(df, NULL), df ) }) # ------------------------------------------------------------------------------ # expand_pick() test_that("`pick()` doesn't expand across anonymous function boundaries", { df <- tibble(x = 1, y = 2) by <- compute_by(by = NULL, data = df, error_call = current_env()) mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env()) # With inline `function() { }` calls (this also handles native R anonymous functions) quo <- dplyr_quosures(z = function() pick(y, x))$z expect_identical(expand_pick(quo, mask), quo) # With `~` anonymous functions quos <- dplyr_quosures(z = ~ pick(y, x))$z expect_identical(expand_pick(quo, mask), quo) }) test_that("`pick()` expands embedded quosures", { df <- tibble(x = 1, y = 2) by <- compute_by(by = NULL, data = df, error_call = current_env()) mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env()) wrapper <- function(x) { dplyr_quosures(z = dense_rank({{ x }})) } quo <- wrapper(pick(x, y))$z out <- expand_pick(quo, mask) expect_identical( quo_get_expr(quo_get_expr(out)[[2L]]), expr(asNamespace("dplyr")$dplyr_pick_tibble(x = x, y = y)) ) }) dplyr/tests/testthat/test-bind-rows.R0000644000176200001440000001640015106134104017406 0ustar liggesuserstest_that("bind_rows() handles simple inputs", { df1 <- tibble(x = 1:2, y = letters[1:2]) df2 <- tibble(x = 3:4, y = letters[3:4]) out <- bind_rows(df1, df2) expect_equal(out, tibble(x = 1:4, y = letters[1:4])) }) test_that("bind_rows() reorders columns to match first df", { df1 <- tibble(x = 1, y = 2) df2 <- tibble(y = 1, x = 2) expect_named(bind_rows(df1, df2), c("x", "y")) }) test_that("bind_rows() returns union of columns", { df1 <- tibble(x = 1) df2 <- tibble(y = 2) out <- bind_rows(df1, df2) expect_equal(out, tibble(x = c(1, NA), y = c(NA, 2))) }) test_that("bind_rows() handles zero column data frames (#2175)", { df1 <- tibble(.rows = 1) df2 <- tibble(x = 1) out <- bind_rows(df1, df2) expect_equal(out, tibble(x = c(NA, 1))) }) test_that("bind_rows() handles zero row data frames (#597)", { df1 <- tibble(x = numeric()) df2 <- tibble(y = 1) out <- bind_rows(df1, df2) expect_equal(out, tibble(x = NA_real_, y = 1)) }) test_that("bind_rows() ignores NULL (#2056)", { df <- tibble(a = 1) expect_equal(bind_rows(df, NULL), df) expect_equal(bind_rows(list(df, NULL)), df) }) test_that("bind_rows() creates a column of identifiers (#1337)", { df1 <- tibble(x = 1:2) df2 <- tibble(x = 3) # with out <- bind_rows(a = df1, b = df2, .id = "id") expect_equal(out, tibble(id = c("a", "a", "b"), x = 1:3)) out <- bind_rows(list(a = df1, b = df2), .id = "id") expect_equal(out, tibble(id = c("a", "a", "b"), x = 1:3)) # or without names out <- bind_rows(df1, df2, .id = "id") expect_equal(out, tibble(id = c("1", "1", "2"), x = 1:3)) }) test_that("bind_rows deduplicates row names", { df1 <- data.frame(x = 1:2, row.names = c("a", "b")) df2 <- data.frame(x = 3:4, row.names = c("a", "c")) out <- bind_rows(df1, df2) expect_equal(rownames(out), c("a...1", "b", "a...3", "c")) }) test_that("bind_rows respects the drop attribute of grouped df", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) gg <- bind_rows(df, df) expect_equal(group_size(gg), c(4L, 4L, 0L)) }) # bind_rows() magic --------------------------------------------------- test_that("bind_rows() handles lists of data frames #1389", { df <- tibble(x = 1) res <- bind_rows(list(df, df), list(df, df)) expect_equal(nrow(res), 4) }) test_that("bind_rows() ignores empty lists (#2826)", { df <- tibble(x = 1:10) expect_equal(bind_rows(list(df, list())), df) }) test_that("bind_rows() accepts lists of dataframe-like lists as first argument", { ll <- list(a = 1, b = 2) expect_equal(bind_rows(list(ll)), tibble(a = 1, b = 2)) expect_equal(bind_rows(list(ll, ll)), tibble(a = c(1, 1), b = c(2, 2))) }) test_that("bind_rows() can handle lists (#1104)", { ll <- list(list(x = 1, y = "a"), list(x = 2, y = "b")) out <- bind_rows(ll) expect_equal(out, tibble(x = c(1, 2), y = c("a", "b"))) out <- bind_rows(ll[[1]], ll[2]) expect_equal(out, tibble(x = c(1, 2), y = c("a", "b"))) }) test_that("bind_rows() handles 0-length named list (#1515)", { x <- set_names(list()) expect_equal(bind_rows(x), tibble()) }) test_that("bind_rows() handles tibbles + vectors", { out <- bind_rows( tibble(a = 1, b = 2), c(a = 3, b = 4) ) expect_equal(out, tibble(a = c(1, 3), b = c(2, 4))) out <- bind_rows( a = c(a = 1, b = 2), b = c(a = 3, b = 4), .id = "id" ) expect_equal(out, tibble(id = c("a", "b"), a = c(1, 3), b = c(2, 4))) }) test_that("bind_rows() only flattens S3 lists that inherit from list (#3924)", { df <- data.frame(x = 1, y = 2) lst1 <- structure(list(df, df, df), class = "special_lst") expect_snapshot(bind_rows(lst1), error = TRUE) lst2 <- structure(list(df, df, df), class = c("special_lst", "list")) expect_equal(bind_rows(lst2), bind_rows(df, df, df)) }) test_that("bind_rows() handles named list", { x <- list(x = 1, y = 2, z = 3) expect_equal(bind_rows(x), tibble(x = 1, y = 2, z = 3)) }) test_that("bind_rows() handles empty names in a list (#7100)", { x <- rep(list(data.frame(x = 1)), times = 5) names(x) <- paste0("id_", 1:5) names(x)[c(3, 5)] <- NA_character_ x <- bind_rows(x, .id = "id") # If names are missing, bind_rows will replace with index expect_identical(x$id, c("id_1", "id_2", "3", "id_4", "5")) }) test_that("bind_rows() validates lists (#5417)", { out <- bind_rows(list(x = 1), list(x = 1, y = 1:2)) expect_equal(out, tibble(x = c(1, 1, 1), y = c(NA, 1:2))) expect_snapshot(bind_rows(list(x = 1), list(x = 1:3, y = 1:2)), error = TRUE) }) test_that("bind_rows() handles missing, null, and empty elements (#5429)", { x <- list(a = NULL, b = NULL) y <- list(a = "B", b = 2) l <- list(x, y) expect_identical( bind_rows(l), tibble(a = "B", b = 2) ) x <- list(a = NULL, b = 1) y <- list(a = "B", b = 2) l <- list(x, y) expect_identical( bind_rows(l), tibble(b = c(1, 2), a = c(NA, "B")) ) x <- list(a = character(0), b = 1) y <- list(a = "B", b = 2) l <- list(x, y) expect_identical( bind_rows(l), tibble(a = "B", b = 2) ) }) test_that("bind_rows(.id= NULL) does not set names (#5089)", { out <- bind_rows(list(a = tibble(x = 1:2))) expect_equal(attr(out, "row.names"), 1:2) out <- bind_rows(x = c(a = 1)) expect_identical(attr(out, "row.names"), 1L) }) # Column coercion -------------------------------------------------------------- test_that("bind_rows() promotes integer to numeric", { df1 <- tibble(a = 1L, b = 1L) df2 <- tibble(a = 1, b = 1L) res <- bind_rows(df1, df2) expect_type(res$a, "double") expect_type(res$b, "integer") }) test_that("bind_rows() coerces factor when levels don't match", { df1 <- data.frame(a = factor("a")) df2 <- data.frame(a = factor("b")) res <- bind_rows(df1, df2) expect_equal(res$a, factor(c("a", "b"))) }) test_that("bind_rows() handles complex. #933", { df1 <- tibble(x = 1 + 1i) df2 <- tibble(x = 2 + 1i) out <- bind_rows(df1, df2) expect_equal(out, tibble(x = c(1 + 1i, 2 + 1i))) }) test_that("bind_rows() handles POSIXct (#1125)", { df1 <- data.frame(date = as.POSIXct(NA)) df2 <- data.frame(date = as.POSIXct("2015-05-05")) res <- bind_rows(df1, df2) expect_equal(nrow(res), 2L) expect_true(is.na(res$date[1])) }) test_that("bind_rows() accepts data frame columns (#2015)", { df1 <- tibble(x = 1, y = tibble(a = 1, b = 1)) df2 <- tibble(x = 2, y = tibble(a = 2, b = 2)) out <- bind_rows(df1, df2) expect_equal(out, tibble(x = 1:2, y = tibble(a = 1:2, b = 1:2))) }) test_that("bind_rows() accepts difftime objects", { df1 <- data.frame(x = as.difftime(1, units = "hours")) df2 <- data.frame(x = as.difftime(1, units = "mins")) res <- bind_rows(df1, df2) expect_equal(res$x, as.difftime(c(3600, 60), units = "secs")) }) # Errors ------------------------------------------------------------ test_that("bind_rows() give informative errors", { expect_snapshot({ "invalid .id" df1 <- tibble(x = 1:3) df2 <- tibble(x = 4:6) (expect_error(bind_rows(df1, df2, .id = 5))) "invalid type" ll <- list(tibble(a = 1:5), env(a = 1)) (expect_error(bind_rows(ll))) df1 <- tibble(a = factor("a")) df2 <- tibble(a = 1L) (expect_error(bind_rows(df1, df2))) "unnamed vectors" (expect_error(bind_rows(1:2))) }) }) dplyr/tests/testthat/test-colwise-select.R0000644000176200001440000001337615106134104020435 0ustar liggesusersdf <- tibble(x = 0L, y = 0.5, z = 1) test_that("can select/rename all variables", { expect_identical(select_all(df), df) expect_identical(select_all(df, toupper), set_names(df, c("X", "Y", "Z"))) expect_identical(select_all(df, toupper), rename_all(df, toupper)) }) test_that("can select/rename with predicate", { expect_identical(select_if(df, is_integerish), select(df, x, z)) expect_identical( select_if(df, is_integerish, toupper), set_names(df[c("x", "z")], c("X", "Z")) ) expect_identical( rename_if(df, is_integerish, toupper), set_names(df, c("X", "y", "Z")) ) }) test_that("can take list, but only containing single function", { expect_identical( select_if(df, list(~ is_integerish(.)), list(~ toupper(.))), set_names(df[c("x", "z")], c("X", "Z")) ) expect_identical( rename_if(df, list(~ is_integerish(.)), list(~ toupper(.))), set_names(df, c("X", "y", "Z")) ) }) test_that("can select/rename with vars()", { expect_identical(select_at(df, vars(x:y)), df[-3]) expect_identical( select_at(df, vars(x:y), toupper), set_names(df[-3], c("X", "Y")) ) expect_identical( rename_at(df, vars(x:y), toupper), set_names(df, c("X", "Y", "z")) ) }) test_that("select variants can use grouping variables (#3351, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) expect_identical( select(tbl, gr1), select_at(tbl, vars(gr1)) ) expect_identical( select_all(tbl), tbl ) expect_identical( select_if(tbl, is.integer), tbl ) }) test_that("select_if keeps grouping cols", { by_species <- iris |> group_by(Species) expect_silent(df <- by_species |> select_if(is.numeric)) expect_equal(df, by_species[c(5, 1:4)]) }) test_that("select_if() handles non-syntactic colnames", { df <- tibble(`x 1` = 1:3) expect_identical(select_if(df, is_integer)[[1]], 1:3) }) test_that("select_if() handles quoted predicates", { expected <- select_if(mtcars, is_integerish) expect_identical(select_if(mtcars, "is_integerish"), expected) expect_identical(select_if(mtcars, ~ is_integerish(.x)), expected) }) test_that("rename_all() works with grouped data (#3363)", { df <- data.frame(a = 1, b = 2) out <- df |> group_by(a) |> rename_all(toupper) expect_identical(out, group_by(data.frame(A = 1, B = 2), A)) }) test_that("scoping (#3426)", { interface <- function(.tbl, .funs = list()) { impl(.tbl, .funs = .funs) } impl <- function(.tbl, ...) { select_all(.tbl, ...) } expect_identical( interface(mtcars, .funs = toupper), select_all(mtcars, .funs = list(toupper)) ) }) test_that("rename variants can rename a grouping variable (#3351)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) res <- rename(tbl, GR1 = gr1, GR2 = gr2, X = x) expect_identical( rename_at(tbl, vars(everything()), toupper), res ) expect_identical( rename_all(tbl, toupper), res ) expect_identical( rename_if(tbl, is.integer, toupper), res ) }) test_that("select_all does not change the order of columns (#3351)", { tbl <- group_by(tibble(x = 1:4, y = 4:1), y) expect_identical(select_all(tbl), tbl) tbl <- group_by(tibble(x = 1:4, y = 4:1), x) expect_identical(select_all(tbl), tbl) tbl <- group_by(tibble(x = 1:4, y = 4:1, z = 1:4), y) expect_identical(select_all(tbl), tbl) }) test_that("mutate_all does not change the order of columns (#3351)", { tbl <- group_by(tibble(x = 1:4, y = 1:4), y) expect_message( expect_identical(names(mutate_all(tbl, identity)), names(tbl)), "ignored" ) tbl <- group_by(tibble(x = 1:4, y = 1:4), x) expect_message( expect_identical(names(mutate_all(tbl, identity)), names(tbl)), "ignored" ) tbl <- group_by(tibble(x = 1:4, y = 1:4, z = 1:4), y) expect_message( expect_identical(names(mutate_all(tbl, identity)), names(tbl)), "ignored" ) }) test_that("select_if() and rename_if() handles logical (#4213)", { ids <- "Sepal.Length" expect_identical( select_if(iris, !names(iris) %in% ids), iris |> select(-Sepal.Length) ) expect_identical( rename_if(iris, !names(iris) %in% ids, toupper), rename_at(iris, setdiff(names(iris), "Sepal.Length"), toupper) ) }) test_that("rename_at() handles empty selection (#4324)", { expect_identical( mtcars |> rename_at(vars(contains("fake_col")), ~ paste0("NewCol.", .)), mtcars ) }) test_that("rename_all/at() call the function with simple character vector (#4459)", { fun <- function(x) case_when(x == 'mpg' ~ 'fuel_efficiency', .default = x) out <- rename_all(mtcars, fun) expect_equal(names(out)[1L], 'fuel_efficiency') out <- rename_at(mtcars, vars(everything()), fun) expect_equal(names(out)[1L], 'fuel_efficiency') }) test_that("select_if() discards the column when predicate gives NA (#4486)", { out <- tibble(mycol = c("", "", NA)) |> select_if(~ !all(. == "")) expect_identical( out, tibble::new_tibble(list(), nrow = 3L) ) }) # Errors ------------------------------------------------------------------ test_that("colwise select() / rename() give meaningful errors", { expect_snapshot({ df <- tibble(x = 0L, y = 0.5, z = 1) # colwise rename() (expect_error( df |> rename_all() )) (expect_error( df |> rename_if(is_integerish) )) (expect_error( df |> rename_at(vars(x:y)) )) (expect_error( df |> rename_all(list(tolower, toupper)) )) # colwise select() (expect_error( df |> select_all(list(tolower, toupper)) )) (expect_error( df |> select_if(function(.x) 1) )) (expect_error( df |> select_if(function(.x) c(TRUE, TRUE)) )) (expect_error( data.frame() |> select_all(.funs = 42) )) }) }) dplyr/tests/testthat/test-defunct.R0000644000176200001440000000034415106134104017132 0ustar liggesuserstest_that("generate informative errors", { expect_snapshot(error = TRUE, { combine() src_mysql() src_postgres() src_sqlite() src_local() src_df() tbl_df() as.tbl() add_rownames() }) }) dplyr/tests/testthat/test-na-if.R0000644000176200001440000000402315106134104016472 0ustar liggesuserstest_that("scalar y replaces all matching x", { x <- c(0, 1, 0) expect_identical(na_if(x, 0), c(NA, 1, NA)) expect_identical(na_if(x, 1), c(0, NA, 0)) }) test_that("`y` can be a vector the same length as `x` (matching SQL NULLIF)", { x <- c(0, 1, 0) y <- c(0, 1, 2) expect_identical(na_if(x, y), c(NA, NA, 0)) }) test_that("`NA` replacing itself is a no-op", { expect_identical(na_if(NA, NA), NA) }) test_that("missing values are allowed to equal each other, so `NaN`s can be standardized", { expect_identical(na_if(NaN, NaN), NA_real_) }) test_that("missing values equal each other in partially incomplete data frame rows", { x <- tibble( x = c(2, 1, NA, 1), y = c(1, NA, NA, NA), z = c(3, NaN, NA, NaN) ) y <- tibble(x = 1, y = NA, z = NaN) expect <- vec_assign(x, i = c(2, 4), value = NA) expect_identical(na_if(x, y), expect) }) test_that("works when there are missings in either input", { expect_identical(na_if(c(1, NA, 2), 1), c(NA, NA, 2)) expect_identical(na_if(c(1, NA, 2), c(1, NA, NA)), c(NA, NA, 2)) }) test_that("works with data frames", { x <- tibble(a = c(1, 99, 99, 99), b = c("x", "NA", "bar", "NA")) y <- tibble(a = 99, b = "NA") expect_identical( na_if(x, y), x[c(1, NA, 3, NA), ] ) }) test_that("works with rcrd types", { x <- new_rcrd(list(a = c(1, 99, 99, 99), b = c("x", "NA", "bar", "NA"))) y <- new_rcrd(list(a = 99, b = "NA")) expect_identical( na_if(x, y), x[c(1, NA, 3, NA)] ) }) test_that("is type stable on `x`", { expect_identical(na_if(0L, 0), NA_integer_) expect_snapshot(error = TRUE, { na_if(0L, 1.5) }) }) test_that("is size stable on `x`", { expect_snapshot(error = TRUE, { na_if(1, integer()) }) expect_snapshot(error = TRUE, { na_if(1, c(1, 2)) }) expect_snapshot(error = TRUE, { na_if(c(1, 2, 3), c(1, 2)) }) }) test_that("requires vector types for `x` and `y`", { expect_snapshot(error = TRUE, { na_if(environment(), 1) }) expect_snapshot(error = TRUE, { na_if(1, environment()) }) }) dplyr/tests/testthat/test-colwise-filter.R0000644000176200001440000000524215106134104020434 0ustar liggesuserstest_that("filter_if()", { expect_identical(nrow(filter_if(mtcars, is_integerish, all_vars(. > 1))), 0L) expect_identical(nrow(filter_if(mtcars, is_integerish, all_vars(. > 0))), 7L) }) test_that("filter_at()", { sepal_large <- filter_at(iris, vars(starts_with("Sepal")), all_vars(. > 4)) sepal_large_expected <- filter(iris, Sepal.Length > 4, Sepal.Width > 4) expect_equal(sepal_large, sepal_large_expected) }) test_that("filter_all()", { expect_identical( filter_all(mtcars, any_vars(. > 200))$disp, mtcars$disp[mtcars$disp > 200] ) }) test_that("filter_at can filter by grouping variables (#3351, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) expect_identical( filter_at(tbl, vars(gr1), all_vars(. > 1)), filter(tbl, gr1 > 1) ) }) test_that("filter_if and filter_all includes grouping variables (#3351, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) res <- filter_all(tbl, all_vars(. > 1)) expect_true(all(res$gr1 > 1)) res <- filter_if(tbl, is.integer, all_vars(. > 1)) expect_true(all(res$gr1 > 1)) }) test_that("can supply functions to scoped filters", { exp <- as.list(mtcars[c(8, 9, 21), ]) out <- mtcars |> filter_at(c("cyl", "am"), ~ .x == 4 | .x == 0) expect_identical(as.list(out), exp) out <- mtcars |> filter_at(c("cyl", "am"), function(.x) .x == 4 | .x == 0) expect_identical(as.list(out), exp) }) test_that("colwise filter support .data$. in the quosure versions", { expect_identical( filter_if(iris, is.numeric, any_vars(.data$. > 4)), filter_if(iris, is.numeric, any_vars(. > 4)) ) expect_identical( filter_all(select(iris, -Species), any_vars(.data$. > 4)), filter_all(select(iris, -Species), any_vars(. > 4)) ) expect_identical( filter_at(iris, vars(contains(".")), any_vars(.data$. > 4)), filter_at(iris, vars(contains(".")), any_vars(. > 4)) ) }) test_that("all_exprs() creates intersection", { expect_identical(all_exprs(am == 1), quo(am == 1)) quo <- set_env(quo((!!quo(cyl == 2)) & (!!quo(am == 1))), base_env()) expect_identical(all_exprs(cyl == 2, am == 1), quo) }) test_that("any_exprs() creates union", { expect_identical(any_exprs(am == 1), quo(am == 1)) quo <- set_env(quo((!!quo(cyl == 2)) | (!!quo(am == 1))), base_env()) expect_identical(any_exprs(cyl == 2, am == 1), quo) }) # Errors ------------------------------------------------------------------ test_that("colwise filter() give meaningful errors", { expect_snapshot({ (expect_error(filter_if(mtcars, is_character, all_vars(. > 0)))) (expect_error(filter_all(mtcars, list(~ . > 0)))) }) }) dplyr/tests/testthat/test-colwise-mutate.R0000644000176200001440000002716515106134104020456 0ustar liggesuserstest_that("can use character vectors or bare functions", { df <- data.frame(x = 1:3) expect_equal(summarise_all(df, "mean"), data.frame(x = 2)) expect_equal(summarise_all(df, mean), data.frame(x = 2)) expect_equal(mutate_all(df, list(x = "mean")), data.frame(x = rep(2, 3))) expect_equal(mutate_all(df, list(x = mean)), data.frame(x = rep(2, 3))) }) test_that("default names are smallest unique set", { df <- data.frame(x = 1:3, y = 1:3) expect_named(summarise_at(df, vars(x:y), list(mean)), c("x", "y")) expect_named( summarise_at(df, vars(x), list(mean = mean, sd = sd)), c("mean", "sd") ) expect_named( summarise_at(df, vars(x:y), list(mean = mean, sd = sd)), c("x_mean", "y_mean", "x_sd", "y_sd") ) }) test_that("named arguments force complete names", { df <- data.frame(x = 1:3, y = 1:3) expect_named( summarise_at(df, vars(x:y), list(mean = mean)), c("x_mean", "y_mean") ) expect_named( summarise_at(df, vars(x = x), list(mean = mean, sd = sd)), c("x_mean", "x_sd") ) }) expect_classes <- function(tbl, expected) { classes <- unname(map_chr(tbl, class)) classes <- paste0(substring(classes, 0, 1), collapse = "") expect_equal(classes, expected) } test_that("can select colwise", { columns <- iris |> mutate_at(NULL, as.character) expect_classes(columns, "nnnnf") columns <- iris |> mutate_at(vars(starts_with("Petal")), as.character) expect_classes(columns, "nnccf") numeric <- iris |> mutate_at(c(1, 3), as.character) expect_classes(numeric, "cncnf") character <- iris |> mutate_at("Species", as.character) expect_classes(character, "nnnnc") }) test_that("can probe colwise", { predicate <- iris |> mutate_if(is.factor, as.character) expect_classes(predicate, "nnnnc") logical <- iris |> mutate_if(c(TRUE, FALSE, TRUE, TRUE, FALSE), as.character) expect_classes(logical, "cnccf") }) test_that("non syntactic colnames work", { df <- tibble(`x 1` = 1:3) expect_identical(summarise_at(df, "x 1", sum)[[1]], 6L) expect_identical(summarise_if(df, is.numeric, sum)[[1]], 6L) expect_identical(summarise_all(df, sum)[[1]], 6L) expect_identical(mutate_all(df, `*`, 2)[[1]], (1:3) * 2) }) test_that("empty selection does not select everything (#2009, #1989)", { expect_equal( tibble::remove_rownames(mtcars), tibble::remove_rownames(mutate_if(mtcars, is.factor, as.character)) ) }) test_that("predicate can be quoted", { expected <- mutate_if(mtcars, is_integerish, mean) expect_identical(mutate_if(mtcars, "is_integerish", mean), expected) expect_identical(mutate_if(mtcars, ~ is_integerish(.x), mean), expected) }) test_that("transmute verbs do not retain original variables", { expect_named( transmute_all(tibble(x = 1:3, y = 1:3), list(mean = mean, sd = sd)), c("x_mean", "y_mean", "x_sd", "y_sd") ) expect_named( transmute_if( tibble(x = 1:3, y = 1:3), is_integer, list(mean = mean, sd = sd) ), c("x_mean", "y_mean", "x_sd", "y_sd") ) expect_named( transmute_at( tibble(x = 1:3, y = 1:3), vars(x:y), list(mean = mean, sd = sd) ), c("x_mean", "y_mean", "x_sd", "y_sd") ) }) test_that("can rename with vars() (#2594)", { expect_identical( mutate_at(tibble(x = 1:3), vars(y = x), mean), tibble(x = 1:3, y = c(2, 2, 2)) ) }) test_that("selection works with grouped data frames (#2624)", { gdf <- group_by(iris, Species) expect_snapshot(out <- mutate_if(gdf, is.factor, as.character)) expect_identical(out, gdf) }) test_that("at selection works even if not all ops are named (#2634)", { df <- tibble(x = 1, y = 2) expect_identical( mutate_at(df, vars(z = x, y), list(~ . + 1)), tibble(x = 1, y = 3, z = 2) ) }) test_that("can use a purrr-style lambda", { expect_identical( summarise_at(mtcars, vars(1:2), ~ mean(.x)), summarise(mtcars, mpg = mean(mpg), cyl = mean(cyl)) ) }) test_that("mutate and transmute variants does not mutate grouping variable (#3351, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) res <- mutate(tbl, gr2 = sqrt(gr2), x = sqrt(x)) expect_message(expect_identical(mutate_all(tbl, sqrt), res), "ignored") expect_message(expect_identical(transmute_all(tbl, sqrt), res), "ignored") expect_message( expect_identical(mutate_if(tbl, is.integer, sqrt), res), "ignored" ) expect_message( expect_identical(transmute_if(tbl, is.integer, sqrt), res), "ignored" ) expect_identical(transmute_at(tbl, vars(-group_cols()), sqrt), res) expect_identical(mutate_at(tbl, vars(-group_cols()), sqrt), res) }) test_that("summarise variants does not summarise grouping variable (#3351, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) res <- summarise(tbl, gr2 = mean(gr2), x = mean(x)) expect_identical(summarise_all(tbl, mean), res) expect_identical(summarise_if(tbl, is.integer, mean), res) }) test_that("summarise_at removes grouping variables (#3613)", { d <- tibble(x = 1:2, y = 3:4, g = 1:2) |> group_by(g) res <- d |> group_by(g) |> summarise_at(-1, mean) expect_equal(names(res), c("g", "y")) }) test_that("group_by_(at,all) handle utf-8 names (#3829)", { local_non_utf8_encoding() name <- get_native_lang_string() tbl <- tibble(a = 1) |> setNames(name) res <- group_by_all(tbl) |> groups() expect_equal(res[[1]], sym(name)) res <- group_by_at(tbl, name) |> groups() expect_equal(res[[1]], sym(name)) }) test_that("*_(all,at) handle utf-8 names (#2967)", { local_non_utf8_encoding() name <- get_native_lang_string() tbl <- tibble(a = 1) |> setNames(name) res <- tbl |> mutate_all(list(as.character)) |> names() expect_equal(res, name) res <- tbl |> mutate_at(name, list(as.character)) |> names() expect_equal(res, name) res <- tbl |> summarise_all(list(as.character)) |> names() expect_equal(res, name) res <- tbl |> summarise_at(name, list(as.character)) |> names() expect_equal(res, name) res <- select_at(tbl, name) |> names() expect_equal(res, name) }) test_that("summarise_at with multiple columns AND unnamed functions works (#4119)", { res <- storms |> summarise_at(vars(wind, pressure), list(mean, median)) expect_equal(df_n_col(res), 4L) expect_equal( names(res), c("wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2") ) res <- storms |> summarise_at(vars(wind, pressure), list(n = length, mean, median)) expect_equal(df_n_col(res), 6L) expect_equal( names(res), c( "wind_n", "pressure_n", "wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2" ) ) }) test_that("mutate_at with multiple columns AND unnamed functions works (#4119)", { res <- storms |> mutate_at(vars(wind, pressure), list(mean, median)) expect_equal(df_n_col(res), df_n_col(storms) + 4L) expect_equal( names(res), c(names(storms), c("wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2")) ) }) test_that("colwise mutate have .data in scope of rlang lambdas (#4183)", { results <- list( iris |> mutate_if(is.numeric, ~ . / iris$Petal.Width), iris |> mutate_if(is.numeric, ~ . / Petal.Width), iris |> mutate_if(is.numeric, ~ . / .data$Petal.Width), iris |> mutate_if(is.numeric, list(~ . / iris$Petal.Width)), iris |> mutate_if(is.numeric, list(~ . / Petal.Width)), iris |> mutate_if(is.numeric, list(~ . / .data$Petal.Width)), iris |> mutate_if(is.numeric, ~ .x / iris$Petal.Width), iris |> mutate_if(is.numeric, ~ .x / Petal.Width), iris |> mutate_if(is.numeric, ~ .x / .data$Petal.Width), iris |> mutate_if(is.numeric, list(~ .x / iris$Petal.Width)), iris |> mutate_if(is.numeric, list(~ .x / Petal.Width)), iris |> mutate_if(is.numeric, list(~ .x / .data$Petal.Width)) ) for (i in 2:12) { expect_equal(results[[1]], results[[i]]) } }) test_that("can choose the name of vars with multiple funs (#4180)", { expect_identical( mtcars |> group_by(cyl) |> summarise_at(vars(DISP = disp), list(mean = mean, median = median)), mtcars |> group_by(cyl) |> summarise(DISP_mean = mean(disp), DISP_median = median(disp)) ) }) test_that("summarise_at() unquotes in lambda (#4287)", { df <- tibble::tibble(year = seq(2015, 2050, 5), P = 5.0 + 2.5 * year) year <- 2037 expect_equal( summarise_at(df, vars(-year), ~ approx(x = year, y = ., xout = !!year)$y), summarise(df, P = approx(x = year, y = P, xout = !!year)$y) ) }) test_that("mutate_at() unquotes in lambdas (#4199)", { df <- tibble(a = 1:10, b = runif(1:10), c = letters[1:10]) varname <- "a" symname <- rlang::sym(varname) quoname <- enquo(symname) expect_identical( df |> mutate(b = mean(!!quoname)), df |> mutate_at(vars(matches("b")), list(~ mean(!!quoname))) ) }) test_that("summarise_at() can refer to local variables and columns (#4304)", { # using local here in case someone wants to run the content of the test # as opposed to the test_that() call res <- local({ value <- 10 expect_identical( iris |> summarise_at("Sepal.Length", ~ sum(. / value)), iris |> summarise(Sepal.Length = sum(Sepal.Length / value)) ) }) }) test_that("colwise mutate handles formulas with constants (#4374)", { expect_identical( tibble(x = 12) |> mutate_all(~42), tibble(x = 42) ) expect_identical( tibble(x = 12) |> mutate_at("x", ~42), tibble(x = 42) ) }) test_that("colwise mutate handle named chr vectors", { res <- tibble(x = 1:10) |> mutate_at(c(y = "x"), mean) expect_identical(res, tibble(x = 1:10, y = 5.5)) }) test_that("colwise verbs deprecate quosures (#4330)", { expect_snapshot({ (expect_warning(mutate_at(mtcars, vars(mpg), quo(mean(.))))) (expect_warning(summarise_at(mtcars, vars(mpg), quo(mean(.))))) }) }) test_that("rlang lambda inherit from the data mask (#3843)", { res <- iris |> mutate_at( vars(starts_with("Petal")), ~ ifelse(Species == "setosa" & . < 1.5, NA, .) ) expected <- iris |> mutate( Petal.Length = ifelse( Species == "setosa" & Petal.Length < 1.5, NA, Petal.Length ), Petal.Width = ifelse( Species == "setosa" & Petal.Width < 1.5, NA, Petal.Width ) ) expect_identical(res, expected) res <- iris |> group_by(Species) |> mutate_at( vars(starts_with("Petal")), ~ ifelse(Species == "setosa" & . < 1.5, NA, .) ) expected <- iris |> group_by(Species) |> mutate( Petal.Length = ifelse( Species == "setosa" & Petal.Length < 1.5, NA, Petal.Length ), Petal.Width = ifelse( Species == "setosa" & Petal.Width < 1.5, NA, Petal.Width ) ) expect_identical(res, expected) }) test_that("_if isn't tripped up by columns named 'i' (#5330)", { test_df <- tibble(i = c("a", "b"), j = c(1, 2)) result_df <- test_df |> mutate_if(is.character, as.factor) expect_equal(result_df$i, as.factor(test_df$i)) expect_equal(result_df$j, test_df$j) }) # Errors -------------------------------------------- test_that("colwise mutate gives meaningful error messages", { expect_snapshot({ # column not found (expect_error( mutate_at(tibble(), "test", ~1) )) # not summarising grouping variables tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) tbl <- group_by(tbl, gr1) (expect_error( summarise_at(tbl, vars(gr1), mean) )) # improper additional arguments (expect_error( mutate_all(mtcars, length, 0, 0) )) (expect_error( mutate_all(mtcars, mean, na.rm = TRUE, na.rm = TRUE) )) }) }) dplyr/tests/testthat/test-defunct-each.R0000644000176200001440000000032015137161765020043 0ustar liggesuserstest_that("generate informative errors", { expect_snapshot(error = TRUE, { summarise_each() summarise_each_() mutate_each() mutate_each_() summarize_each() summarize_each_() }) }) dplyr/tests/testthat/test-count-tally.R0000644000176200001440000001451215106134104017757 0ustar liggesusers# count ------------------------------------------------------------------- test_that("count sorts output by keys by default", { # Due to usage of `summarise()` internally df <- tibble(x = c(2, 1, 1, 2, 1)) out <- count(df, x) expect_equal(out, tibble(x = c(1, 2), n = c(3, 2))) }) test_that("count can sort output by `n`", { df <- tibble(x = c(1, 1, 2, 2, 2)) out <- count(df, x, sort = TRUE) expect_equal(out, tibble(x = c(2, 1), n = c(3, 2))) }) test_that("count can rename grouping columns", { # But should it really allow this? df <- tibble(x = c(2, 1, 1, 2, 1)) out <- count(df, y = x) expect_equal(out, tibble(y = c(1, 2), n = c(3, 2))) }) test_that("informs if n column already present, unless overridden", { df1 <- tibble(n = c(1, 1, 2, 2, 2)) expect_message(out <- count(df1, n), "already present") expect_named(out, c("n", "nn")) # not a good idea, but supported expect_message(out <- count(df1, n, name = "n"), NA) expect_named(out, "n") expect_message(out <- count(df1, n, name = "nn"), NA) expect_named(out, c("n", "nn")) df2 <- tibble(n = c(1, 1, 2, 2, 2), nn = 1:5) expect_message(out <- count(df2, n), "already present") expect_named(out, c("n", "nn")) expect_message(out <- count(df2, n, nn), "already present") expect_named(out, c("n", "nn", "nnn")) }) test_that("name must be string", { df <- tibble(x = c(1, 2)) expect_snapshot(error = TRUE, count(df, x, name = 1)) expect_snapshot(error = TRUE, count(df, x, name = letters)) }) test_that("output includes empty levels with .drop = FALSE", { df <- tibble(f = factor("b", levels = c("a", "b", "c"))) out <- count(df, f, .drop = FALSE) expect_equal(out$n, c(0, 1, 0)) out <- count(group_by(df, f, .drop = FALSE)) expect_equal(out$n, c(0, 1, 0)) }) test_that("count preserves grouping", { df <- tibble(g = c(1, 2, 2, 2)) exp <- tibble(g = c(1, 2), n = c(1, 3)) expect_equal(df |> count(g), exp) expect_equal(df |> group_by(g) |> count(), exp |> group_by(g)) }) test_that("output preserves class & attributes where possible", { df <- data.frame(g = c(1, 2, 2, 2)) attr(df, "my_attr") <- 1 out <- df |> count(g) expect_s3_class(out, "data.frame", exact = TRUE) expect_equal(attr(out, "my_attr"), 1) out <- df |> group_by(g) |> count() expect_s3_class(out, "grouped_df") expect_equal(group_vars(out), "g") # summarise() currently drops attributes expect_null(attr(out, "my_attr")) }) test_that("works with dbplyr", { skip_if_not_installed("dbplyr") skip_if_not_installed("RSQLite") db <- dbplyr::memdb_frame(x = c(1, 1, 1, 2, 2)) df1 <- db |> count(x) |> as_tibble() expect_equal(df1, tibble(x = c(1, 2), n = c(3, 2))) df2 <- db |> add_count(x) |> as_tibble() expect_equal(df2, tibble(x = c(1, 1, 1, 2, 2), n = c(3, 3, 3, 2, 2))) }) test_that("dbplyr `count()` method has transient internal grouping (#6338, tidyverse/dbplyr#940)", { skip_if_not_installed("dbplyr") skip_if_not_installed("RSQLite") db <- dbplyr::memdb_frame( x = c(1, 1, 1, 2, 2), y = c("a", "a", "b", "c", "c") ) df <- db |> count(x, y) |> collect() expect <- tibble( x = c(1, 1, 2), y = c("a", "b", "c"), n = c(2L, 1L, 2L) ) expect_false(is_grouped_df(df)) expect_identical(df, expect) }) test_that("can only explicitly chain together multiple tallies", { expect_snapshot({ df <- data.frame(g = c(1, 1, 2, 2), n = 1:4) df |> count(g, wt = n) df |> count(g, wt = n) |> count(wt = n) df |> count(n) }) }) test_that("count() owns errors (#6139)", { expect_snapshot({ (expect_error(count(mtcars, new = 1 + ""))) (expect_error(count(mtcars, wt = 1 + ""))) }) }) test_that("count() `wt = n()` is deprecated", { df <- tibble(a = 1:5) expect_snapshot({ count(df, a, wt = n()) }) }) # tally ------------------------------------------------------------------- test_that("tally can sort output", { gf <- group_by(tibble(x = c(1, 1, 2, 2, 2)), x) out <- tally(gf, sort = TRUE) expect_equal(out, tibble(x = c(2, 1), n = c(3, 2))) }) test_that("weighted tally drops NAs (#1145)", { df <- tibble(x = c(1, 1, NA)) expect_equal(tally(df, x)$n, 2) }) test_that("tally() drops last group (#5199) ", { df <- data.frame(x = 1, y = 2, z = 3) res <- expect_message(df |> group_by(x, y) |> tally(wt = z), NA) expect_equal(group_vars(res), "x") }) test_that("tally() owns errors (#6139)", { expect_snapshot({ (expect_error(tally(mtcars, wt = 1 + ""))) }) }) test_that("tally() `wt = n()` is deprecated", { df <- tibble(a = 1:5) expect_snapshot({ tally(df, wt = n()) }) }) # add_count --------------------------------------------------------------- test_that("add_count preserves grouping", { df <- tibble(g = c(1, 2, 2, 2)) exp <- tibble(g = c(1, 2, 2, 2), n = c(1, 3, 3, 3)) expect_equal(df |> add_count(g), exp) expect_equal(df |> group_by(g) |> add_count(), exp |> group_by(g)) }) test_that("`.drop` is defunct", { df <- tibble(f = factor("b", levels = c("a", "b", "c"))) expect_snapshot(error = TRUE, { add_count(df, f, .drop = FALSE) }) }) test_that("add_count() `wt = n()` is deprecated", { df <- tibble(a = 1:5) expect_snapshot({ add_count(df, a, wt = n()) }) }) test_that("add_count() owns errors (#6139)", { expect_snapshot({ (expect_error(add_count(mtcars, new = 1 + ""))) (expect_error(add_count(mtcars, wt = 1 + ""))) }) }) # add_tally --------------------------------------------------------------- test_that("can add tallies of a variable", { df <- tibble(a = c(2, 1, 1)) expect_equal( df |> group_by(a) |> add_tally(), group_by(tibble(a = c(2, 1, 1), n = c(1, 2, 2)), a) ) }) test_that("add_tally can be given a weighting variable", { df <- data.frame(a = c(1, 1, 2, 2, 2), w = c(1, 1, 2, 3, 4)) out <- df |> group_by(a) |> add_tally(wt = w) expect_equal(out$n, c(2, 2, 9, 9, 9)) out <- df |> group_by(a) |> add_tally(wt = w + 1) expect_equal(out$n, c(4, 4, 12, 12, 12)) }) test_that("can override output column", { df <- data.frame(g = c(1, 1, 2, 2, 2), x = c(3, 2, 5, 5, 5)) expect_named(add_tally(df, name = "xxx"), c("g", "x", "xxx")) }) test_that("add_tally() owns errors (#6139)", { expect_snapshot({ (expect_error(add_tally(mtcars, wt = 1 + ""))) }) }) test_that("add_tally() `wt = n()` is deprecated", { df <- tibble(a = 1:5) expect_snapshot({ add_tally(df, wt = n()) }) }) dplyr/tests/testthat/test-sample.R0000644000176200001440000000747215106134104016774 0ustar liggesusers# Basic behaviour ------------------------------------------------------------- test_that("sample preserves class", { expect_s3_class(sample_n(mtcars, 1), "data.frame") expect_s3_class(sample_n(as_tibble(mtcars), 1), "tbl_df") expect_s3_class(sample_frac(mtcars, 1), "data.frame") expect_s3_class(sample_frac(as_tibble(mtcars), 1), "tbl_df") }) # Ungrouped -------------------------------------------------------------------- test_that("sample respects weight", { df <- data.frame(x = 1:2, y = c(0, 1)) expect_equal(sample_n(df, 1, weight = y)$x, 2) expect_equal(sample_frac(df, 0.5, weight = y)$x, 2) }) test_that("slice does not evaluate the expression in empty groups (#1438)", { res <- mtcars |> group_by(cyl) |> filter(cyl == 6) |> slice(1:2) expect_equal(nrow(res), 2L) expect_error( res <- mtcars |> group_by(cyl) |> filter(cyl == 6) |> sample_n(size = 3), NA ) expect_equal(nrow(res), 3L) }) # Grouped ---------------------------------------------------------------------- test_that("sampling grouped tbl samples each group", { sampled <- mtcars |> group_by(cyl) |> sample_n(2) expect_s3_class(sampled, "grouped_df") expect_equal(group_vars(sampled), "cyl") expect_equal(nrow(sampled), 6) expect_equal(map_int(group_rows(sampled), length), c(2, 2, 2)) }) test_that("grouped sample respects weight", { df2 <- tibble( x = rep(1:2, 100), y = rep(c(0, 1), 100), g = rep(1:2, each = 100) ) grp <- df2 |> group_by(g) expect_equal(sample_n(grp, 1, weight = y)$x, c(2, 2)) expect_equal(sample_frac(grp, 0.5, weight = y)$x, rep(2, nrow(df2) / 2)) }) test_that("grouped sample accepts NULL weight from variable (for saeSim)", { df <- tibble( x = rep(1:2, 10), y = rep(c(0, 1), 10), g = rep(1:2, each = 10) ) weight <- NULL expect_no_error(sample_n(df, nrow(df), weight = weight)) expect_no_error(sample_frac(df, weight = weight)) grp <- df |> group_by(g) expect_no_error(sample_n(grp, nrow(df) / 2, weight = weight)) expect_no_error(sample_frac(grp, weight = weight)) }) test_that("sample_n and sample_frac can call n() (#3413)", { df <- tibble( x = rep(1:2, 10), y = rep(c(0, 1), 10), g = rep(1:2, each = 10) ) gdf <- group_by(df, g) expect_equal(nrow(sample_n(df, n())), nrow(df)) expect_equal(nrow(sample_n(gdf, n())), nrow(gdf)) expect_equal(nrow(sample_n(df, n() - 2L)), nrow(df) - 2) expect_equal(nrow(sample_n(gdf, n() - 2L)), nrow(df) - 4) }) test_that("sample_n and sample_frac handles lazy grouped data frames (#3380)", { df1 <- data.frame(x = 1:10, y = rep(1:2, each = 5)) df2 <- data.frame(x = 6:15, z = 1:10) res <- df1 |> group_by(y) |> anti_join(df2, by = "x") |> sample_n(1) expect_equal(nrow(res), 1L) res <- df1 |> group_by(y) |> anti_join(df2, by = "x") |> sample_frac(0.2) expect_equal(nrow(res), 1L) }) # Errors -------------------------------------------- test_that("sample_*() gives meaningful error messages", { expect_snapshot({ df2 <- tibble( x = rep(1:2, 100), y = rep(c(0, 1), 100), g = rep(1:2, each = 100) ) grp <- df2 |> group_by(g) # base R error messages (expect_error( sample_n(grp, nrow(df2) / 2, weight = y) )) (expect_error( sample_frac(grp, 1, weight = y) )) # can't sample more values than obs (without replacement) (expect_error( mtcars |> group_by(cyl) |> sample_n(10) )) # unknown type (expect_error( sample_n(list()) )) (expect_error( sample_frac(list()) )) "# respects weight" df <- data.frame(x = 1:2, y = c(0, 1)) (expect_error( sample_n(df, 2, weight = y) )) (expect_error( sample_frac(df, 2) )) (expect_error( sample_frac(df |> group_by(y), 2) )) (expect_error( sample_frac(df, 1, weight = y) )) }) }) dplyr/tests/testthat/test-recode-values.R0000644000176200001440000002323015106134104020237 0ustar liggesuserstest_that("formula interface works as expected", { x <- c(1, 2, 0, NA, 0, NA, 5) y <- seq_along(x) z <- as.character(y) expect_identical( recode_values(x, 0 ~ "zero", NA ~ z, default = "default"), c("default", "default", "zero", "4", "zero", "6", "default") ) }) test_that("from/to vector interface works as expected", { x <- c("a", "b", "a", "c", NA, "d", NA, "e") # Lookup table # fmt: skip table <- tribble( ~from, ~to, "a", "A", "b", "B", "c", "C" ) expect_identical( recode_values(x, from = table$from, to = table$to), c("A", "B", "A", "C", NA, NA, NA, NA) ) expect_identical( recode_values(x, from = table$from, to = table$to, default = "0"), c("A", "B", "A", "C", "0", "0", "0", "0") ) expect_identical( replace_values(x, from = table$from, to = table$to), c("A", "B", "A", "C", NA, "d", NA, "e") ) }) test_that("from/to list of vectors interface works as expected", { x <- c("a", "b", "a", "c") # Lookup table # fmt: skip table <- tribble( ~from, ~to, c("a", "b"), "AB", c("c"), "C" ) # `from` is a list, `to` is not expect_identical(table$from, list(c("a", "b"), "c")) expect_identical(table$to, c("AB", "C")) expect_identical( recode_values(x, from = table$from, to = table$to), c("AB", "AB", "AB", "C") ) expect_identical( replace_values(x, from = table$from, to = table$to), c("AB", "AB", "AB", "C") ) # Lookup table # fmt: skip table <- tribble( ~from, ~to, "a", 1:4, "b", 5:8, "c", 9:12 ) # `to` is a list, `from` is not expect_identical(table$from, c("a", "b", "c")) expect_identical(table$to, list(1:4, 5:8, 9:12)) expect_identical( recode_values(x, from = table$from, to = table$to), c(1L, 6L, 3L, 12L) ) # Lookup table # fmt: skip table <- tribble( ~from, ~to, c("a", "b"), 1:4, c("c"), 5:8 ) # `from` is a list, `to` is a list expect_identical(table$from, list(c("a", "b"), "c")) expect_identical(table$to, list(1:4, 5:8)) expect_identical( recode_values(x, from = table$from, to = table$to), c(1L, 2L, 3L, 8L) ) }) test_that("when `from` is a list, `to` must recycle to the same size as that list", { expect_identical( recode_values(1:2, from = list(1, 2:3), to = 0), c(0, 0) ) expect_snapshot(error = TRUE, { recode_values(1, from = list(1, 2, 3), to = c(1, 2)) }) }) test_that("`NA` is considered unmatched unless handled explicitly", { # Like `inner_join(unmatched = "error")`. # We think it would be exponentially more complex to try and add some kind of # additional `missing` argument that handles missing values separately from # `unmatched` values. It's kind of nice that you have to be explicit in your # lookup table about whether or not you are expecting a missing value when # you've opted into the strict world of `unmatched = "error"`. x <- c("a", "b", "a", NA, "c") # Lookup table # fmt: skip table <- tribble( ~from, ~to, "a", "A", "b", "B", "c", "C" ) expect_snapshot(error = TRUE, { recode_values(x, from = table$from, to = table$to, unmatched = "error") }) table <- add_row(table, from = NA, to = NA) expect_identical( recode_values(x, from = table$from, to = table$to, unmatched = "error"), c("A", "B", "A", NA, "C") ) }) test_that("`NA` is matched exactly", { # With logical `NA` x <- c(1, NA) expect_identical(recode_values(x, NA ~ 0), c(NA, 0)) expect_identical(recode_values(x, from = NA, to = 0), c(NA, 0)) expect_identical(replace_values(x, NA ~ 0), c(1, 0)) expect_identical(replace_values(x, from = NA, to = 0), c(1, 0)) # With typed `NA` expect_identical(recode_values(x, NA_real_ ~ 0), c(NA, 0)) expect_identical(recode_values(x, from = NA_real_, to = 0), c(NA, 0)) expect_identical(replace_values(x, NA_real_ ~ 0), c(1, 0)) expect_identical(replace_values(x, from = NA_real_, to = 0), c(1, 0)) # `NA_real_` vs `NaN` x <- c(1, NA, NaN) expect_identical( recode_values(x, from = c(NA, NaN), to = c(2, 3)), c(NA, 2, 3) ) expect_identical( replace_values(x, from = c(NA, NaN), to = c(2, 3)), c(1, 2, 3) ) }) test_that("`x` must be a vector", { x <- lm(1 ~ 1) expect_snapshot(error = TRUE, { recode_values(x, 1 ~ 1) }) expect_snapshot(error = TRUE, { replace_values(x, 1 ~ 1) }) }) test_that("respects `ptype`", { expect_identical( recode_values(1, from = 1, to = 0L, ptype = double()), 0 ) expect_identical( recode_values(1, from = 2, to = 3L, default = 0L, ptype = double()), 0 ) expect_snapshot(error = TRUE, { recode_values(1, 1 ~ 0L, ptype = character()) }) # Error index is right when `NULL` is involved expect_snapshot(error = TRUE, { recode_values(1, 1 ~ "x", NULL, 2 ~ 0L, ptype = character()) }) expect_snapshot(error = TRUE, { recode_values(1, from = 1, to = 0L, ptype = character()) }) expect_snapshot(error = TRUE, { recode_values(1, from = 1, to = "x", default = 0L, ptype = character()) }) }) test_that("`replace_values()` is type stable on `x`", { # Common type would be double, but we use type of `x` expect_identical( replace_values(1:2, from = 1L, to = 0), c(0L, 2L) ) x <- factor(c("a", "b")) # Common type would be character, but we use type of `x` expect_identical( replace_values(x, from = "a", to = "b"), factor(c("b", "b"), levels = c("a", "b")) ) expect_snapshot(error = TRUE, { replace_values(x, "c" ~ "b") }) expect_snapshot(error = TRUE, { replace_values(x, from = "c", to = "b") }) expect_snapshot(error = TRUE, { replace_values(x, "a" ~ "c") }) expect_snapshot(error = TRUE, { replace_values(x, from = "a", to = "c") }) # Error index is right when `NULL` is involved expect_snapshot(error = TRUE, { replace_values(x, "a" ~ "b", NULL, "b" ~ "c") }) }) test_that("respects `default`", { expect_identical( recode_values(1:3, 2 ~ 0, default = 1), c(1, 0, 1) ) expect_identical( recode_values(1:3, 2 ~ 0, default = 4:6), c(4, 0, 6) ) }) test_that("`default` is part of `ptype` determination", { # Common type of double expect_identical( recode_values(1, from = 1, to = 0L, default = 1), 0 ) expect_snapshot(error = TRUE, { recode_values(1, from = 1, to = 0L, default = "x") }) }) test_that("`default` has its size checked", { expect_snapshot(error = TRUE, { recode_values(1:3, 1 ~ 0, default = 1:5) }) }) test_that("treats list `from` and `to` as lists of vectors", { # To align with what the `...` interface allows. # Use the vctrs interface if you want `from` and `to` lists treated as vectors. x <- 1:4 a <- c(1L, 3L) b <- 4L expect_identical( recode_values(x, from = list(a, b), to = list(0L, 5L)), c(0L, NA, 0L, 5L) ) expect_identical( replace_values(x, from = list(a, b), to = list(0L, 5L)), c(0L, 2L, 0L, 5L) ) # Notice how `from` and `to` are "just as powerful" as the formula interface # because we treat lists this way. That's the invariant we are going for here. expect_identical( recode_values(x, from = list(a, b), to = list(0L, 5L)), recode_values(x, a ~ 0L, b ~ 5L) ) expect_identical( replace_values(x, from = list(a, b), to = list(0L, 5L)), replace_values(x, a ~ 0L, b ~ 5L) ) # To treat `from` and `to` lists as vectors, use vctrs x <- list(1, 2, 3:4, 5) from <- list(3:4, 2) to <- list(1L, 6:7) expect_snapshot(error = TRUE, { recode_values(x, from = from, to = to) }) expect_identical( vec_recode_values(x, from = from, to = to), list(NULL, 6:7, 1L, NULL) ) }) test_that("`...` must be unnamed", { # Better than `case_when()`! expect_snapshot(error = TRUE, { recode_values(1, foo = 1 ~ 2) }) expect_snapshot(error = TRUE, { replace_values(1, foo = 1 ~ 2) }) }) test_that("`...` must contain two sided formulas", { expect_snapshot(error = TRUE, { recode_values(1, 1 ~ 1, 2) }) expect_snapshot(error = TRUE, { replace_values(1, 1 ~ 1, 2) }) expect_snapshot(error = TRUE, { recode_values(1, 1 ~ 1, ~2) }) expect_snapshot(error = TRUE, { replace_values(1, 1 ~ 1, ~2) }) }) test_that("throws correct errors based on all combinations of `...` and `from` and `to`", { # None of `...` and `from` or `to` expect_snapshot(error = TRUE, recode_values(1)) # `replace_values()` is a no-op here like `replace_when()`, see other tests # expect_snapshot(error = TRUE, replace_values(1)) # Both `...` and `from` expect_snapshot(error = TRUE, recode_values(1, 1 ~ 2, from = 1)) expect_snapshot(error = TRUE, replace_values(1, 1 ~ 2, from = 1)) # `from` but not `to` expect_snapshot(error = TRUE, recode_values(1, from = 1)) expect_snapshot(error = TRUE, replace_values(1, from = 1)) # `to` but not `from` expect_snapshot(error = TRUE, recode_values(1, to = 1)) expect_snapshot(error = TRUE, replace_values(1, to = 1)) }) test_that("replace_values() is a no-op with no `...` or `from` and `to`", { # Like `replace_when()` expect_identical(replace_values(1), 1) }) test_that("recode_values() takes names from inputs", { expect_identical( recode_values( c(a = 1, b = 2), c(c = 1) ~ c(d = 0), default = c(e = 3) ), c(d = 0, e = 3) ) expect_identical( recode_values( c(a = 1, b = 2), from = c(c = 1), to = c(d = 0), default = c(e = 3) ), c(d = 0, e = 3) ) }) test_that("replace_values() takes names from `x`", { expect_identical( replace_values( c(a = 1, b = 2), c(c = 1) ~ c(d = 0) ), c(a = 0, b = 2) ) expect_identical( replace_values( c(a = 1, b = 2), from = c(c = 1), to = c(d = 0) ), c(a = 0, b = 2) ) }) dplyr/tests/testthat/helper-torture.R0000644000176200001440000000005413663216626017524 0ustar liggesuserswith_gctorture2 <- withr::with_(gctorture2) dplyr/tests/testthat/test-colwise-funs.R0000644000176200001440000000037213663216626020141 0ustar liggesuserstest_that("as_fun_list() uses rlang auto-naming", { nms <- names(as_fun_list(list(min, max), env())) # Just check they are labellised as literals enclosed in brackets to # insulate from upstream changes expect_true(all(grepl("^<", nms))) }) dplyr/tests/testthat/test-case-when.R0000644000176200001440000003317215137161765017402 0ustar liggesusers# `case_when()` ---------------------------------------------------------------- test_that("matches values in order", { x <- 1:3 expect_equal( case_when( x <= 1 ~ 1, x <= 2 ~ 2, x <= 3 ~ 3 ), c(1, 2, 3) ) }) test_that("unmatched gets missing value", { x <- 1:3 expect_equal( case_when( x <= 1 ~ 1, x <= 2 ~ 2 ), c(1, 2, NA) ) }) test_that("missing values can be replaced (#1999)", { x <- c(1:3, NA) expect_equal( case_when( x <= 1 ~ 1, x <= 2 ~ 2, is.na(x) ~ 0 ), c(1, 2, NA, 0) ) }) test_that("NA conditions (#2927)", { expect_equal( case_when( c(TRUE, FALSE, NA) ~ 1:3, TRUE ~ 4L ), c(1L, 4L, 4L) ) }) test_that("any `TRUE` overrides an `NA`", { x <- c(1, 2, NA, 3) expect <- c("one", "not_one", "missing", "not_one") # `TRUE` overriding before the `NA` expect_identical( case_when( is.na(x) ~ "missing", x == 1 ~ "one", .default = "not_one" ), expect ) # `TRUE` overriding after the `NA` expect_identical( case_when( x == 1 ~ "one", is.na(x) ~ "missing", .default = "not_one" ), expect ) }) test_that("case_when can be used in anonymous functions (#3422)", { res <- tibble(a = 1:3) |> mutate(b = (function(x) case_when(x < 2 ~ TRUE, .default = FALSE))(a)) |> pull() expect_equal(res, c(TRUE, FALSE, FALSE)) }) test_that("case_when() can be used inside mutate()", { out <- mtcars[1:4, ] |> mutate( out = case_when( cyl == 4 ~ 1, .data[["am"]] == 1 ~ 2, .default = 0 ) ) |> pull() expect_identical(out, c(2, 2, 1, 0)) }) test_that("case_when() conditions must be logical (and aren't cast to logical!)", { expect_snapshot(error = TRUE, { case_when(1 ~ 2) }) # Make sure input numbering is right in the error message! expect_snapshot(error = TRUE, { case_when(TRUE ~ 2, 3.5 ~ 4) }) }) test_that("case_when() accepts logical condition vectors with attributes (#6678)", { x <- structure(c(FALSE, TRUE), label = "foo") expect_identical(case_when(x ~ 1, .default = 2), c(2, 1)) }) test_that("case_when() does not accept classed logical conditions", { # From a vctrs perspective, these aren't "logical condition indices" x <- structure(c(FALSE, TRUE), class = "my_logical") expect_snapshot(error = TRUE, { case_when(x ~ 1) }) }) test_that("case_when() logical conditions can't be arrays (#6862)", { x <- array(TRUE, dim = c(3, 3)) y <- c("a", "b", "c") expect_snapshot(error = TRUE, { case_when(x ~ y) }) # Not even 1D arrays x <- array(TRUE, dim = 3) expect_snapshot(error = TRUE, { case_when(x ~ y) }) }) test_that("can pass quosures to case_when()", { fs <- local({ x <- 3:1 quos( x < 2 ~ TRUE, TRUE ~ FALSE ) }) expect_identical(case_when(!!!fs), c(FALSE, FALSE, TRUE)) }) test_that("can pass nested quosures to case_when()", { fs <- local({ foo <- mtcars$cyl[1:4] quos( !!quo(foo) == 4 ~ 1, TRUE ~ 0 ) }) expect_identical(case_when(!!!fs), c(0, 0, 1, 0)) }) test_that("can pass unevaluated formulas to case_when()", { x <- 6:8 fs <- exprs( x == 7L ~ TRUE, TRUE ~ FALSE ) expect_identical(case_when(!!!fs), c(FALSE, TRUE, FALSE)) out <- local({ x <- 7:9 case_when(!!!fs) }) expect_identical(out, c(TRUE, FALSE, FALSE)) }) test_that("unevaluated formulas can refer to data mask", { fs <- exprs( cyl == 4 ~ 1, am == 1 ~ 2, TRUE ~ 0 ) out <- mtcars[1:4, ] |> mutate(out = case_when(!!!fs)) |> pull() expect_identical(out, c(2, 2, 1, 0)) }) test_that("unevaluated formulas can contain quosures", { quo <- local({ n <- 4 quo(n) }) fs <- exprs( cyl == !!quo ~ 1, am == 1 ~ 2, TRUE ~ 0 ) out <- mtcars[1:4, ] |> mutate(out = case_when(!!!fs)) |> pull() expect_identical(out, c(2, 2, 1, 0)) }) test_that("NULL inputs are compacted", { x <- 1:3 bool <- FALSE out <- case_when( x == 2 ~ TRUE, if (bool) x == 3 ~ NA, .default = FALSE ) expect_identical(out, c(FALSE, TRUE, FALSE)) bool <- TRUE out <- case_when( x == 2 ~ TRUE, if (bool) x == 3 ~ NA, .default = FALSE ) expect_identical(out, c(FALSE, TRUE, NA)) }) test_that("passes through `.default` correctly", { expect_identical(case_when(FALSE ~ 1, .default = 2), 2) expect_identical( case_when(c(TRUE, FALSE, TRUE, FALSE, TRUE) ~ 1:5, .default = 2), c(1, 2, 3, 2, 5) ) }) test_that("`.default` isn't part of recycling", { # Because eventually we want to only take the output size from the LHS conditions, # so having `.default` participate in the common size is a step in the wrong # direction expect_snapshot(error = TRUE, { case_when(FALSE ~ 1L, .default = 2:5) }) }) test_that("`.default` is part of common type computation", { expect_identical(case_when(TRUE ~ 1L, .default = 2), 1) expect_snapshot(error = TRUE, { case_when(TRUE ~ 1L, .default = "x") }) }) test_that("passes through `.ptype` correctly", { expect_identical(case_when(TRUE ~ 1, .ptype = integer()), 1L) expect_snapshot(error = TRUE, { case_when(TRUE ~ 1, FALSE ~ 1.5, .ptype = integer()) }) # Error index is right when `NULL` is involved expect_snapshot(error = TRUE, { case_when(TRUE ~ 1, NULL, FALSE ~ 1.5, .ptype = integer()) }) }) test_that("passes through `.size` correctly", { expect_identical(case_when(TRUE ~ 1, .size = 2), c(1, 1)) expect_snapshot(error = TRUE, { case_when(TRUE ~ 1:2, .size = 3) }) # Error index is right when `NULL` is involved expect_snapshot(error = TRUE, { case_when(TRUE ~ 1:3, NULL, TRUE ~ 1:2, .size = 3) }) }) test_that("can't supply `.default` and `.unmatched`", { # Probably overkill to add `unmatched_arg` just to get `.unmatched` instead # of `unmatched`. expect_snapshot(error = TRUE, { case_when(TRUE ~ 1, .default = 1, .unmatched = "error") }) }) test_that("`.unmatched` is validated", { # Probably overkill to add `unmatched_arg` to `vec_case_when()` just to get # `.unmatched` instead of `unmatched` expect_snapshot(error = TRUE, { case_when(TRUE ~ 1, .unmatched = "foo") }) expect_snapshot(error = TRUE, { case_when(TRUE ~ 1, .unmatched = 1) }) }) test_that("`.unmatched` treats `FALSE` like an unmatched location", { expect_snapshot(error = TRUE, { case_when( c(TRUE, FALSE, TRUE) ~ 1, .unmatched = "error" ) }) }) test_that("`.unmatched` treats `NA` like an unmatched location", { expect_snapshot(error = TRUE, { case_when( c(TRUE, NA, TRUE) ~ 1, .unmatched = "error" ) }) }) test_that("`.unmatched` errors pluralize well", { # One location x <- letters[1:5] expect_snapshot(error = TRUE, { case_when( x == "a" ~ 1, x == "b" ~ 2, x == "c" ~ 3, x == "e" ~ 4, .unmatched = "error" ) }) # Two locations x <- letters[1:5] expect_snapshot(error = TRUE, { case_when( x == "a" ~ 1, x == "c" ~ 2, x == "e" ~ 3, .unmatched = "error" ) }) # Many locations x <- 1:100 expect_snapshot(error = TRUE, { case_when(x == 1 ~ "a", .unmatched = "error") }) }) # `case_when()` errors --------------------------------------------------------- test_that("invalid type errors are correct (#6261) (#6206)", { expect_snapshot(error = TRUE, { case_when(TRUE ~ 1, TRUE ~ "x") }) }) test_that("`NULL` formula element throws meaningful error (#7794)", { # "Must be a vector" errors expect_snapshot(error = TRUE, { case_when(NULL ~ NULL) case_when(TRUE ~ NULL) case_when(NULL ~ TRUE) case_when(c(TRUE, TRUE) ~ NULL) case_when(NULL ~ c(TRUE, TRUE)) case_when(TRUE ~ NULL, c(TRUE, TRUE) ~ NULL) case_when(NULL ~ TRUE, NULL ~ c(TRUE, TRUE)) }) # Recycling errors come first expect_snapshot(error = TRUE, { case_when(c(TRUE, TRUE) ~ NULL, c(TRUE, TRUE, TRUE) ~ NULL) case_when(NULL ~ c(TRUE, TRUE), NULL ~ c(TRUE, TRUE, TRUE)) }) }) test_that("throws chained errors when formula evaluation fails", { expect_snapshot(error = TRUE, { case_when(1 ~ 2, 3 ~ stop("oh no!")) }) expect_snapshot(error = TRUE, { case_when(1 ~ 2, stop("oh no!") ~ 4) }) }) test_that("case_when() give meaningful errors", { expect_snapshot({ (expect_error( case_when( c(TRUE, FALSE) ~ 1:3, c(FALSE, TRUE) ~ 1:2 ) )) (expect_error( case_when( c(TRUE, FALSE) ~ 1, c(FALSE, TRUE, FALSE) ~ 2, c(FALSE, TRUE, FALSE, NA) ~ 3 ) )) (expect_error( case_when(51:53 ~ 1:3) )) (expect_error( case_when(paste(50)) )) (expect_error( case_when(y ~ x, paste(50)) )) (expect_error( case_when() )) (expect_error( case_when(NULL) )) (expect_error( case_when(~ 1:2) )) }) }) # `case_when()` deprecated ----------------------------------------------------- test_that("Using scalar LHS with vector RHS is deprecated (#7082)", { # In many packages, people use `case_when()` when they should be using a # series of if statements. We try to warn when we detect this. expect_snapshot({ # Columns x <- 1:5 y <- 6:10 # Scalars code <- 1L sex <- "M" # This is really a series of if statements. # This is highly inefficient because each scalar LHS is recycled to size 5. expect_identical( case_when( code == 1L && sex == "M" ~ x, code == 1L && sex == "F" ~ y, code == 1L && sex == "M" ~ x + 1L, .default = 0L ), x ) }) # Motivating example of a silent bug that results from allowing this kind of # common size determination (#7082). We ideally want this case to fail. LHS # common size is 1 and RHS inputs ideally should be forced to recycle to this # size. Since both the LHS and RHS inputs are consulted to compute a common # size of 0, this incorrectly returns `character()`, but we at least warn the # user that something is fishy here, and hopefully they take a closer look and # catch their error. expect_snapshot({ x <- 1 case_when( x == 1 ~ "a", x == 2 ~ character(), .default = "other" ) }) # Now confirm that the other 3 possible combinations don't warn! # size 1 LHS, size 1 RHS expect_identical( expect_no_warning(case_when(TRUE ~ "a", FALSE ~ "b")), "a" ) # size >1 LHS, size 1 RHS expect_identical( expect_no_warning(case_when(c(TRUE, FALSE) ~ "a", c(FALSE, TRUE) ~ "b")), c("a", "b") ) # size >1 LHS, size >1 RHS expect_identical( expect_no_warning(case_when( c(TRUE, FALSE) ~ c("a", "b"), c(FALSE, TRUE) ~ c("c", "d") )), c("a", "d") ) }) # `replace_when()` ------------------------------------------------------------- test_that("replace_when() recycles scalar RHS", { x <- c(1, 2, 3, 1, 2, 3) expect_identical( replace_when(x, x == 1 ~ 0, x == 3 ~ 4), c(0, 2, 4, 0, 2, 4) ) }) test_that("replace_when() allows vector RHS of the same size as `x`", { x <- c(1, 2, 3, 1, 2, 3) y <- seq_along(x) expect_identical( replace_when(x, x == 1 ~ 0, x == 3 ~ y), c(0, 2, 3, 0, 2, 6) ) expect_snapshot(error = TRUE, { replace_when(x, x == 1 ~ 1:3) }) }) test_that("replace_when() does not recycle LHS values", { # Unlike `case_when()` we get to do this right! x <- c(1, 2, 3) expect_snapshot(error = TRUE, { replace_when(x, TRUE ~ 0) }) # Error index is right when `NULL` is involved expect_snapshot(error = TRUE, { replace_when(x, c(TRUE, TRUE, TRUE) ~ 0, NULL, TRUE ~ 0) }) }) test_that("replace_when() retains the type of `x`", { x <- c(1L, 2L) # Not going towards common type of double expect_identical( replace_when(x, x == 1L ~ 0), c(0L, 2L) ) x <- factor(c("a", "b", "c")) # Note common type would be character expect_identical( replace_when(x, x == "a" ~ "c"), factor(c("c", "b", "c"), levels = c("a", "b", "c")) ) # Can't cast to unknown level expect_snapshot(error = TRUE, { replace_when(x, x == "a" ~ "d") }) # Error index is right when `NULL` is involved expect_snapshot(error = TRUE, { replace_when(x, x == "a" ~ "b", NULL, x == "b" ~ "d") }) }) test_that("replace_when() retains names of `x`, consistent with `base::replace()`", { x <- c(a = 1, b = 2, c = 3) expect_identical( replace_when( x, x == 1 ~ 0, x == 3 ~ c(z = 4) ), c(a = 0, b = 2, c = 4) ) # `x` does not become named if RHS inputs are named x <- c(1, 2, 3) expect_identical( replace_when( x, x == 1 ~ c(a = 0), x == 3 ~ c(b = 4) ), c(0, 2, 4) ) }) test_that("replace_when() does not allow named `...`", { # Purposefully stricter than `case_when()` expect_snapshot(error = TRUE, { replace_when(1, foo = TRUE ~ 2) }) }) test_that("replace_when() compacts `NULL` inputs", { expect_identical( replace_when(1, NULL, TRUE ~ 2, NULL), 2 ) }) test_that("replace_when() is a no-op with zero conditions", { # Unlike `case_when()`, where when zero conditions are supplied # we don't know what kind of vector to build (and we refuse to # build an `unspecified` vector, unlike `vec_case_when()`) expect_identical(replace_when(1), 1) expect_identical(replace_when(1, NULL), 1) }) test_that("replace_when() works with data frames", { x <- tibble(a = c(1, 2, 3, 1), b = c(2, 3, 4, 2)) expect_identical( replace_when( x, vec_equal(x, tibble(a = 1, b = 2)) ~ NA ), vec_assign(x, c(1, 4), NA) ) expect_identical( replace_when( x, vec_equal(x, tibble(a = 1, b = 2)) ~ tibble(a = 0, b = -1) ), vec_assign(x, c(1, 4), tibble(a = 0, b = -1)) ) }) dplyr/tests/testthat/test-filter.R0000644000176200001440000005374115137161765017021 0ustar liggesuserstest_that("filter handles passing ...", { df <- data.frame(x = 1:4) f <- function(...) { x1 <- 4 f1 <- function(y) y filter(df, ..., f1(x1) > x) } g <- function(...) { x2 <- 2 f(x > x2, ...) } res <- g() expect_equal(res$x, 3L) df <- group_by(df, x) res <- g() expect_equal(res$x, 3L) }) test_that("filter handles simple symbols", { df <- data.frame(x = 1:4, test = rep(c(T, F), each = 2)) res <- filter(df, test) gdf <- group_by(df, x) res <- filter(gdf, test) h <- function(data) { test2 <- c(T, T, F, F) filter(data, test2) } expect_equal(h(df), df[1:2, ]) f <- function(data, ...) { one <- 1 filter(data, test, x > one, ...) } g <- function(data, ...) { four <- 4 f(data, x < four, ...) } res <- g(df) expect_equal(res$x, 2L) expect_equal(res$test, TRUE) res <- g(gdf) expect_equal(res$x, 2L) expect_equal(res$test, TRUE) }) test_that("filter handlers scalar results", { expect_equal(filter(mtcars, min(mpg) > 0), mtcars, ignore_attr = TRUE) expect_equal( filter(group_by(mtcars, cyl), min(mpg) > 0), group_by(mtcars, cyl) ) }) test_that("filter and filter_out propagate attributes", { date.start <- ISOdate(2010, 01, 01, 0) test <- data.frame(Date = ISOdate(2010, 01, 01, 1:10)) test2 <- test |> filter(Date < ISOdate(2010, 01, 01, 5)) expect_equal(test$Date[1:4], test2$Date) test2 <- test |> filter_out(Date < ISOdate(2010, 01, 01, 5)) expect_equal(test$Date[5:10], test2$Date) }) test_that("filter and filter_out discards NA", { temp <- data.frame( i = 1:5, x = c(NA, 1L, 1L, 0L, 0L) ) res <- filter(temp, x == 1) expect_identical( res, data.frame(i = c(2L, 3L), x = c(1L, 1L)) ) res <- filter_out(temp, x == 1) expect_identical( res, data.frame(i = c(1L, 4L, 5L), x = c(NA, 0L, 0L)) ) }) test_that("date class remains on filter (#273)", { x1 <- x2 <- data.frame( date = seq.Date(as.Date("2013-01-01"), by = "1 days", length.out = 2), var = c(5, 8) ) x1.filter <- x1 |> filter(as.Date(date) > as.Date("2013-01-01")) x2$date <- x2$date + 1 x2.filter <- x2 |> filter(as.Date(date) > as.Date("2013-01-01")) expect_equal(class(x1.filter$date), "Date") expect_equal(class(x2.filter$date), "Date") }) test_that("filter handles $ correctly (#278)", { d1 <- tibble( num1 = as.character(sample(1:10, 1000, T)), var1 = runif(1000), ) d2 <- data.frame(num1 = as.character(1:3), stringsAsFactors = FALSE) res1 <- d1 |> filter(num1 %in% c("1", "2", "3")) res2 <- d1 |> filter(num1 %in% d2$num1) expect_equal(res1, res2) }) test_that("filter() and filter_out() are still a union if no parameters are given", { # Justification is that `filter(df, ...)` performs a `pall(...)` style # operation over the `...` to determine which rows to keep. This defaults to # `TRUE` for each row when no inputs are provided, so in `filter()` all rows # are retained. Which implies that `filter_out()` retains no rows. expect_identical(filter(mtcars), mtcars) expect_identical(filter_out(mtcars), mtcars[0, ]) expect_identical(filter(mtcars, !!!list()), mtcars) expect_identical(filter_out(mtcars, !!!list()), mtcars[0, ]) }) test_that("$ does not end call traversing. #502", { # Suppose some analysis options are set much earlier in the script analysis_opts <- list(min_outcome = 0.25) # Generate some dummy data d <- expand.grid(Subject = 1:3, TrialNo = 1:2, Time = 1:3) |> as_tibble() |> arrange(Subject, TrialNo, Time) |> mutate(Outcome = (1:18 %% c(5, 7, 11)) / 10) # Do some aggregation trial_outcomes <- d |> group_by(Subject, TrialNo) |> summarise(MeanOutcome = mean(Outcome), .groups = "drop") left <- filter(trial_outcomes, MeanOutcome < analysis_opts$min_outcome) right <- filter(trial_outcomes, analysis_opts$min_outcome > MeanOutcome) expect_equal(left, right) }) test_that("filter handles POSIXlt", { datesDF <- read.csv( stringsAsFactors = FALSE, text = " X 2014-03-13 16:08:19 2014-03-13 16:16:23 2014-03-13 16:28:28 2014-03-13 16:28:54 " ) datesDF$X <- as.POSIXlt(datesDF$X) expect_equal( nrow(filter(datesDF, X > as.POSIXlt("2014-03-13"))), 4 ) }) test_that("filter handles complex vectors (#436)", { d <- data.frame(x = 1:10, y = 1:10 + 2i) expect_equal(filter(d, x < 4)$y, 1:3 + 2i) expect_equal(filter(d, Re(y) < 4)$y, 1:3 + 2i) }) test_that("%in% works as expected (#126)", { df <- tibble(a = c("a", "b", "ab"), g = c(1, 1, 2)) res <- df |> filter(a %in% letters) expect_equal(nrow(res), 2L) res <- df |> group_by(g) |> filter(a %in% letters) expect_equal(nrow(res), 2L) }) test_that("row_number does not segfault with example from #781", { z <- data.frame(a = c(1, 2, 3)) b <- "a" res <- z |> filter(row_number(b) == 2) expect_equal(nrow(res), 0L) }) test_that("row_number works on 0 length columns (#3454)", { expect_identical( mutate(tibble(), a = row_number()), tibble(a = integer()) ) }) test_that("filter does not alter expression (#971)", { my_filter <- ~ am == 1 expect_equal(my_filter[[2]][[2]], as.name("am")) }) test_that("hybrid evaluation handles $ correctly (#1134)", { df <- tibble(x = 1:10, g = rep(1:5, 2)) res <- df |> group_by(g) |> filter(x > min(df$x)) expect_equal(nrow(res), 9L) }) test_that("filter and filter_out correctly handle empty data frames (#782)", { expect_identical(filter(tibble(), TRUE), tibble()) expect_identical(filter(tibble(), FALSE), tibble()) expect_identical(filter_out(tibble(), TRUE), tibble()) expect_identical(filter_out(tibble(), FALSE), tibble()) }) test_that("filter(.,TRUE,TRUE) works (#1210)", { df <- data.frame(x = 1:5) expect_identical(filter(df, TRUE, TRUE), df) expect_identical(filter_out(df, TRUE, TRUE), df[0, , drop = FALSE]) }) test_that("filter, slice, and arrange preserves attributes (#1064)", { df <- structure( data.frame(x = 1:10, g1 = rep(1:2, each = 5), g2 = rep(1:5, 2)), meta = "this is important" ) res <- filter(df, x < 5) |> attr("meta") expect_equal(res, "this is important") res <- filter_out(df, x < 5) |> attr("meta") expect_equal(res, "this is important") res <- filter(df, x < 5, x > 4) |> attr("meta") expect_equal(res, "this is important") res <- filter_out(df, x < 5, x > 4) |> attr("meta") expect_equal(res, "this is important") res <- df |> slice(1:50) |> attr("meta") expect_equal(res, "this is important") res <- df |> arrange(x) |> attr("meta") expect_equal(res, "this is important") }) test_that("filter works with rowwise data (#1099)", { df <- tibble( First = c("string1", "string2"), Second = c("Sentence with string1", "something") ) res <- df |> rowwise() |> filter(grepl(First, Second, fixed = TRUE)) expect_equal(nrow(res), 1L) expect_equal(df[1, ], ungroup(res)) }) test_that("grouped filter handles indices (#880)", { res <- iris |> group_by(Species) |> filter(Sepal.Length > 5) res2 <- mutate(res, Petal = Petal.Width * Petal.Length) expect_equal(nrow(res), nrow(res2)) expect_equal(group_rows(res), group_rows(res2)) expect_equal(group_keys(res), group_keys(res2)) }) test_that("filter(FALSE) and filter_out(TRUE) handle indices", { indices <- list_of(integer(), integer(), integer(), .ptype = integer()) out <- mtcars |> group_by(cyl) |> filter(FALSE, .preserve = TRUE) |> group_rows() expect_identical(out, indices) out <- mtcars |> group_by(cyl) |> filter_out(TRUE, .preserve = TRUE) |> group_rows() expect_identical(out, indices) indices <- list_of(.ptype = integer()) out <- mtcars |> group_by(cyl) |> filter(FALSE, .preserve = FALSE) |> group_rows() expect_identical(out, indices) out <- mtcars |> group_by(cyl) |> filter_out(TRUE, .preserve = FALSE) |> group_rows() expect_identical(out, indices) }) test_that("filter handles S4 objects (#1366)", { env <- environment() Numbers <- suppressWarnings(setClass( "Numbers", slots = c(foo = "numeric"), contains = "integer", where = env )) setMethod("[", "Numbers", function(x, i, ...) { Numbers(unclass(x)[i, ...], foo = x@foo) }) on.exit(removeClass("Numbers", where = env)) df <- data.frame(x = Numbers(1:10, foo = 10)) res <- filter(df, x > 3) expect_s4_class(res$x, "Numbers") expect_equal(res$x@foo, 10) }) test_that("hybrid lag and default value for string columns work (#1403)", { res <- mtcars |> mutate(xx = LETTERS[gear]) |> filter(xx == lag(xx, default = "foo")) xx <- LETTERS[mtcars$gear] ok <- xx == lag(xx, default = "foo") expect_equal(xx[ok], res$xx) res <- mtcars |> mutate(xx = LETTERS[gear]) |> filter(xx == lead(xx, default = "foo")) xx <- LETTERS[mtcars$gear] ok <- xx == lead(xx, default = "foo") expect_equal(xx[ok], res$xx) }) # .data and .env tests now in test-hybrid-traverse.R test_that("filter handles raw vectors (#1803)", { df <- tibble(a = 1:3, b = as.raw(1:3)) expect_identical(filter(df, a == 1), tibble(a = 1L, b = as.raw(1))) expect_identical(filter(df, b == 1), tibble(a = 1L, b = as.raw(1))) }) test_that("`vars` attribute is not added if empty (#2772)", { expect_identical(tibble(x = 1:2) |> filter(x == 1), tibble(x = 1L)) }) test_that("filter handles list columns", { res <- tibble(a = 1:2, x = list(1:10, 1:5)) |> filter(a == 1) |> pull(x) expect_equal(res, list(1:10)) res <- tibble(a = 1:2, x = list(1:10, 1:5)) |> group_by(a) |> filter(a == 1) |> pull(x) expect_equal(res, list(1:10)) }) test_that("hybrid function row_number does not trigger warning in filter (#3750)", { out <- tryCatch( { mtcars |> filter(row_number() > 1, row_number() < 5) TRUE }, warning = function(w) FALSE ) expect_true(out) }) test_that("filter() and filter_out() preserve order across groups (#3989)", { df <- tibble( g = c(1, 2, 1, 2, 1), time = 5:1, x = 5:1 ) res1 <- df |> group_by(g) |> filter(x <= 4) |> arrange(time) res2 <- df |> group_by(g) |> arrange(time) |> filter(x <= 4) res3 <- df |> filter(x <= 4) |> arrange(time) |> group_by(g) expect_identical(res1$time, 1:4) expect_equal(res1, res2) expect_equal(res1, res3) res1 <- df |> group_by(g) |> filter_out(x <= 2) |> arrange(time) res2 <- df |> group_by(g) |> arrange(time) |> filter_out(x <= 2) res3 <- df |> filter_out(x <= 2) |> arrange(time) |> group_by(g) expect_identical(res1$time, 3:5) expect_equal(res1, res2) expect_equal(res1, res3) }) test_that("filter() with two conditions does not freeze (#4049)", { expect_identical( iris |> filter(Sepal.Length > 7, Petal.Length < 6), iris |> filter(Sepal.Length > 7 & Petal.Length < 6) ) }) test_that("filter() handles matrix and data frame columns (#3630)", { df <- tibble( x = 1:2, y = matrix(1:4, ncol = 2), z = data.frame(A = 1:2, B = 3:4) ) expect_equal(filter(df, x == 1), df[1, ]) expect_equal(filter(df, y[, 1] == 1), df[1, ]) expect_equal(filter(df, z$A == 1), df[1, ]) gdf <- group_by(df, x) expect_equal(filter(gdf, x == 1), gdf[1, ]) expect_equal(filter(gdf, y[, 1] == 1), gdf[1, ]) expect_equal(filter(gdf, z$A == 1), gdf[1, ]) gdf <- group_by(df, y) expect_equal(filter(gdf, x == 1), gdf[1, ]) expect_equal(filter(gdf, y[, 1] == 1), gdf[1, ]) expect_equal(filter(gdf, z$A == 1), gdf[1, ]) gdf <- group_by(df, z) expect_equal(filter(gdf, x == 1), gdf[1, ]) expect_equal(filter(gdf, y[, 1] == 1), gdf[1, ]) expect_equal(filter(gdf, z$A == 1), gdf[1, ]) }) test_that("filter() and filter_out() handle named logical (#4638)", { tbl <- tibble(a = c(a = TRUE)) expect_identical(filter(tbl, a), tbl) tbl <- tibble(a = c(a = FALSE)) expect_identical(filter_out(tbl, a), tbl) }) test_that("filter() allows named constants that resolve to logical vectors (#4612)", { filters <- mtcars |> transmute( cyl %in% 6:8, hp / drat > 50 ) expect_identical( mtcars |> filter(!!!filters), mtcars |> filter(!!!unname(filters)) ) }) test_that("filter() and filter_out() allow 1 dimension arrays", { df <- tibble(x = array(c(TRUE, FALSE, TRUE))) expect_identical(filter(df, x), df[c(1, 3), ]) expect_identical(filter_out(df, x), df[2, ]) }) test_that("filter() and filter_out() allow matrices with 1 column with a deprecation warning (#6091)", { df <- tibble(x = 1:2) expect_snapshot({ out <- filter(df, matrix(c(TRUE, FALSE), nrow = 2)) }) expect_identical(out, tibble(x = 1L)) expect_snapshot({ out <- filter_out(df, matrix(c(TRUE, FALSE), nrow = 2)) }) expect_identical(out, tibble(x = 2L)) # Only warns once when grouped df <- tibble(x = c(1, 1, 2, 2), y = c(1, 2, 3, 4)) gdf <- group_by(df, x) expect_snapshot({ out <- filter(gdf, matrix(c(TRUE, FALSE), nrow = 2)) }) expect_identical(out, group_by(tibble(x = c(1, 2), y = c(1, 3)), x)) expect_snapshot({ out <- filter_out(gdf, matrix(c(TRUE, FALSE), nrow = 2)) }) expect_identical(out, group_by(tibble(x = c(1, 2), y = c(2, 4)), x)) }) test_that("filter() and filter_out() disallow matrices with >1 column", { df <- tibble(x = 1:3) expect_snapshot(error = TRUE, { filter(df, matrix(TRUE, nrow = 3, ncol = 2)) }) expect_snapshot(error = TRUE, { filter_out(df, matrix(TRUE, nrow = 3, ncol = 2)) }) }) test_that("filter() and filter_out() disallow arrays with >2 dimensions", { df <- tibble(x = 1:3) expect_snapshot(error = TRUE, { filter(df, array(TRUE, dim = c(3, 1, 1))) }) expect_snapshot(error = TRUE, { filter_out(df, array(TRUE, dim = c(3, 1, 1))) }) }) test_that("filter() gives useful error messages", { expect_snapshot({ # wrong type (expect_error( iris |> group_by(Species) |> filter(1:n()) )) (expect_error( iris |> filter(1:n()) )) # matrix with > 1 columns (expect_error( filter( data.frame(x = 1:2), matrix(c(TRUE, FALSE, TRUE, FALSE), nrow = 2) ) )) # wrong size (expect_error( iris |> group_by(Species) |> filter(c(TRUE, FALSE)) )) (expect_error( iris |> rowwise(Species) |> filter(c(TRUE, FALSE)) )) (expect_error( iris |> filter(c(TRUE, FALSE)) )) # evaluation error (expect_error( mtcars |> filter(`_x`) )) (expect_error( mtcars |> group_by(cyl) |> filter(`_x`) )) # named inputs (expect_error( filter(mtcars, x = 1) )) (expect_error( filter(mtcars, y > 2, z = 3) )) (expect_error( filter(mtcars, TRUE, x = 1) )) # ts (expect_error( filter(ts(1:10)) )) # Error that contains { (expect_error( tibble() |> filter(stop("{")) )) }) }) test_that("Using data frames in `filter()` is defunct (#7758)", { df <- data.frame(x = 1, y = 1) gdf <- group_by(df, x) rdf <- rowwise(df, x) # Use `if_any()` or `if_all()`, not `across()` expect_snapshot(error = TRUE, { filter(df, across(everything(), ~ .x > 0)) }) expect_snapshot(error = TRUE, { filter(gdf, across(everything(), ~ .x > 0)) }) expect_snapshot(error = TRUE, { filter(rdf, across(everything(), ~ .x > 0)) }) # Can't filter with a data frame of logicals (same as the `across()` case) expect_snapshot(error = TRUE, { filter(df, tibble(x > 0, y > 0)) }) expect_snapshot(error = TRUE, { filter(gdf, tibble(x > 0, y > 0)) }) expect_snapshot(error = TRUE, { filter(rdf, tibble(x > 0, y > 0)) }) }) test_that("filter and filter_out preserve grouping", { gf <- group_by(tibble(g = c(1, 1, 1, 2, 2), x = 1:5), g) i <- count_regroups(out <- filter(gf, x %in% c(3, 4))) expect_equal(i, 0L) expect_equal(group_vars(gf), "g") expect_equal(group_rows(out), list_of(1L, 2L)) i <- count_regroups(out <- filter_out(gf, x %in% c(3, 4))) expect_equal(i, 0L) expect_equal(group_vars(gf), "g") expect_equal(group_rows(out), list_of(1:2, 3L)) i <- count_regroups(out <- filter(gf, x < 3)) expect_equal(i, 0L) expect_equal(group_vars(gf), "g") expect_equal(group_rows(out), list_of(c(1L, 2L))) i <- count_regroups(out <- filter_out(gf, x < 3)) expect_equal(i, 0L) expect_equal(group_vars(gf), "g") expect_equal(group_rows(out), list_of(1L, 2:3)) }) test_that("filter() and filter_out() with empty dots still calls dplyr_row_slice()", { tbl <- new_tibble(list(x = 1), nrow = 1L) foo <- structure(tbl, class = c("foo_df", class(tbl))) local_methods( # `foo_df` always loses class when row slicing dplyr_row_slice.foo_df = function(data, i, ...) { out <- NextMethod() new_tibble(out, nrow = nrow(out)) } ) expect_s3_class(filter(foo), class(tbl), exact = TRUE) expect_s3_class(filter_out(foo), class(tbl), exact = TRUE) expect_s3_class(filter(foo, x == 1), class(tbl), exact = TRUE) expect_s3_class(filter_out(foo, x == 1), class(tbl), exact = TRUE) }) test_that("can filter() with unruly class", { local_methods( `[.dplyr_foobar` = function(x, i, ...) new_dispatched_quux(vec_slice(x, i)), dplyr_row_slice.dplyr_foobar = function(x, i, ...) x[i, ] ) df <- foobar(data.frame(x = 1:3)) expect_identical( filter(df, x <= 2), quux(data.frame(x = 1:2, dispatched = TRUE)) ) }) test_that("filter() preserves the call stack on error (#5308)", { foobar <- function() stop("foo") stack <- NULL expect_error( withCallingHandlers( error = function(...) stack <<- sys.calls(), filter(mtcars, foobar()) ) ) expect_true(some(stack, is_call, "foobar")) }) test_that("if_any() and if_all() work", { df <- tibble(x1 = 1:10, x2 = c(1:5, 10:6)) expect_equal( filter(df, if_all(starts_with("x"), ~ . > 6)), filter(df, x1 > 6 & x2 > 6) ) expect_equal( filter_out(df, if_all(starts_with("x"), ~ . > 6)), filter_out(df, x1 > 6 & x2 > 6) ) expect_equal( filter(df, if_any(starts_with("x"), ~ . > 6)), filter(df, x1 > 6 | x2 > 6) ) expect_equal( filter_out(df, if_any(starts_with("x"), ~ . > 6)), filter_out(df, x1 > 6 | x2 > 6) ) }) test_that("filter and filter_out keep zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) expect_identical(group_size(filter(df, f == 1)), c(2L, 0L, 0L)) expect_identical(group_size(filter_out(df, f == 1)), c(0L, 2L, 0L)) }) test_that("filtering retains labels for zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) expect_equal( ungroup(count(filter(df, f == 1))), tibble( e = 1, f = factor(1:3), g = c(1, 2, NA), n = c(2L, 0L, 0L) ) ) expect_equal( ungroup(count(filter_out(df, f == 1))), tibble( e = 1, f = factor(1:3), g = c(1, 2, NA), n = c(0L, 2L, 0L) ) ) }) test_that("`filter()` doesn't allow data frames with missing or empty names (#6758)", { df1 <- new_data_frame(set_names(list(1), "")) df2 <- new_data_frame(set_names(list(1), NA_character_)) expect_snapshot(error = TRUE, { filter(df1) }) expect_snapshot(error = TRUE, { filter_out(df1) }) expect_snapshot(error = TRUE, { filter(df2) }) expect_snapshot(error = TRUE, { filter_out(df2) }) }) test_that("`filter()` and `filter_out()` are complements", { df <- tibble( x = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, NA, NA, NA), y = c(TRUE, FALSE, NA, TRUE, FALSE, NA, TRUE, FALSE, NA) ) # Important invariant is that these are equivalent up to row ordering # `union(filter(df, ...), filter_out(df, ...)) ~= df` expect_identical( union(filter(df, x, y), filter_out(df, x, y)) |> arrange(x, y), df |> arrange(x, y) ) }) # .by ------------------------------------------------------------------------- test_that("can group transiently using `.by`", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) out <- filter(df, x > mean(x), .by = g) expect_identical(out$g, c(1, 2)) expect_identical(out$x, c(10, 3)) expect_s3_class(out, class(df), exact = TRUE) out <- filter_out(df, x > mean(x), .by = g) expect_identical(out$g, c(1, 2, 1)) expect_identical(out$x, c(5, 1, 2)) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains bare data.frame class", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) out <- filter(df, x > mean(x), .by = g) expect_s3_class(out, class(df), exact = TRUE) out <- filter_out(df, x > mean(x), .by = g) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains data frame attributes", { # With data.frames or tibbles df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) tbl <- as_tibble(df) attr(df, "foo") <- "bar" attr(tbl, "foo") <- "bar" out <- filter(df, x > mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") out <- filter_out(df, x > mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") out <- filter(tbl, x > mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") out <- filter_out(tbl, x > mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") }) test_that("can't use `.by` with `.preserve`", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { filter(df, .by = x, .preserve = TRUE) }) expect_snapshot(error = TRUE, { filter_out(df, .by = x, .preserve = TRUE) }) }) test_that("catches `.by` with grouped-df", { df <- tibble(x = 1) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { filter(gdf, .by = x) }) expect_snapshot(error = TRUE, { filter_out(gdf, .by = x) }) }) test_that("catches `.by` with rowwise-df", { df <- tibble(x = 1) rdf <- rowwise(df) expect_snapshot(error = TRUE, { filter(rdf, .by = x) }) expect_snapshot(error = TRUE, { filter_out(rdf, .by = x) }) }) test_that("catches `by` typo (#6647)", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { filter(df, by = x) }) expect_snapshot(error = TRUE, { filter_out(df, by = x) }) }) dplyr/tests/testthat/test-desc.R0000644000176200001440000000013414366556340016436 0ustar liggesuserstest_that("errors cleanly on non-vectors", { expect_snapshot(desc(mean), error = TRUE) }) dplyr/tests/testthat/test-join-cols.R0000644000176200001440000001721015106134104017377 0ustar liggesuserstest_that("key vars are found", { vars <- join_cols(c("x", "y"), c("x", "z"), by = join_by(x)) expect_equal(vars$x$key, c(x = 1L)) expect_equal(vars$y$key, c(x = 1L)) vars <- join_cols(c("a", "x", "b"), c("x", "a"), by = join_by(x)) expect_equal(vars$x$key, c(x = 2L)) expect_equal(vars$y$key, c(x = 1L)) vars <- join_cols(c("x", "y"), c("a", "x", "z"), by = join_by(y == z)) expect_equal(vars$x$key, c(y = 2L)) expect_equal(vars$y$key, c(z = 3L)) vars <- join_cols(c("x", "y"), c("a", "x", "z"), by = join_by(y >= z)) expect_equal(vars$x$key, c(y = 2L)) expect_equal(vars$y$key, c(z = 3L)) }) test_that("y key matches order and names of x key", { vars <- join_cols( c("x", "y", "z"), c("c", "b", "a"), by = join_by(x == a, y == b) ) expect_equal(vars$x$key, c(x = 1L, y = 2L)) expect_equal(vars$y$key, c(a = 3L, b = 2L)) }) test_that("duplicate column names are given suffixes", { vars <- join_cols(c("x", "y"), c("x", "y"), by = join_by(x)) expect_equal(vars$x$out, c("x" = 1, "y.x" = 2)) expect_equal(vars$y$out, c("y.y" = 2)) # including join vars when keep = TRUE vars <- join_cols(c("x", "y"), c("x", "y"), by = join_by(x), keep = TRUE) expect_equal(vars$x$out, c("x.x" = 1, "y.x" = 2)) expect_equal(vars$y$out, c("x.y" = 1, "y.y" = 2)) vars <- join_cols(c("x", "y"), c("x", "y"), by = join_by(x < x), keep = TRUE) expect_equal(vars$x$out, c("x.x" = 1, "y.x" = 2)) expect_equal(vars$y$out, c("x.y" = 1, "y.y" = 2)) # suffixes don't create duplicates vars <- join_cols(c("x", "y", "y.x"), c("x", "y"), by = join_by(x)) expect_equal(vars$x$out, c("x" = 1, "y.x" = 2, "y.x.x" = 3)) expect_equal(vars$y$out, c("y.y" = 2)) # but not when they're the join vars vars <- join_cols(c("A", "A.x"), c("B", "A.x", "A"), by = join_by(A.x)) expect_named(vars$x$out, c("A.x.x", "A.x")) expect_named(vars$y$out, c("B", "A.y")) # or when no suffix is requested vars <- join_cols( c("x", "y"), c("x", "y"), by = join_by(x), suffix = c("", ".y") ) expect_equal(vars$x$out, c("x" = 1, "y" = 2)) expect_equal(vars$y$out, c("y.y" = 2)) }) test_that("duplicate non-equi key columns are given suffixes", { vars <- join_cols( c("a", "y", "z"), c("b", "y", "z"), by = join_by(y >= y, z <= z) ) expect_equal(vars$x$out, c("a" = 1, "y.x" = 2, "z.x" = 3)) expect_equal(vars$y$out, c("b" = 1, "y.y" = 2, "z.y" = 3)) }) test_that("NA names are preserved", { vars <- join_cols(c("x", NA), c("x", "z"), by = join_by(x)) expect_named(vars$x$out, c("x", NA)) vars <- join_cols(c("x", NA), c("x", NA), by = join_by(x)) expect_named(vars$x$out, c("x", "NA.x")) expect_named(vars$y$out, "NA.y") }) test_that("by default, `by` columns omitted from `y` with equi-conditions, but not non-equi conditions", { # equi keys always keep the LHS name, regardless of whether of not a duplicate exists in the RHS # non-equi keys will get a suffix if a duplicate exists vars <- join_cols( c("x", "y", "z"), c("x", "y", "z"), by = join_by(x == y, y > z) ) expect_equal(vars$x$out, c("x" = 1, "y" = 2, "z.x" = 3)) expect_equal(vars$y$out, c("x.y" = 1, "z.y" = 3)) # unless specifically requested with `keep = TRUE` vars <- join_cols( c("x", "y", "z"), c("x", "y", "z"), by = join_by(x == y, y > z), keep = TRUE ) expect_equal(vars$x$out, c("x.x" = 1, "y.x" = 2, "z.x" = 3)) expect_equal(vars$y$out, c("x.y" = 1, "y.y" = 2, "z.y" = 3)) }) test_that("can't mix non-equi conditions with `keep = FALSE` (#6499)", { expect_snapshot(error = TRUE, { join_cols(c("x", "y"), c("x", "z"), by = join_by(x, y > z), keep = FALSE) }) expect_snapshot(error = TRUE, { join_cols( c("xl", "xu"), c("yl", "yu"), by = join_by(xl >= yl, xu < yu), keep = FALSE ) }) # Doesn't make sense here. # With right/full joins we'd have to merge both `yl` and `yu` into `x` somehow. expect_snapshot(error = TRUE, { join_cols( "x", c("yl", "yu"), by = join_by(between(x, yl, yu)), keep = FALSE ) }) # Doesn't make sense here. # With right/full joins, based on how the binary conditions are generated # we'd merge: # - `yu` into `xl` # - `yl` into `xu` # Which can result in `xl` and `xu` columns that don't maintain a `xl <= xu` # invariant. expect_snapshot(error = TRUE, { join_cols( c("xl", "xu"), c("yl", "yu"), by = join_by(overlaps(xl, xu, yl, yu)), keep = FALSE ) }) }) test_that("can duplicate key between non-equi conditions", { vars <- join_cols("x", c("xl", "xu"), by = join_by(x > xl, x < xu)) expect_identical(vars$x$key, c(x = 1L, x = 1L)) expect_identical(vars$x$out, c(x = 1L)) expect_identical(vars$y$key, c(xl = 1L, xu = 2L)) expect_identical(vars$y$out, c(xl = 1L, xu = 2L)) expect_identical( join_cols("x", c("xl", "xu"), by = join_by(x > xl, x < xu), keep = NULL), join_cols("x", c("xl", "xu"), by = join_by(x > xl, x < xu), keep = TRUE) ) }) test_that("can't duplicate key between equi condition and non-equi condition", { expect_snapshot( error = TRUE, join_cols("x", c("xl", "xu"), by = join_by(x > xl, x == xu)) ) expect_snapshot( error = TRUE, join_cols(c("xl", "xu"), "x", by = join_by(xl < x, xu == x)) ) }) test_that("emits useful messages", { # names expect_snapshot(error = TRUE, join_cols(c("x", "y"), c("y", "y"), join_by(y))) expect_snapshot(error = TRUE, join_cols(c("y", "y"), c("x", "y"), join_by(y))) xy <- c("x", "y") xyz <- c("x", "y", "z") # join vars errors expect_snapshot( error = TRUE, join_cols(xy, xy, by = as_join_by(list("1", y = "2"))) ) expect_snapshot( error = TRUE, join_cols(xy, xy, by = as_join_by(list(x = "1", "2"))) ) expect_snapshot(error = TRUE, join_cols(xy, xy, by = as_join_by(c("x", NA)))) expect_snapshot( error = TRUE, join_cols(xy, xy, by = as_join_by(c("aaa", "bbb"))) ) # join vars uniqueness expect_snapshot( error = TRUE, join_cols(xy, xy, by = as_join_by(c("x", "x", "x"))) ) expect_snapshot(error = TRUE, join_cols(xyz, xyz, by = join_by(x, x > y, z))) # suffixes expect_snapshot( error = TRUE, join_cols(xy, xy, by = join_by(x), suffix = "x") ) expect_snapshot( error = TRUE, join_cols(xy, xy, by = join_by(x), suffix = c("", NA)) ) }) # ------------------------------------------------------------------------------ # join_cast_common() test_that("takes common type", { x <- tibble(a = 1, b = 2L) y <- tibble(a = 1L, b = 3) vars <- join_cols(names(x), names(y), by = join_by(a, b)) out <- join_cast_common(x, y, vars) expect_identical(out$x, tibble(a = 1, b = 2)) expect_identical(out$y, tibble(a = 1, b = 3)) }) test_that("finalizes unspecified columns (#6804)", { vars <- join_cols(x_names = "x", y_names = "x", by = join_by(x)) x <- tibble(x = NA) y <- tibble(x = NA) out <- join_cast_common(x, y, vars) expect_identical(out$x, tibble(x = NA)) expect_identical(out$y, tibble(x = NA)) x <- tibble(x = NA) y <- tibble(x = unspecified()) out <- join_cast_common(x, y, vars) expect_identical(out$x, tibble(x = NA)) expect_identical(out$y, tibble(x = logical())) x <- tibble(x = unspecified()) y <- tibble(x = unspecified()) out <- join_cast_common(x, y, vars) expect_identical(out$x, tibble(x = logical())) expect_identical(out$y, tibble(x = logical())) }) test_that("references original column in `y` when there are type errors (#6465)", { x <- tibble(a = 1) y <- tibble(b = "1") x_key <- x y_key <- set_names(y, names(x)) vars <- join_cols(names(x), names(y), by = join_by(a == b)) expect_snapshot({ (expect_error(join_cast_common(x_key, y_key, vars))) }) }) dplyr/tests/testthat/test-conditions.R0000644000176200001440000000766415106134104017667 0ustar liggesuserstest_that("can pass verb-level error call", { dplyr_local_error_call(call("foo")) expect_snapshot(error = TRUE, { mutate(mtcars, 1 + "") transmute(mtcars, 1 + "") summarise(mtcars, 1 + "") summarise(group_by(mtcars, cyl), 1 + "") filter(mtcars, 1 + "") arrange(mtcars, 1 + "") select(mtcars, 1 + "") slice(mtcars, 1 + "") }) }) test_that("can pass verb-level error call (example case)", { my_verb <- function(data, var1, var2) { dplyr_local_error_call() pull(transmute(data, .result = {{ var1 }} * {{ var2 }})) } expect_snapshot(error = TRUE, { my_verb(mtcars, 1 + "", am) my_verb(mtcars, cyl, c(am, vs)) }) }) test_that("`err_locs()` works as expected", { expect_snapshot(error = TRUE, err_locs(1.5)) expect_snapshot(error = TRUE, err_locs(integer())) expect_snapshot({ err_locs(1L) err_locs(1:5) err_locs(1:6) err_locs(1:7) }) }) test_that("errors during dots collection are not enriched (#6178)", { expect_snapshot(error = TRUE, { mutate(mtcars, !!foobarbaz()) transmute(mtcars, !!foobarbaz()) select(mtcars, !!foobarbaz()) arrange(mtcars, !!foobarbaz()) filter(mtcars, !!foobarbaz()) }) }) test_that("warnings are collected for `last_dplyr_warnings()`", { skip_if_not_installed("base", "3.6.0") local_options( rlang_trace_format_srcrefs = FALSE ) df <- tibble(id = 1:2) f <- function() { warning("msg") 1 } reset_dplyr_warnings() expect_snapshot({ "Ungrouped" df |> mutate(x = f()) |> invisible() last_dplyr_warnings() }) reset_dplyr_warnings() expect_snapshot({ "Grouped" df |> group_by(id) |> mutate(x = f()) |> invisible() last_dplyr_warnings() }) reset_dplyr_warnings() expect_snapshot({ "Rowwise" df |> rowwise() |> mutate(x = f()) |> invisible() last_dplyr_warnings() }) reset_dplyr_warnings() expect_snapshot({ "Multiple type of warnings within multiple verbs" df |> group_by(g = f():n()) |> rowwise() |> mutate(x = f()) |> group_by(id) |> mutate(x = f()) |> invisible() last_dplyr_warnings() }) reset_dplyr_warnings() expect_snapshot({ "Truncated (1 more)" df |> rowwise() |> mutate(x = f()) last_dplyr_warnings(n = 1) }) reset_dplyr_warnings() expect_snapshot({ "Truncated (several more)" df <- tibble(id = 1:5) df |> rowwise() |> mutate(x = f()) last_dplyr_warnings(n = 1) }) }) test_that("complex backtraces with base and rlang warnings", { skip_if_not_installed("base", "3.6.0") local_options( rlang_trace_format_srcrefs = FALSE ) reset_dplyr_warnings() df <- tibble(id = 1:3) f <- function(...) g(...) g <- function(...) h(...) h <- function(x, base = TRUE) { if (base) { warning("foo") } else { warn("foo") } x } foo <- function() bar() bar <- function() { df |> group_by(x = f(1):n()) |> mutate(x = f(1, base = FALSE)) } expect_snapshot({ foo() last_dplyr_warnings() }) }) test_that("`last_dplyr_warnings()` only records 5 backtraces", { reset_dplyr_warnings() f <- function() { warning("msg") 1 } df <- tibble(id = 1:10) expect_warning( df |> group_by(id) |> mutate(x = f()) ) warnings <- last_dplyr_warnings(Inf) traces <- map(warnings, `[[`, "trace") expect_equal( sum(map_lgl(traces, is_null)), 5 ) }) test_that("can collect warnings in main verbs", { reset_dplyr_warnings() f <- function() { warning("foo") TRUE } expect_snapshot({ invisible( mtcars |> rowwise() |> filter(f()) |> arrange(f()) |> mutate(a = f()) |> summarise(b = f()) ) warnings <- last_dplyr_warnings(Inf) warnings[[1]] # filter() warnings[[33]] # arrange() warnings[[65]] # mutate() warnings[[97]] # summarise() }) }) dplyr/tests/testthat/test-n-distinct.R0000644000176200001440000000262214366556340017600 0ustar liggesuserstest_that("n_distinct() counts empty inputs", { expect_equal(n_distinct(NULL), 0) expect_equal(n_distinct(data.frame()), 0) }) test_that("n_distinct() counts unique values in simple vectors", { expect_equal(n_distinct(c(TRUE, FALSE, NA)), 3) expect_equal(n_distinct(c(1, 2, NA)), 3) expect_equal(n_distinct(c(1L, 2L, NA)), 3) expect_equal(n_distinct(c("x", "y", NA)), 3) }) test_that("n_distinct() counts unique combinations", { expect_equal(n_distinct(c(1, 1, 1), c(2, 2, 2)), 1) expect_equal(n_distinct(c(1, 1, 2), c(1, 2, 2)), 3) }) test_that("n_distinct() handles data frames", { expect_equal(n_distinct(data.frame(c(1, 1, 1), c(2, 2, 2))), 1) expect_equal(n_distinct(data.frame(c(1, 1, 2), c(1, 2, 2))), 3) }) test_that("n_distinct() can drop missing values", { expect_equal(n_distinct(NA, na.rm = TRUE), 0) expect_equal(n_distinct(c(NA, 0), na.rm = TRUE), 1) expect_equal(n_distinct(c(NA, 0), c(0, NA), na.rm = TRUE), 0) expect_equal(n_distinct(c(NA, 0), c(0, 0), na.rm = TRUE), 1) # check tibbles unpacked correctly expect_equal(n_distinct(1, tibble(x = 2, y = NA), na.rm = TRUE), 0) }) test_that("n_distinct() follows recycling rules", { expect_equal(n_distinct(double(), 1), 0) expect_equal(n_distinct(1:2, 1), 2) }) test_that("n_distinct() generates useful errors", { expect_snapshot(error = TRUE, { n_distinct() n_distinct(x = 1:4) n_distinct(mean) }) }) dplyr/tests/testthat/test-by.R0000644000176200001440000000471514366556340016143 0ustar liggesuserstest_that("computes group data when `by` is set", { df <- tibble(x = c(1, 1, 2, 2, 1)) out <- compute_by(by = x, data = df) expect_identical(out$type, "grouped") expect_identical(out$names, "x") expect_identical( out$data, tibble(x = c(1, 2), ".rows" := list_of(c(1L, 2L, 5L), c(3L, 4L))) ) }) test_that("computes `by` group data in order of appearance", { df <- tibble( x = c(5, 4, 5, 5), y = c(2, 3, 1, 2) ) out <- compute_by(by = c(x, y), data = df) expect <- tibble( x = c(5, 4, 5), y = c(2, 3, 1), ".rows" := list_of(c(1L, 4L), 2L, 3L) ) expect_identical(out$data, expect) }) test_that("extracts existing data when `by = NULL`", { df <- data.frame(x = c(1, 1, 2, 2, 1)) out <- compute_by(by = NULL, data = df) expect_identical(out$type, "ungrouped") expect_identical(out$names, character()) # `compute_by()` is always type stable on `$data` and returns a bare tibble expect_identical(out$data, as_tibble(group_data(df))) df <- tibble(x = c(1, 1, 2, 2, 1)) out <- compute_by(by = NULL, data = df) expect_identical(out$type, "ungrouped") expect_identical(out$names, character()) expect_identical(out$data, group_data(df)) gdf <- group_by(df, x) out <- compute_by(by = NULL, data = gdf) expect_identical(out$type, "grouped") expect_identical(out$names, "x") expect_identical(out$data, group_data(gdf)) rdf <- rowwise(df) out <- compute_by(by = NULL, data = rdf) expect_identical(out$type, "rowwise") expect_identical(out$names, character()) expect_identical(out$data, group_data(rdf)) }) test_that("empty selection results in ungrouped group data", { df <- tibble(x = 1) out <- compute_by(by = c(), data = df) expect_identical(out$type, "ungrouped") expect_identical(out$names, character()) expect_identical(out$data, group_data(df)) }) test_that("throws tidyselect errors", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { compute_by(by = y, data = df) }) }) test_that("can't set `.by` with a grouped-df", { df <- tibble(x = 1:5) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { compute_by(x, gdf) }) }) test_that("can't set `.by` with a rowwise-df", { df <- tibble(x = 1:5) rdf <- rowwise(df) expect_snapshot(error = TRUE, { compute_by(x, rdf) }) }) test_that("can tweak the error args", { df <- tibble(x = 1:5) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { compute_by(x, gdf, by_arg = "x", data_arg = "dat") }) }) dplyr/tests/testthat/test-funs.R0000644000176200001440000001042615106134104016457 0ustar liggesusers# between ------------------------------------------------------------------- test_that("returns NA if any argument is NA", { na <- NA_real_ expect_equal(between(1, 1, na), NA) expect_equal(between(1, na, 1), NA) expect_equal(between(na, 1, 1), NA) }) test_that("can be vectorized along `left` and `right`", { expect_identical(between(1:2, c(0L, 4L), 5L), c(TRUE, FALSE)) expect_identical(between(1:2, 0L, c(0L, 3L)), c(FALSE, TRUE)) }) test_that("compatible with base R", { x <- runif(1e3) expect_equal(between(x, 0.25, 0.5), x >= 0.25 & x <= 0.5) }) test_that("works with S3 objects", { x <- new_vctr(c(1, 5), class = "foo") left <- new_vctr(0, class = "foo") right <- new_vctr(3, class = "foo") expect_identical(between(x, left, right), c(TRUE, FALSE)) }) test_that("works with date-time `x` and date `left/right` (#6183)", { jan2 <- as.POSIXct("2022-01-02", tz = "UTC") jan1 <- as.Date("2022-01-01") jan3 <- as.Date("2022-01-03") expect_true(between(jan2, jan1, jan3)) }) test_that("works with data frames", { x <- tibble(year = c(2020, 2020, 2021), month = c(1, 3, 6)) left <- tibble(year = c(2019, 2020, 2021), month = c(1, 4, 3)) right <- tibble(year = c(2020, 2020, 2022), month = c(1, 6, 3)) expect_identical(between(x, left, right), c(TRUE, FALSE, TRUE)) }) test_that("works with rcrds", { x <- new_rcrd(list(year = c(2020, 2020, 2021), month = c(1, 3, 6))) left <- new_rcrd(list(year = c(2019, 2020, 2021), month = c(1, 4, 3))) right <- new_rcrd(list(year = c(2020, 2020, 2022), month = c(1, 6, 3))) expect_identical(between(x, left, right), c(TRUE, FALSE, TRUE)) }) test_that("takes the common type between all inputs (#6478)", { expect_identical(between(1L, 1.5, 2L), FALSE) expect_identical(between(1L, 0.5, 2.5), TRUE) expect_snapshot(error = TRUE, { between("1", 2, 3) }) expect_snapshot(error = TRUE, { between(1, "2", 3) }) expect_snapshot(error = TRUE, { between(1, 2, "3") }) }) test_that("recycles `left` and `right` to the size of `x`", { expect_snapshot(error = TRUE, { between(1:3, 1:2, 1L) }) expect_snapshot(error = TRUE, { between(1:3, 1L, 1:2) }) }) test_that("ptype argument works as expected with non-alphabetical ordered factors", { # Create an ordered factor with non-alphabetical order x <- factor( c("b", "c", "a", "d"), levels = c("d", "c", "b", "a"), ordered = TRUE ) # Test with ptype specified (uses factor order) expect_identical( between(x, "c", "a", ptype = x), c(TRUE, TRUE, TRUE, FALSE) ) # Test without ptype (uses alphabetical order) expect_identical( between(x, "c", "a"), c(FALSE, FALSE, FALSE, FALSE) ) }) test_that("ptype argument affects type casting", { x <- 1:5 expect_identical( between(x, 1.5, 3.5), c(FALSE, TRUE, TRUE, FALSE, FALSE) ) expect_snapshot(error = TRUE, { between(x, 1.5, 3.5, ptype = integer()) }) }) # cum* -------------------------------------------------------------------- test_that("cum(sum,min,max) return expected results for simple cases", { expect_equal(cummean(numeric()), numeric()) x <- c(5, 10, 2, 4) expect_equal(cummean(x), cumsum(x) / seq_along(x)) expect_equal(cumany(logical()), logical()) expect_equal(cumany(FALSE), FALSE) expect_equal(cumany(TRUE), TRUE) expect_equal(cumany(c(FALSE, FALSE)), c(FALSE, FALSE)) expect_equal(cumany(c(TRUE, FALSE)), c(TRUE, TRUE)) expect_equal(cumany(c(FALSE, TRUE)), c(FALSE, TRUE)) expect_equal(cumany(c(TRUE, TRUE)), c(TRUE, TRUE)) expect_equal(cumall(logical()), logical()) expect_equal(cumall(FALSE), FALSE) expect_equal(cumall(TRUE), TRUE) expect_equal(cumall(c(FALSE, FALSE)), c(FALSE, FALSE)) expect_equal(cumall(c(TRUE, FALSE)), c(TRUE, FALSE)) expect_equal(cumall(c(FALSE, TRUE)), c(FALSE, FALSE)) expect_equal(cumall(c(TRUE, TRUE)), c(TRUE, TRUE)) }) test_that("cumany/cumall propagate NAs (#408, #3749, #4132)", { expect_equal(cumall(c(NA, NA)), c(NA, NA)) expect_equal(cumall(c(NA, TRUE)), c(NA, NA)) expect_equal(cumall(c(NA, FALSE)), c(NA, FALSE)) expect_equal(cumany(c(NA, NA)), c(NA, NA)) expect_equal(cumany(c(NA, TRUE)), c(NA, TRUE)) expect_equal(cumany(c(NA, FALSE)), c(NA, NA)) }) test_that("cummean is not confused by FP error (#1387)", { a <- rep(99, 9) expect_true(all(cummean(a) == a)) }) dplyr/tests/testthat/test-join.R0000644000176200001440000005645215106134104016454 0ustar liggesusers# Basic properties -------------------------------------------------------- test_that("mutating joins preserve row and column order of x", { df1 <- data.frame(a = 1:3) df2 <- data.frame(b = 1, c = 2, a = 4:1) out <- inner_join(df1, df2, by = "a") expect_named(out, c("a", "b", "c")) expect_equal(out$a, 1:3) out <- left_join(df1, df2, by = "a") expect_named(out, c("a", "b", "c")) expect_equal(out$a, 1:3) out <- right_join(df1, df2, by = "a") expect_named(out, c("a", "b", "c")) expect_equal(out$a, 1:4) out <- full_join(df1, df2, by = "a") expect_named(out, c("a", "b", "c")) expect_equal(out$a, 1:4) }) test_that("even when column names change", { df1 <- data.frame(x = c(1, 1, 2, 3), z = 1:4, a = 1) df2 <- data.frame(z = 1:3, b = 1, x = c(1, 2, 4)) out <- inner_join(df1, df2, by = "x") expect_named(out, c("x", "z.x", "a", "z.y", "b")) }) test_that("filtering joins preserve row and column order of x (#2964)", { df1 <- data.frame(a = 4:1, b = 1) df2 <- data.frame(b = 1, c = 2, a = 2:3) out <- semi_join(df1, df2, by = "a") expect_named(out, c("a", "b")) expect_equal(out$a, 3:2) out <- anti_join(df1, df2, by = "a") expect_named(out, c("a", "b")) expect_equal(out$a, c(4L, 1L)) }) test_that("keys are coerced to symmetric type", { foo <- tibble(id = 1:2, var1 = "foo") bar <- tibble(id = as.numeric(1:2), var2 = "bar") expect_type(inner_join(foo, bar, by = "id")$id, "double") expect_type(inner_join(bar, foo, by = "id")$id, "double") foo <- tibble(id = factor(c("a", "b")), var1 = "foo") bar <- tibble(id = c("a", "b"), var2 = "bar") expect_type(inner_join(foo, bar, by = "id")$id, "character") expect_type(inner_join(bar, foo, by = "id")$id, "character") }) test_that("factor keys are coerced to the union factor type", { df1 <- tibble(x = 1, y = factor("a")) df2 <- tibble(x = 2, y = factor("b")) out <- full_join(df1, df2, by = c("x", "y")) expect_equal(out$y, factor(c("a", "b"))) }) test_that("keys of non-equi conditions are not coerced if `keep = NULL`", { foo <- tibble(id = factor(c("a", "b")), col1 = c(1, 2), var1 = "foo") bar <- tibble(id = c("a", "b"), col2 = c(1L, 2L), var2 = "bar") out <- inner_join(foo, bar, by = join_by(id, col1 >= col2)) expect_type(out$id, "character") expect_type(out$col1, "double") expect_type(out$col2, "integer") out <- inner_join(bar, foo, by = join_by(id, col2 <= col1)) expect_type(out$id, "character") expect_type(out$col1, "double") expect_type(out$col2, "integer") }) test_that("when keep = TRUE, left_join() preserves both sets of keys", { # when keys have different names df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(x = c(3, 4), y = c(3, 4)) out <- left_join(df1, df2, by = c("a" = "x"), keep = TRUE) expect_equal(out$a, c(2, 3)) expect_equal(out$x, c(NA, 3)) # when keys have same name df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(a = c(3, 4), y = c(3, 4)) out <- left_join(df1, df2, by = c("a"), keep = TRUE) expect_equal(out$a.x, c(2, 3)) expect_equal(out$a.y, c(NA, 3)) }) test_that("when keep = TRUE, right_join() preserves both sets of keys", { # when keys have different names df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(x = c(3, 4), y = c(3, 4)) out <- right_join(df1, df2, by = c("a" = "x"), keep = TRUE) expect_equal(out$a, c(3, NA)) expect_equal(out$x, c(3, 4)) # when keys have same name df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(a = c(3, 4), y = c(3, 4)) out <- right_join(df1, df2, by = c("a"), keep = TRUE) expect_equal(out$a.x, c(3, NA)) expect_equal(out$a.y, c(3, 4)) }) test_that("when keep = TRUE, full_join() preserves both sets of keys", { # when keys have different names df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(x = c(3, 4), y = c(3, 4)) out <- full_join(df1, df2, by = c("a" = "x"), keep = TRUE) expect_equal(out$a, c(2, 3, NA)) expect_equal(out$x, c(NA, 3, 4)) # when keys have same name df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(a = c(3, 4), y = c(3, 4)) out <- full_join(df1, df2, by = c("a"), keep = TRUE) expect_equal(out$a.x, c(2, 3, NA)) expect_equal(out$a.y, c(NA, 3, 4)) }) test_that("when keep = TRUE, inner_join() preserves both sets of keys (#5581)", { # when keys have different names df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(x = c(3, 4), y = c(3, 4)) out <- inner_join(df1, df2, by = c("a" = "x"), keep = TRUE) expect_equal(out$a, c(3)) expect_equal(out$x, c(3)) # when keys have same name df1 <- tibble(a = c(2, 3), b = c(1, 2)) df2 <- tibble(a = c(3, 4), y = c(3, 4)) out <- inner_join(df1, df2, by = c("a"), keep = TRUE) expect_equal(out$a.x, c(3)) expect_equal(out$a.y, c(3)) }) test_that("can't use `keep = FALSE` with non-equi conditions (#6499)", { df1 <- tibble(xl = c(1, 3), xu = c(4, 7)) df2 <- tibble(yl = c(2, 5, 8), yu = c(6, 8, 9)) expect_snapshot(error = TRUE, { left_join(df1, df2, join_by(overlaps(xl, xu, yl, yu)), keep = FALSE) }) # Would never make sense here. # Based on how the binary conditions are generated we'd merge: # - `yu` into `xl` # - `yl` into `xu` # Which results in `xl` and `xu` columns that don't maintain `xl <= xu`. expect_snapshot(error = TRUE, { full_join(df1, df2, join_by(overlaps(xl, xu, yl, yu)), keep = FALSE) }) }) test_that("joins matches NAs by default (#892, #2033)", { df1 <- tibble(x = c(NA_character_, 1)) df2 <- tibble(x = c(NA_character_, 2)) expect_equal(nrow(inner_join(df1, df2, by = "x")), 1) expect_equal(nrow(semi_join(df1, df2, by = "x")), 1) }) test_that("joins don't match NA when na_matches = 'never' (#2033)", { df1 <- tibble(a = c(1, NA)) df2 <- tibble(a = c(1, NA), b = 1:2) out <- left_join(df1, df2, by = "a", na_matches = "never") expect_equal(out, tibble(a = c(1, NA), b = c(1, NA))) out <- inner_join(df1, df2, by = "a", na_matches = "never") expect_equal(out, tibble(a = 1, b = 1)) out <- semi_join(df1, df2, by = "a", na_matches = "never") expect_equal(out, tibble(a = 1)) out <- anti_join(df1, df2, by = "a", na_matches = "never") expect_equal(out, tibble(a = NA_integer_)) out <- nest_join(df1, df2, by = "a", na_matches = "never") expect <- tibble( a = c(1, NA), df2 = list(tibble(b = 1L), tibble(b = integer())) ) expect_equal(out, expect) dat1 <- tibble( name = c("a", "c"), var1 = c(1, 2) ) dat3 <- tibble( name = c("a", NA_character_), var3 = c(5, 6) ) expect_equal( full_join(dat1, dat3, by = "name", na_matches = "never"), tibble(name = c("a", "c", NA), var1 = c(1, 2, NA), var3 = c(5, NA, 6)) ) }) test_that("`left_join(by = join_by(closest(...)))` works as expected", { df1 <- tibble(x = 1:5) df2 <- tibble(y = c(1, 2, 4)) out <- left_join(df1, df2, by = join_by(closest(x <= y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(1, 2, 4, 4, NA)) out <- left_join(df1, df2, by = join_by(closest(x < y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(2, 4, 4, NA, NA)) out <- left_join(df1, df2, by = join_by(closest(x >= y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(1, 2, 2, 4, 4)) out <- left_join(df1, df2, by = join_by(closest(x > y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(NA, 1, 2, 2, 4)) }) test_that("`full_join(by = join_by(closest(...)))` works as expected", { df1 <- tibble(x = 1:5) df2 <- tibble(y = c(1, 2, 4)) out <- full_join(df1, df2, by = join_by(closest(x <= y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(1, 2, 4, 4, NA)) out <- full_join(df1, df2, by = join_by(closest(x < y))) expect_identical(out$x, c(1:5, NA)) expect_identical(out$y, c(2, 4, 4, NA, NA, 1)) out <- full_join(df1, df2, by = join_by(closest(x >= y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(1, 2, 2, 4, 4)) out <- full_join(df1, df2, by = join_by(closest(x > y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(NA, 1, 2, 2, 4)) }) test_that("`right_join(by = join_by(closest(...)))` works as expected", { df1 <- tibble(x = 1:5) df2 <- tibble(y = c(1, 2, 4)) out <- right_join(df1, df2, by = join_by(closest(x <= y))) expect_identical(out$x, 1:4) expect_identical(out$y, c(1, 2, 4, 4)) out <- right_join(df1, df2, by = join_by(closest(x < y))) expect_identical(out$x, c(1:3, NA)) expect_identical(out$y, c(2, 4, 4, 1)) out <- right_join(df1, df2, by = join_by(closest(x >= y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(1, 2, 2, 4, 4)) out <- right_join(df1, df2, by = join_by(closest(x > y))) expect_identical(out$x, 2:5) expect_identical(out$y, c(1, 2, 2, 4)) }) test_that("`inner_join(by = join_by(closest(...)))` works as expected", { df1 <- tibble(x = 1:5) df2 <- tibble(y = c(1, 2, 4)) out <- inner_join(df1, df2, by = join_by(closest(x <= y))) expect_identical(out$x, 1:4) expect_identical(out$y, c(1, 2, 4, 4)) out <- inner_join(df1, df2, by = join_by(closest(x < y))) expect_identical(out$x, 1:3) expect_identical(out$y, c(2, 4, 4)) out <- inner_join(df1, df2, by = join_by(closest(x >= y))) expect_identical(out$x, 1:5) expect_identical(out$y, c(1, 2, 2, 4, 4)) out <- inner_join(df1, df2, by = join_by(closest(x > y))) expect_identical(out$x, 2:5) expect_identical(out$y, c(1, 2, 2, 4)) }) test_that("joins using `between(bounds =)` work as expected (#6488)", { df1 <- tibble(x = 1:5) df2 <- tibble(lower = 2, upper = 4) out <- full_join( df1, df2, by = join_by(between(x, lower, upper, bounds = "[]")) ) expect_identical(out$lower, c(NA, 2, 2, 2, NA)) expect_identical(out$upper, c(NA, 4, 4, 4, NA)) out <- full_join( df1, df2, by = join_by(between(x, lower, upper, bounds = "[)")) ) expect_identical(out$lower, c(NA, 2, 2, NA, NA)) expect_identical(out$upper, c(NA, 4, 4, NA, NA)) out <- full_join( df1, df2, by = join_by(between(x, lower, upper, bounds = "(]")) ) expect_identical(out$lower, c(NA, NA, 2, 2, NA)) expect_identical(out$upper, c(NA, NA, 4, 4, NA)) out <- full_join( df1, df2, by = join_by(between(x, lower, upper, bounds = "()")) ) expect_identical(out$lower, c(NA, NA, 2, NA, NA)) expect_identical(out$upper, c(NA, NA, 4, NA, NA)) }) test_that("joins using `overlaps(bounds =)` work as expected (#6488)", { df1 <- tibble(x_lower = c(1, 1, 3, 4), x_upper = c(2, 3, 4, 5)) df2 <- tibble(y_lower = 2, y_upper = 4) expect_closed <- vec_cbind(df1, vec_c(df2, df2, df2, df2)) out <- full_join( df1, df2, by = join_by(overlaps(x_lower, x_upper, y_lower, y_upper, bounds = "[]")) ) expect_identical(out, expect_closed) # `[)`, `(]`, and `()` all generate the same binary conditions but are useful # for consistency with `between(bounds =)` expect_open <- vec_cbind(df1, vec_c(NA, df2, df2, NA)) out <- full_join( df1, df2, by = join_by(overlaps(x_lower, x_upper, y_lower, y_upper, bounds = "[)")) ) expect_identical(out, expect_open) out <- full_join( df1, df2, by = join_by(overlaps(x_lower, x_upper, y_lower, y_upper, bounds = "(]")) ) expect_identical(out, expect_open) out <- full_join( df1, df2, by = join_by(overlaps(x_lower, x_upper, y_lower, y_upper, bounds = "()")) ) expect_identical(out, expect_open) }) test_that("join_mutate() validates arguments", { df <- tibble(x = 1) # Mutating joins expect_snapshot(error = TRUE, { join_mutate(df, df, by = 1, type = "left") join_mutate(df, df, by = "x", type = "left", suffix = 1) join_mutate(df, df, by = "x", type = "left", na_matches = "foo") join_mutate(df, df, by = "x", type = "left", keep = 1) }) }) test_that("join_filter() validates arguments", { df <- tibble(x = 1) # Filtering joins expect_snapshot(error = TRUE, { join_filter(df, df, by = 1, type = "semi") join_filter(df, df, by = "x", type = "semi", na_matches = "foo") }) }) test_that("mutating joins trigger many-to-many warning", { df <- tibble(x = c(1, 1)) expect_snapshot(out <- left_join(df, df, join_by(x))) }) test_that("mutating joins don't trigger many-to-many warning when called indirectly", { df <- tibble(x = c(1, 1)) fn <- function(df1, df2, relationship = NULL) { left_join(df1, df2, join_by(x), relationship = relationship) } # Directly calling `left_join()` from a function you control results in a warning expect_warning( fn(df, df), class = "dplyr_warning_join_relationship_many_to_many" ) # Now mimic calling an "rlang function" which you don't control that calls `left_join()` fn_env(fn) <- ns_env("rlang") # Indirectly calling `left_join()` through a function you don't control # doesn't warn expect_no_warning( fn(df, df), class = "dplyr_warning_join_relationship_many_to_many" ) }) test_that("mutating joins compute common columns", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- tibble(x = c(1, 3), z = c(2, 3)) expect_snapshot(out <- left_join(df1, df2)) }) test_that("filtering joins compute common columns", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- tibble(x = c(1, 3), z = c(2, 3)) expect_snapshot(out <- semi_join(df1, df2)) }) test_that("mutating joins finalize unspecified columns (#6804)", { df1 <- tibble(x = NA) df2 <- tibble(x = NA) expect_identical( inner_join(df1, df2, by = join_by(x)), tibble(x = NA) ) expect_identical( inner_join(df1, df2, by = join_by(x), na_matches = "never"), tibble(x = logical()) ) # Pre-existing `unspecified()` vectors get finalized, because they are # considered internal types and we took a "common type" between the keys df1 <- tibble(x = unspecified()) df2 <- tibble(x = unspecified()) expect_identical( inner_join(df1, df2, by = join_by(x)), tibble(x = logical()) ) }) test_that("filtering joins finalize unspecified columns (#6804)", { df1 <- tibble(x = NA) df2 <- tibble(x = NA) expect_identical( semi_join(df1, df2, by = join_by(x)), tibble(x = NA) ) expect_identical( semi_join(df1, df2, by = join_by(x), na_matches = "never"), tibble(x = logical()) ) # Pre-existing `unspecified()` vectors aren't finalized, # because we don't take the common type of the keys. # We retain the exact type of `x`. df1 <- tibble(x = unspecified()) df2 <- tibble(x = NA) expect_identical( semi_join(df1, df2, by = join_by(x)), tibble(x = unspecified()) ) }) test_that("mutating joins reference original column in `y` when there are type errors (#6465)", { x <- tibble(a = 1) y <- tibble(b = "1") expect_snapshot({ (expect_error(left_join(x, y, by = join_by(a == b)))) }) }) test_that("filtering joins reference original column in `y` when there are type errors (#6465)", { x <- tibble(a = 1) y <- tibble(b = "1") expect_snapshot({ (expect_error(semi_join(x, y, by = join_by(a == b)))) }) }) test_that("error if passed additional arguments", { df1 <- data.frame(a = 1:3) df2 <- data.frame(a = 1) expect_snapshot(error = TRUE, { inner_join(df1, df2, on = "a") left_join(df1, df2, on = "a") right_join(df1, df2, on = "a") full_join(df1, df2, on = "a") nest_join(df1, df2, on = "a") anti_join(df1, df2, on = "a") semi_join(df1, df2, on = "a") }) }) # nest_join --------------------------------------------------------------- test_that("nest_join returns list of tibbles (#3570)", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- tibble(x = c(1, 1), z = c(2, 3)) out <- nest_join(df1, df2, by = "x") expect_named(out, c("x", "y", "df2")) expect_type(out$df2, "list") expect_s3_class(out$df2[[1]], "tbl_df") }) test_that("nest_join respects types of y (#6295)", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- rowwise(tibble(x = c(1, 1), z = c(2, 3))) out <- nest_join(df1, df2, by = "x") expect_s3_class(out$df2[[1]], "rowwise_df") }) test_that("nest_join preserves data frame attributes on `x` and `y` (#6295)", { df1 <- data.frame(x = c(1, 2), y = c(3, 4)) attr(df1, "foo") <- 1 df2 <- data.frame(x = c(1, 2), z = c(3, 4)) attr(df2, "foo") <- 2 out <- nest_join(df1, df2, by = "x") expect_identical(attr(out, "foo"), 1) expect_identical(attr(out$df2[[1]], "foo"), 2) }) test_that("nest_join computes common columns", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- tibble(x = c(1, 3), z = c(2, 3)) expect_snapshot(out <- nest_join(df1, df2)) }) test_that("nest_join finalizes unspecified columns (#6804)", { df1 <- tibble(x = NA) df2 <- tibble(x = NA) expect_identical( nest_join(df1, df2, by = join_by(x)), tibble(x = NA, df2 = list(tibble(.rows = 1L))) ) expect_identical( nest_join(df1, df2, by = join_by(x), keep = TRUE), tibble(x = NA, df2 = list(tibble(x = NA))) ) expect_identical( nest_join(df1, df2, by = join_by(x), na_matches = "never"), tibble(x = NA, df2 = list(tibble())) ) # Pre-existing `unspecified()` vectors get finalized, because they are # considered internal types and we took a "common type" between the keys df1 <- tibble(x = unspecified()) df2 <- tibble(x = unspecified()) expect_identical( nest_join(df1, df2, by = join_by(x)), tibble(x = logical(), df2 = list()) ) }) test_that("nest_join references original column in `y` when there are type errors (#6465)", { x <- tibble(a = 1) y <- tibble(b = "1") expect_snapshot({ (expect_error(nest_join(x, y, by = join_by(a == b)))) }) }) test_that("nest_join handles multiple matches in x (#3642)", { df1 <- tibble(x = c(1, 1)) df2 <- tibble(x = 1, y = 1:2) out <- nest_join(df1, df2, by = "x") expect_equal(out$df2[[1]], out$df2[[2]]) }) test_that("nest_join forces `multiple = 'all'` internally (#6392)", { df1 <- tibble(x = 1) df2 <- tibble(x = 1, y = 1:2) expect_no_warning(out <- nest_join(df1, df2, by = "x")) expect_identical(nrow(out$df2[[1]]), 2L) }) test_that("y keys dropped by default for equi conditions", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- tibble(x = c(1, 3), z = c(2, 3)) out <- nest_join(df1, df2, by = "x") expect_named(out, c("x", "y", "df2")) expect_named(out$df2[[1]], "z") out <- nest_join(df1, df2, by = "x", keep = TRUE) expect_named(out$df2[[1]], c("x", "z")) }) test_that("y keys kept by default for non-equi conditions", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- tibble(x = c(1, 3), z = c(2, 3)) out <- nest_join(df1, df2, by = join_by(x >= x)) expect_named(out, c("x", "y", "df2")) expect_named(out$df2[[1]], c("x", "z")) }) test_that("validates inputs", { df1 <- tibble(x = c(1, 2), y = c(2, 3)) df2 <- tibble(x = c(1, 3), z = c(2, 3)) expect_snapshot(error = TRUE, { nest_join(df1, df2, by = 1) nest_join(df1, df2, keep = 1) nest_join(df1, df2, name = 1) nest_join(df1, df2, na_matches = 1) }) }) # output type --------------------------------------------------------------- test_that("joins x preserve type of x", { df1 <- data.frame(x = 1) df2 <- tibble(x = 2) expect_s3_class(inner_join(df1, df2, by = "x"), "data.frame", exact = TRUE) expect_s3_class(inner_join(df2, df1, by = "x"), "tbl_df") }) test_that("joins preserve groups", { gf1 <- tibble(a = 1:3) |> group_by(a) gf2 <- tibble(a = rep(1:4, 2), b = 1) |> group_by(b) i <- count_regroups(out <- inner_join(gf1, gf2, by = "a")) expect_equal(i, 1L) expect_equal(group_vars(out), "a") i <- count_regroups(out <- semi_join(gf1, gf2, by = "a")) expect_equal(i, 0L) expect_equal(group_vars(out), "a") # once for x + once for each row for y i <- count_regroups(out <- nest_join(gf1, gf2, by = "a")) expect_equal(i, 4L) expect_equal(group_vars(out), "a") expect_equal(group_vars(out$gf2[[1]]), "b") }) test_that("joins respect zero length groups", { df1 <- tibble(f = factor(c(1, 1, 2, 2), levels = 1:3), x = c(1, 2, 1, 4)) |> group_by(f) df2 <- tibble(f = factor(c(2, 2, 3, 3), levels = 1:3), y = c(1, 2, 3, 4)) |> group_by(f) expect_equal( group_size(left_join(df1, df2, by = "f", relationship = "many-to-many")), c(2, 4) ) expect_equal( group_size(right_join(df1, df2, by = "f", relationship = "many-to-many")), c(4, 2) ) expect_equal( group_size(full_join(df1, df2, by = "f", relationship = "many-to-many")), c(2, 4, 2) ) expect_equal(group_size(anti_join(df1, df2, by = "f")), c(2)) expect_equal( group_size(inner_join(df1, df2, by = "f", relationship = "many-to-many")), c(4) ) df1 <- tibble(f = factor(c(1, 1, 2, 2), levels = 1:3), x = c(1, 2, 1, 4)) |> group_by(f, .drop = FALSE) df2 <- tibble(f = factor(c(2, 2, 3, 3), levels = 1:3), y = c(1, 2, 3, 4)) |> group_by(f, .drop = FALSE) expect_equal( group_size(left_join(df1, df2, by = "f", relationship = "many-to-many")), c(2, 4, 0) ) expect_equal( group_size(right_join(df1, df2, by = "f", relationship = "many-to-many")), c(0, 4, 2) ) expect_equal( group_size(full_join(df1, df2, by = "f", relationship = "many-to-many")), c(2, 4, 2) ) expect_equal(group_size(anti_join(df1, df2, by = "f")), c(2, 0, 0)) expect_equal( group_size(inner_join(df1, df2, by = "f", relationship = "many-to-many")), c(0, 4, 0) ) }) test_that("group column names reflect renamed duplicate columns (#2330)", { df1 <- tibble(x = 1:5, y = 1:5) |> group_by(x, y) df2 <- tibble(x = 1:5, y = 1:5) out <- inner_join(df1, df2, by = "x") expect_equal(group_vars(out), "x") # TODO: fix this issue: https://github.com/tidyverse/dplyr/issues/4917 # expect_equal(group_vars(out), c("x", "y.x")) }) test_that("rowwise group structure is updated after a join (#5227)", { df1 <- rowwise(tibble(x = 1:2)) df2 <- tibble(x = c(1:2, 2L)) x <- left_join(df1, df2, by = "x") expect_identical(group_rows(x), list_of(1L, 2L, 3L)) }) # deprecated ---------------------------------------------------------------- test_that("by = character() generates cross (#4206)", { local_options(lifecycle_verbosity = "quiet") df1 <- tibble(x = 1:2) df2 <- tibble(y = 1:2) out <- left_join(df1, df2, by = character()) expect_equal(out$x, rep(1:2, each = 2)) expect_equal(out$y, rep(1:2, 2)) }) test_that("`by = character()` technically respects `unmatched`", { local_options(lifecycle_verbosity = "quiet") df1 <- tibble() df2 <- tibble(x = 1) expect_snapshot(error = TRUE, { left_join(df1, df2, by = character(), unmatched = "error") }) }) test_that("`by = character()` technically respects `relationship`", { local_options(lifecycle_verbosity = "quiet") df <- tibble(x = 1:2) expect_snapshot(error = TRUE, { left_join(df, df, by = character(), relationship = "many-to-one") }) }) test_that("`by = character()` for a cross join is deprecated (#6604)", { df1 <- tibble(x = 1:2) df2 <- tibble(y = 1:2) # Mutating join expect_snapshot({ out <- left_join(df1, df2, by = character()) }) # Filtering join expect_snapshot({ out <- semi_join(df1, df2, by = character()) }) # Nest join expect_snapshot({ out <- nest_join(df1, df2, by = character()) }) }) test_that("`by = named character()` for a cross join works", { # Used by the sift package df1 <- tibble(x = 1:2) df2 <- tibble(y = 1:2) by <- set_names(character(), nm = character()) expect_snapshot({ out <- left_join(df1, df2, by = by) }) expect_identical( out, cross_join(df1, df2) ) }) test_that("`by = list(x = character(), y = character())` for a cross join is deprecated (#6604)", { df1 <- tibble(x = 1:2) df2 <- tibble(y = 1:2) expect_snapshot({ out <- left_join(df1, df2, by = list(x = character(), y = character())) }) }) dplyr/tests/testthat/test-arrange.R0000644000176200001440000003564515106134104017135 0ustar liggesusers# To turn on warnings from tibble::`names<-()` local_options(lifecycle_verbosity = "warning") test_that("empty arrange() returns input", { df <- tibble(x = 1:10, y = 1:10) gf <- group_by(df, x) expect_identical(arrange(df), df) expect_identical(arrange(gf), gf) expect_identical(arrange(df, !!!list()), df) expect_identical(arrange(gf, !!!list()), gf) }) test_that("can sort empty data frame", { df <- tibble(a = numeric(0)) expect_equal(arrange(df, a), df) }) test_that("local arrange sorts missing values to end", { df <- data.frame(x = c(2, 1, NA)) expect_equal(df |> arrange(x) |> pull(), c(1, 2, NA)) expect_equal(df |> arrange(desc(x)) |> pull(), c(2, 1, NA)) }) test_that("arrange() gives meaningful errors", { expect_snapshot({ # duplicated column name (expect_error( tibble(x = 1, x = 1, .name_repair = "minimal") |> arrange(x) )) # error in mutate() step (expect_error( tibble(x = 1) |> arrange(y) )) (expect_error( tibble(x = 1) |> arrange(rep(x, 2)) )) }) }) # column types ---------------------------------------------------------- test_that("arrange handles list columns (#282)", { # no intrinsic ordering df <- tibble(x = 1:3, y = list(3, 2, 1)) expect_equal(arrange(df, y), df) df <- tibble(x = 1:3, y = list(sum, mean, sd)) expect_equal(arrange(df, y), df) }) test_that("arrange handles raw columns (#1803)", { df <- tibble(x = 1:3, y = as.raw(3:1)) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("arrange handles matrix columns", { df <- tibble(x = 1:3, y = matrix(6:1, ncol = 2)) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("arrange handles data.frame columns (#3153)", { df <- tibble(x = 1:3, y = data.frame(z = 3:1)) expect_equal(arrange(df, y), tibble(x = 3:1, y = data.frame(z = 1:3))) }) test_that("arrange handles complex columns", { df <- tibble(x = 1:3, y = 3:1 + 2i) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("arrange handles S4 classes (#1105)", { TestS4 <- suppressWarnings(setClass("TestS4", contains = "integer")) setMethod('[', 'TestS4', function(x, i, ...) { TestS4(unclass(x)[i, ...]) }) on.exit(removeClass("TestS4"), add = TRUE) df <- tibble(x = 1:3, y = TestS4(3:1)) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("arrange works with two columns when the first has a data frame proxy (#6268)", { # `id1` has a data frame proxy for `vec_proxy_order()` df <- tibble( id1 = new_rcrd(list(x = 1, y = 1)), id2 = c(1, 3, 2) ) out <- arrange(df, id1, id2) expect_identical(out$id2, c(1, 2, 3)) }) test_that("arrange ignores NULLs (#6193)", { df <- tibble(x = 1:2) y <- NULL out <- arrange(df, y, desc(x)) expect_equal(out$x, 2:1) out <- arrange(df, y, desc(x), y) expect_equal(out$x, 2:1) }) test_that("`arrange()` works with `numeric_version` (#6680)", { x <- numeric_version(c("1.11", "1.2.3", "1.2.2")) df <- tibble(x = x) expect <- df[c(3, 2, 1), ] expect_identical(arrange(df, x), expect) }) # locale -------------------------------------------------------------- test_that("arrange defaults to the C locale", { x <- c("A", "a", "b", "B") df <- tibble(x = x) res <- arrange(df, x) expect_identical(res$x, c("A", "B", "a", "b")) res <- arrange(df, desc(x)) expect_identical(res$x, rev(c("A", "B", "a", "b"))) }) test_that("locale can be set to an English locale", { skip_if_not_installed("stringi", "1.5.3") x <- c("A", "a", "b", "B") df <- tibble(x = x) res <- arrange(df, x, .locale = "en") expect_identical(res$x, c("a", "A", "b", "B")) }) test_that("non-English locales can be used", { skip_if_not_installed("stringi", "1.5.3") # Danish `o` with `/` through it sorts after `z` in Danish locale x <- c("o", "\u00F8", "p", "z") df <- tibble(x = x) # American English locale puts it right after `o` res <- arrange(df, x, .locale = "en") expect_identical(res$x, x) res <- arrange(df, x, .locale = "da") expect_identical(res$x, x[c(1, 3, 4, 2)]) }) test_that("arrange errors if stringi is not installed and a locale identifier is used", { expect_snapshot(error = TRUE, { locale_to_chr_proxy_collate("fr", has_stringi = FALSE) }) }) test_that("arrange validates `.locale`", { df <- tibble() expect_snapshot(error = TRUE, { arrange(df, .locale = 1) }) expect_snapshot(error = TRUE, { arrange(df, .locale = c("en_US", "fr_BF")) }) }) test_that("arrange validates that `.locale` must be one from stringi", { skip_if_not_installed("stringi", "1.5.3") df <- tibble() expect_snapshot(error = TRUE, { arrange(df, .locale = "x") }) }) # data ---------------------------------------------------------------- test_that("arrange preserves input class", { df1 <- data.frame(x = 1:3, y = 3:1) df2 <- tibble(x = 1:3, y = 3:1) df3 <- df1 |> group_by(x) expect_s3_class(arrange(df1, x), "data.frame", exact = TRUE) expect_s3_class(arrange(df2, x), "tbl_df") expect_s3_class(arrange(df3, x), "grouped_df") }) test_that("grouped arrange ignores group, unless requested with .by_group", { df <- data.frame(g = c(2, 1, 2, 1), x = 4:1) gf <- group_by(df, g) expect_equal(arrange(gf, x), gf[4:1, , ]) expect_equal(arrange(gf, x, .by_group = TRUE), gf[c(4, 2, 3, 1), , ]) }) test_that("arrange updates the grouping structure (#605)", { df <- tibble(g = c(2, 2, 1, 1), x = c(1, 3, 2, 4)) res <- df |> group_by(g) |> arrange(x) expect_s3_class(res, "grouped_df") expect_equal(group_rows(res), list_of(c(2L, 4L), c(1L, 3L))) }) test_that("arrange() supports across() and pick() (#4679)", { df <- tibble(x = c(1, 3, 2, 1), y = c(4, 3, 2, 1)) expect_identical( df |> arrange(pick(everything())), df |> arrange(x, y) ) expect_identical( df |> arrange(across(everything(), .fns = desc)), df |> arrange(desc(x), desc(y)) ) expect_identical( df |> arrange(pick(x)), df |> arrange(x) ) expect_identical( df |> arrange(across(y, .fns = identity)), df |> arrange(y) ) }) test_that("arrange() works with across() and pick() cols that return multiple columns (#6490)", { df <- tibble( a = c(1, 1, 1), b = c(2, 2, 2), c = c(4, 4, 3), d = c(5, 2, 7) ) expect_identical( arrange( df, across(c(a, b), .fns = identity), across(c(c, d), .fns = identity) ), df[c(3, 2, 1), ] ) expect_identical( arrange(df, pick(a, b), pick(c, d)), df[c(3, 2, 1), ] ) }) test_that("arrange() evaluates each pick() call on the original data (#6495)", { df <- tibble(x = 2:1) out <- arrange(df, TRUE, pick(everything())) expect_identical(out, df[c(2, 1), ]) out <- arrange(df, NULL, pick(everything())) expect_identical(out, df[c(2, 1), ]) }) test_that("arrange() with empty dots still calls dplyr_row_slice()", { tbl <- new_tibble(list(x = 1), nrow = 1L) foo <- structure(tbl, class = c("foo_df", class(tbl))) local_methods( # `foo_df` always loses class when row slicing dplyr_row_slice.foo_df = function(data, i, ...) { out <- NextMethod() new_tibble(out, nrow = nrow(out)) } ) expect_s3_class(arrange(foo), class(tbl), exact = TRUE) expect_s3_class(arrange(foo, x), class(tbl), exact = TRUE) }) test_that("can arrange() with unruly class", { local_methods( `[.dplyr_foobar` = function(x, i, ...) new_dispatched_quux(vec_slice(x, i)), dplyr_row_slice.dplyr_foobar = function(x, i, ...) x[i, ] ) df <- foobar(data.frame(x = 1:3)) expect_identical( arrange(df, desc(x)), quux(data.frame(x = 3:1, dispatched = TRUE)) ) }) test_that("arrange() preserves the call stack on error (#5308)", { foobar <- function() stop("foo") stack <- NULL expect_error( withCallingHandlers( error = function(...) stack <<- sys.calls(), arrange(mtcars, foobar()) ) ) expect_true(some(stack, is_call, "foobar")) }) test_that("desc() inside arrange() checks the number of arguments (#5921)", { expect_snapshot({ df <- data.frame(x = 1, y = 2) (expect_error(arrange(df, desc(x, y)))) }) }) test_that("arrange keeps zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) expect_equal(group_size(arrange(df)), c(2, 2, 0)) expect_equal(group_size(arrange(df, x)), c(2, 2, 0)) }) # legacy -------------------------------------------------------------- test_that("legacy - using the deprecated global option `dplyr.legacy_locale` forces the system locale", { skip_if_not(has_collate_locale("en_US"), message = "Can't use 'en_US' locale") local_options(dplyr.legacy_locale = TRUE) withr::local_collate("en_US") df <- tibble(x = c("a", "A", "Z", "b")) # Capture `dplyr.legacy_locale` deprecation warning expect_snapshot({ out <- arrange(df, x)$x }) expect_identical(out, c("a", "A", "b", "Z")) }) test_that("legacy - usage of `.locale` overrides `dplyr.legacy_locale`", { skip_if_not_installed("stringi", "1.5.3") local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) # Danish `o` with `/` through it sorts after `z` in Danish locale x <- c("o", "\u00F8", "p", "z") df <- tibble(x = x) # American English locale puts it right after `o` res <- arrange(df, x, .locale = "en") expect_identical(res$x, x) res <- arrange(df, x, .locale = "da") expect_identical(res$x, x[c(1, 3, 4, 2)]) }) test_that("legacy - empty arrange() returns input", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble(x = 1:10, y = 1:10) gf <- group_by(df, x) expect_identical(arrange(df), df) expect_identical(arrange(gf), gf) expect_identical(arrange(df, !!!list()), df) expect_identical(arrange(gf, !!!list()), gf) }) test_that("legacy - can sort empty data frame", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble(a = numeric(0)) expect_equal(arrange(df, a), df) }) test_that("legacy - local arrange sorts missing values to end", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- data.frame(x = c(2, 1, NA)) expect_equal(df |> arrange(x) |> pull(), c(1, 2, NA)) expect_equal(df |> arrange(desc(x)) |> pull(), c(2, 1, NA)) }) test_that("legacy - arrange handles list columns (#282)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) # no intrinsic ordering df <- tibble(x = 1:3, y = list(3, 2, 1)) expect_equal(arrange(df, y), df) df <- tibble(x = 1:3, y = list(sum, mean, sd)) expect_equal(arrange(df, y), df) }) test_that("legacy - arrange handles raw columns (#1803)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble(x = 1:3, y = as.raw(3:1)) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("legacy - arrange handles matrix columns", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble(x = 1:3, y = matrix(6:1, ncol = 2)) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("legacy - arrange handles data.frame columns (#3153)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble(x = 1:3, y = data.frame(z = 3:1)) expect_equal(arrange(df, y), tibble(x = 3:1, y = data.frame(z = 1:3))) }) test_that("legacy - arrange handles complex columns", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble(x = 1:3, y = 3:1 + 2i) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("legacy - arrange handles S4 classes (#1105)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) TestS4 <- suppressWarnings(setClass("TestS4", contains = "integer")) setMethod('[', 'TestS4', function(x, i, ...) { TestS4(unclass(x)[i, ...]) }) on.exit(removeClass("TestS4"), add = TRUE) df <- tibble(x = 1:3, y = TestS4(3:1)) expect_equal(arrange(df, y), df[3:1, ]) }) test_that("legacy - `arrange()` works with `numeric_version` (#6680)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) x <- numeric_version(c("1.11", "1.2.3", "1.2.2")) df <- tibble(x = x) expect <- df[c(3, 2, 1), ] expect_identical(arrange(df, x), expect) }) test_that("legacy - arrange works with two columns when the first has a data frame proxy (#6268)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) # `id1` has a data frame proxy for `vec_proxy_order()` df <- tibble( id1 = new_rcrd(list(x = 1, y = 1)), id2 = c(1, 3, 2) ) out <- arrange(df, id1, id2) expect_identical(out$id2, c(1, 2, 3)) }) test_that("legacy - arrange() supports across() and pick() (#4679)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble(x = c(1, 3, 2, 1), y = c(4, 3, 2, 1)) expect_identical( df |> arrange(pick(everything())), df |> arrange(x, y) ) expect_identical( df |> arrange(across(everything(), .fns = desc)), df |> arrange(desc(x), desc(y)) ) expect_identical( df |> arrange(pick(x)), df |> arrange(x) ) expect_identical( df |> arrange(across(y, .fns = identity)), df |> arrange(y) ) }) test_that("legacy - arrange() works with across() and pick() cols that return multiple columns (#6490)", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) df <- tibble( a = c(1, 1, 1), b = c(2, 2, 2), c = c(4, 4, 3), d = c(5, 2, 7) ) expect_identical( arrange( df, across(c(a, b), .fns = identity), across(c(c, d), .fns = identity) ), df[c(3, 2, 1), ] ) expect_identical( arrange(df, pick(a, b), pick(c, d)), df[c(3, 2, 1), ] ) }) test_that("legacy - arrange sorts missings in df-cols correctly", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) col <- tibble(a = c(1, 1, 1), b = c(3, NA, 1)) df <- tibble(x = col) expect_identical(arrange(df, x), df[c(3, 1, 2), ]) expect_identical(arrange(df, desc(x)), df[c(1, 3, 2), ]) }) test_that("legacy - arrange with duplicates in a df-col uses a stable sort", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) col <- tibble(a = c(1, 1, 1, 1, 1), b = c(3, NA, 2, 3, NA)) df <- tibble(x = col, y = 1:5) expect_identical(arrange(df, x)$y, c(3L, 1L, 4L, 2L, 5L)) expect_identical(arrange(df, desc(x))$y, c(1L, 4L, 3L, 2L, 5L)) }) test_that("legacy - arrange with doubly nested df-col doesn't infloop", { local_options(lifecycle_verbosity = "quiet") local_options(dplyr.legacy_locale = TRUE) one <- tibble(a = c(1, 1, 1, 1, 1), b = c(1, 1, 2, 2, 2)) two <- tibble(a = c(1, 1, 1, 1, 1), b = c(2, 1, 1, 2, 2)) col <- tibble(one = one, two = two) df <- tibble(x = col, y = c(1, 1, 1, 1, 0)) expect_identical(arrange(df, x, y), df[c(2, 1, 3, 5, 4), ]) }) dplyr/tests/testthat/test-consecutive-id.R0000644000176200001440000000112714366556340020444 0ustar liggesuserstest_that("works with simple vectors", { expect_equal(consecutive_id(c(1, 1, 2, 1, 2)), c(1, 1, 2, 3, 4)) }) test_that("handles data frames", { df <- tibble(x = c(1, 1, 1, 1), y = c(1, 2, 2, 1)) expect_equal(consecutive_id(df), c(1, 2, 2, 3)) }) test_that("follows recycling rules", { expect_equal(consecutive_id(double(), 1), integer()) expect_equal(consecutive_id(1:2, 1), 1:2) expect_snapshot(error = TRUE, { consecutive_id(1:3, 1:4) }) }) test_that("generates useful errors", { expect_snapshot(error = TRUE, { consecutive_id(x = 1:4) consecutive_id(mean) }) }) dplyr/tests/testthat/test-join-cross.R0000644000176200001440000000244415106134104017573 0ustar liggesuserstest_that("cross join works", { df1 <- tibble(x = 1:2) df2 <- tibble(y = 1:3) expect_identical( cross_join(df1, df2), tibble( x = vec_rep_each(1:2, times = 3), y = vec_rep(1:3, times = 2) ) ) }) test_that("cross join results in 0 rows if either input has 0 rows", { df1 <- tibble(x = 1:2) df2 <- tibble(y = integer()) expect_identical( cross_join(df1, df2), tibble(x = integer(), y = integer()) ) expect_identical( cross_join(df2, df1), tibble(y = integer(), x = integer()) ) }) test_that("cross join works with 0 column, >0 row tibbles", { df1 <- new_tibble(list(), nrow = 3) df2 <- tibble(x = 1:2) expect_identical( cross_join(df1, df1), new_tibble(list(), nrow = 9) ) expect_identical( cross_join(df1, df2), vec_rep(df2, times = 3) ) }) test_that("cross join applies `suffix`", { df1 <- tibble(x = 1, y = 2) df2 <- tibble(x = 2, z = 3) expect_named(cross_join(df1, df2), c("x.x", "y", "x.y", "z")) expect_named( cross_join(df1, df2, suffix = c("", "_y")), c("x", "y", "x_y", "z") ) }) test_that("cross join checks for duplicate names", { df1 <- tibble(a = 1, b = 2, a = 3, .name_repair = "minimal") df2 <- tibble(a = 2, c = 3) expect_snapshot(error = TRUE, { cross_join(df1, df2) }) }) dplyr/tests/testthat/test-transmute.R0000644000176200001440000000613315106134104017526 0ustar liggesuserstest_that("non-syntactic grouping variable is preserved (#1138)", { df <- tibble(`a b` = 1L) |> group_by(`a b`) |> transmute() expect_named(df, "a b") }) test_that("transmute preserves grouping", { gf <- group_by(tibble(x = 1:2, y = 2), x) i <- count_regroups(out <- transmute(gf, x = 1)) expect_equal(i, 1L) expect_equal(group_vars(out), "x") expect_equal(nrow(group_data(out)), 1) i <- count_regroups(out <- transmute(gf, z = 1)) expect_equal(i, 0) expect_equal(group_data(out), group_data(gf)) }) # Empty transmutes ------------------------------------------------- test_that("transmute with no args returns grouping vars", { df <- tibble(x = 1, y = 2) gf <- group_by(df, x) expect_equal(df |> transmute(), df[integer()]) expect_equal(gf |> transmute(), gf[1L]) }) # transmute variables ----------------------------------------------- test_that("transmute succeeds in presence of raw columns (#1803)", { df <- tibble(a = 1:3, b = as.raw(1:3)) expect_identical(transmute(df, a), df["a"]) expect_identical(transmute(df, b), df["b"]) }) test_that("arguments to transmute() don't match vars_transmute() arguments", { df <- tibble(a = 1) expect_identical(transmute(df, var = a), tibble(var = 1)) expect_identical(transmute(df, exclude = a), tibble(exclude = 1)) expect_identical(transmute(df, include = a), tibble(include = 1)) }) test_that("arguments to rename() don't match vars_rename() arguments (#2861)", { df <- tibble(a = 1) expect_identical(rename(df, var = a), tibble(var = 1)) expect_identical( rename(group_by(df, a), var = a), group_by(tibble(var = 1), var) ) expect_identical(rename(df, strict = a), tibble(strict = 1)) expect_identical( rename(group_by(df, a), strict = a), group_by(tibble(strict = 1), strict) ) }) test_that("can transmute() with .data pronoun (#2715)", { expect_identical(transmute(mtcars, .data$cyl), transmute(mtcars, cyl)) }) test_that("transmute() does not warn when a variable is removed with = NULL (#4609)", { df <- data.frame(x = 1) expect_warning(transmute(df, y = x + 1, z = y * 2, y = NULL), NA) }) test_that("transmute() can handle auto splicing", { expect_equal( iris |> transmute(tibble(Sepal.Length, Sepal.Width)), iris |> select(Sepal.Length, Sepal.Width) ) }) test_that("transmute() retains ordering supplied in `...`, even for pre-existing columns (#6086)", { df <- tibble(x = 1:3, y = 4:6) out <- transmute(df, x, z = x + 1, y) expect_named(out, c("x", "z", "y")) }) test_that("transmute() retains ordering supplied in `...`, even for group columns (#6086)", { df <- tibble(x = 1:3, g1 = 1:3, g2 = 1:3, y = 4:6) df <- group_by(df, g1, g2) out <- transmute(df, x, z = x + 1, y, g1) # - Untouched group variables are first # - Following by ordering supplied through `...` expect_named(out, c("g2", "x", "z", "y", "g1")) }) test_that("transmute() error messages", { expect_snapshot({ (expect_error(transmute(mtcars, cyl2 = cyl, .keep = 'all'))) (expect_error(transmute(mtcars, cyl2 = cyl, .before = disp))) (expect_error(transmute(mtcars, cyl2 = cyl, .after = disp))) }) }) dplyr/tests/testthat/test-reframe.R0000644000176200001440000002633415137161765017153 0ustar liggesuserstest_that("`reframe()` allows summaries", { df <- tibble(g = c(1, 1, 1, 2, 2), x = 1:5) expect_identical( reframe(df, x = mean(x)), tibble(x = 3) ) expect_identical( reframe(df, x = mean(x), .by = g), tibble(g = c(1, 2), x = c(2, 4.5)) ) }) test_that("`reframe()` allows size 0 results", { df <- tibble(g = c(1, 1, 1, 2, 2), x = 1:5) gdf <- group_by(df, g) expect_identical( reframe(df, x = which(x > 5)), tibble(x = integer()) ) expect_identical( reframe(df, x = which(x > 5), .by = g), tibble(g = double(), x = integer()) ) expect_identical( reframe(gdf, x = which(x > 5)), tibble(g = double(), x = integer()) ) }) test_that("`reframe()` allows size >1 results", { df <- tibble(g = c(1, 1, 1, 2, 2), x = 1:5) gdf <- group_by(df, g) expect_identical( reframe(df, x = which(x > 2)), tibble(x = 3:5) ) expect_identical( reframe(df, x = which(x > 2), .by = g), tibble(g = c(1, 2, 2), x = c(3L, 1L, 2L)) ) expect_identical( reframe(gdf, x = which(x > 2)), tibble(g = c(1, 2, 2), x = c(3L, 1L, 2L)) ) }) test_that("`reframe()` recycles across columns", { df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5) out <- reframe(df, a = 1:2, b = 1L, c = 2:3) expect_identical(out$a, 1:2) expect_identical(out$b, c(1L, 1L)) expect_identical(out$c, 2:3) out <- reframe(df, a = 1:2, b = 1L, c = 2:3, .by = g) expect_identical(out$g, c(1, 1, 2, 2)) expect_identical(out$a, c(1:2, 1:2)) expect_identical(out$b, c(1L, 1L, 1L, 1L)) expect_identical(out$c, c(2:3, 2:3)) }) test_that("`reframe()` can recycle across columns to size 0", { df <- tibble(g = 1:2, x = 1:2) gdf <- group_by(df, g) out <- reframe(df, y = mean(x), z = which(x > 3)) expect_identical(out$y, double()) expect_identical(out$z, integer()) out <- reframe(df, y = mean(x), z = which(x > 1), .by = g) expect_identical(out$g, 2L) expect_identical(out$y, 2) expect_identical(out$z, 1L) out <- reframe(gdf, y = mean(x), z = which(x > 1)) expect_identical(out$g, 2L) expect_identical(out$y, 2) expect_identical(out$z, 1L) }) test_that("`reframe()` throws intelligent recycling errors", { df <- tibble(g = 1:2, x = 1:2) gdf <- group_by(df, g) expect_snapshot(error = TRUE, { reframe(df, x = 1:2, y = 3:5) }) expect_snapshot(error = TRUE, { reframe(df, x = 1:2, y = 3:5, .by = g) }) expect_snapshot(error = TRUE, { reframe(gdf, x = 1:2, y = 3:5) }) }) test_that("`reframe()` and `summarise()` are consistent with zero expressions", { df <- tibble(x = c("a", "a", "b"), y = 1:3) gdf <- group_by(df, x) expect_identical(reframe(df), tibble(.rows = 1L)) expect_identical(reframe(df), summarise(df)) expect_identical(reframe(df, .by = x), tibble(x = c("a", "b"))) expect_identical(reframe(df, .by = x), summarise(df, .by = x)) expect_identical(reframe(gdf), tibble(x = c("a", "b"))) expect_identical(reframe(gdf), summarise(gdf)) }) test_that("`reframe()` and `summarise()` are consistent with zero expressions and zero rows", { # The grouped cases here are special. There are "zero groups" to evaluate on, # but we still always evaluate 1 time, and then effectively recycle the # results to size 0. df <- tibble(x = character(), y = integer()) gdf <- group_by(df, x) expect_identical(reframe(df), tibble(.rows = 1L)) expect_identical(reframe(df), summarise(df)) expect_identical(reframe(df, .by = x), tibble(x = character())) expect_identical(reframe(df, .by = x), summarise(df, .by = x)) expect_identical(reframe(gdf), tibble(x = character())) expect_identical(reframe(gdf), summarise(gdf)) }) test_that("`reframe()` and `summarise()` are consistent with data frame that flattens into zero expressions", { df <- tibble(x = c("a", "a", "b"), y = 1:3) gdf <- group_by(df, x) expect_identical( reframe(df, tibble(.rows = 1L)), tibble(.rows = 1L) ) expect_identical( reframe(df, tibble(.rows = 1L)), summarise(df, tibble(.rows = 1L)) ) expect_identical( reframe(df, tibble(.rows = 1L), .by = x), tibble(x = c("a", "b")) ) expect_identical( reframe(df, tibble(.rows = 1L), .by = x), summarise(df, tibble(.rows = 1L), .by = x) ) expect_identical( reframe(gdf, tibble(.rows = 1L)), tibble(x = c("a", "b")) ) expect_identical( reframe(gdf, tibble(.rows = 1L)), summarise(gdf, tibble(.rows = 1L)) ) }) test_that("`reframe()` and `summarise()` are consistent with data frame that flattens into zero expressions and zero rows", { # The grouped cases here are special. There are "zero groups" to evaluate on, # but we still always evaluate 1 time, and then effectively recycle the # results to size 0. df <- tibble(x = character(), y = integer()) gdf <- group_by(df, x) expect_identical( reframe(df, tibble(.rows = 1L)), tibble(.rows = 1L) ) expect_identical( reframe(df, tibble(.rows = 1L)), summarise(df, tibble(.rows = 1L)) ) expect_identical( reframe(df, tibble(.rows = 1L), .by = x), tibble(x = character()) ) expect_identical( reframe(df, tibble(.rows = 1L), .by = x), summarise(df, tibble(.rows = 1L), .by = x) ) expect_identical( reframe(gdf, tibble(.rows = 1L)), tibble(x = character()) ) expect_identical( reframe(gdf, tibble(.rows = 1L)), summarise(gdf, tibble(.rows = 1L)) ) }) test_that("`reframe()` can return more rows than the original data frame", { df <- tibble(x = 1:2) expect_identical( reframe(df, x = vec_rep_each(x, x)), tibble(x = c(1L, 2L, 2L)) ) }) test_that("`reframe()` doesn't message about regrouping when multiple group columns are supplied", { df <- tibble(a = c(1, 1, 2, 2, 2), b = c(1, 2, 1, 1, 2), x = 1:5) gdf <- group_by(df, a, b) # Silence expect_snapshot({ out <- reframe(df, x = mean(x), .by = c(a, b)) }) expect_snapshot({ out <- reframe(gdf, x = mean(x)) }) }) test_that("`reframe()` doesn't message about regrouping when >1 rows are returned per group", { df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5) gdf <- group_by(df, g) # Silence expect_snapshot({ out <- reframe(df, x = vec_rep_each(x, x), .by = g) }) expect_snapshot({ out <- reframe(gdf, x = vec_rep_each(x, x)) }) }) test_that("`reframe()` allows sequential assignments", { df <- tibble(g = 1:2, x = 1:2) expect_identical( reframe(df, y = 3, z = mean(x) + y), tibble(y = 3, z = 4.5) ) expect_identical( reframe(df, y = 3, z = mean(x) + y, .by = g), tibble(g = 1:2, y = c(3, 3), z = c(4, 5)) ) }) test_that("`reframe()` allows for overwriting existing columns", { df <- tibble(g = c("a", "b"), x = 1:2) expect_identical( reframe(df, x = 3, z = x), tibble(x = 3, z = 3) ) expect_identical( reframe(df, x = cur_group_id(), z = x, .by = g), tibble(g = c("a", "b"), x = 1:2, z = 1:2) ) }) test_that("`reframe()` works with unquoted values", { df <- tibble(x = 1:5) expect_equal(reframe(df, out = !!1), tibble(out = 1)) expect_equal(reframe(df, out = !!quo(1)), tibble(out = 1)) expect_equal(reframe(df, out = !!(1:2)), tibble(out = 1:2)) }) test_that("`reframe()` with bare data frames always returns a bare data frame", { df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) out <- reframe(df, x = mean(x)) expect_s3_class(out, class(df), exact = TRUE) out <- reframe(df, x = mean(x), .by = g) expect_s3_class(out, class(df), exact = TRUE) }) test_that("`reframe()` drops data frame attributes", { # Because `reframe()` theoretically creates a "new" data frame # With data.frames df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) attr(df, "foo") <- "bar" out <- reframe(df, x = mean(x)) expect_null(attr(out, "foo")) out <- reframe(df, x = mean(x), .by = g) expect_null(attr(out, "foo")) # With tibbles tbl <- as_tibble(df) attr(tbl, "foo") <- "bar" out <- reframe(tbl, x = mean(x)) expect_null(attr(out, "foo")) out <- reframe(tbl, x = mean(x), .by = g) expect_null(attr(out, "foo")) # With grouped_df gdf <- group_by(df, g) attr(gdf, "foo") <- "bar" out <- reframe(gdf, x = mean(x)) expect_null(attr(out, "foo")) }) test_that("`reframe()` with `group_by()` sorts keys", { df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) df <- group_by(df, g) out <- reframe(df, x = mean(x)) expect_identical(out$g, c(0, 1, 2)) expect_identical(out$x, c(5, 2, 6)) }) test_that("`reframe()` with `group_by()` respects `.drop = FALSE`", { g <- factor(c("c", "a", "c"), levels = c("a", "b", "c")) df <- tibble(g = g, x = c(1, 4, 2)) gdf <- group_by(df, g, .drop = FALSE) out <- reframe(gdf, x = mean(x)) expect_identical(out$g, factor(c("a", "b", "c"))) expect_identical(out$x, c(4, NaN, 1.5)) }) test_that("`reframe()` with `group_by()` always returns an ungrouped tibble", { df <- tibble(a = c(1, 1, 2, 2, 2), b = c(1, 2, 1, 1, 2), x = 1:5) gdf <- group_by(df, a, b) out <- reframe(gdf, x = mean(x)) expect_identical(class(out), class(df)) }) test_that("`reframe()` with `rowwise()` respects list-col element access", { df <- tibble(x = list(1:2, 3:5, 6L)) rdf <- rowwise(df) expect_identical( reframe(rdf, x), tibble(x = 1:6) ) }) test_that("`reframe()` with `rowwise()` respects rowwise group columns", { df <- tibble(g = c(1, 1, 2), x = list(1:2, 3:5, 6L)) rdf <- rowwise(df, g) out <- reframe(rdf, x) expect_identical(out$g, c(rep(1, 5), 2)) expect_identical(out$x, 1:6) }) test_that("`reframe()` with `rowwise()` always returns an ungrouped tibble", { df <- tibble(g = c(1, 1, 2), x = list(1:2, 3:5, 6L)) rdf <- rowwise(df, g) expect_s3_class(reframe(rdf, x), class(df), exact = TRUE) }) test_that("named data frame results with 0 columns participate in recycling (#6509)", { df <- tibble(x = 1:3) gdf <- group_by(df, x) empty <- tibble() expect_identical(reframe(df, empty = empty), tibble(empty = empty)) expect_identical( reframe(df, x = sum(x), empty = empty), tibble(x = integer(), empty = empty) ) expect_identical( reframe(df, empty = empty, x = sum(x)), tibble(empty = empty, x = integer()) ) empty3 <- new_tibble(list(), nrow = 3L) expect_identical(reframe(df, empty = empty3), tibble(empty = empty3)) expect_identical( reframe(df, x = sum(x), empty = empty3), tibble(x = c(6L, 6L, 6L), empty = empty3) ) expect_identical( reframe(df, empty = empty3, x = sum(x)), tibble(empty = empty3, x = c(6L, 6L, 6L)) ) }) # .by ---------------------------------------------------------------------- test_that("can group transiently using `.by`", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) out <- reframe(df, x = mean(x), .by = g) expect_identical(out$g, c(1, 2)) expect_identical(out$x, c(3, 2)) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping orders by first appearance", { df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) out <- reframe(df, x = mean(x), .by = g) expect_identical(out$g, c(2, 1, 0)) expect_identical(out$x, c(6, 2, 5)) }) test_that("catches `.by` with grouped-df", { df <- tibble(x = 1) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { reframe(gdf, .by = x) }) }) test_that("catches `.by` with rowwise-df", { df <- tibble(x = 1) rdf <- rowwise(df) expect_snapshot(error = TRUE, { reframe(rdf, .by = x) }) }) dplyr/tests/testthat/test-join-rows.R0000644000176200001440000002662415106134104017442 0ustar liggesuserstest_that("`relationship` default behavior is correct", { # "warn-many-to-many" for equality joins expect_snapshot(out <- join_rows(c(1, 1), c(1, 1), condition = "==")) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) # "none" for rolling joins expect_warning( out <- join_rows(c(1, 2), c(1, 1), condition = ">=", filter = "max"), NA ) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) # If rolling joins warned on many-to-many relationships, it would be a little # hard to explain that the above example warns, but this wouldn't just because # we've removed `2` as a key from `x`: # `join_rows(1, c(1, 1), condition = ">=", filter = "max")` # "none" for inequality joins (and overlap joins) expect_warning(out <- join_rows(c(1, 2), c(0, 1), condition = ">="), NA) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) # "none" for deprecated cross joins expect_warning(out <- join_rows(c(1, 1), c(1, 1), cross = TRUE), NA) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) }) test_that("`multiple` first/last/any works correctly", { out <- join_rows(c(1, 1), c(1, 1), multiple = "first") expect_equal(out$x, c(1L, 2L)) expect_equal(out$y, c(1L, 1L)) out <- join_rows(c(1, 1), c(1, 1), multiple = "last") expect_equal(out$x, c(1L, 2L)) expect_equal(out$y, c(2L, 2L)) out <- join_rows(c(1, 1), c(1, 1), multiple = "any") expect_equal(out$x, c(1L, 2L)) expect_equal(out$y %in% c(1L, 2L), c(TRUE, TRUE)) }) test_that("inner join only outputs matching keys", { out <- join_rows(c(2, 1), c(3, 4, 1), type = "inner") expect_equal(out$x, 2L) expect_equal(out$y, 3L) out <- join_rows(c(2, 1), c(3, 4, 1), type = "inner", condition = ">") expect_equal(out$x, 1L) expect_equal(out$y, 3L) }) test_that("left join contains all keys from x", { out <- join_rows(c(2, 1), c(3, 4, 1), type = "left") expect_equal(out$x, c(1L, 2L)) expect_equal(out$y, c(NA, 3L)) out <- join_rows(c(2, 1), c(3, 4, 1), type = "left", condition = ">") expect_equal(out$x, c(1L, 2L)) expect_equal(out$y, c(3L, NA)) }) test_that("right join contains all keys from y", { out <- join_rows(c(2, 1), c(3, 4, 1), type = "right") expect_equal(out$x, c(2L, NA, NA)) expect_equal(out$y, c(3L, 1L, 2L)) out <- join_rows(c(2, 1), c(3, 4, 1), type = "right", condition = ">=") expect_equal(out$x, c(1L, 2L, NA, NA)) expect_equal(out$y, c(3L, 3L, 1L, 2L)) }) test_that("full join contains all keys from both", { out <- join_rows(c(2, 1), c(3, 1), type = "full") expect_equal(out$x, c(1L, 2L, NA)) expect_equal(out$y, c(NA, 2L, 1L)) out <- join_rows(c(2, 1), c(3, 1), type = "full", condition = ">") expect_equal(out$x, c(1L, 2L, NA)) expect_equal(out$y, c(2L, NA, 1L)) }) test_that("nest join returns 0L for unmatched x keys", { out <- join_rows(c(2, 1), c(3, 4, 1), type = "nest") expect_equal(out$x, c(1L, 2L)) expect_equal(out$y, c(0L, 3L)) }) test_that("nest join returns 0L for missing x keys with `na_matches = 'never'`", { out <- join_rows(c(NA, 1), 1, type = "nest", na_matches = "never") expect_equal(out$x, c(1L, 2L)) expect_equal(out$y, c(0L, 1L)) }) test_that("matching rows can be filtered", { out <- join_rows(c(3, 5), c(2, 4, 1), condition = ">=", filter = "max") expect_equal(out$x, 1:2) expect_equal(out$y, 1:2) out <- join_rows(c(3, 5), c(2, 4, 1), condition = ">=", filter = "min") expect_equal(out$x, 1:2) expect_equal(out$y, c(3, 3)) }) test_that("missing values only match with `==`, `>=`, and `<=` conditions", { out <- join_rows(NA, NA, condition = "==") expect_identical(out$x, 1L) expect_identical(out$y, 1L) out <- join_rows(NA, NA, condition = ">=") expect_identical(out$x, 1L) expect_identical(out$y, 1L) out <- join_rows(NA, NA, condition = "<=") expect_identical(out$x, 1L) expect_identical(out$y, 1L) out <- join_rows(NA, NA, condition = ">") expect_identical(out$x, integer()) expect_identical(out$y, integer()) out <- join_rows(NA, NA, condition = "<") expect_identical(out$x, integer()) expect_identical(out$y, integer()) x <- tibble(x = c(1, 1), y = c(2, NA)) y <- tibble(x = c(1, 1), y = c(3, NA)) out <- join_rows(x, y, condition = c("==", "<=")) expect_identical(out$x, c(1L, 2L)) expect_identical(out$y, c(1L, 2L)) out <- join_rows(x, y, condition = c("==", "<")) expect_identical(out$x, 1L) expect_identical(out$y, 1L) }) test_that("join_rows() doesn't error on unmatched rows if they won't be dropped", { # 2 is unmatched, but a left join means we always retain that key out <- join_rows(c(1, 2), 1, type = "left", unmatched = "error") expect_identical(out$x, c(1L, 2L)) expect_identical(out$y, c(1L, NA)) out <- join_rows(c(1, 2), c(1, 3), type = "full", unmatched = "error") expect_identical(out$x, c(1L, 2L, NA)) expect_identical(out$y, c(1L, NA, 2L)) }) test_that("join_rows() allows `unmatched` to be specified independently for inner joins", { out <- join_rows(c(1, 2), 1, type = "inner", unmatched = c("drop", "error")) expect_identical(out$x, 1L) expect_identical(out$y, 1L) out <- join_rows(1, c(2, 1), type = "inner", unmatched = c("error", "drop")) expect_identical(out$x, 1L) expect_identical(out$y, 2L) # Both have dropped rows, only `y` is mentioned in the error expect_snapshot(error = TRUE, { join_rows(c(1, 3), c(1, 2), type = "inner", unmatched = c("drop", "error")) }) }) test_that("join_rows() expects incompatible type errors to have been handled by join_cast_common()", { expect_snapshot({ (expect_error( join_rows(data.frame(x = 1), data.frame(x = factor("a"))) )) }) }) test_that("join_rows() gives meaningful one-to-one errors", { expect_snapshot(error = TRUE, { join_rows(1, c(1, 1), relationship = "one-to-one") }) expect_snapshot(error = TRUE, { join_rows(c(1, 1), 1, relationship = "one-to-one") }) }) test_that("join_rows() gives meaningful one-to-many errors", { expect_snapshot(error = TRUE, { join_rows(c(1, 1), 1, relationship = "one-to-many") }) }) test_that("join_rows() gives meaningful many-to-one errors", { expect_snapshot(error = TRUE, { join_rows(1, c(1, 1), relationship = "many-to-one") }) }) test_that("join_rows() gives meaningful many-to-many warnings", { expect_snapshot({ join_rows(c(1, 1), c(1, 1)) }) # With proof that the defaults flow through user facing functions df <- data.frame(x = c(1, 1)) expect_snapshot({ left_join(df, df, by = join_by(x)) }) }) test_that("join_rows() gives meaningful error message on unmatched rows", { # Unmatched in the RHS expect_snapshot( error = TRUE, join_rows( data.frame(x = c(1, 2)), data.frame(x = c(3, 1)), type = "left", unmatched = "error" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = c(1, 2)), data.frame(x = c(3, 1)), type = "nest", unmatched = "error" ) ) # Unmatched in the LHS expect_snapshot( error = TRUE, join_rows( data.frame(x = c(1, 2)), data.frame(x = c(3, 1)), type = "right", unmatched = "error" ) ) # Unmatched in either side expect_snapshot( error = TRUE, join_rows( data.frame(x = c(1, 2)), data.frame(x = 1), type = "inner", unmatched = "error" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = c(1, 2)), data.frame(x = 1), type = "inner", unmatched = c("error", "drop") ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = 1), data.frame(x = c(1, 2)), type = "inner", unmatched = "error" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = 1), data.frame(x = c(1, 2)), type = "inner", unmatched = c("drop", "error") ) ) }) test_that("join_rows() always errors on unmatched missing values", { # Unmatched in the RHS expect_snapshot( error = TRUE, join_rows( data.frame(x = 1), data.frame(x = NA), type = "left", unmatched = "error", na_matches = "na" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = NA), data.frame(x = NA), type = "left", unmatched = "error", na_matches = "never" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = 1), data.frame(x = NA), type = "nest", unmatched = "error", na_matches = "na" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = NA), data.frame(x = NA), type = "nest", unmatched = "error", na_matches = "never" ) ) # Unmatched in the LHS expect_snapshot( error = TRUE, join_rows( data.frame(x = NA), data.frame(x = 1), type = "right", unmatched = "error", na_matches = "na" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = NA), data.frame(x = NA), type = "right", unmatched = "error", na_matches = "never" ) ) # Unmatched in either side expect_snapshot( error = TRUE, join_rows( data.frame(x = 1), data.frame(x = c(1, NA)), type = "inner", unmatched = "error", na_matches = "na" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = 1), data.frame(x = c(1, NA)), type = "inner", unmatched = c("drop", "error"), na_matches = "na" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = c(1, NA)), data.frame(x = 1), type = "inner", unmatched = "error", na_matches = "na" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = c(1, NA)), data.frame(x = 1), type = "inner", unmatched = c("error", "drop"), na_matches = "na" ) ) expect_snapshot( error = TRUE, join_rows( data.frame(x = NA), data.frame(x = NA), type = "inner", unmatched = "error", na_matches = "never" ) ) }) test_that("join_rows() validates `unmatched`", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { join_rows(df, df, unmatched = 1) join_rows(df, df, unmatched = "foo") # One `unmatched` input is allowed for most joins join_rows(df, df, type = "left", unmatched = character()) join_rows(df, df, type = "left", unmatched = c("drop", "error")) # Two `unmatched` inputs are allowed for inner joins join_rows(df, df, type = "inner", unmatched = character()) join_rows(df, df, type = "inner", unmatched = c("drop", "error", "error")) join_rows(df, df, type = "inner", unmatched = c("drop", "dr")) }) }) test_that("join_rows() validates `relationship`", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { join_rows(df, df, relationship = 1) }) # Notably can't use the vctrs options expect_snapshot(error = TRUE, { join_rows(df, df, relationship = "none") }) expect_snapshot(error = TRUE, { join_rows(df, df, relationship = "warn-many-to-many") }) }) test_that("join_rows() rethrows overflow error nicely (#6912)", { skip_on_cran() # Windows 32-bit doesn't support long vectors of this size, and the # intermediate `r_ssize` will be too large skip_if(.Machine$sizeof.pointer < 8L, message = "No long vector support") df <- tibble(x = 1:1e7) expect_snapshot(error = TRUE, { join_rows(df, df, condition = ">=") }) }) dplyr/tests/testthat/helper-pick.R0000644000176200001440000000013714366556340016751 0ustar liggesuserspick_wrapper <- function(...) { # Wrapping `pick()` forces evaluation fallback pick(...) } dplyr/tests/testthat/test-deprec-funs.R0000644000176200001440000000600615137161765017737 0ustar liggesuserstest_that("fun_list is merged with new args", { withr::local_options(lifecycle_verbosity = "quiet") funs <- funs(fn = bar) funs <- as_fun_list(funs, env(), baz = "baz") expect_identical(funs$fn, quo(bar(., baz = "baz"))) }) test_that("funs() works with namespaced calls", { withr::local_options(lifecycle_verbosity = "quiet") expect_identical( summarise_all(mtcars, funs(base::mean(.))), summarise_all(mtcars, funs(mean(.))) ) expect_identical( summarise_all(mtcars, funs(base::mean)), summarise_all(mtcars, funs(mean(.))) ) }) test_that("funs() found in local environment", { withr::local_options(lifecycle_verbosity = "quiet") f <- function(x) 1 df <- data.frame(x = c(2:10, 1000)) out <- summarise_all(df, funs(f = f, mean = mean, median = median)) expect_equal(out, data.frame(f = 1, mean = 105.4, median = 6.5)) }) test_that("funs() accepts quoted functions", { withr::local_options(lifecycle_verbosity = "quiet") expect_identical(funs(mean), funs("mean")) }) test_that("funs() accepts unquoted functions", { withr::local_options(lifecycle_verbosity = "quiet") funs <- funs(fn = !!mean) expect_identical(funs$fn, new_quosure(call2(base::mean, quote(.)))) }) test_that("funs() accepts quoted calls", { withr::local_options(lifecycle_verbosity = "quiet") expect_identical(funs(mean), funs(mean(.))) }) test_that("funs() can be merged with new arguments", { withr::local_options(lifecycle_verbosity = "quiet") fns <- funs(foo(.)) expect_identical( as_fun_list(fns, current_env(), foo = 1L), funs(foo(., foo = 1L)) ) }) enfun <- function(.funs, ...) { as_fun_list(.funs, caller_env(), ...) } test_that("can enfun() literal functions", { res <- enfun(identity(mean)) expect_equal(length(res), 1L) expect_identical(res[[1L]], mean) }) test_that("can enfun() named functions by expression", { res <- enfun(mean) expect_equal(length(res), 1L) expect_identical(res[[1L]], mean) }) test_that("local objects are not treated as symbols", { withr::local_options(lifecycle_verbosity = "quiet") mean <- funs(my_mean(.)) expect_identical(enfun(mean), mean) }) test_that("can enfun() character vectors", { res <- enfun(c("min", "max")) expect_equal(length(res), 2L) expect_equal(res[[1]], min) expect_equal(res[[2]], max) }) test_that("can enfun() purrr-style lambdas", { my_mean <- as_function(~ mean(.x)) res <- enfun(~ mean(.x)) expect_equal(length(res), 1L) expect_type(res[[1]], "closure") }) test_that("as_fun_list() auto names chr vectors (4307)", { df <- data.frame(x = 1:10) expect_named( summarise_at(df, "x", c("mean", "sum")), c("mean", "sum") ) }) test_that("funs() is deprecated", { expect_snapshot(funs(fn = bar)) }) # Errors ------------------------------------------------------------------ test_that("funs() give meaningful error messages", { withr::local_options(lifecycle_verbosity = "quiet") expect_snapshot({ (expect_error(funs(function(si) { mp[si] }))) (expect_error(funs(~ mp[.]))) }) }) dplyr/tests/testthat/test-order-by.R0000644000176200001440000000215215106134104017224 0ustar liggesuserstest_that("order_by() gives useful error messages", { expect_snapshot({ (expect_error(order_by(mtcars, 10))) (expect_error(order_by(mtcars, cyl))) }) }) test_that("`with_order()` works with data frame `order_by` (#6334)", { x <- 1:3 order_by <- tibble(a = c(1, 1, 2), b = c(2, 1, 1)) expect_identical(with_order(order_by, lag, x), c(2L, NA, 1L)) }) test_that("`with_order()` requires `order_by` and `x` to be the same size", { expect_snapshot(error = TRUE, { with_order(1:2, identity, 1:3) }) }) test_that("order_by() returns correct value", { expected <- int(15, 14, 12, 9, 5) expect_identical(order_by(5:1, cumsum(1:5)), expected) x <- 5:1 y <- 1:5 expect_identical(order_by(x, cumsum(y)), expected) }) test_that("order_by() works in arbitrary envs (#2297)", { env <- child_env("base") expect_equal( with_env(env, dplyr::order_by(5:1, cumsum(1:5))), rev(cumsum(rev(1:5))) ) expect_equal( order_by(5:1, cumsum(1:5)), rev(cumsum(rev(1:5))) ) }) test_that("order_by() give meaningful errors", { expect_snapshot({ (expect_error(order_by(NULL, 1L))) }) }) dplyr/tests/testthat/test-rows.R0000644000176200001440000003412715106134104016502 0ustar liggesusers# ------------------------------------------------------------------------------ # rows_insert() test_that("rows_insert() works", { data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) expect_identical( rows_insert(data, tibble(a = 4L, b = "z"), by = "a"), tibble(a = 1:4, b = c("a", "b", NA, "z"), c = c(0.5, 1.5, 2.5, NA)) ) }) test_that("rows_insert() doesn't allow insertion of matched keys by default", { x <- tibble(a = 1, b = 2) y <- tibble(a = 1, b = 3) expect_snapshot( (expect_error(rows_insert(x, y, by = "a"))) ) y <- tibble(a = c(1, 1, 1), b = c(3, 4, 5)) expect_snapshot( (expect_error(rows_insert(x, y, by = "a"))) ) }) test_that("rows_insert() allows you to ignore matched keys with `conflict = 'ignore'`", { x <- tibble(a = 1, b = 2) y <- tibble(a = 1, b = 3) expect_identical(rows_insert(x, y, by = "a", conflict = "ignore"), x) y <- tibble(a = c(1, 2, 1), b = c(3, 4, 5)) expect_identical( rows_insert(x, y, by = "a", conflict = "ignore"), rows_insert(x, y[2, ], by = "a") ) }) test_that("rows_insert() allows `x` keys to be duplicated (#5553)", { x <- tibble(a = c(1, 1), b = c(2, 3)) y <- tibble(a = 2, b = 4) expect_identical( rows_insert(x, y, by = "a"), tibble(a = c(1, 1, 2), b = c(2, 3, 4)) ) }) test_that("rows_insert() allows `y` keys to be duplicated (#5553)", { x <- tibble(a = 2, b = 4) y <- tibble(a = c(1, 1), b = c(2, 3)) expect_identical( rows_insert(x, y, by = "a"), tibble(a = c(2, 1, 1), b = c(4, 2, 3)) ) }) test_that("rows_insert() casts keys to the type of `x`", { x <- vctrs::data_frame(key = 1L, value = 2) y <- vctrs::data_frame(key = 1.5, value = 1.5) expect_snapshot({ (expect_error(rows_insert(x, y, "key"))) }) }) test_that("rows_insert() casts values to the type of `x`", { x <- vctrs::data_frame(key = 1, value = 2L) y <- vctrs::data_frame(key = 2, value = 1.5) expect_snapshot({ (expect_error(rows_insert(x, y, "key"))) }) }) test_that("rows_insert() checks that `x` and `y` contain `by` (#6652)", { x <- tibble(a = 1, b = 2) y <- tibble(a = 1) expect_snapshot({ (expect_error(rows_insert(x, y, by = "c"))) }) expect_snapshot({ (expect_error(rows_insert(x, y, by = c("a", "b")))) }) }) test_that("`conflict` is validated", { x <- tibble(a = 1) y <- tibble(a = 2) expect_snapshot({ (expect_error(rows_insert(x, y, by = "a", conflict = "foo"))) (expect_error(rows_insert(x, y, by = "a", conflict = 1))) }) }) # ------------------------------------------------------------------------------ # rows_append() test_that("rows_append() allows you to insert unconditionally", { x <- tibble(a = 1, b = 2) y <- tibble(a = 1, b = 3) expect_identical(rows_append(x, y), bind_rows(x, y)) y <- tibble(a = c(1, 2, 1), b = c(3, 4, 5)) expect_identical(rows_append(x, y), bind_rows(x, y)) }) test_that("rows_append() casts to the type of `x`", { x <- vctrs::data_frame(key = 1L, value = 2) y <- vctrs::data_frame(key = 1.5, value = 1.5) expect_snapshot({ (expect_error(rows_append(x, y))) }) y <- vctrs::data_frame(key = 2, value = 3L) out <- rows_append(x, y) expect_identical(out$key, c(1L, 2L)) expect_identical(out$value, c(2, 3)) }) test_that("rows_append() requires that `y` columns be a subset of `x`", { x <- tibble(a = 1, b = 2) y <- tibble(a = 1, b = 2, c = 3) expect_snapshot({ (expect_error(rows_append(x, y))) }) }) test_that("rows_append() doesn't require that `x` columns be a subset of `y`", { x <- tibble(a = 1, b = 2, c = 3) y <- tibble(a = 1, b = 2) out <- rows_append(x, y) expect_identical(out$c, c(3, NA)) }) # ------------------------------------------------------------------------------ test_that("rows_update() works", { data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) expect_identical( rows_update(data, tibble(a = 2:3, b = "z"), by = "a"), tibble(a = 1:3, b = c("a", "z", "z"), c = data$c) ) expect_silent( expect_identical( rows_update(data, tibble(b = "z", a = 2:3), by = "a"), tibble(a = 1:3, b = c("a", "z", "z"), c = data$c) ) ) }) test_that("rows_update() requires `y` keys to exist in `x` by default", { x <- tibble(a = 1, b = 2) y <- tibble(a = c(2, 1, 3), b = c(1, 1, 1)) expect_snapshot((expect_error(rows_update(x, y, "a")))) }) test_that("rows_update() allows `y` keys that don't exist in `x` to be ignored", { x <- tibble(a = 1, b = 2) y <- tibble(a = c(2, 1, 3), b = c(1, 1, 1)) expect_identical( rows_update(x, y, "a", unmatched = "ignore"), tibble(a = 1, b = 1) ) }) test_that("rows_update() allows `x` keys to be duplicated (#5553)", { x <- tibble(a = c(1, 2, 1, 3), b = c(2, 3, 4, 5), c = letters[1:4]) y <- tibble(a = c(1, 3), b = c(99, 88)) expect_identical( rows_update(x, y, by = "a"), tibble(a = c(1, 2, 1, 3), b = c(99, 3, 99, 88), c = letters[1:4]) ) }) test_that("rows_update() doesn't allow `y` keys to be duplicated (#5553)", { x <- tibble(a = 2, b = 4) y <- tibble(a = c(1, 1), b = c(2, 3)) expect_snapshot((expect_error(rows_update(x, y, by = "a")))) }) test_that("rows_update() avoids bare data.frame `drop = FALSE` issues", { x <- vctrs::data_frame(x = c(1, 2, 3), y = I(list(1:2, 3:4, 5:6))) y <- vctrs::data_frame(x = c(1, 3), y = I(list(0L, 100:101))) out <- rows_update(x, y, "x") expect_identical(out$y, I(list(0L, 3:4, 100:101))) }) test_that("rows_update() casts keys to their common type for matching but retains `x` type", { x <- vctrs::data_frame(key = 1L, value = 2) y <- vctrs::data_frame(key = 1, value = 1.5) out <- rows_update(x, y, "key") expect_identical(out$key, x$key) expect_identical(out$value, y$value) y <- vctrs::data_frame(key = "x", value = 1.5) expect_snapshot({ (expect_error(rows_update(x, y, "key"))) }) }) test_that("rows_update() casts values to the type of `x`", { x <- vctrs::data_frame(key = 1, value = 2L) y <- vctrs::data_frame(key = 1, value = 1.5) expect_snapshot({ (expect_error(rows_update(x, y, "key"))) }) out <- rows_update(y, x, "key") expect_identical(out$value, 2) }) test_that("`unmatched` is validated", { x <- tibble(a = 1) y <- tibble(a = 1) expect_snapshot({ (expect_error(rows_update(x, y, by = "a", unmatched = "foo"))) (expect_error(rows_update(x, y, by = "a", unmatched = 1))) }) }) # ------------------------------------------------------------------------------ test_that("rows_patch() works", { data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) expect_identical( rows_patch(data, tibble(a = 2:3, b = "z"), by = "a"), tibble(a = 1:3, b = c("a", "b", "z"), c = data$c) ) expect_silent( expect_identical( rows_patch(data, tibble(b = "z", a = 2:3), by = "a"), tibble(a = 1:3, b = c("a", "b", "z"), c = data$c) ) ) }) test_that("rows_patch() requires `y` keys to exist in `x` by default", { x <- tibble(a = 1, b = 2) y <- tibble(a = c(2, 1, 3), b = c(1, 1, 1)) expect_snapshot((expect_error(rows_patch(x, y, "a")))) }) test_that("rows_patch() allows `y` keys that don't exist in `x` to be ignored", { x <- tibble(a = 1, b = NA_real_) y <- tibble(a = c(2, 1, 3), b = c(1, 1, 1)) expect_identical( rows_patch(x, y, "a", unmatched = "ignore"), tibble(a = 1, b = 1) ) }) test_that("rows_patch() allows `x` keys to be duplicated (#5553)", { x <- tibble(a = c(1, 2, 1, 3), b = c(NA, 3, 4, NA), c = letters[1:4]) y <- tibble(a = c(1, 3), b = c(99, 88)) expect_identical( rows_patch(x, y, by = "a"), tibble(a = c(1, 2, 1, 3), b = c(99, 3, 4, 88), c = letters[1:4]) ) }) test_that("rows_patch() doesn't allow `y` keys to be duplicated (#5553)", { x <- tibble(a = 2, b = 4) y <- tibble(a = c(1, 1), b = c(2, 3)) expect_snapshot((expect_error(rows_patch(x, y, by = "a")))) }) test_that("rows_patch() avoids bare data.frame `drop = FALSE` issues", { x <- vctrs::data_frame(x = c(1, 2, 3, 3), y = c(NA, 5, NA, 6)) y <- vctrs::data_frame(x = c(1, 3), y = c(0, 100)) out <- rows_patch(x, y, "x") expect_identical(out$y, c(0, 5, 100, 6)) }) test_that("rows_patch() casts keys to their common type for matching but retains `x` type", { x <- vctrs::data_frame(key = 1L, value = NA_real_) y <- vctrs::data_frame(key = 1, value = 1.5) out <- rows_patch(x, y, "key") expect_identical(out$key, x$key) expect_identical(out$value, y$value) y <- vctrs::data_frame(key = "x", value = 1.5) expect_snapshot({ (expect_error(rows_patch(x, y, "key"))) }) }) test_that("rows_patch() casts values to the type of `x`", { x <- vctrs::data_frame(key = 1, value = 2L) y <- vctrs::data_frame(key = 1, value = 1.5) expect_snapshot({ (expect_error(rows_patch(x, y, "key"))) }) }) # ------------------------------------------------------------------------------ test_that("rows_upsert() works", { data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) expect_identical( rows_upsert(data, tibble(a = 2:4, b = "z"), by = "a"), tibble(a = 1:4, b = c("a", "z", "z", "z"), c = c(data$c, NA)) ) }) test_that("rows_upsert() allows `x` keys to be duplicated (#5553)", { x <- tibble(a = c(1, 2, 1, 3), b = c(NA, 3, 4, NA), c = letters[1:4]) y <- tibble(a = c(1, 3, 4), b = c(99, 88, 100)) expect_identical( rows_upsert(x, y, by = "a"), tibble( a = c(1, 2, 1, 3, 4), b = c(99, 3, 99, 88, 100), c = c(letters[1:4], NA) ) ) }) test_that("rows_upsert() doesn't allow `y` keys to be duplicated (#5553)", { x <- tibble(a = 2, b = 4) y <- tibble(a = c(1, 1), b = c(2, 3)) expect_snapshot((expect_error(rows_upsert(x, y, by = "a")))) }) test_that("rows_upsert() avoids bare data.frame `drop = FALSE` issues", { x <- vctrs::data_frame(x = c(1, 2, 3), y = I(list(1:2, 3:4, 5:6))) y <- vctrs::data_frame(x = c(1, 3, 4), y = I(list(0L, 100:101, -1L))) out <- rows_upsert(x, y, "x") expect_identical(out$y, I(list(0L, 3:4, 100:101, -1L))) }) test_that("rows_upsert() casts keys to their common type for matching but retains `x` type", { x <- vctrs::data_frame(key = 1L, value = 2) y <- vctrs::data_frame(key = c(2, 1), value = c(1.5, 2.5)) out <- rows_upsert(x, y, "key") expect_identical(out$key, c(1L, 2L)) expect_identical(out$value, c(2.5, 1.5)) y <- vctrs::data_frame(key = "x", value = 1.5) expect_snapshot({ (expect_error(rows_upsert(x, y, "key"))) }) }) test_that("rows_upsert() casts keys to the type of `x`", { x <- vctrs::data_frame(key = 1L, value = 2) y <- vctrs::data_frame(key = 1.5, value = 1.5) expect_snapshot({ (expect_error(rows_upsert(x, y, "key"))) }) }) test_that("rows_upsert() casts values to the type of `x`", { x <- vctrs::data_frame(key = 1, value = 2L) y <- vctrs::data_frame(key = 1, value = 1.5) expect_snapshot({ (expect_error(rows_upsert(x, y, "key"))) }) }) # ------------------------------------------------------------------------------ test_that("rows_delete() works", { data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) expect_identical( rows_delete(data, tibble(a = 2:3), by = "a"), data[1, ] ) }) test_that("rows_delete() ignores extra `y` columns, with a message", { x <- tibble(a = 1) y <- tibble(a = 1, b = 2) expect_snapshot({ out <- rows_delete(x, y) }) expect_identical(out, x[0, ]) expect_snapshot({ out <- rows_delete(x, y, by = "a") }) expect_identical(out, x[0, ]) }) test_that("rows_delete() requires `y` keys to exist in `x` by default", { x <- tibble(a = 1, b = 2) y <- tibble(a = c(2, 1, 3), b = c(1, 1, 1)) expect_snapshot((expect_error(rows_delete(x, y, "a")))) }) test_that("rows_delete() allows `y` keys that don't exist in `x` to be ignored", { x <- tibble(a = 1, b = 2) y <- tibble(a = c(2, 1, 3)) expect_identical( rows_delete(x, y, "a", unmatched = "ignore"), tibble(a = double(), b = double()) ) }) test_that("rows_delete() allows `x` keys to be duplicated (#5553)", { x <- tibble(a = c(1, 2, 1, 3), b = c(NA, 3, 4, NA), c = letters[1:4]) y <- tibble(a = c(1, 3)) expect_identical( rows_delete(x, y, by = "a"), x[2, ] ) }) test_that("rows_delete() allows `y` keys to be duplicated (#5553)", { x <- tibble(a = c(1, 2, 3), b = c(4, 5, 6)) y <- tibble(a = c(1, 1)) expect_identical( rows_delete(x, y, by = "a"), x[c(2, 3), ] ) }) test_that("rows_delete() casts keys to their common type for matching but retains `x` type", { x <- vctrs::data_frame(key = c(1L, 2L), value = c("x", "y")) y <- vctrs::data_frame(key = 2) out <- rows_delete(x, y, "key") expect_identical(out$key, 1L) y <- vctrs::data_frame(key = "x", value = 1.5) expect_snapshot({ (expect_error(rows_delete(x, y, "key"))) }) }) # ------------------------------------------------------------------------------ # Common errors test_that("rows_check_x_contains_y() checks that `y` columns are in `x`", { x <- tibble(a = 1) y <- tibble(a = 1, b = 2) expect_snapshot((expect_error(rows_check_x_contains_y(x, y)))) }) test_that("rows_check_by() checks that `y` has at least 1 column before using it (#6061)", { y <- tibble() expect_snapshot((expect_error(rows_check_by(by = NULL, y = y)))) }) test_that("rows_check_by() uses the first column from `y` by default, with a message", { y <- tibble(a = 1, b = 2) expect_snapshot( by <- rows_check_by(by = NULL, y = y) ) expect_identical(by, "a") }) test_that("rows_check_by() validates `by`", { y <- tibble(x = 1) expect_snapshot({ (expect_error(rows_check_by(by = 1, y = y))) (expect_error(rows_check_by(by = character(), y = y))) (expect_error(rows_check_by(by = c(x = "y"), y = y))) }) }) test_that("rows_check_contains_by() checks that all `by` columns are in `x`", { x <- tibble(x = 1) expect_snapshot({ (expect_error(rows_check_contains_by(x, "y", arg = "x"))) (expect_error(rows_check_contains_by(x, c("y", "x", "z"), arg = "y"))) }) }) test_that("rows_check_unique() requires uniqueness", { x <- tibble(x = c(1, 1, 1), y = c(2, 3, 2), z = c(1, 2, 3)) expect_silent(rows_check_unique(x, "x")) expect_snapshot({ (expect_error(rows_check_unique(x["x"], "x"))) (expect_error(rows_check_unique(x[c("x", "y")], "y"))) }) }) dplyr/tests/testthat/test-colwise-group-by.R0000644000176200001440000000204615106134104020712 0ustar liggesuserstest_that("group_by_ verbs take scoped inputs", { expect_identical(group_vars(group_by_all(mtcars)), names(mtcars)) expect_identical( group_vars(group_by_at(mtcars, vars(starts_with("d")))), c("disp", "drat") ) expect_identical(group_vars(group_by_if(iris, is.factor)), "Species") }) test_that("group_by_ verbs accept optional operations", { df <- tibble(x = 1:2, y = 2:3) gdf <- group_by(mutate_all(df, as.factor), x, y) expect_identical(group_by_all(df, as.factor), gdf) expect_identical(group_by_if(df, is_integer, as.factor), gdf) expect_identical(group_by_at(df, vars(x:y), as.factor), gdf) }) test_that("group_by variants can group by an already grouped by data (#3351)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(c(1, 2), each = 4), x = 1:8) |> group_by(gr1) expect_identical( group_by_at(tbl, vars(gr1, gr2)), group_by(tbl, gr1, gr2) ) expect_identical( group_by_all(tbl), group_by(tbl, gr1, gr2, x) ) expect_identical( group_by_if(tbl, is.integer), group_by(tbl, gr1, x) ) }) dplyr/tests/testthat/test-nth-value.R0000644000176200001440000001504415106134104017410 0ustar liggesusers# ------------------------------------------------------------------------------ # nth() test_that("nth works with lists and uses `vec_slice2()` to return elements (#6331)", { # We'd like to use `vec_slice()` everywhere, but it breaks too many revdeps # that rely on `nth()` returning list elements x <- list(1, 2, 3:5) expect_equal(nth(x, 1), 1) expect_equal(nth(x, 3), 3:5) }) test_that("nth `default` for lists defaults to `NULL` since it uses `vec_slice2()`", { expect_null(nth(list(1), 2)) expect_null(nth(list(), 1)) }) test_that("nth `default` for lists can be anything", { # Because list elements can be anything x <- list(1, 2) default <- environment() expect_identical(nth(x, 3, default = default), default) default <- 1:3 expect_identical(nth(x, 3, default = default), default) }) test_that("nth treats list-of like lists", { x <- list_of(1, 2, c(3, 4)) expect_identical(nth(x, 3), c(3, 4)) expect_identical(nth(x, 4), NULL) # Not particularly strict about `default` here, # even though `list_of()` elements are typed expect_identical(nth(x, 4, default = "x"), "x") }) test_that("nth works with data frames and always returns a single row", { x <- tibble(x = 1:3, y = 4:6) expect_identical(nth(x, 1), tibble(x = 1L, y = 4L)) expect_identical(nth(x, 4), tibble(x = NA_integer_, y = NA_integer_)) expect_identical( nth(x, 4, default = tibble(x = 0, y = 0)), tibble(x = 0L, y = 0L) ) }) test_that("nth works with rcrds", { x <- new_rcrd(list(x = 1:3, y = 4:6)) expect_identical(nth(x, 1), vec_slice(x, 1)) expect_identical(nth(x, 4), vec_init(x)) expect_identical(nth(x, 4, default = x[2]), x[2]) }) test_that("drops names, because it uses `vec_slice2()`", { x <- c(a = 1, b = 2) expect_named(nth(x, 2), NULL) }) test_that("negative values index from end", { x <- 1:5 expect_equal(nth(x, -1), 5L) expect_equal(nth(x, -3), 3L) }) test_that("indexing past ends returns default value", { expect_equal(nth(1:4, 5), NA_integer_) expect_equal(nth(1:4, -5), NA_integer_) expect_equal(nth(1:4, -10), NA_integer_) expect_equal(nth(1:4, -10, default = 6L), 6L) }) test_that("gets corner case indexing correct", { expect_identical(nth(1:4, -5), NA_integer_) expect_identical(nth(1:4, -4), 1L) expect_identical(nth(1:4, -3), 2L) expect_identical(nth(1:4, -1), 4L) expect_identical(nth(1:4, 0), NA_integer_) expect_identical(nth(1:4, 1), 1L) expect_identical(nth(1:4, 3), 3L) expect_identical(nth(1:4, 4), 4L) expect_identical(nth(1:4, 5), NA_integer_) }) test_that("`order_by` can be used to alter the order", { expect_identical(nth(1:5, n = 1L, order_by = 5:1), 5L) expect_identical(nth(as.list(1:5), n = 1L, order_by = 5:1), 5L) }) test_that("can use a data frame as `order_by`", { x <- 1:3 order_by <- tibble(a = c(1, 1, 2), b = c(2, 1, 0)) expect_identical(nth(x, 1, order_by = order_by), 2L) expect_identical(nth(x, 2, order_by = order_by), 1L) }) test_that("`na_rm` can be used to drop missings before selecting the value (#6242)", { x <- c(NA, 4, 10, NA, 5, NA) expect_identical(nth(x, 1, na_rm = TRUE), 4) expect_identical(nth(x, -1, na_rm = TRUE), 5) expect_identical(nth(x, 3, na_rm = TRUE), 5) }) test_that("`na_rm` removes `NULL` list elements", { x <- list(1:3, NULL, 4, integer(), NULL, NULL) expect_identical(nth(x, 2, na_rm = TRUE), 4) expect_identical(nth(x, -1, na_rm = TRUE), integer()) }) test_that("`na_rm` can generate OOB selections, resulting in `default`", { # Removes some values x <- c(NA, FALSE, NA) expect_identical(nth(x, 2, default = TRUE, na_rm = TRUE), TRUE) # Removes everything x <- c(NA, NA, NA) expect_identical(nth(x, 1, default = TRUE, na_rm = TRUE), TRUE) expect_identical(nth(x, -2, default = TRUE, na_rm = TRUE), TRUE) }) test_that("`na_rm` slices `order_by` as well", { x <- c(NA, 4, 10, NA, 5, NA) o <- c(2, 1, 3, 1, 1, 0) expect_identical(nth(x, 1, order_by = o, na_rm = TRUE), 4) expect_identical(nth(x, -1, order_by = o, na_rm = TRUE), 10) expect_identical(nth(x, 2, order_by = o, na_rm = TRUE), 5) expect_identical(nth(x, 3, order_by = o, na_rm = TRUE), 10) }) test_that("`na_rm` is validated", { expect_snapshot(error = TRUE, { nth(1, 1, na_rm = 1) }) expect_snapshot(error = TRUE, { nth(1, 1, na_rm = c(TRUE, FALSE)) }) }) test_that("`default` must be size 1 (when not used with lists)", { expect_snapshot(error = TRUE, { nth(1L, n = 2L, default = 1:2) }) }) test_that("`default` is cast to the type of `x` (when not used with lists)", { expect_snapshot(error = TRUE, { nth("x", 2, default = 2) }) }) test_that("`n` is validated (#5466)", { expect_snapshot(error = TRUE, { nth(1:10, n = "x") }) expect_snapshot(error = TRUE, { nth(1:10, n = 1:2) }) expect_snapshot(error = TRUE, { nth(1:10, n = NA_integer_) }) }) test_that("`x` must be a vector", { expect_snapshot(error = TRUE, { nth(environment(), 1L) }) }) test_that("`order_by` must be the same size as `x`", { expect_snapshot(error = TRUE, { nth(1:5, n = 1L, order_by = 1:2) }) # Ensure that this is checked before `default` is early returned expect_snapshot(error = TRUE, { nth(1:5, n = 6L, order_by = 1:2) }) }) # ------------------------------------------------------------------------------ # first() test_that("`first()` selects the first value", { expect_identical(first(1:5), 1L) }) test_that("`first()` uses default value for 0 length vectors", { expect_equal(first(logical()), NA) expect_equal(first(integer()), NA_integer_) expect_equal(first(numeric()), NA_real_) expect_equal(first(character()), NA_character_) }) test_that("`first()` uses `NULL` default for 0 length lists", { expect_identical(first(list()), NULL) }) test_that("`first()` uses default value for 0 length augmented vectors", { fc <- factor("a")[0] dt <- Sys.Date()[0] tm <- Sys.time()[0] expect_equal(first(fc), vec_init(fc)) expect_equal(first(dt), vec_init(dt)) expect_equal(first(tm), vec_init(tm)) }) test_that("`first()` returns list elements", { expect_identical(first(list(2:3, 4:5)), 2:3) }) test_that("`first()` respects `na_rm`", { x <- c(NA, NA, 2, 3) expect_identical(first(x, na_rm = TRUE), 2) }) # ------------------------------------------------------------------------------ # last() test_that("`last()` selects the last value", { expect_identical(last(1:5), 5L) }) test_that("`last()` returns list elements", { expect_identical(last(list(2:3, 4:5)), 4:5) }) test_that("`last()` respects `na_rm`", { x <- c(2, 3, NA, NA) expect_identical(last(x, na_rm = TRUE), 3) }) dplyr/tests/testthat/test-src-dbi.R0000644000176200001440000000064114366556340017046 0ustar liggesuserstest_that("can work directly with DBI connection", { skip_if_not_installed("RSQLite") skip_if_not_installed("dbplyr") con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") on.exit(DBI::dbDisconnect(con)) df <- tibble(x = 1:10, y = letters[1:10]) df1 <- copy_to(con, df) df2 <- tbl(con, "df") expect_equal(collect(df1), df, ignore_attr = TRUE) expect_equal(collect(df2), df, ignore_attr = TRUE) }) dplyr/tests/testthat/test-deprec-context.R0000644000176200001440000000377015106134104020434 0ustar liggesuserstest_that("cur_data() is deprecated", { df <- tibble(x = 1) expect_snapshot(mutate(df, y = cur_data())) }) test_that("cur_data_all() is deprecated", { df <- tibble(x = 1) expect_snapshot(mutate(df, y = cur_data_all())) }) test_that("cur_data() gives current data without groups, cur_data_all() includes groups", { options(lifecycle_verbosity = "quiet") df <- tibble(x = c("b", "a", "b"), y = 1:3) gf <- group_by(df, x) expect_equal( df |> summarise(x = list(cur_data())) |> pull(), list(df) ) expect_equal( gf |> summarise(x = list(cur_data())) |> pull(), list(tibble(y = 2L), tibble(y = c(1L, 3L))) ) expect_equal( gf |> summarise(x = list(cur_data_all())) |> pull(), list(tibble(x = "a", y = 2L), tibble(x = "b", y = c(1L, 3L))) ) }) test_that("cur_data()/cur_data_all() keeps list columns as lists in rowwise_df (#5901)", { options(lifecycle_verbosity = "quiet") df <- tibble(x = list(tibble(a = 1), tibble(a = 2))) |> rowwise() expect_true( all(summarise(df, test = obj_is_list(cur_data()$x))$test) ) expect_true( all(summarise(df, test = obj_is_list(cur_data_all()$x))$test) ) }) test_that("cur_data() and cur_data_all() work sequentially", { options(lifecycle_verbosity = "quiet") df <- tibble(a = 1) expect_equal( mutate(df, x = df_n_col(cur_data()), y = df_n_col(cur_data())), tibble(a = 1, x = 1, y = 2) ) gf <- tibble(a = 1, b = 2) |> group_by(a) expect_equal( mutate(gf, x = df_n_col(cur_data_all()), y = df_n_col(cur_data_all())), group_by(tibble(a = 1, b = 2, x = 2, y = 3), a) ) }) test_that("mutate(=NULL) preserves correct all_vars", { options(lifecycle_verbosity = "quiet") df <- data.frame(x = 1, y = 2) |> mutate(x = NULL, vars = cur_data_all()) |> pull() expect_equal(df, tibble(y = 2)) }) test_that("give useful error messages when not applicable", { options(lifecycle_verbosity = "quiet") expect_snapshot({ (expect_error(cur_data())) (expect_error(cur_data_all())) }) }) dplyr/tests/testthat/test-all-equal.R0000644000176200001440000001174615106134104017367 0ustar liggesuserstest_that("all_equal is deprecated", { expect_snapshot(all_equal(mtcars, mtcars)) }) # A data frame with all major types df_all <- data.frame( a = c(1, 2.5), b = 1:2, c = c(T, F), d = c("a", "b"), e = factor(c("a", "b")), f = Sys.Date() + 1:2, g = Sys.time() + 1:2, stringsAsFactors = FALSE ) test_that("data frames equal to themselves", { local_options(lifecycle_verbosity = "quiet") expect_true(all_equal(mtcars, mtcars)) expect_true(all_equal(iris, iris)) expect_true(all_equal(df_all, df_all)) }) test_that("data frames not equal if missing row", { local_options(lifecycle_verbosity = "quiet") expect_snapshot({ all_equal(mtcars, mtcars[-1, ]) all_equal(iris, iris[-1, ]) all_equal(df_all, df_all[-1, ]) }) }) test_that("data frames not equal if missing col", { local_options(lifecycle_verbosity = "quiet") expect_snapshot({ all_equal(mtcars, mtcars[, -1]) all_equal(iris, iris[, -1]) all_equal(df_all, df_all[, -1]) }) }) test_that("factors equal only if levels equal", { local_options(lifecycle_verbosity = "quiet") df1 <- tibble(x = factor(c("a", "b"))) df2 <- tibble(x = factor(c("a", "d"))) expect_snapshot({ all_equal(df1, df2) all_equal(df2, df1) }) }) test_that("factor comparison requires strict equality of levels (#2440)", { local_options(lifecycle_verbosity = "quiet") df1 <- tibble(x = factor("a")) df2 <- tibble(x = factor("a", levels = c("a", "b"))) expect_true(all_equal(df1, df2, convert = TRUE)) expect_true(all_equal(df2, df1, convert = TRUE)) expect_snapshot({ all_equal(df1, df2) all_equal(df2, df1) }) }) test_that("all.equal.data.frame handles data.frames with NULL names", { local_options(lifecycle_verbosity = "quiet") x <- data.frame(LETTERS[1:3], rnorm(3)) names(x) <- NULL suppressMessages( expect_true(all_equal(x, x)) ) }) test_that("data frame equality test with ignore_row_order=TRUE detects difference in number of rows. #1065", { local_options(lifecycle_verbosity = "quiet") DF1 <- tibble(a = 1:4, b = letters[1:4]) DF2 <- tibble(a = c(1:4, 4L), b = letters[c(1:4, 4L)]) expect_false(isTRUE(all_equal(DF1, DF2, ignore_row_order = TRUE))) DF1 <- tibble(a = c(1:4, 2L), b = letters[c(1:4, 2L)]) DF2 <- tibble(a = c(1:4, 4L), b = letters[c(1:4, 4L)]) expect_false(isTRUE(all_equal(DF1, DF2, ignore_row_order = TRUE))) }) test_that("all.equal handles NA_character_ correctly. #1095", { local_options(lifecycle_verbosity = "quiet") d1 <- tibble(x = c(NA_character_)) expect_true(all_equal(d1, d1)) d2 <- tibble(x = c(NA_character_, "foo", "bar")) expect_true(all_equal(d2, d2)) }) test_that("handle Date columns of different types, integer and numeric (#1204)", { local_options(lifecycle_verbosity = "quiet") a <- data.frame(date = as.Date("2015-06-07")) b <- data.frame(date = structure(as.integer(a$date), class = "Date")) expect_true(all_equal(a, b)) }) test_that("equality test fails when convert is FALSE and types don't match (#1484)", { local_options(lifecycle_verbosity = "quiet") df1 <- tibble(x = "a") df2 <- tibble(x = factor("a")) expect_true(all_equal(df1, df2, convert = TRUE)) expect_snapshot({ all_equal(df1, df2, convert = FALSE) }) }) test_that("equality handles data frames with 0 rows (#1506)", { df0 <- tibble(x = numeric(0), y = character(0)) expect_equal(df0, df0) }) test_that("equality handles data frames with 0 columns (#1506)", { df0 <- tibble(a = 1:10)[-1] expect_equal(df0, df0) }) test_that("equality handle raw columns", { local_options(lifecycle_verbosity = "quiet") df <- tibble(a = 1:3, b = as.raw(1:3)) expect_true(all_equal(df, df)) }) test_that("equality returns a message for convert = TRUE", { local_options(lifecycle_verbosity = "quiet") df1 <- tibble(x = 1:3) df2 <- tibble(x = as.character(1:3)) expect_snapshot({ all_equal(df1, df2) all_equal(df1, df2, convert = TRUE) }) }) test_that("numeric and integer can be compared if convert = TRUE", { local_options(lifecycle_verbosity = "quiet") df1 <- tibble(x = 1:3) df2 <- tibble(x = as.numeric(1:3)) expect_true(all_equal(df1, df2, convert = TRUE)) expect_snapshot({ all_equal(df1, df2) }) }) test_that("returns vector for more than one difference (#1819)", { local_options(lifecycle_verbosity = "quiet") expect_snapshot({ all_equal(tibble(a = 1, b = 2), tibble(a = 1L, b = 2L)) }) }) test_that("ignore column order", { local_options(lifecycle_verbosity = "quiet") expect_snapshot({ all_equal( tibble(a = 1, b = 2), tibble(b = 2, a = 1), ignore_col_order = FALSE ) all_equal(tibble(a = 1, b = 2), tibble(a = 1), ignore_col_order = FALSE) }) }) # Errors ------------------------------------------------------------------ test_that("count() give meaningful errors", { local_options(lifecycle_verbosity = "quiet") expect_snapshot({ (expect_error(union(tibble(a = 1), tibble(a = "1")))) (expect_error(union(tibble(a = 1, b = 2), tibble(a = "1", b = "2")))) }) }) dplyr/tests/testthat/test-tbl.R0000644000176200001440000000062214266276767016316 0ustar liggesuserstest_that("tbl_nongroup_vars() excludes group variables", { gdf <- group_by(mtcars, cyl) expect_identical(tbl_nongroup_vars(gdf), setdiff(tbl_vars(gdf), "cyl")) }) test_that("tbl_vars() records groups", { gdf <- group_by(mtcars, cyl, am) expect_s3_class(tbl_vars(gdf), "dplyr_sel_vars") expect_true(is_sel_vars(tbl_vars(gdf))) expect_identical(tbl_vars(gdf) %@% groups, c("cyl", "am")) }) dplyr/tests/testthat/test-defunct-lazyeval.R0000644000176200001440000000054315137161765021001 0ustar liggesuserstest_that("generate informative errors", { expect_snapshot(error = TRUE, { add_count_() add_tally_() arrange_() count_() distinct_() do_() filter_() funs_() group_by_() group_indices_() mutate_() tally_() transmute_() rename_() select_() slice_() summarise_() summarize_() }) }) dplyr/tests/testthat/test-near.R0000644000176200001440000000012214266276767016455 0ustar liggesuserstest_that("near accepts nearby fp values", { expect_true(near(sqrt(2)^2, 2)) }) dplyr/tests/testthat/test-data-mask.R0000644000176200001440000000031215106134104017337 0ustar liggesuserstest_that("Empty matrix can be coerced to a data frame (#7004)", { skip_if_not(getRversion() >= "4.4") expect_equal( slice(as.data.frame(matrix(nrow = 0, ncol = 0)), 1), data.frame() ) }) dplyr/tests/testthat/test-top-n.R0000644000176200001440000000154415106134104016542 0ustar liggesuserstest_that("top_n returns n rows", { test_df <- data.frame(x = 1:10, y = 11:20) top_four <- test_df |> top_n(4, y) expect_equal(dim(top_four), c(4, 2)) }) test_that("top_n() handles missing `wt`", { df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1)) expect_message( regexp = "Selecting by x", expect_identical(top_n(df, 2)$x, c(10, 6)) ) }) test_that("top_n() handles calls", { expect_identical(top_n(mtcars, 2, -disp), top_n(mtcars, -2, disp)) }) test_that("top_n() quotes n", { expect_snapshot(res1 <- top_n(mtcars, n() * .5)) expect_snapshot(res2 <- top_n(mtcars, 16)) expect_identical(res1, res2) }) test_that("top_frac() is a shorthand for top_n(n()*)", { expect_identical(top_n(mtcars, n() * .5, disp), top_frac(mtcars, .5, disp)) expect_snapshot(res1 <- top_n(mtcars, n() * .5)) expect_snapshot(res2 <- top_frac(mtcars, .5)) }) dplyr/tests/testthat/test-groups-with.R0000644000176200001440000000116115106134104017770 0ustar liggesuserstest_that("restores original class", { df <- data.frame(x = 1:2) gf <- group_by(df, x) expect_s3_class(with_groups(df, x, mutate), "data.frame", exact = TRUE) expect_s3_class(with_groups(gf, x, mutate), "grouped_df") }) test_that(".groups = NULL ungroups", { gf <- group_by(tibble(x = 1:2), x) out <- gf |> with_groups(NULL, mutate, y = mean(x)) expect_equal(out$y, c(1.5, 1.5)) }) test_that(".groups is defused with context", { local_fn <- identity expect_identical( with_groups(mtcars, local_fn(2), mutate, disp = disp / sd(disp)), with_groups(mtcars, 2, mutate, disp = disp / sd(disp)) ) }) dplyr/tests/testthat/test-summarise.R0000644000176200001440000003761515137161765017543 0ustar liggesuserstest_that("can use freshly create variables (#138)", { df <- tibble(x = 1:10) out <- summarise(df, y = mean(x), z = y + 1) expect_equal(out$y, 5.5) expect_equal(out$z, 6.5) }) test_that("works with empty data frames", { # 0 rows df <- tibble(x = integer()) expect_equal(summarise(df), tibble(.rows = 1)) expect_equal(summarise(df, n = n(), sum = sum(x)), tibble(n = 0, sum = 0)) # 0 cols df <- tibble(.rows = 10) expect_equal(summarise(df), tibble(.rows = 1)) expect_equal(summarise(df, n = n()), tibble(n = 10)) }) test_that("works with grouped empty data frames", { df <- tibble(x = integer()) expect_equal( df |> group_by(x) |> summarise(y = 1L), tibble(x = integer(), y = integer()) ) expect_equal( df |> rowwise(x) |> summarise(y = 1L), group_by(tibble(x = integer(), y = integer()), x) ) }) test_that("no expressions yields grouping data", { df <- tibble(x = 1:2, y = 1:2) gf <- group_by(df, x) expect_equal(summarise(df), tibble(.rows = 1)) expect_equal(summarise(gf), tibble(x = 1:2)) expect_equal(summarise(df, !!!list()), tibble(.rows = 1)) expect_equal(summarise(gf, !!!list()), tibble(x = 1:2)) }) test_that("doesn't preserve attributes", { df <- structure( data.frame(x = 1:10, g1 = rep(1:2, each = 5), g2 = rep(1:5, 2)), meta = "this is important" ) out <- df |> summarise(n = n()) expect_null(attr(out, "res")) out <- df |> group_by(g1) |> summarise(n = n()) expect_null(attr(out, "res")) }) test_that("strips off subclass", { # We consider the data frame returned by `summarise()` to be # "fundamentally a new data frame" df <- new_data_frame(list(a = 1), class = "myclass") out <- df |> summarise(n = n()) expect_s3_class(out, "data.frame", exact = TRUE) out <- df |> summarise(.by = a, n = n()) expect_s3_class(out, "data.frame", exact = TRUE) df <- new_tibble(list(a = 1), class = "myclass") out <- df |> summarise(n = n()) expect_s3_class(out, class(tibble()), exact = TRUE) out <- df |> summarise(.by = a, n = n()) expect_s3_class(out, class(tibble()), exact = TRUE) gdf <- group_by(tibble(a = 1), a) df <- gdf class(df) <- c("myclass", class(gdf)) out <- df |> summarise(n = n(), .groups = "drop") expect_s3_class(out, class(tibble()), exact = TRUE) out <- df |> summarise(n = n(), .groups = "keep") expect_s3_class(out, class(gdf), exact = TRUE) }) test_that("works with unquoted values", { df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5) expect_equal(summarise(df, out = !!1), tibble(out = 1)) expect_equal(summarise(df, out = !!quo(1)), tibble(out = 1)) }) test_that("formulas are evaluated in the right environment (#3019)", { out <- mtcars |> summarise(fn = list(rlang::as_function(~ list(~foo, environment())))) out <- out$fn[[1]]() expect_identical(environment(out[[1]]), out[[2]]) }) test_that("unnamed data frame results with 0 columns are ignored (#5084)", { df1 <- tibble(x = 1:2) expect_equal(df1 |> group_by(x) |> summarise(data.frame()), df1) expect_equal( df1 |> group_by(x) |> summarise(data.frame(), y = 65), mutate(df1, y = 65) ) expect_equal( df1 |> group_by(x) |> summarise(y = 65, data.frame()), mutate(df1, y = 65) ) df2 <- tibble(x = 1:2, y = 3:4) expect_equal(df2 |> group_by(x) |> summarise(data.frame()), df1) expect_equal( df2 |> group_by(x) |> summarise(data.frame(), z = 98), mutate(df1, z = 98) ) expect_equal( df2 |> group_by(x) |> summarise(z = 98, data.frame()), mutate(df1, z = 98) ) # This includes unnamed data frames that have 0 columns but >0 rows. # Noted when working on (#6509). empty3 <- new_tibble(list(), nrow = 3L) expect_equal(df1 |> summarise(empty3), new_tibble(list(), nrow = 1L)) expect_equal( df1 |> summarise(empty3, y = mean(x)), df1 |> summarise(y = mean(x)) ) expect_equal(df1 |> group_by(x) |> summarise(empty3), df1) expect_equal( df1 |> group_by(x) |> summarise(empty3, y = x + 1), mutate(df1, y = x + 1) ) }) test_that("can't overwrite column active bindings (#6666)", { skip_if(getRversion() < "3.6.3", message = "Active binding error changed") df <- tibble(g = c(1, 1, 2, 2), x = 1:4) gdf <- group_by(df, g) # The error seen here comes from trying to `<-` to an active binding when # the active binding function has 0 arguments. expect_snapshot(error = TRUE, { summarise(df, y = { x <<- x + 2L mean(x) }) }) expect_snapshot(error = TRUE, { summarise(df, .by = g, y = { x <<- x + 2L mean(x) }) }) expect_snapshot(error = TRUE, { summarise(gdf, y = { x <<- x + 2L mean(x) }) }) }) test_that("assigning with `<-` doesn't affect the mask (#6666)", { df <- tibble(g = c(1, 1, 2, 2), x = 1:4) gdf <- group_by(df, g) out <- summarise(df, .by = g, y = { x <- x + 4L mean(x) }) expect_identical(out$y, c(5.5, 7.5)) out <- summarise(gdf, y = { x <- x + 4L mean(x) }) expect_identical(out$y, c(5.5, 7.5)) }) test_that("summarise() correctly auto-names expressions (#6741)", { df <- tibble(a = 1:3) expect_identical(summarise(df, min(-a)), tibble("min(-a)" = -3L)) }) # grouping ---------------------------------------------------------------- test_that("peels off a single layer of grouping", { df <- tibble(x = rep(1:4, each = 4), y = rep(1:2, each = 8), z = runif(16)) gf <- df |> group_by(x, y) expect_equal(group_vars(summarise(gf)), "x") expect_equal(group_vars(summarise(summarise(gf))), character()) }) test_that("correctly reconstructs groups", { d <- tibble(x = 1:4, g1 = rep(1:2, 2), g2 = 1:4) |> group_by(g1, g2) |> summarise(x = x + 1) expect_equal(group_rows(d), list_of(1:2, 3:4)) }) test_that("can modify grouping variables", { df <- tibble(a = c(1, 2, 1, 2), b = c(1, 1, 2, 2)) gf <- group_by(df, a, b) i <- count_regroups(out <- summarise(gf, a = a + 1)) expect_equal(i, 1) expect_equal(out$a, c(2, 2, 3, 3)) }) test_that("summarise returns a row for zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) expect_equal(nrow(summarise(df, z = n())), 3L) }) test_that("summarise respects zero-length groups (#341)", { df <- tibble(x = factor(rep(1:3, each = 10), levels = 1:4)) out <- df |> group_by(x, .drop = FALSE) |> summarise(n = n()) expect_equal(out$n, c(10L, 10L, 10L, 0L)) }) # vector types ---------------------------------------------------------- test_that("summarise allows names (#2675)", { data <- tibble(a = 1:3) |> summarise(b = c("1" = a[[1]])) expect_equal(names(data$b), "1") data <- tibble(a = 1:3) |> rowwise() |> summarise(b = setNames(nm = a)) expect_equal(names(data$b), c("1", "2", "3")) data <- tibble(a = c(1, 1, 2)) |> group_by(a) |> summarise(b = setNames(nm = a[[1]])) expect_equal(names(data$b), c("1", "2")) res <- data.frame(x = c(1:3), y = letters[1:3]) |> group_by(y) |> summarise( a = length(x), b = quantile(x, 0.5) ) expect_equal(res$b, c("50%" = 1, "50%" = 2, "50%" = 3)) }) test_that("summarise handles list output columns (#832)", { df <- tibble(x = 1:10, g = rep(1:2, each = 5)) res <- df |> group_by(g) |> summarise(y = list(x)) expect_equal(res$y[[1]], 1:5) # preserving names d <- tibble(x = rep(1:3, 1:3), y = 1:6, names = letters[1:6]) res <- d |> group_by(x) |> summarise(y = list(setNames(y, names))) expect_equal(names(res$y[[1]]), letters[[1]]) }) test_that("summarise coerces types across groups", { gf <- group_by(tibble(g = 1:2), g) out <- summarise(gf, x = if (g == 1) NA else "x") expect_type(out$x, "character") out <- summarise(gf, x = if (g == 1L) NA else 2.5) expect_type(out$x, "double") }) test_that("unnamed tibbles are unpacked (#2326)", { df <- tibble(x = 2) out <- summarise(df, tibble(y = x * 2, z = 3)) expect_equal(out$y, 4) expect_equal(out$z, 3) }) test_that("named tibbles are packed (#2326)", { df <- tibble(x = 2) out <- summarise(df, df = tibble(y = x * 2, z = 3)) expect_equal(out$df, tibble(y = 4, z = 3)) }) test_that("summarise(.groups=) in global environment", { expect_message(eval_bare( expr(data.frame(x = 1, y = 2) |> group_by(x, y) |> summarise()), env(global_env()) )) expect_message(eval_bare( expr(data.frame(x = 1, y = 2) |> rowwise(x, y) |> summarise()), env(global_env()) )) }) test_that("summarise(.groups=)", { df <- data.frame(x = 1, y = 2) expect_equal( df |> summarise(z = 3, .groups = "rowwise"), rowwise(data.frame(z = 3)) ) gf <- df |> group_by(x, y) expect_equal(gf |> summarise() |> group_vars(), "x") expect_equal(gf |> summarise(.groups = "drop_last") |> group_vars(), "x") expect_equal(gf |> summarise(.groups = "drop") |> group_vars(), character()) expect_equal(gf |> summarise(.groups = "keep") |> group_vars(), c("x", "y")) rf <- df |> rowwise(x, y) expect_equal(rf |> summarise(.groups = "drop") |> group_vars(), character()) expect_equal(rf |> summarise(.groups = "keep") |> group_vars(), c("x", "y")) }) test_that("summarise() casts data frame results to common type (#5646)", { df <- data.frame(x = 1:2, g = 1:2) |> group_by(g) res <- df |> summarise( if (g == 1) data.frame(y = 1) else data.frame(y = 1, z = 2), .groups = "drop" ) expect_equal(res$z, c(NA, 2)) }) test_that("summarise() silently skips when all results are NULL (#5708)", { df <- data.frame(x = 1:2, g = 1:2) |> group_by(g) expect_equal(summarise(df, x = NULL), summarise(df)) expect_error(summarise(df, x = if (g == 1) 42)) }) # .by ---------------------------------------------------------------------- test_that("can group transiently using `.by`", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) out <- summarise(df, x = mean(x), .by = g) expect_identical(out$g, c(1, 2)) expect_identical(out$x, c(3, 2)) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains bare data.frame class", { df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) out <- summarise(df, x = mean(x), .by = g) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping drops data frame attributes", { # Because `summarise()` theoretically creates a "new" data frame # With data.frames or tibbles df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) tbl <- as_tibble(df) attr(df, "foo") <- "bar" attr(tbl, "foo") <- "bar" out <- summarise(df, x = mean(x), .by = g) expect_null(attr(out, "foo")) out <- summarise(tbl, x = mean(x), .by = g) expect_null(attr(out, "foo")) }) test_that("transient grouping orders by first appearance", { df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) out <- summarise(df, x = mean(x), .by = g) expect_identical(out$g, c(2, 1, 0)) expect_identical(out$x, c(6, 2, 5)) }) test_that("can't use `.by` with `.groups`", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { summarise(df, .by = x, .groups = "drop") }) }) test_that("catches `.by` with grouped-df", { df <- tibble(x = 1) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { summarise(gdf, .by = x) }) }) test_that("catches `.by` with rowwise-df", { df <- tibble(x = 1) rdf <- rowwise(df) expect_snapshot(error = TRUE, { summarise(rdf, .by = x) }) }) # errors ------------------------------------------------------------------- test_that("summarise() preserves the call stack on error (#5308)", { foobar <- function() stop("foo") stack <- NULL expect_error( withCallingHandlers( error = function(...) stack <<- sys.calls(), summarise(mtcars, foobar()) ) ) expect_true(some(stack, is_call, "foobar")) }) test_that("`summarise()` doesn't allow data frames with missing or empty names (#6758)", { df1 <- new_data_frame(set_names(list(1), "")) df2 <- new_data_frame(set_names(list(1), NA_character_)) expect_snapshot(error = TRUE, { summarise(df1) }) expect_snapshot(error = TRUE, { summarise(df2) }) }) test_that("summarise() messages about implicit `.groups` default", { # Otherwise it only informs when called from the global env local_options(dplyr.summarise.inform = TRUE) df <- tibble(x = 1, y = 2) # Nothing expect_snapshot({ df |> group_by(x) |> summarise() }) expect_snapshot({ df |> rowwise() |> summarise() }) # Implicit `"drop_last"` expect_snapshot({ df |> group_by(x, y) |> summarise() }) # Implicit `"keep"` expect_snapshot({ df |> rowwise(x, y) |> summarise() }) }) test_that("summarise() respects `dplyr.summarise.inform = FALSE`", { local_options(dplyr.summarise.inform = FALSE) # Force evaluation in the global env so we can be very sure we are # silencing the message. It only ever triggers in the global env. eval_global <- function(expr) eval(expr, envir = globalenv()) # Implicit `"drop_last"` expect_snapshot({ eval_global(tibble(x = 1, y = 2) |> group_by(x, y) |> summarise()) }) # Implicit `"keep"` expect_snapshot({ eval_global(tibble(x = 1, y = 2) |> rowwise(x, y) |> summarise()) }) }) test_that("summarise() gives meaningful errors", { eval( envir = global_env(), expr({ expect_snapshot({ # unsupported type (expect_error( tibble(x = 1, y = c(1, 2, 2), z = runif(3)) |> summarise(a = rlang::env(a = 1)) )) (expect_error( tibble(x = 1, y = c(1, 2, 2), z = runif(3)) |> group_by(x, y) |> summarise(a = rlang::env(a = 1)) )) (expect_error( tibble(x = 1, y = c(1, 2, 2), y2 = c(1, 2, 2), z = runif(3)) |> group_by(x, y, y2) |> summarise(a = rlang::env(a = 1)) )) (expect_error( tibble(x = 1, y = c(1, 2, 2), z = runif(3)) |> rowwise() |> summarise(a = lm(y ~ x)) )) # mixed types (expect_error( tibble(id = 1:2, a = list(1, "2")) |> group_by(id) |> summarise(a = a[[1]]) )) (expect_error( tibble(id = 1:2, a = list(1, "2")) |> rowwise() |> summarise(a = a[[1]]) )) # mixed nulls (expect_error( data.frame(x = 1:2, g = 1:2) |> group_by(g) |> summarise(x = if (g == 1) 42) )) (expect_error( data.frame(x = 1:2, g = 1:2) |> group_by(g) |> summarise(x = if (g == 2) 42) )) # .data pronoun (expect_error(summarise(tibble(a = 1), c = .data$b))) (expect_error(summarise(group_by(tibble(a = 1:3), a), c = .data$b))) # Duplicate column names (expect_error( tibble(x = 1, x = 1, .name_repair = "minimal") |> summarise(x) )) # Not glue()ing (expect_error(tibble() |> summarise(stop("{")))) (expect_error( tibble(a = 1, b = "{value:1, unit:a}") |> group_by(b) |> summarise(a = stop("!")) )) }) }) ) }) test_that("non-summary results are defunct in favor of `reframe()` (#6382, #7761)", { df <- tibble(g = c(1, 1, 2), x = 1:3) gdf <- group_by(df, g) rdf <- rowwise(df) expect_snapshot(error = TRUE, { out <- summarise(df, x = which(x < 3)) }) expect_snapshot(error = TRUE, { out <- summarise(df, x = which(x < 3), .by = g) }) # First group returns size 2 summary expect_snapshot(error = TRUE, { out <- summarise(gdf, x = which(x < 3)) }) # Last row returns size 0 summary expect_snapshot(error = TRUE, { out <- summarise(rdf, x = which(x < 3)) }) # A few additional tests from when we used to allow this, which are now errors expect_snapshot(error = TRUE, { tibble() |> summarise(x = 1, y = 1:3, z = 1) }) expect_snapshot(error = TRUE, { gf <- group_by(tibble(a = 1:2), a) gf |> summarise(x = 1, y = 1:3, z = 1) }) expect_snapshot(error = TRUE, { gf <- group_by(tibble(a = 1:2), a) gf |> summarise(x = seq_len(a), y = 1) }) }) dplyr/tests/testthat/test-sets.R0000644000176200001440000001027315106134104016462 0ustar liggesuserstest_that("x used as basis of output (#3839)", { df1 <- tibble(x = 1:4, y = 1) df2 <- tibble(y = 1, x = c(4, 2)) expect_equal(intersect(df1, df2), tibble(x = c(2, 4), y = 1)) expect_equal(union(df1, df2), tibble(x = 1:4, y = 1)) expect_equal(union_all(df1, df2), tibble(x = c(1:4, 4, 2), y = 1)) expect_equal(setdiff(df1, df2), tibble(x = c(1, 3), y = 1)) expect_equal(symdiff(df1, df2), tibble(x = c(1, 3), y = 1)) }) test_that("set operations (apart from union_all) remove duplicates", { df1 <- tibble(x = c(1, 1, 2)) df2 <- tibble(x = 2) expect_equal(intersect(df1, df2), tibble(x = 2)) expect_equal(union(df1, df2), tibble(x = c(1, 2))) expect_equal(union_all(df1, df2), tibble(x = c(1, 1, 2, 2))) expect_equal(setdiff(df1, df2), tibble(x = 1)) expect_equal(symdiff(df1, df2), tibble(x = 1)) }) test_that("standard coercion rules are used (#799)", { df1 <- tibble(x = 1:2, y = c(1, 1)) df2 <- tibble(x = 1:2, y = 1:2) expect_equal(nrow(intersect(df1, df2)), 1) expect_equal(nrow(union(df1, df2)), 3) expect_equal(nrow(union_all(df1, df2)), 4) expect_equal(nrow(setdiff(df1, df2)), 1) expect_equal(nrow(symdiff(df1, df2)), 2) }) test_that("grouping metadata is reconstructed (#3587)", { df1 <- tibble(x = 1:4, g = rep(1:2, each = 2)) |> group_by(g) df2 <- tibble(x = 3:6, g = rep(2:3, each = 2)) expect_equal(group_vars(intersect(df1, df2)), "g") expect_equal(group_vars(union(df1, df2)), "g") expect_equal(group_vars(union_all(df1, df2)), "g") expect_equal(group_vars(setdiff(df1, df2)), "g") expect_equal(group_vars(symdiff(df1, df2)), "g") }) test_that("also work with vectors", { expect_equal(intersect(1:3, 3:4), 3) expect_equal(union(1:3, 3:4), 1:4) expect_equal(union_all(1:3, 3:4), c(1:3, 3:4)) expect_equal(setdiff(1:3, 3:4), 1:2) expect_equal(symdiff(1:3, 3:4), c(1, 2, 4)) # removes duplicates expect_equal(symdiff(c(1, 1, 2), c(2, 2, 3)), c(1, 3)) }) test_that("extra arguments in ... error (#5891)", { df1 <- tibble(var = 1:3) df2 <- tibble(var = 2:4) expect_snapshot(error = TRUE, { intersect(df1, df2, z = 3) union(df1, df2, z = 3) union_all(df1, df2, z = 3) setdiff(df1, df2, z = 3) symdiff(df1, df2, z = 3) }) }) test_that("incompatible data frames error (#903)", { df1 <- tibble(x = 1) df2 <- tibble(x = 1, y = 1) expect_snapshot(error = TRUE, { intersect(df1, df2) union(df1, df2) union_all(df1, df2) setdiff(df1, df2) symdiff(df1, df2) }) }) test_that("is_compatible generates useful messages for different cases", { expect_snapshot({ cat(is_compatible(tibble(x = 1), 1)) cat(is_compatible(tibble(x = 1), tibble(x = 1, y = 2))) cat(is_compatible( tibble(x = 1, y = 1), tibble(y = 1, x = 1), ignore_col_order = FALSE )) cat(is_compatible(tibble(x = 1), tibble(y = 1))) cat(is_compatible(tibble(x = 1), tibble(x = 1L), convert = FALSE)) cat(is_compatible(tibble(x = 1), tibble(x = "a"))) }) }) # setequal ---------------------------------------------------------------- test_that("setequal ignores column and row order", { df1 <- tibble(x = 1:2, y = 3:4) df2 <- df1[2:1, 2:1] expect_true(setequal(df1, df2)) expect_true(setequal(df1, df2)) }) test_that("setequal ignores duplicated rows (#6057)", { df1 <- tibble(x = 1) df2 <- df1[c(1, 1, 1), ] expect_true(setequal(df1, df2)) expect_true(setequal(df2, df1)) }) test_that("setequal uses coercion rules (#6114)", { df1 <- tibble(x = 1) df2 <- tibble(x = 1L) expect_true(setequal(df1, df2)) expect_true(setequal(df2, df1)) }) test_that("setequal tibbles must have same rows and columns", { # Different rows are the definition of not equal expect_false(setequal(tibble(x = 1:2), tibble(x = 2:3))) # Different or incompatible columns are an error, like the other set ops (#6786) expect_snapshot(error = TRUE, { setequal(tibble(x = 1:2), tibble(y = 1:2)) }) expect_snapshot(error = TRUE, { setequal(tibble(x = 1:2), tibble(x = c("a", "b"))) }) }) test_that("setequal checks y is a data frame", { expect_snapshot(setequal(mtcars, 1), error = TRUE) }) test_that("setequal checks for extra arguments", { expect_snapshot(setequal(mtcars, mtcars, z = 2), error = TRUE) }) dplyr/tests/testthat/test-colwise.R0000644000176200001440000000204115106134104017143 0ustar liggesuserstest_that("tbl_at_vars() treats `NULL` as empty inputs", { expect_identical(tbl_at_vars(mtcars, vars(NULL)), tbl_at_vars(mtcars, vars())) expect_identical( tibble::remove_rownames(mutate_at(mtcars, vars(NULL), `*`, 100)), tibble::remove_rownames(mtcars) ) }) test_that("lists of formulas are auto-named", { df <- tibble(x = 1:3, y = 4:6) out <- df |> summarise_all(list(~ mean(.), ~ sd(.x, na.rm = TRUE))) expect_named(out, c("x_mean", "y_mean", "x_sd", "y_sd")) out <- df |> summarise_all(list(foobar = ~ mean(.), ~ sd(.x, na.rm = TRUE))) expect_named(out, c("x_foobar", "y_foobar", "x_sd", "y_sd")) }) # Errors -------------------------------------------- test_that("colwise utils gives meaningful error messages", { expect_snapshot({ (expect_error( tbl_at_vars(iris, raw(3)) )) (expect_error( tbl_if_vars(iris, list(identity, force), environment()) )) .funs <- as_fun_list(list(identity, force), caller_env()) (expect_error( tbl_if_vars(iris, .funs, environment()) )) }) }) dplyr/tests/testthat/test-rowwise.R0000644000176200001440000001072615137161765017227 0ustar liggesuserstest_that("rowwise status preserved by major verbs", { rf <- rowwise(tibble(x = 1:5, y = 5:1), "x") out <- arrange(rf, y) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") out <- filter(rf, x < 3) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") out <- filter_out(rf, x < 3) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") out <- mutate(rf, x = x + 1) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") out <- rename(rf, X = x) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "X") out <- select(rf, "x") expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") out <- slice(rf, c(1, 1)) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") # Except for summarise out <- summarise(rf, z = mean(x, y)) expect_s3_class(out, "grouped_df") expect_equal(group_vars(out), "x") }) test_that("rowwise nature preserved by subsetting ops", { rf <- rowwise(tibble(x = 1:5, y = 1:5), "x") out <- rf[1] expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") out[, "z"] <- 5:1 expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") names(out) <- toupper(names(out)) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "X") }) test_that("except when it should be removed", { rf <- rowwise(tibble(x = 1:5, y = 1:5), "x") expect_equal(out <- rf[, 1, drop = TRUE], rf$x) }) test_that("rowwise has decent print method", { rf <- rowwise(tibble(x = 1:5), "x") expect_snapshot(rf) }) test_that("rowwise captures group_vars", { df <- group_by(tibble(g = 1:2, x = 1:2), g) rw <- rowwise(df) expect_equal(group_vars(rw), "g") # but can't regroup expect_error(rowwise(df, x), "Can't re-group") }) test_that("can re-rowwise", { rf1 <- rowwise(tibble(x = 1:5, y = 1:5), "x") rf2 <- rowwise(rf1, y) expect_equal(group_vars(rf2), "y") }) test_that("new_rowwise_df() does not require `group_data=`", { df <- new_rowwise_df(data.frame(x = 1:2)) expect_s3_class(df, "rowwise_df") expect_equal(attr(df, "groups"), tibble(".rows" := vctrs::list_of(1L, 2L))) }) test_that("new_rowwise_df() can add class and attributes (#5918)", { df <- new_rowwise_df( tibble(x = 1:4), tibble(), class = "custom_rowwise_df", a = "b" ) expect_s3_class(df, "custom_rowwise_df") expect_equal(attr(df, "a"), "b") }) test_that("rbind() works with rowwise data frames by calling bind_rows() (r-lib/vctrs#1935)", { x <- rowwise(tibble(a = 1:2)) y <- rowwise(tibble(a = 3:4)) out <- rbind(x, y) expect_identical(out, rowwise(tibble(a = c(1:2, 3:4)))) # Important that `.rows` is recreated, not copied over from `x` (r-lib/vctrs#1935) expect_identical( group_data(out), new_tibble(list(.rows = list_of(1L, 2L, 3L, 4L))) ) # `bind_rows()` returns an object with the class of the first input, # which is roughly how `rbind()` also works # With bare tibble y <- tibble(a = 5:6) out <- rbind(x, y) expect_identical(out, rowwise(tibble(a = c(1:2, 5:6)))) # With grouped_df y <- group_by(tibble(a = 5:6), a) out <- rbind(x, y) expect_identical(out, rowwise(tibble(a = c(1:2, 5:6)))) }) test_that("validate_rowwise_df() gives useful errors", { df1 <- rowwise(tibble(x = 1:4, g = rep(1:2, each = 2)), g) groups <- attr(df1, "groups") groups[[2]] <- 4:1 attr(df1, "groups") <- groups df2 <- rowwise(tibble(x = 1:4, g = rep(1:2, each = 2)), g) groups <- attr(df2, "groups") names(groups) <- c("g", "not.rows") attr(df2, "groups") <- groups df3 <- df2 attr(df3, "groups") <- tibble() df4 <- df3 attr(df4, "groups") <- NA df7 <- rowwise(tibble(x = 1:10)) attr(df7, "groups")$.rows <- 11:20 df8 <- rowwise(tibble(x = 1:10)) df10 <- df7 attr(df10, "groups") <- tibble() df11 <- df7 attr(df11, "groups") <- NULL expect_snapshot({ (expect_error(validate_rowwise_df(df1))) (expect_error(validate_rowwise_df(df2))) (expect_error(validate_rowwise_df(df3))) (expect_error(validate_rowwise_df(df4))) (expect_error(validate_rowwise_df(df7))) (expect_error(attr(df8, "groups")$.rows <- 1:8)) (expect_error(validate_rowwise_df(df10))) (expect_error(validate_rowwise_df(df11))) (expect_error( new_rowwise_df( tibble(x = 1:10), tibble(".rows" := list(1:5, -1L)) ) )) (expect_error( new_rowwise_df( tibble(x = 1:10), 1:10 ) )) }) }) dplyr/tests/testthat/test-when.R0000644000176200001440000000760615137161765016474 0ustar liggesuserstest_that("all possible variations of each combination are right", { N <- NA expect_identical(when_all(T, T, na_rm = FALSE), T) expect_identical(when_all(T, F, na_rm = FALSE), F) expect_identical(when_all(T, N, na_rm = FALSE), N) expect_identical(when_all(F, T, na_rm = FALSE), F) expect_identical(when_all(F, F, na_rm = FALSE), F) expect_identical(when_all(F, N, na_rm = FALSE), F) expect_identical(when_all(N, T, na_rm = FALSE), N) expect_identical(when_all(N, F, na_rm = FALSE), F) expect_identical(when_all(N, N, na_rm = FALSE), N) expect_identical(when_all(T, T, na_rm = TRUE), T) expect_identical(when_all(T, F, na_rm = TRUE), F) expect_identical(when_all(T, N, na_rm = TRUE), T) expect_identical(when_all(F, T, na_rm = TRUE), F) expect_identical(when_all(F, F, na_rm = TRUE), F) expect_identical(when_all(F, N, na_rm = TRUE), F) expect_identical(when_all(N, T, na_rm = TRUE), T) expect_identical(when_all(N, F, na_rm = TRUE), F) expect_identical(when_all(N, N, na_rm = TRUE), T) expect_identical(when_any(T, T, na_rm = FALSE), T) expect_identical(when_any(T, F, na_rm = FALSE), T) expect_identical(when_any(T, N, na_rm = FALSE), T) expect_identical(when_any(F, T, na_rm = FALSE), T) expect_identical(when_any(F, F, na_rm = FALSE), F) expect_identical(when_any(F, N, na_rm = FALSE), N) expect_identical(when_any(N, T, na_rm = FALSE), T) expect_identical(when_any(N, F, na_rm = FALSE), N) expect_identical(when_any(N, N, na_rm = FALSE), N) expect_identical(when_any(T, T, na_rm = TRUE), T) expect_identical(when_any(T, F, na_rm = TRUE), T) expect_identical(when_any(T, N, na_rm = TRUE), T) expect_identical(when_any(F, T, na_rm = TRUE), T) expect_identical(when_any(F, F, na_rm = TRUE), F) expect_identical(when_any(F, N, na_rm = TRUE), F) expect_identical(when_any(N, T, na_rm = TRUE), T) expect_identical(when_any(N, F, na_rm = TRUE), F) expect_identical(when_any(N, N, na_rm = TRUE), F) }) test_that("empty case works", { expect_identical(when_any(), logical()) expect_identical(when_all(), logical()) }) test_that("`size` influences the empty case", { expect_identical(when_any(size = 1), FALSE) expect_identical(when_all(size = 1), TRUE) }) test_that("no recycling is performed!", { # On the vctrs side we decided recycling doesn't # make much sense in these functions expect_snapshot(error = TRUE, { when_any(TRUE, c(TRUE, FALSE)) }) expect_snapshot(error = TRUE, { when_all(TRUE, c(TRUE, FALSE)) }) expect_snapshot(error = TRUE, { when_any(TRUE, size = 2) }) expect_snapshot(error = TRUE, { when_all(TRUE, size = 2) }) }) test_that("inputs must be strictly logical vectors", { # Not cast to logical expect_snapshot(error = TRUE, { when_any(1) }) expect_snapshot(error = TRUE, { when_all(1) }) # Not a 1D array of logical expect_snapshot(error = TRUE, { when_any(array(TRUE)) }) expect_snapshot(error = TRUE, { when_all(array(TRUE)) }) # Not a classed logical expect_snapshot(error = TRUE, { when_any(structure(TRUE, class = "foo")) }) expect_snapshot(error = TRUE, { when_all(structure(TRUE, class = "foo")) }) # Extraneous attributes are fine expect_identical(when_any(structure(TRUE, foo = "bar")), TRUE) expect_identical(when_all(structure(TRUE, foo = "bar")), TRUE) }) test_that("`...` can't be named", { # This is why we can have non `.` prefixed arguments expect_snapshot(error = TRUE, { when_any(x = TRUE) }) expect_snapshot(error = TRUE, { when_all(x = TRUE) }) }) test_that("`na_rm` is validated", { expect_snapshot(error = TRUE, { when_any(na_rm = "x") }) expect_snapshot(error = TRUE, { when_all(na_rm = "x") }) }) test_that("`size` is validated", { # Good enough, just don't want to crash expect_snapshot(error = TRUE, { when_any(size = "x") }) expect_snapshot(error = TRUE, { when_all(size = "x") }) }) dplyr/tests/testthat/test-copy-to.R0000644000176200001440000000052714406402754017112 0ustar liggesuserstest_that("`auto_copy()` is a no-op when they share the same source", { df1 <- tibble(x = 1) df2 <- tibble(x = 2) expect_identical(auto_copy(df1, df2), df2) }) test_that("`auto_copy()` throws an informative error on different sources (#6798)", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { auto_copy(df, NULL) }) }) dplyr/tests/testthat/helper-dplyr.R0000644000176200001440000000025214366556340017153 0ustar liggesusersexpect_no_error <- function(object, ...) { expect_error({{ object }}, NA, ...) } expect_no_warning <- function(object, ...) { expect_warning({{ object }}, NA, ...) } dplyr/tests/testthat/helper-encoding.R0000644000176200001440000000364715106134104017601 0ustar liggesusersget_lang_strings <- function() { lang_strings <- c( de = "Gl\u00fcck", cn = "\u5e78\u798f", ru = "\u0441\u0447\u0430\u0441\u0442\u044c\u0435", ko = "\ud589\ubcf5" ) native_lang_strings <- enc2native(lang_strings) same <- (lang_strings == native_lang_strings) list( same = lang_strings[same], different = lang_strings[!same] ) } get_native_lang_string <- function() { lang_strings <- get_lang_strings() if (length(lang_strings$same) == 0) { testthat::skip("No native language string available") } lang_strings$same[[1L]] } get_alien_lang_string <- function() { lang_strings <- get_lang_strings() if (length(lang_strings$different) == 0) { testthat::skip("No alien language string available") } lang_strings$different[[1L]] } has_locale <- function(locale, category) { original <- Sys.getlocale(category = category) on.exit(Sys.setlocale(category = category, locale = original), add = TRUE) tryCatch( expr = { Sys.setlocale(category = category, locale = locale) TRUE }, warning = function(w) FALSE, error = function(e) FALSE ) } has_collate_locale <- function(locale) { has_locale(locale = locale, category = "LC_COLLATE") } has_ctype_locale <- function(enc) { has_locale(locale = enc, category = "LC_CTYPE") } non_utf8_encoding <- function(enc = NULL) { if (!l10n_info()$`UTF-8`) { return(Sys.getlocale("LC_CTYPE")) } enc <- enc %||% c( "en_US.ISO8859-1", "en_US.ISO8859-15", "fr_CH.ISO8859-1", "fr_CH.ISO8859-15" ) available <- vapply(enc, has_ctype_locale, logical(1)) if (any(available)) { enc[available][1] } else { NULL } } local_non_utf8_encoding <- function(enc = NULL, env = parent.frame()) { non_utf8 <- non_utf8_encoding(enc) if (is.null(non_utf8)) { skip("Can't set a non-UTF-8 encoding") } else { withr::local_locale(c(LC_CTYPE = non_utf8), .local_envir = env) } } dplyr/tests/testthat/test-group-map.R0000644000176200001440000000725115106134104017415 0ustar liggesuserstest_that("group_map() respects empty groups", { res <- group_by(mtcars, cyl) |> group_map(~ head(.x, 2L)) expect_equal(length(res), 3L) res <- iris |> group_by(Species) |> filter(Species == "setosa") |> group_map(~ tally(.x)) expect_equal(length(res), 1L) res <- iris |> group_by(Species, .drop = FALSE) |> filter(Species == "setosa") |> group_map(~ tally(.x)) expect_equal(length(res), 3L) }) test_that("group_map() can return arbitrary objects", { expect_equal( group_by(mtcars, cyl) |> group_map(~10), rep(list(10), 3) ) }) test_that("group_map() works on ungrouped data frames (#4067)", { expect_identical( group_map(mtcars, ~ head(.x, 2L)), list(head(as_tibble(mtcars), 2L)) ) }) test_that("group_modify() makes a grouped_df", { res <- group_by(mtcars, cyl) |> group_modify(~ head(.x, 2L)) expect_equal(nrow(res), 6L) expect_equal(group_rows(res), list_of(1:2, 3:4, 5:6)) res <- iris |> group_by(Species) |> filter(Species == "setosa") |> group_modify(~ tally(.x)) expect_equal(nrow(res), 1L) expect_equal(group_rows(res), list_of(1L)) res <- iris |> group_by(Species, .drop = FALSE) |> filter(Species == "setosa") |> group_modify(~ tally(.x)) expect_equal(nrow(res), 3L) expect_equal(as.list(group_rows(res)), list(1L, 2L, 3L)) }) test_that("group_modify() and group_map() want functions with at least 2 arguments, or ... (#3996)", { head1 <- function(d, ...) head(d, 1) g <- iris |> group_by(Species) expect_equal(nrow(group_modify(g, head1)), 3L) expect_equal(length(group_map(g, head1)), 3L) }) test_that("group_modify() works on ungrouped data frames (#4067)", { expect_identical( group_modify(mtcars, ~ head(.x, 2L)), head(mtcars, 2L) ) }) test_that("group_map() uses ptype on empty splits (#4421)", { res <- mtcars |> group_by(cyl) |> filter(hp > 1000) |> group_map(~.x) expect_equal(res, list(), ignore_attr = TRUE) ptype <- attr(res, "ptype") expect_equal(names(ptype), setdiff(names(mtcars), "cyl")) expect_equal(nrow(ptype), 0L) expect_s3_class(ptype, "data.frame") }) test_that("group_modify() uses ptype on empty splits (#4421)", { res <- mtcars |> group_by(cyl) |> filter(hp > 1000) |> group_modify(~.x) expect_equal(res, group_by(mtcars[integer(0L), names(res)], cyl)) }) test_that("group_modify() works with additional arguments (#4509)", { myfun <- function(.x, .y, foo) { .x[[foo]] <- 1 .x } srcdata <- data.frame( A = rep(1:2, each = 3) ) |> group_by(A) targetdata <- srcdata targetdata$bar <- 1 expect_equal( group_modify(.data = srcdata, .f = myfun, foo = "bar"), targetdata ) }) test_that("group_map() does not warn about .keep= for rowwise_df", { expect_warning( data.frame(x = 1) |> rowwise() |> group_walk( ~ {} ), NA ) }) test_that("group_map() give meaningful errors", { head1 <- function(d) head(d, 1) expect_snapshot({ # group_modify() (expect_error( mtcars |> group_by(cyl) |> group_modify(~ data.frame(cyl = 19)) )) (expect_error(mtcars |> group_by(cyl) |> group_modify(~10))) (expect_error(iris |> group_by(Species) |> group_modify(head1))) # group_map() (expect_error(iris |> group_by(Species) |> group_map(head1))) }) }) test_that("`keep =` is defunct", { df <- tibble(x = 1) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { group_map(df, keep = TRUE) }) expect_snapshot(error = TRUE, { group_map(gdf, keep = TRUE) }) expect_snapshot(error = TRUE, { group_modify(df, keep = TRUE) }) expect_snapshot(error = TRUE, { group_modify(gdf, keep = TRUE) }) }) dplyr/tests/testthat/test-context.R0000644000176200001440000000321115106134104017162 0ustar liggesuserstest_that("cur_group() works", { df <- tibble(g = 1, x = 1) gf <- group_by(df, g) expect_equal( df |> summarise(key = list(cur_group())) |> pull(key), list(tibble(.rows = 1L)) ) expect_equal( gf |> summarise(key = list(cur_group())) |> pull(key), list(tibble(g = 1)) ) }) test_that("cur_group() works with empty grouped data frame (#6304)", { df <- tibble(x = integer()) gdf <- group_by(df, x) out <- mutate(df, y = cur_group()) expect_identical(out$y, tibble()) out <- mutate(gdf, y = cur_group()) expect_identical(out$y, tibble(x = integer())) }) test_that("cur_group_idx() gives unique id", { df <- tibble(x = c("b", "a", "b")) gf <- group_by(df, x) expect_equal( summarise(gf, id = cur_group_id()), tibble(x = c("a", "b"), id = 1:2) ) expect_equal( mutate(gf, id = cur_group_id()), group_by(tibble(x = df$x, id = c(2, 1, 2)), x) ) }) test_that("cur_group_rows() retrieves row position in original data", { df <- tibble(x = c("b", "a", "b"), y = 1:3) gf <- group_by(df, x) expect_equal( df |> summarise(x = list(cur_group_rows())) |> pull(), list(1:3) ) expect_equal( gf |> summarise(x = list(cur_group_rows())) |> pull(), list(2L, c(1L, 3L)) ) }) test_that("give useful error messages when not applicable", { expect_snapshot({ (expect_error(n())) (expect_error(cur_column())) (expect_error(cur_group())) (expect_error(cur_group_id())) (expect_error(cur_group_rows())) }) }) test_that("group labels are correctly formatted", { expect_snapshot({ group_labels_details(c("a" = 1)) group_labels_details(c("a" = 1, "b" = 2)) }) }) dplyr/tests/testthat/utf-8.txt0000644000176200001440000000156715106134104016116 0ustar liggesusers# UTF-8 tests that can't be run on Windows CRAN # R CMD check will try to parse the file anyway, # we use a different file extension to avoid this. df <- data.frame(中文1 = 1:10, 中文2 = 1:10, eng = 1:10) df2 <- df |> mutate(中文1 = 中文1 + 1) gdf2 <- df |> group_by(eng) |> mutate(中文1 = 中文1 + 1) expect_equal(lobstr::obj_addrs(names(df)), lobstr::obj_addrs(names(df2))) expect_equal(lobstr::obj_addrs(names(df)), lobstr::obj_addrs(names(gdf2))) df3 <- filter(df2, eng > 5) gdf3 <- filter(gdf2, eng > 5) expect_equal(lobstr::obj_addrs(names(df)), lobstr::obj_addrs(names(df3))) expect_equal(lobstr::obj_addrs(names(df)), lobstr::obj_addrs(names(gdf3))) df4 <- filter(df2, 中文1 > 5) gdf4 <- filter(gdf2, 中文1 > 5) expect_equal(lobstr::obj_addrs(names(df)), lobstr::obj_addrs(names(df4))) expect_equal(lobstr::obj_addrs(names(df)), lobstr::obj_addrs(names(gdf4))) dplyr/tests/testthat/test-nest-by.R0000644000176200001440000000165115106134104017065 0ustar liggesuserstest_that("returns expected type/data", { df <- data.frame(g = 1:2, x = 1:2, y = 1:2) out <- nest_by(df, g) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "g") expect_named(out, c("g", "data")) }) test_that("can control key col", { df <- data.frame(g = 1:2, x = 1:2, y = 1:2) out <- nest_by(df, g, .key = "key") expect_named(out, c("g", "key")) }) test_that("nest_by() inherits grouping", { df <- data.frame(g1 = 1:2, g2 = 1:2, x = 1:2, y = 1:2) expect_equal( df |> group_by(g1) |> nest_by() |> group_vars(), "g1" ) # And you can't have it both ways expect_error(df |> group_by(g1) |> nest_by("g2"), "re-group") }) test_that("can control whether grouping data in list-col", { df <- data.frame(g = 1:2, x = 1:2, y = 1:2) out <- nest_by(df, g) expect_named(out$data[[1]], c("x", "y")) out <- nest_by(df, g, .keep = TRUE) expect_named(out$data[[1]], c("g", "x", "y")) }) dplyr/tests/testthat/test-case-match.R0000644000176200001440000002120115137161765017523 0ustar liggesusers# ------------------------------------------------------------------------------ # case_match() test_that("`case_match()` is soft deprecated", { expect_snapshot({ case_match(1, 1 ~ "x") }) }) test_that("LHS can match multiple values", { local_options(lifecycle_verbosity = "quiet") expect_equal(case_match(1, 1:2 ~ "x"), "x") }) test_that("LHS can match special values", { local_options(lifecycle_verbosity = "quiet") expect_equal(case_match(NA, NA ~ "x"), "x") expect_equal(case_match(NaN, NaN ~ "x"), "x") }) test_that("RHS is recycled to match x", { local_options(lifecycle_verbosity = "quiet") x <- 1:3 expect_equal(case_match(x, c(1, 3) ~ x * 2), c(2, NA, 6)) }) test_that("`NULL` values in `...` are dropped", { local_options(lifecycle_verbosity = "quiet") expect_identical( case_match(1:2, 1 ~ "a", NULL, 2 ~ "b", NULL), c("a", "b") ) }) test_that("requires at least one condition", { local_options(lifecycle_verbosity = "quiet") expect_snapshot(error = TRUE, { case_match(1) }) expect_snapshot(error = TRUE, { case_match(1, NULL) }) }) test_that("passes through `.default` correctly", { local_options(lifecycle_verbosity = "quiet") expect_identical(case_match(1, 3 ~ 1, .default = 2), 2) expect_identical(case_match(1:5, 6 ~ 1, .default = 2), rep(2, 5)) expect_identical(case_match(1:5, 6 ~ 1:5, .default = 2:6), 2:6) }) test_that("`.default` is part of common type computation", { local_options(lifecycle_verbosity = "quiet") expect_identical(case_match(1, 1 ~ 1L, .default = 2), 1) expect_snapshot(error = TRUE, { case_match(1, 1 ~ 1L, .default = "x") }) }) test_that("passes through `.ptype` correctly", { local_options(lifecycle_verbosity = "quiet") expect_identical(case_match(1, 1 ~ 1, .ptype = integer()), 1L) }) test_that("`NULL` formula element throws meaningful error", { local_options(lifecycle_verbosity = "quiet") expect_snapshot(error = TRUE, { case_match(1, 1 ~ NULL) }) expect_snapshot(error = TRUE, { case_match(1, NULL ~ 1) }) }) test_that("throws chained errors when formula evaluation fails", { local_options(lifecycle_verbosity = "quiet") expect_snapshot(error = TRUE, { case_match(1, 1 ~ 2, 3 ~ stop("oh no!")) }) expect_snapshot(error = TRUE, { case_match(1, 1 ~ 2, stop("oh no!") ~ 4) }) }) # ------------------------------------------------------------------------------ # vec_case_match() test_that("works like a vectorized switch", { out <- vec_case_match( needles = c(1, 4, 2, 1), haystacks = list(1, 2, 4), values = list("a", "b", "d") ) expect_identical(out, c("a", "d", "b", "a")) }) test_that("the first match in `haystacks` is always used", { out <- vec_case_match( needles = c(1, 4, 2, 1), haystacks = list(1, 2, 1, 4, 2), values = list("a", "b", "c", "d", "e") ) expect_identical(out, c("a", "d", "b", "a")) }) test_that("`haystacks` can contain multiple values", { out <- vec_case_match( needles = c(1, 4, 2, 1), haystacks = list(c(1, 2), c(4, 5)), values = list("a", "b") ) expect_identical(out, c("a", "b", "a", "a")) }) test_that("`values` can be vectorized on the size of `needles`", { out <- vec_case_match( needles = c(1, 4, 2, 1), haystacks = list(c(1, 2), c(4, 5)), values = list(1:4, 5:8) ) expect_identical(out, c(1L, 6L, 3L, 4L)) }) test_that("unmatched value falls through to `default`", { out <- vec_case_match( needles = c(1, 4, 2, 1, 5), haystacks = list(1, 2), values = list("a", "b") ) expect_identical(out, c("a", NA, "b", "a", NA)) out <- vec_case_match( needles = c(1, 4, 2, 1, 5), haystacks = list(1, 2), values = list("a", "b"), default = "na" ) expect_identical(out, c("a", "na", "b", "a", "na")) }) test_that("`default` can be vectorized on the size of `needles`", { out <- vec_case_match( needles = c(1, 4, 2, 1, 5), haystacks = list(1, 2), values = list("a", "b"), default = c("one", "two", "three", "four", "five") ) expect_identical(out, c("a", "two", "b", "a", "five")) }) test_that("unmatched missing values get `default`", { out <- vec_case_match( needles = c(1, 4, 2, NA, NA), haystacks = list(1, 2), values = list("a", "b") ) expect_identical(out, c("a", NA, "b", NA, NA)) out <- vec_case_match( needles = c(1, 4, 2, NA, NA), haystacks = list(1, 2), values = list("a", "b"), default = "na" ) expect_identical(out, c("a", "na", "b", "na", "na")) }) test_that("can exactly match on missing values", { out <- vec_case_match( needles = c(NA, NaN, NA), haystacks = list(NA, NaN), values = list("na", "nan") ) expect_identical(out, c("na", "nan", "na")) }) test_that("`haystacks` must be castable to `needles`", { expect_snapshot(error = TRUE, { vec_case_match(1L, haystacks = list(1.5), values = list(2)) }) }) test_that("`ptype` overrides `values` common type", { expect_identical( vec_case_match( 1:2, haystacks = list(1), values = list(0), ptype = integer() ), c(0L, NA) ) expect_snapshot(error = TRUE, { vec_case_match( 1:2, haystacks = list(1), values = list(1.5), ptype = integer() ) }) }) test_that("`default` is considered in the common type computation", { expect_identical( vec_case_match(1, haystacks = list(1), values = list(2L), default = 1.5), 2 ) }) test_that("`default` respects `ptype`", { expect_identical( vec_case_match( needles = 1, haystacks = list(1), values = list(2L), default = 1, ptype = integer() ), 2L ) expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = list(1), values = list(2L), default = 1.5, ptype = integer() ) }) }) test_that("`NULL` values in `haystacks` and `values` are not dropped", { expect_snapshot(error = TRUE, { vec_case_match(1:2, list(1, NULL, 2), list("a", NULL, "b")) }) expect_snapshot(error = TRUE, { vec_case_match(1:2, list(1, NULL, 2), list("a", "a", "b")) }) expect_snapshot(error = TRUE, { vec_case_match(1:2, list(1, 1, 2), list("a", NULL, "b")) }) }) test_that("size of `needles` is maintained", { expect_snapshot(error = TRUE, { vec_case_match(1, haystacks = list(1), values = list(1:2)) }) }) test_that("input must be a vector", { expect_snapshot(error = TRUE, { vec_case_match( environment(), haystacks = list(environment()), values = list(1) ) }) }) test_that("`haystacks` must be a list", { expect_snapshot(error = TRUE, { vec_case_match(1, haystacks = 1, values = list(2)) }) }) test_that("`values` must be a list", { expect_snapshot(error = TRUE, { vec_case_match(1, haystacks = list(1), values = 2) }) }) test_that("`needles_arg` is respected", { expect_snapshot(error = TRUE, { vec_case_match( needles = environment(), haystacks = list(environment()), values = list(1), needles_arg = "foo" ) }) expect_snapshot(error = TRUE, { vec_case_match( needles = environment(), haystacks = list(environment()), values = list(1), needles_arg = "" ) }) }) test_that("`haystacks_arg` is respected", { expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = 1, values = list(1), haystacks_arg = "foo" ) }) expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = 1, values = list(1), haystacks_arg = "" ) }) expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = list(a = "x"), values = list(1), haystacks_arg = "foo" ) }) expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = list("x"), values = list(1), haystacks_arg = "" ) }) }) test_that("`values_arg` is respected", { expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = list(1, 2), values = list("x", b = 1), values_arg = "foo" ) }) expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = list(1, 2), values = list("x", b = 1), values_arg = "" ) }) }) test_that("`default_arg` is respected", { expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = list(1), values = list(2L), default = 1.5, default_arg = "foo", ptype = integer() ) }) expect_snapshot(error = TRUE, { vec_case_match( needles = 1, haystacks = list(1), values = list(2L), default = 1.5, default_arg = "", ptype = integer() ) }) }) dplyr/tests/testthat/test-lead-lag.R0000644000176200001440000001055415106134104017154 0ustar liggesuserstest_that("`lead()` / `lag()` get the direction right", { expect_identical(lead(1:5), c(2:5, NA)) expect_identical(lag(1:5), c(NA, 1:4)) }) test_that("If n = 0, lead and lag return x", { x <- c(10L, 8L, 1L, 3L, 6L, 9L, 4L, 2L, 5L, 7L) expect_equal(lead(x, 0), x) expect_equal(lag(x, 0), x) }) test_that("If n = length(x), returns all missing", { x <- c(10L, 8L, 1L, 3L, 6L, 9L, 4L, 2L, 5L, 7L) expect_equal(lead(x, length(x)), rep(NA_integer_, length(x))) expect_equal(lag(x, length(x)), rep(NA_integer_, length(x))) }) test_that("`lag()` gives informative error for objects", { expect_snapshot(error = TRUE, { lag(ts(1:10)) }) }) test_that("lead() and lag() work for matrices (#5028)", { m <- matrix(1:6, ncol = 2) expect_equal( lag(m, 1), matrix(c(NA_integer_, 1L, 2L, NA_integer_, 4L, 5L), ncol = 2) ) expect_equal( lag(m, 1, default = NA), matrix(c(NA_integer_, 1L, 2L, NA_integer_, 4L, 5L), ncol = 2) ) expect_equal( lead(m, 1), matrix(c(2L, 3L, NA_integer_, 5L, 6L, NA_integer_), ncol = 2) ) expect_equal( lead(m, 1, default = NA), matrix(c(2L, 3L, NA_integer_, 5L, 6L, NA_integer_), ncol = 2) ) }) test_that("lead and lag preserve factors", { x <- factor(c("a", "b", "c")) expect_equal(levels(lead(x)), c("a", "b", "c")) expect_equal(levels(lag(x)), c("a", "b", "c")) }) test_that("lead and lag preserves dates and times", { x <- as.Date("2013-01-01") + 1:3 y <- as.POSIXct(x) expect_s3_class(lead(x), "Date") expect_s3_class(lag(x), "Date") expect_s3_class(lead(y), "POSIXct") expect_s3_class(lag(y), "POSIXct") }) test_that("`lead()` / `lag()` validate `n`", { expect_snapshot(error = TRUE, { lead(1:5, n = 1:2) lead(1:5, -1) }) expect_snapshot(error = TRUE, { lag(1:5, n = 1:2) lag(1:5, -1) }) }) test_that("`lead()` / `lag()` check for empty dots", { expect_snapshot(error = TRUE, { lead(1:5, deault = 1) }) expect_snapshot(error = TRUE, { lag(1:5, deault = 1) }) }) test_that("`lead()` / `lag()` require that `x` is a vector", { expect_snapshot(error = TRUE, { lead(environment()) }) expect_snapshot(error = TRUE, { lag(environment()) }) }) # ------------------------------------------------------------------------------ # shift() test_that("works with all 4 combinations of with/without `default` and lag/lead", { x <- 1:5 expect_identical(shift(x, n = 2L), c(NA, NA, 1L, 2L, 3L)) expect_identical(shift(x, n = 2L, default = 0L), c(0L, 0L, 1L, 2L, 3L)) expect_identical(shift(x, n = -2L), c(3L, 4L, 5L, NA, NA)) expect_identical(shift(x, n = -2L, default = 0L), c(3L, 4L, 5L, 0L, 0L)) }) test_that("works with size 0 input", { x <- integer() expect_identical(shift(x, n = 2L), x) expect_identical(shift(x, n = 2L, default = 3L), x) expect_identical(shift(x, n = -2L), x) expect_identical(shift(x, n = -2L, default = 3L), x) }) test_that("works with `n = 0` with and without `default`", { x <- 1:5 expect_identical(shift(x, n = 0L), x) expect_identical(shift(x, n = 0L, default = -1L), x) x <- integer() expect_identical(shift(x, n = 0L), x) expect_identical(shift(x, n = 0L, default = -1L), x) }) test_that("works with data frames", { df <- tibble(a = 1:3, b = letters[1:3]) expect_identical(shift(df, n = 1), vec_slice(df, c(NA, 1, 2))) expect_identical(shift(df, n = -1), vec_slice(df, c(2, 3, NA))) default <- tibble(a = 0L, b = "") expect_identical( shift(df, n = 2, default = default), vec_c(default, default, vec_slice(df, 1)) ) }) test_that("is affected by `order_by`", { x <- 1:5 order_by <- c(2, 3, 2, 1, 5) expect_identical( shift(x, n = 1, order_by = order_by), c(4L, 3L, 1L, NA, 2L) ) expect_identical( shift(x, n = -2, order_by = order_by), c(2L, NA, 5L, 3L, NA) ) }) test_that("`default` is cast to the type of `x` (#6330)", { expect_identical(shift(1L, default = 2), 2L) expect_snapshot(error = TRUE, { shift(1L, default = 1.5) }) }) test_that("`default` must be size 1 (#5641)", { expect_snapshot(error = TRUE, { shift(1:5, default = 1:2) }) expect_snapshot(error = TRUE, { shift(1:5, default = integer()) }) }) test_that("`n` is validated", { expect_snapshot(error = TRUE, { shift(1, n = 1:2) }) }) test_that("`order_by` must be the same size as `x`", { expect_snapshot(error = TRUE, { shift(1:5, order_by = 1:4) }) }) dplyr/tests/testthat/_snaps/0000755000176200001440000000000015137161765015705 5ustar liggesusersdplyr/tests/testthat/_snaps/all-equal.md0000644000176200001440000000635314416000506020073 0ustar liggesusers# all_equal is deprecated Code all_equal(mtcars, mtcars) Condition Warning: `all_equal()` was deprecated in dplyr 1.1.0. i Please use `all.equal()` instead. i And manually order the rows/cols as needed Output [1] TRUE # data frames not equal if missing row Code all_equal(mtcars, mtcars[-1, ]) Output [1] "Different number of rows." Code all_equal(iris, iris[-1, ]) Output [1] "Different number of rows." Code all_equal(df_all, df_all[-1, ]) Output [1] "Different number of rows." # data frames not equal if missing col Code all_equal(mtcars, mtcars[, -1]) Output Different number of columns: 11 vs 10. Code all_equal(iris, iris[, -1]) Output Different number of columns: 5 vs 4. Code all_equal(df_all, df_all[, -1]) Output Different number of columns: 7 vs 6. # factors equal only if levels equal Code all_equal(df1, df2) Output Different types for column `x`: factor<38051> vs factor. Code all_equal(df2, df1) Output Different types for column `x`: factor vs factor<38051>. # factor comparison requires strict equality of levels (#2440) Code all_equal(df1, df2) Output Different types for column `x`: factor<4d52a> vs factor<38051>. Code all_equal(df2, df1) Output Different types for column `x`: factor<38051> vs factor<4d52a>. # equality test fails when convert is FALSE and types don't match (#1484) Code all_equal(df1, df2, convert = FALSE) Output Different types for column `x`: character vs factor<4d52a>. # equality returns a message for convert = TRUE Code all_equal(df1, df2) Output Different types for column `x`: integer vs character. Code all_equal(df1, df2, convert = TRUE) Output Incompatible types for column `x`: integer vs character. # numeric and integer can be compared if convert = TRUE Code all_equal(df1, df2) Output Different types for column `x`: integer vs double. # returns vector for more than one difference (#1819) Code all_equal(tibble(a = 1, b = 2), tibble(a = 1L, b = 2L)) Output Different types for column `a`: double vs integer. Different types for column `b`: double vs integer. # ignore column order Code all_equal(tibble(a = 1, b = 2), tibble(b = 2, a = 1), ignore_col_order = FALSE) Output Same column names, but different order. Code all_equal(tibble(a = 1, b = 2), tibble(a = 1), ignore_col_order = FALSE) Output Different number of columns: 2 vs 1. # count() give meaningful errors Code (expect_error(union(tibble(a = 1), tibble(a = "1")))) Output Error in `union()`: ! `x` and `y` are not compatible. x Incompatible types for column `a`: double vs character. Code (expect_error(union(tibble(a = 1, b = 2), tibble(a = "1", b = "2")))) Output Error in `union()`: ! `x` and `y` are not compatible. x Incompatible types for column `a`: double vs character. x Incompatible types for column `b`: double vs character. dplyr/tests/testthat/_snaps/top-n.md0000644000176200001440000000062514416000544017251 0ustar liggesusers# top_n() quotes n Code res1 <- top_n(mtcars, n() * 0.5) Message Selecting by carb --- Code res2 <- top_n(mtcars, 16) Message Selecting by carb # top_frac() is a shorthand for top_n(n()*) Code res1 <- top_n(mtcars, n() * 0.5) Message Selecting by carb --- Code res2 <- top_frac(mtcars, 0.5) Message Selecting by carb dplyr/tests/testthat/_snaps/group-by.md0000644000176200001440000000453515106134104017761 0ustar liggesusers# can't rename while partially `ungroup()`-ing (#6606) Code ungroup(gdf, g2 = g) Condition Error in `ungroup()`: ! Can't rename variables in this context. # select(group_by(.)) implicitly adds grouping variables (#170) Code res <- select(group_by(mtcars, vs), mpg) Message Adding missing grouping variables: `vs` # group_by works with zero-row data frames (#486) Code x <- select(dfg, a) Message Adding missing grouping variables: `g` # group_by() and ungroup() give meaningful error messages Code df <- tibble(x = 1, y = 2) (expect_error(group_by(df, unknown))) Output Error in `group_by()`: ! Must group by variables found in `.data`. x Column `unknown` is not found. Code (expect_error(ungroup(df, x))) Output Error in `ungroup()`: ! `...` must be empty. x Problematic argument: * ..1 = x i Did you forget to name an argument? Code (expect_error(ungroup(group_by(df, x, y), z))) Output Error in `ungroup()`: ! Can't select columns that don't exist. x Column `z` doesn't exist. Code (expect_error(group_by(df, z = a + 1))) Output Error in `group_by()`: i In argument: `z = a + 1`. Caused by error: ! object 'a' not found # group_by(add =) is defunct Code group_by(df, x, add = TRUE) Condition Error: ! The `add` argument of `group_by()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.add` argument instead. # group_by_prepare(add =) is defunct Code group_by_prepare(df, x, add = TRUE) Condition Error: ! The `add` argument of `group_by()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.add` argument instead. # group_by(.dots =) is defunct Code group_by(df, .dots = "x") Condition Error: ! The `.dots` argument of `group_by()` was deprecated in dplyr 1.0.0 and is now defunct. # group_by_prepare(.dots =) is defunct Code group_by_prepare(df, .dots = "x") Condition Error: ! The `.dots` argument of `group_by()` was deprecated in dplyr 1.0.0 and is now defunct. dplyr/tests/testthat/_snaps/group-data.md0000644000176200001440000000145615106134104020257 0ustar liggesusers# group_keys(...) is defunct Code group_keys(df, x) Condition Error: ! The `...` argument of `group_keys()` was deprecated in dplyr 1.0.0 and is now defunct. i Please `group_by()` first # no arg group_indices() is deprecated Code out <- summarise(df, id = group_indices()) Condition Warning: There was 1 warning in `summarise()`. i In argument: `id = group_indices()`. Caused by warning: ! `group_indices()` with no arguments was deprecated in dplyr 1.0.0. i Please use `cur_group_id()` instead. # group_indices(...) is deprecated Code group_indices(df, x) Condition Error: ! The `...` argument of `group_indices()` was deprecated in dplyr 1.0.0 and is now defunct. i Please `group_by()` first dplyr/tests/testthat/_snaps/order-by.md0000644000176200001440000000153114416000533017732 0ustar liggesusers# order_by() gives useful error messages Code (expect_error(order_by(mtcars, 10))) Output Error in `order_by()`: ! `call` must be a function call, not the number 10. Code (expect_error(order_by(mtcars, cyl))) Output Error in `order_by()`: ! `call` must be a function call, not a symbol. i Did you mean `arrange(mtcars, cyl)`? # `with_order()` requires `order_by` and `x` to be the same size Code with_order(1:2, identity, 1:3) Condition Error in `with_order()`: ! `order_by` must have size 3, not size 2. # order_by() give meaningful errors Code (expect_error(order_by(NULL, 1L))) Output Error in `order_by()`: ! `call` must be a function call, not the number 1. dplyr/tests/testthat/_snaps/defunct-each.md0000644000176200001440000000206415137161765020557 0ustar liggesusers# generate informative errors Code summarise_each() Condition Error: ! `summarise_each()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `across()` instead. Code summarise_each_() Condition Error: ! `summarise_each_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `across()` instead. Code mutate_each() Condition Error: ! `mutate_each()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `across()` instead. Code mutate_each_() Condition Error: ! `mutate_each_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `across()` instead. Code summarize_each() Condition Error: ! `summarise_each()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `across()` instead. Code summarize_each_() Condition Error: ! `summarise_each_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `across()` instead. dplyr/tests/testthat/_snaps/rowwise.md0000644000176200001440000000477414416000537017726 0ustar liggesusers# rowwise has decent print method Code rf Output # A tibble: 5 x 1 # Rowwise: x x 1 1 2 2 3 3 4 4 5 5 # validate_rowwise_df() gives useful errors Code (expect_error(validate_rowwise_df(df1))) Output Error in `validate_rowwise_df()`: ! The `.rows` column must be a list of size 1, one-based integer vectors with the right value. Code (expect_error(validate_rowwise_df(df2))) Output Error in `validate_rowwise_df()`: ! The last column of the `groups` attribute must be called `.rows`. Code (expect_error(validate_rowwise_df(df3))) Output Error in `validate_rowwise_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(validate_rowwise_df(df4))) Output Error in `validate_rowwise_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(validate_rowwise_df(df7))) Output Error in `validate_rowwise_df()`: ! The `.rows` column must be a list of size 1, one-based integer vectors with the right value. Code (expect_error(attr(df8, "groups")$.rows <- 1:8)) Output Error in `$<-`: ! Assigned data `1:8` must be compatible with existing data. x Existing data has 10 rows. x Assigned data has 8 rows. i Only vectors of size 1 are recycled. Caused by error in `vectbl_recycle_rhs_rows()`: ! Can't recycle input of size 8 to size 10. Code (expect_error(validate_rowwise_df(df10))) Output Error in `validate_rowwise_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(validate_rowwise_df(df11))) Output Error in `validate_rowwise_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(new_rowwise_df(tibble(x = 1:10), tibble(".rows" := list(1:5, -1L)))) ) Output Error in `new_rowwise_df()`: ! `group_data` must be a tibble without a `.rows` column. Code (expect_error(new_rowwise_df(tibble(x = 1:10), 1:10))) Output Error in `new_rowwise_df()`: ! `group_data` must be a tibble without a `.rows` column. dplyr/tests/testthat/_snaps/arrange.md0000644000176200001440000000451115106134104017626 0ustar liggesusers# arrange() gives meaningful errors Code (expect_error(arrange(tibble(x = 1, x = 1, .name_repair = "minimal"), x))) Output Error in `arrange()`: ! Can't transform a data frame with duplicate names. Code (expect_error(arrange(tibble(x = 1), y))) Output Error in `arrange()`: i In argument: `..1 = y`. Caused by error: ! object 'y' not found Code (expect_error(arrange(tibble(x = 1), rep(x, 2)))) Output Error in `arrange()`: i In argument: `..1 = rep(x, 2)`. Caused by error: ! `..1` must be size 1, not 2. # arrange errors if stringi is not installed and a locale identifier is used Code locale_to_chr_proxy_collate("fr", has_stringi = FALSE) Condition Error: ! stringi >=1.5.3 is required to arrange in a different locale. # arrange validates `.locale` Code arrange(df, .locale = 1) Condition Error in `arrange()`: ! `.locale` must be a string or `NULL`. --- Code arrange(df, .locale = c("en_US", "fr_BF")) Condition Error in `arrange()`: ! If `.locale` is a character vector, it must be a single string. # arrange validates that `.locale` must be one from stringi Code arrange(df, .locale = "x") Condition Error in `arrange()`: ! `.locale` must be one of the locales within `stringi::stri_locale_list()`. # desc() inside arrange() checks the number of arguments (#5921) Code df <- data.frame(x = 1, y = 2) (expect_error(arrange(df, desc(x, y)))) Output Error in `arrange()`: ! `desc()` must be called with exactly one argument. # legacy - using the deprecated global option `dplyr.legacy_locale` forces the system locale Code out <- arrange(df, x)$x Condition Warning: `options(dplyr.legacy_locale =)` was deprecated in dplyr 1.2.0. i If needed for `arrange()`, use `arrange(.locale =)` instead. i If needed for `group_by() |> summarise()`, follow up with an additional `arrange(.locale =)` call. i Use `Sys.getlocale("LC_COLLATE")` to determine your system locale, and compare against `stringi::stri_locale_list()` to determine the `.locale` value to use. dplyr/tests/testthat/_snaps/colwise-filter.md0000644000176200001440000000073514416000510021137 0ustar liggesusers# colwise filter() give meaningful errors Code (expect_error(filter_if(mtcars, is_character, all_vars(. > 0)))) Output Error in `filter_if()`: ! `.predicate` must match at least one column. Code (expect_error(filter_all(mtcars, list(~ . > 0)))) Output Error in `filter_all()`: ! `.vars_predicate` must be a function or a call to `all_vars()` or `any_vars()`, not a list. dplyr/tests/testthat/_snaps/reframe.md0000644000176200001440000000304614416000535017635 0ustar liggesusers# `reframe()` throws intelligent recycling errors Code reframe(df, x = 1:2, y = 3:5) Condition Error in `reframe()`: ! Can't recycle `y = 3:5`. Caused by error: ! `y` must be size 2 or 1, not 3. i An earlier column had size 2. --- Code reframe(df, x = 1:2, y = 3:5, .by = g) Condition Error in `reframe()`: ! Can't recycle `y = 3:5`. i In group 1: `g = 1`. Caused by error: ! `y` must be size 2 or 1, not 3. i An earlier column had size 2. --- Code reframe(gdf, x = 1:2, y = 3:5) Condition Error in `reframe()`: ! Can't recycle `y = 3:5`. i In group 1: `g = 1`. Caused by error: ! `y` must be size 2 or 1, not 3. i An earlier column had size 2. # `reframe()` doesn't message about regrouping when multiple group columns are supplied Code out <- reframe(df, x = mean(x), .by = c(a, b)) --- Code out <- reframe(gdf, x = mean(x)) # `reframe()` doesn't message about regrouping when >1 rows are returned per group Code out <- reframe(df, x = vec_rep_each(x, x), .by = g) --- Code out <- reframe(gdf, x = vec_rep_each(x, x)) # catches `.by` with grouped-df Code reframe(gdf, .by = x) Condition Error in `reframe()`: ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df Code reframe(rdf, .by = x) Condition Error in `reframe()`: ! Can't supply `.by` when `.data` is a rowwise data frame. dplyr/tests/testthat/_snaps/group-map.md0000644000176200001440000000352615106134104020123 0ustar liggesusers# group_map() give meaningful errors Code (expect_error(group_modify(group_by(mtcars, cyl), ~ data.frame(cyl = 19)))) Output Error in `group_modify()`: ! The returned data frame cannot contain the original grouping variables: cyl. Code (expect_error(group_modify(group_by(mtcars, cyl), ~10))) Output Error in `group_modify()`: ! The result of `.f` must be a data frame. Code (expect_error(group_modify(group_by(iris, Species), head1))) Output Error in `group_modify()`: ! `.f` must accept at least two arguments. i You can use `...` to absorb unused components. Code (expect_error(group_map(group_by(iris, Species), head1))) Output Error in `group_map()`: ! `.f` must accept at least two arguments. i You can use `...` to absorb unused components. # `keep =` is defunct Code group_map(df, keep = TRUE) Condition Error: ! The `keep` argument of `group_map()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.keep` argument instead. --- Code group_map(gdf, keep = TRUE) Condition Error: ! The `keep` argument of `group_map()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.keep` argument instead. --- Code group_modify(df, keep = TRUE) Condition Error: ! The `keep` argument of `group_modify()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.keep` argument instead. --- Code group_modify(gdf, keep = TRUE) Condition Error: ! The `keep` argument of `group_modify()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.keep` argument instead. dplyr/tests/testthat/_snaps/conditions.md0000644000176200001440000003651615106134104020372 0ustar liggesusers# can pass verb-level error call Code mutate(mtcars, 1 + "") Condition Error in `foo()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code transmute(mtcars, 1 + "") Condition Error in `foo()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code summarise(mtcars, 1 + "") Condition Error in `foo()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code summarise(group_by(mtcars, cyl), 1 + "") Condition Error in `foo()`: i In argument: `1 + ""`. i In group 1: `cyl = 4`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code filter(mtcars, 1 + "") Condition Error in `foo()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code arrange(mtcars, 1 + "") Condition Error in `foo()`: i In argument: `..1 = 1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code select(mtcars, 1 + "") Condition Error in `foo()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code slice(mtcars, 1 + "") Condition Error in `foo()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator # can pass verb-level error call (example case) Code my_verb(mtcars, 1 + "", am) Condition Error in `my_verb()`: i In argument: `.result = (1 + "") * am`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code my_verb(mtcars, cyl, c(am, vs)) Condition Error in `my_verb()`: i In argument: `.result = cyl * c(am, vs)`. Caused by error: ! `.result` must be size 32 or 1, not 64. # `err_locs()` works as expected Code err_locs(1.5) Condition Error in `err_locs()`: ! `x` must be an integer vector of locations. i This is an internal error that was detected in the dplyr package. Please report it at with a reprex () and the full backtrace. --- Code err_locs(integer()) Condition Error in `err_locs()`: ! `x` must have at least 1 location. i This is an internal error that was detected in the dplyr package. Please report it at with a reprex () and the full backtrace. --- Code err_locs(1L) Output `c(1)` Code err_locs(1:5) Output `c(1, 2, 3, 4, 5)` Code err_locs(1:6) Output `c(1, 2, 3, 4, 5)` and 1 more Code err_locs(1:7) Output `c(1, 2, 3, 4, 5)` and 2 more # errors during dots collection are not enriched (#6178) Code mutate(mtcars, !!foobarbaz()) Condition Error in `foobarbaz()`: ! could not find function "foobarbaz" Code transmute(mtcars, !!foobarbaz()) Condition Error in `foobarbaz()`: ! could not find function "foobarbaz" Code select(mtcars, !!foobarbaz()) Condition Error in `foobarbaz()`: ! could not find function "foobarbaz" Code arrange(mtcars, !!foobarbaz()) Condition Error in `foobarbaz()`: ! could not find function "foobarbaz" Code filter(mtcars, !!foobarbaz()) Condition Error in `foobarbaz()`: ! could not find function "foobarbaz" # warnings are collected for `last_dplyr_warnings()` Code # Ungrouped invisible(mutate(df, x = f())) Condition Warning: There was 1 warning in `mutate()`. i In argument: `x = f()`. Caused by warning in `f()`: ! msg Code last_dplyr_warnings() Output [[1]] Warning in `mutate()`: i In argument: `x = f()`. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(df, x = f()) 2. \-dplyr:::mutate.data.frame(df, x = f()) --- Code # Grouped invisible(mutate(group_by(df, id), x = f())) Condition Warning: There were 2 warnings in `mutate()`. The first warning was: i In argument: `x = f()`. i In group 1: `id = 1`. Caused by warning in `f()`: ! msg i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. Code last_dplyr_warnings() Output [[1]] Warning in `mutate()`: i In argument: `x = f()`. i In group 1: `id = 1`. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(group_by(df, id), x = f()) 2. \-dplyr:::mutate.data.frame(group_by(df, id), x = f()) [[2]] Warning in `mutate()`: i In argument: `x = f()`. i In group 2: `id = 2`. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(group_by(df, id), x = f()) 2. \-dplyr:::mutate.data.frame(group_by(df, id), x = f()) --- Code # Rowwise invisible(mutate(rowwise(df), x = f())) Condition Warning: There were 2 warnings in `mutate()`. The first warning was: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. Code last_dplyr_warnings() Output [[1]] Warning in `mutate()`: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(rowwise(df), x = f()) 2. \-dplyr:::mutate.data.frame(rowwise(df), x = f()) [[2]] Warning in `mutate()`: i In argument: `x = f()`. i In row 2. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(rowwise(df), x = f()) 2. \-dplyr:::mutate.data.frame(rowwise(df), x = f()) --- Code # Multiple type of warnings within multiple verbs invisible(mutate(group_by(mutate(rowwise(group_by(df, g = f():n())), x = f()), id), x = f())) Condition Warning: There was 1 warning in `group_by()`. i In argument: `g = f():n()`. Caused by warning in `f()`: ! msg Warning: There were 2 warnings in `mutate()`. The first warning was: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. Warning: There were 2 warnings in `mutate()`. The first warning was: i In argument: `x = f()`. i In group 1: `id = 1`. Caused by warning in `f()`: ! msg i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. Code last_dplyr_warnings() Output [[1]] Warning in `group_by()`: i In argument: `g = f():n()`. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(...) 2. +-dplyr::group_by(...) 3. +-dplyr::mutate(rowwise(group_by(df, g = f():n())), x = f()) 4. +-dplyr::rowwise(group_by(df, g = f():n())) 5. +-dplyr::group_by(df, g = f():n()) 6. \-dplyr:::group_by.data.frame(df, g = f():n()) [[2]] Warning in `mutate()`: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(...) 2. +-dplyr::group_by(...) 3. +-dplyr::mutate(rowwise(group_by(df, g = f():n())), x = f()) 4. \-dplyr:::mutate.data.frame(...) [[3]] Warning in `mutate()`: i In argument: `x = f()`. i In row 2. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(...) 2. +-dplyr::group_by(...) 3. +-dplyr::mutate(rowwise(group_by(df, g = f():n())), x = f()) 4. \-dplyr:::mutate.data.frame(...) [[4]] Warning in `mutate()`: i In argument: `x = f()`. i In group 1: `id = 1`. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(...) 2. \-dplyr:::mutate.data.frame(...) [[5]] Warning in `mutate()`: i In argument: `x = f()`. i In group 2: `id = 2`. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(...) 2. \-dplyr:::mutate.data.frame(...) --- Code # Truncated (1 more) mutate(rowwise(df), x = f()) Condition Warning: There were 2 warnings in `mutate()`. The first warning was: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. Output # A tibble: 2 x 2 # Rowwise: id x 1 1 1 2 2 1 Code last_dplyr_warnings(n = 1) Output [[1]] Warning in `mutate()`: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(rowwise(df), x = f()) 2. \-dplyr:::mutate.data.frame(rowwise(df), x = f()) Message ... with 1 more warning. i Run `dplyr::last_dplyr_warnings(n = 2)` to show more. --- Code # Truncated (several more) df <- tibble(id = 1:5) mutate(rowwise(df), x = f()) Condition Warning: There were 5 warnings in `mutate()`. The first warning was: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg i Run `dplyr::last_dplyr_warnings()` to see the 4 remaining warnings. Output # A tibble: 5 x 2 # Rowwise: id x 1 1 1 2 2 1 3 3 1 4 4 1 5 5 1 Code last_dplyr_warnings(n = 1) Output [[1]] Warning in `mutate()`: i In argument: `x = f()`. i In row 1. Caused by warning in `f()`: ! msg --- Backtrace: x 1. +-dplyr::mutate(rowwise(df), x = f()) 2. \-dplyr:::mutate.data.frame(rowwise(df), x = f()) Message ... with 4 more warnings. i Run `dplyr::last_dplyr_warnings(n = 2)` to show more. # complex backtraces with base and rlang warnings Code foo() Condition Warning: There was 1 warning in `group_by()`. i In argument: `x = f(1):n()`. Caused by warning in `h()`: ! foo Warning: There were 3 warnings in `mutate()`. The first warning was: i In argument: `x = f(1, base = FALSE)`. i In group 1: `x = 1`. Caused by warning: ! foo i Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings. Output # A tibble: 3 x 2 # Groups: x [1] id x 1 1 1 2 2 1 3 3 1 Code last_dplyr_warnings() Output [[1]] Warning in `group_by()`: i In argument: `x = f(1):n()`. Caused by warning in `h()`: ! foo --- Backtrace: x 1. \-dplyr (local) foo() 2. \-dplyr (local) bar() 3. +-dplyr::mutate(group_by(df, x = f(1):n()), x = f(1, base = FALSE)) 4. +-dplyr::group_by(df, x = f(1):n()) 5. \-dplyr:::group_by.data.frame(df, x = f(1):n()) [[2]] Warning in `mutate()`: i In argument: `x = f(1, base = FALSE)`. i In group 1: `x = 1`. Caused by warning: ! foo --- Backtrace: x 1. \-dplyr (local) foo() 2. \-dplyr (local) bar() 3. +-dplyr::mutate(group_by(df, x = f(1):n()), x = f(1, base = FALSE)) 4. \-dplyr:::mutate.data.frame(...) [[3]] Warning in `mutate()`: i In argument: `x = f(1, base = FALSE)`. i In group 2: `x = 2`. Caused by warning: ! foo --- Backtrace: x 1. \-dplyr (local) foo() 2. \-dplyr (local) bar() 3. +-dplyr::mutate(group_by(df, x = f(1):n()), x = f(1, base = FALSE)) 4. \-dplyr:::mutate.data.frame(...) [[4]] Warning in `mutate()`: i In argument: `x = f(1, base = FALSE)`. i In group 3: `x = 3`. Caused by warning: ! foo --- Backtrace: x 1. \-dplyr (local) foo() 2. \-dplyr (local) bar() 3. +-dplyr::mutate(group_by(df, x = f(1):n()), x = f(1, base = FALSE)) 4. \-dplyr:::mutate.data.frame(...) # can collect warnings in main verbs Code invisible(summarise(mutate(arrange(filter(rowwise(mtcars), f()), f()), a = f()), b = f())) Condition Warning: There were 32 warnings in `filter()`. The first warning was: i In argument: `f()`. i In row 1. Caused by warning in `f()`: ! foo i Run `dplyr::last_dplyr_warnings()` to see the 31 remaining warnings. Warning: There was 1 warning in `arrange()`. i In argument: `..1 = f()`. Caused by warning in `f()`: ! foo Warning: There were 32 warnings in `mutate()`. The first warning was: i In argument: `a = f()`. i In row 1. Caused by warning in `f()`: ! foo i Run `dplyr::last_dplyr_warnings()` to see the 31 remaining warnings. Warning: There were 32 warnings in `summarise()`. The first warning was: i In argument: `b = f()`. i In row 1. Caused by warning in `f()`: ! foo i Run `dplyr::last_dplyr_warnings()` to see the 31 remaining warnings. Code warnings <- last_dplyr_warnings(Inf) warnings[[1]] Output Warning in `filter()`: i In argument: `f()`. i In row 1. Caused by warning in `f()`: ! foo Code warnings[[33]] Output Warning in `arrange()`: i In argument: `..1 = f()`. Caused by warning in `f()`: ! foo Code warnings[[65]] Output Warning in `mutate()`: i In argument: `a = f()`. i In row 32. Caused by warning in `f()`: ! foo Code warnings[[97]] Output Warning in `summarise()`: i In argument: `b = f()`. i In row 32. Caused by warning in `f()`: ! foo dplyr/tests/testthat/_snaps/colwise-mutate.md0000644000176200001440000000405215106134104021151 0ustar liggesusers# selection works with grouped data frames (#2624) Code out <- mutate_if(gdf, is.factor, as.character) Message `mutate_if()` ignored the following grouping variables: * Column `Species` # colwise verbs deprecate quosures (#4330) Code (expect_warning(mutate_at(mtcars, vars(mpg), quo(mean(.))))) Output Warning: The `...` argument of `mutate_at()` can't contain quosures as of dplyr 0.8.3. i Please use a one-sided formula, a function, or a function name. Code (expect_warning(summarise_at(mtcars, vars(mpg), quo(mean(.))))) Output Warning: The `...` argument of `summarise_at()` can't contain quosures as of dplyr 0.8.3. i Please use a one-sided formula, a function, or a function name. # colwise mutate gives meaningful error messages Code (expect_error(mutate_at(tibble(), "test", ~1))) Output Error in `tbl_at_vars()`: ! Can't select columns that don't exist. x Column `test` doesn't exist. Code tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) tbl <- group_by(tbl, gr1) (expect_error(summarise_at(tbl, vars(gr1), mean))) Output Error in `tbl_at_vars()`: ! Can't select columns that don't exist. x Column `gr1` doesn't exist. Code (expect_error(mutate_all(mtcars, length, 0, 0))) Output Error in `mutate()`: i In argument: `mpg = .Primitive("length")(mpg, 0, 0)`. Caused by error: ! 3 arguments passed to 'length' which requires 1 Code (expect_error(mutate_all(mtcars, mean, na.rm = TRUE, na.rm = TRUE))) Output Error in `mutate()`: i In argument: `mpg = (function (x, ...) ...`. Caused by error in `mean.default()`: ! formal argument "na.rm" matched by multiple actual arguments dplyr/tests/testthat/_snaps/case-match.md0000644000176200001440000001554415137161765020245 0ustar liggesusers# `case_match()` is soft deprecated Code case_match(1, 1 ~ "x") Condition Warning: `case_match()` was deprecated in dplyr 1.2.0. i Please use `recode_values()` instead. Output [1] "x" # requires at least one condition Code case_match(1) Condition Error in `case_match()`: ! `...` can't be empty. --- Code case_match(1, NULL) Condition Error in `case_match()`: ! `...` can't be empty. # `.default` is part of common type computation Code case_match(1, 1 ~ 1L, .default = "x") Condition Error in `case_match()`: ! Can't combine and `.default` . # `NULL` formula element throws meaningful error Code case_match(1, 1 ~ NULL) Condition Error in `case_match()`: ! `..1 (right)` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. --- Code case_match(1, NULL ~ 1) Condition Error in `case_match()`: ! `..1 (left)` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. # throws chained errors when formula evaluation fails Code case_match(1, 1 ~ 2, 3 ~ stop("oh no!")) Condition Error in `case_match()`: ! Failed to evaluate the right-hand side of formula 2. Caused by error: ! oh no! --- Code case_match(1, 1 ~ 2, stop("oh no!") ~ 4) Condition Error in `case_match()`: ! Failed to evaluate the left-hand side of formula 2. Caused by error: ! oh no! # `haystacks` must be castable to `needles` Code vec_case_match(1L, haystacks = list(1.5), values = list(2)) Condition Error in `vec_case_match()`: ! Can't convert from `haystacks[[1]]` to due to loss of precision. * Locations: 1 # `ptype` overrides `values` common type Code vec_case_match(1:2, haystacks = list(1), values = list(1.5), ptype = integer()) Condition Error in `vec_case_match()`: ! Can't convert from `values[[1]]` to due to loss of precision. * Locations: 1 # `default` respects `ptype` Code vec_case_match(needles = 1, haystacks = list(1), values = list(2L), default = 1.5, ptype = integer()) Condition Error in `vec_case_match()`: ! Can't convert from `default` to due to loss of precision. * Locations: 1 # `NULL` values in `haystacks` and `values` are not dropped Code vec_case_match(1:2, list(1, NULL, 2), list("a", NULL, "b")) Condition Error in `vec_case_match()`: ! `haystacks[[2]]` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. --- Code vec_case_match(1:2, list(1, NULL, 2), list("a", "a", "b")) Condition Error in `vec_case_match()`: ! `haystacks[[2]]` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. --- Code vec_case_match(1:2, list(1, 1, 2), list("a", NULL, "b")) Condition Error in `vec_case_match()`: ! `values[[2]]` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. # size of `needles` is maintained Code vec_case_match(1, haystacks = list(1), values = list(1:2)) Condition Error in `vec_case_match()`: ! Can't recycle `values[[1]]` (size 2) to size 1. # input must be a vector Code vec_case_match(environment(), haystacks = list(environment()), values = list(1)) Condition Error in `vec_case_match()`: ! `needles` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. # `haystacks` must be a list Code vec_case_match(1, haystacks = 1, values = list(2)) Condition Error in `vec_case_match()`: ! `haystacks` must be a list, not the number 1. # `values` must be a list Code vec_case_match(1, haystacks = list(1), values = 2) Condition Error in `vec_case_match()`: ! `values` must be a list, not the number 2. # `needles_arg` is respected Code vec_case_match(needles = environment(), haystacks = list(environment()), values = list(1), needles_arg = "foo") Condition Error in `vec_case_match()`: ! `foo` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. --- Code vec_case_match(needles = environment(), haystacks = list(environment()), values = list(1), needles_arg = "") Condition Error in `vec_case_match()`: ! Input must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. # `haystacks_arg` is respected Code vec_case_match(needles = 1, haystacks = 1, values = list(1), haystacks_arg = "foo") Condition Error in `vec_case_match()`: ! `foo` must be a list, not the number 1. --- Code vec_case_match(needles = 1, haystacks = 1, values = list(1), haystacks_arg = "") Condition Error in `vec_case_match()`: ! Input must be a list, not the number 1. --- Code vec_case_match(needles = 1, haystacks = list(a = "x"), values = list(1), haystacks_arg = "foo") Condition Error in `vec_case_match()`: ! Can't convert `foo$a` to . --- Code vec_case_match(needles = 1, haystacks = list("x"), values = list(1), haystacks_arg = "") Condition Error in `vec_case_match()`: ! Can't convert `..1` to . # `values_arg` is respected Code vec_case_match(needles = 1, haystacks = list(1, 2), values = list("x", b = 1), values_arg = "foo") Condition Error in `vec_case_match()`: ! Can't combine `foo[[1]]` and `foo$b` . --- Code vec_case_match(needles = 1, haystacks = list(1, 2), values = list("x", b = 1), values_arg = "") Condition Error in `vec_case_match()`: ! Can't combine `..1` and `b` . # `default_arg` is respected Code vec_case_match(needles = 1, haystacks = list(1), values = list(2L), default = 1.5, default_arg = "foo", ptype = integer()) Condition Error in `vec_case_match()`: ! Can't convert from `foo` to due to loss of precision. * Locations: 1 --- Code vec_case_match(needles = 1, haystacks = list(1), values = list(2L), default = 1.5, default_arg = "", ptype = integer()) Condition Error in `vec_case_match()`: ! Can't convert from to due to loss of precision. * Locations: 1 dplyr/tests/testthat/_snaps/locale.md0000644000176200001440000000141215106134104017443 0ustar liggesusers# `dplyr_legacy_locale()` respects `dplyr.legacy_locale` Code dplyr_legacy_locale() Condition Error: ! Global option `dplyr.legacy_locale` must be a single `TRUE` or `FALSE`. # `dplyr_legacy_locale()` treats `dplyr.legacy_locale` as deprecated Code dplyr_legacy_locale() Condition Warning: `options(dplyr.legacy_locale =)` was deprecated in dplyr 1.2.0. i If needed for `arrange()`, use `arrange(.locale =)` instead. i If needed for `group_by() |> summarise()`, follow up with an additional `arrange(.locale =)` call. i Use `Sys.getlocale("LC_COLLATE")` to determine your system locale, and compare against `stringi::stri_locale_list()` to determine the `.locale` value to use. Output [1] TRUE dplyr/tests/testthat/_snaps/slice.md0000644000176200001440000002627315016155021017320 0ustar liggesusers# slice errors if positive and negative indices mixed Code slice(tibble(), 1, -1) Condition Error in `slice()`: ! Can't compute indices. Caused by error: ! Can't subset elements with `1`. x Negative and positive locations can't be mixed. i Subscript `1` has a positive value at location 1. # slicing with one-column matrix is deprecated Code out <- slice(df, matrix(c(1, 3))) Condition Warning: Slicing with a 1-column matrix was deprecated in dplyr 1.1.0. # slice errors if index is not numeric Code slice(tibble(), "a") Condition Error in `slice()`: i In argument: `"a"`. Caused by error: ! Can't subset elements with `"a"`. x `"a"` must be numeric, not the string "a". # user errors are correctly labelled Code slice(df, 1 + "") Condition Error in `slice()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code slice(group_by(df, x), 1 + "") Condition Error in `slice()`: i In argument: `1 + ""`. i In group 1: `x = 1`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator # `...` can't be named (#6554) Code slice(df, 1, foo = g) Condition Error in `slice()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * foo = g # can't use `.by` with `.preserve` Code slice(df, .by = x, .preserve = TRUE) Condition Error in `slice()`: ! Can't supply both `.by` and `.preserve`. # catches `.by` with grouped-df Code slice(gdf, .by = x) Condition Error in `slice()`: ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df Code slice(rdf, .by = x) Condition Error in `slice()`: ! Can't supply `.by` when `.data` is a rowwise data frame. # catches `by` typo (#6647) Code slice(df, by = x) Condition Error in `slice()`: ! Can't specify an argument named `by` in this verb. i Did you mean to use `.by` instead? # slice_helpers() call get_slice_size() Code slice_head(df, n = "a") Condition Error in `slice_head()`: ! `n` must be a round number, not the string "a". Code slice_tail(df, n = "a") Condition Error in `slice_tail()`: ! `n` must be a round number, not the string "a". Code slice_min(df, x, n = "a") Condition Error in `slice_min()`: ! `n` must be a round number, not the string "a". Code slice_max(df, x, n = "a") Condition Error in `slice_max()`: ! `n` must be a round number, not the string "a". Code slice_sample(df, n = "a") Condition Error in `slice_sample()`: ! `n` must be a round number, not the string "a". # get_slice_size() validates its inputs Code get_slice_size(n = 1, prop = 1) Condition Error: ! Must supply `n` or `prop`, but not both. Code get_slice_size(n = "a") Condition Error: ! `n` must be a round number, not the string "a". Code get_slice_size(prop = "a") Condition Error: ! `prop` must be a number, not the string "a". # get_slice_size() snapshots Code body(get_slice_size(prop = 0)) Output clamp(0, floor(0 * n), n) Code body(get_slice_size(prop = 0.4)) Output clamp(0, floor(0.4 * n), n) Code body(get_slice_size(prop = 2)) Output clamp(0, floor(2 * n), n) Code body(get_slice_size(prop = 2, allow_outsize = TRUE)) Output floor(2 * n) Code body(get_slice_size(prop = -0.4)) Output clamp(0, ceiling(n + -0.4 * n), n) Code body(get_slice_size(prop = -2)) Output clamp(0, ceiling(n + -2 * n), n) Code body(get_slice_size(n = 0)) Output clamp(0, 0, n) Code body(get_slice_size(n = 4)) Output clamp(0, 4, n) Code body(get_slice_size(n = 20)) Output clamp(0, 20, n) Code body(get_slice_size(n = 20, allow_outsize = TRUE)) Output [1] 20 Code body(get_slice_size(n = -4)) Output clamp(0, ceiling(n + -4), n) Code body(get_slice_size(n = -20)) Output clamp(0, ceiling(n + -20), n) # n must be an integer Code slice_head(df, n = 1.1) Condition Error in `slice_head()`: ! `n` must be a round number, not the number 1.1. # slice_*() checks that `n=` is explicitly named and ... is empty Code slice_head(df, 5) Condition Error in `slice_head()`: ! `n` must be explicitly named. i Did you mean `slice_head(n = 5)`? Code slice_tail(df, 5) Condition Error in `slice_tail()`: ! `n` must be explicitly named. i Did you mean `slice_tail(n = 5)`? Code slice_min(df, x, 5) Condition Error in `slice_min()`: ! `n` must be explicitly named. i Did you mean `slice_min(n = 5)`? Code slice_max(df, x, 5) Condition Error in `slice_max()`: ! `n` must be explicitly named. i Did you mean `slice_max(n = 5)`? Code slice_sample(df, 5) Condition Error in `slice_sample()`: ! `n` must be explicitly named. i Did you mean `slice_sample(n = 5)`? --- Code dplyr::slice_head(df, 5) Condition Error in `dplyr::slice_head()`: ! `n` must be explicitly named. i Did you mean `dplyr::slice_head(n = 5)`? Code dplyr::slice_tail(df, 5) Condition Error in `dplyr::slice_tail()`: ! `n` must be explicitly named. i Did you mean `dplyr::slice_tail(n = 5)`? Code dplyr::slice_min(df, x, 5) Condition Error in `dplyr::slice_min()`: ! `n` must be explicitly named. i Did you mean `dplyr::slice_min(n = 5)`? Code dplyr::slice_max(df, x, 5) Condition Error in `dplyr::slice_max()`: ! `n` must be explicitly named. i Did you mean `dplyr::slice_max(n = 5)`? Code dplyr::slice_sample(df, 5) Condition Error in `dplyr::slice_sample()`: ! `n` must be explicitly named. i Did you mean `dplyr::slice_sample(n = 5)`? --- Code slice_head(df, 5, 2) Condition Error in `slice_head()`: ! `...` must be empty. x Problematic arguments: * ..1 = 5 * ..2 = 2 i Did you forget to name an argument? Code slice_tail(df, 5, 2) Condition Error in `slice_tail()`: ! `...` must be empty. x Problematic arguments: * ..1 = 5 * ..2 = 2 i Did you forget to name an argument? Code slice_min(df, x, 5, 2) Condition Error in `slice_min()`: ! `...` must be empty. x Problematic arguments: * ..1 = 5 * ..2 = 2 i Did you forget to name an argument? Code slice_max(df, x, 5, 2) Condition Error in `slice_max()`: ! `...` must be empty. x Problematic arguments: * ..1 = 5 * ..2 = 2 i Did you forget to name an argument? Code slice_sample(df, 5, 2) Condition Error in `slice_sample()`: ! `...` must be empty. x Problematic arguments: * ..1 = 5 * ..2 = 2 i Did you forget to name an argument? # slice_helper `by` errors use correct error context and correct `by_arg` Code slice_head(gdf, n = 1, by = x) Condition Error in `slice_head()`: ! Can't supply `by` when `.data` is a grouped data frame. Code slice_tail(gdf, n = 1, by = x) Condition Error in `slice_tail()`: ! Can't supply `by` when `.data` is a grouped data frame. Code slice_min(gdf, order_by = x, by = x) Condition Error in `slice_min()`: ! Can't supply `by` when `.data` is a grouped data frame. Code slice_max(gdf, order_by = x, by = x) Condition Error in `slice_max()`: ! Can't supply `by` when `.data` is a grouped data frame. Code slice_sample(gdf, n = 1, by = x) Condition Error in `slice_sample()`: ! Can't supply `by` when `.data` is a grouped data frame. # slice_helper catches `.by` typo (#6647) Code slice_head(df, n = 1, .by = x) Condition Error in `slice_head()`: ! Can't specify an argument named `.by` in this verb. i Did you mean to use `by` instead? Code slice_tail(df, n = 1, .by = x) Condition Error in `slice_tail()`: ! Can't specify an argument named `.by` in this verb. i Did you mean to use `by` instead? Code slice_min(df, order_by = x, .by = x) Condition Error in `slice_min()`: ! Can't specify an argument named `.by` in this verb. i Did you mean to use `by` instead? Code slice_max(df, order_by = x, .by = x) Condition Error in `slice_max()`: ! Can't specify an argument named `.by` in this verb. i Did you mean to use `by` instead? Code slice_sample(df, n = 1, .by = x) Condition Error in `slice_sample()`: ! Can't specify an argument named `.by` in this verb. i Did you mean to use `by` instead? # slice_min/max() check size of `order_by=` (#5922) Code slice_min(data.frame(x = 1:10), 1:6) Condition Error in `slice_min()`: ! Can't compute indices. Caused by error: ! `order_by` must have size 10, not size 6. Code slice_max(data.frame(x = 1:10), 1:6) Condition Error in `slice_max()`: ! Can't compute indices. Caused by error: ! `order_by` must have size 10, not size 6. # slice_min/max() validate simple arguments Code slice_min(data.frame(x = 1:10)) Condition Error in `slice_min()`: ! `order_by` is absent but must be supplied. Code slice_max(data.frame(x = 1:10)) Condition Error in `slice_max()`: ! `order_by` is absent but must be supplied. Code slice_min(data.frame(x = 1:10), x, with_ties = 1) Condition Error in `slice_min()`: ! `with_ties` must be `TRUE` or `FALSE`, not the number 1. Code slice_max(data.frame(x = 1:10), x, with_ties = 1) Condition Error in `slice_max()`: ! `with_ties` must be `TRUE` or `FALSE`, not the number 1. Code slice_min(data.frame(x = 1:10), x, na_rm = 1) Condition Error in `slice_min()`: ! `na_rm` must be `TRUE` or `FALSE`, not the number 1. Code slice_max(data.frame(x = 1:10), x, na_rm = 1) Condition Error in `slice_max()`: ! `na_rm` must be `TRUE` or `FALSE`, not the number 1. # slice_sample() checks size of `weight_by=` (#5922) Code slice_sample(df, n = 2, weight_by = 1:6) Condition Error in `slice_sample()`: ! Can't compute indices. Caused by error: ! `weight_by` must have size 10, not size 6. # `slice_sample()` validates `replace` Code slice_sample(df, replace = 1) Condition Error in `slice_sample()`: ! `replace` must be `TRUE` or `FALSE`, not the number 1. Code slice_sample(df, replace = NA) Condition Error in `slice_sample()`: ! `replace` must be `TRUE` or `FALSE`, not `NA`. dplyr/tests/testthat/_snaps/select.md0000644000176200001440000000446615106134104017477 0ustar liggesusers# grouping variables preserved with a message, unless already selected (#1511, #5841) Code res <- select(df, x) Message Adding missing grouping variables: `g` --- Code expect_equal(select(df, a = c), group_by(tibble(b = 2, a = 3), b)) Message Adding missing grouping variables: `b` Code expect_equal(select(df, b = c), group_by(tibble(a = 1, b = 3), a)) Message Adding missing grouping variables: `a` # non-syntactic grouping variable is preserved (#1138) Code df <- select(group_by(tibble(`a b` = 1L), `a b`)) Message Adding missing grouping variables: `a b` # select() provides informative errors Code (expect_error(select(mtcars, 1 + ""))) Output Error in `select()`: i In argument: `1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator # dplyr_col_select() aborts when `[` implementation is broken Code (expect_error(select(df1, 1:2))) Output Error in `select()`: ! Can't select columns past the end. i Location 2 doesn't exist. i There is only 1 column. Code (expect_error(select(df1, 0))) Output Error in `select()`: ! Can't reconstruct data frame. x The `[` method for class must return a data frame. i It returned a . --- Code (expect_error(select(df1, 2))) Output Error in `select()`: ! Can't select columns past the end. i Location 2 doesn't exist. i There is only 1 column. Code (expect_error(select(df1, 1))) Output Error in `select()`: ! Can't reconstruct data frame. x The `[` method for class must return a data frame. i It returned a . Code (expect_error(select(df2, 1))) Output Error in `select()`: ! Can't reconstruct data frame. x The `[` method for class must return a data frame with 1 column. i It returned a of 0 columns. dplyr/tests/testthat/_snaps/defunct.md0000644000176200001440000000301715106134104017637 0ustar liggesusers# generate informative errors Code combine() Condition Error: ! `combine()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use `vctrs::vec_c()` instead. Code src_mysql() Condition Error: ! `src_mysql()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use `tbl()` directly with a database connection Code src_postgres() Condition Error: ! `src_postgres()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use `tbl()` directly with a database connection Code src_sqlite() Condition Error: ! `src_sqlite()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use `tbl()` directly with a database connection Code src_local() Condition Error: ! `src_local()` was deprecated in dplyr 1.0.0 and is now defunct. Code src_df() Condition Error: ! `src_df()` was deprecated in dplyr 1.0.0 and is now defunct. Code tbl_df() Condition Error: ! `tbl_df()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use `tibble::as_tibble()` instead. Code as.tbl() Condition Error: ! `as.tbl()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use `tibble::as_tibble()` instead. Code add_rownames() Condition Error: ! `add_rownames()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use `tibble::rownames_to_column()` instead. dplyr/tests/testthat/_snaps/case-when.md0000644000176200001440000002531515137161765020107 0ustar liggesusers# case_when() conditions must be logical (and aren't cast to logical!) Code case_when(1 ~ 2) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not the number 1. --- Code case_when(TRUE ~ 2, 3.5 ~ 4) Condition Error in `case_when()`: ! `..2 (left)` must be a logical vector, not the number 3.5. # case_when() does not accept classed logical conditions Code case_when(x ~ 1) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not a object. # case_when() logical conditions can't be arrays (#6862) Code case_when(x ~ y) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not a logical matrix. --- Code case_when(x ~ y) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not a logical 1D array. # `.default` isn't part of recycling Code case_when(FALSE ~ 1L, .default = 2:5) Condition Error in `case_when()`: ! Can't recycle `.default` (size 4) to size 1. # `.default` is part of common type computation Code case_when(TRUE ~ 1L, .default = "x") Condition Error in `case_when()`: ! Can't combine and `.default` . # passes through `.ptype` correctly Code case_when(TRUE ~ 1, FALSE ~ 1.5, .ptype = integer()) Condition Error in `case_when()`: ! Can't convert from `..2 (right)` to due to loss of precision. * Locations: 1 --- Code case_when(TRUE ~ 1, NULL, FALSE ~ 1.5, .ptype = integer()) Condition Error in `case_when()`: ! Can't convert from `..3 (right)` to due to loss of precision. * Locations: 1 # passes through `.size` correctly Code case_when(TRUE ~ 1:2, .size = 3) Condition Error in `case_when()`: ! Can't recycle `..1 (right)` (size 2) to size 3. --- Code case_when(TRUE ~ 1:3, NULL, TRUE ~ 1:2, .size = 3) Condition Error in `case_when()`: ! Can't recycle `..3 (right)` (size 2) to size 3. # can't supply `.default` and `.unmatched` Code case_when(TRUE ~ 1, .default = 1, .unmatched = "error") Condition Error in `case_when()`: ! Can't set `.default` when `unmatched = "error"`. # `.unmatched` is validated Code case_when(TRUE ~ 1, .unmatched = "foo") Condition Error in `case_when()`: ! `unmatched` must be either "default" or "error", not "foo". --- Code case_when(TRUE ~ 1, .unmatched = 1) Condition Error in `case_when()`: ! `unmatched` must be a string, not the number 1. # `.unmatched` treats `FALSE` like an unmatched location Code case_when(c(TRUE, FALSE, TRUE) ~ 1, .unmatched = "error") Condition Error in `case_when()`: ! Each location must be matched. x Location 2 is unmatched. # `.unmatched` treats `NA` like an unmatched location Code case_when(c(TRUE, NA, TRUE) ~ 1, .unmatched = "error") Condition Error in `case_when()`: ! Each location must be matched. x Location 2 is unmatched. # `.unmatched` errors pluralize well Code case_when(x == "a" ~ 1, x == "b" ~ 2, x == "c" ~ 3, x == "e" ~ 4, .unmatched = "error") Condition Error in `case_when()`: ! Each location must be matched. x Location 4 is unmatched. --- Code case_when(x == "a" ~ 1, x == "c" ~ 2, x == "e" ~ 3, .unmatched = "error") Condition Error in `case_when()`: ! Each location must be matched. x Locations 2 and 4 are unmatched. --- Code case_when(x == 1 ~ "a", .unmatched = "error") Condition Error in `case_when()`: ! Each location must be matched. x Locations 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ..., 99, and 100 are unmatched. # invalid type errors are correct (#6261) (#6206) Code case_when(TRUE ~ 1, TRUE ~ "x") Condition Error in `case_when()`: ! Can't combine `..1 (right)` and `..2 (right)` . # `NULL` formula element throws meaningful error (#7794) Code case_when(NULL ~ NULL) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not `NULL`. Code case_when(TRUE ~ NULL) Condition Error in `case_when()`: ! `..1 (right)` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. Code case_when(NULL ~ TRUE) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not `NULL`. Code case_when(c(TRUE, TRUE) ~ NULL) Condition Error in `case_when()`: ! `..1 (right)` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. Code case_when(NULL ~ c(TRUE, TRUE)) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not `NULL`. Code case_when(TRUE ~ NULL, c(TRUE, TRUE) ~ NULL) Condition Error in `case_when()`: ! `..1 (right)` must be a vector, not `NULL`. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. Code case_when(NULL ~ TRUE, NULL ~ c(TRUE, TRUE)) Condition Error in `case_when()`: ! `..1 (left)` must be a logical vector, not `NULL`. --- Code case_when(c(TRUE, TRUE) ~ NULL, c(TRUE, TRUE, TRUE) ~ NULL) Condition Error in `case_when()`: ! Can't recycle `..1 (left)` (size 2) to match `..2 (left)` (size 3). Code case_when(NULL ~ c(TRUE, TRUE), NULL ~ c(TRUE, TRUE, TRUE)) Condition Error in `case_when()`: ! Can't recycle `..1 (right)` (size 2) to match `..2 (right)` (size 3). # throws chained errors when formula evaluation fails Code case_when(1 ~ 2, 3 ~ stop("oh no!")) Condition Error in `case_when()`: ! Failed to evaluate the right-hand side of formula 2. Caused by error: ! oh no! --- Code case_when(1 ~ 2, stop("oh no!") ~ 4) Condition Error in `case_when()`: ! Failed to evaluate the left-hand side of formula 2. Caused by error: ! oh no! # case_when() give meaningful errors Code (expect_error(case_when(c(TRUE, FALSE) ~ 1:3, c(FALSE, TRUE) ~ 1:2))) Output Error in `case_when()`: ! Can't recycle `..1 (right)` (size 3) to match `..2 (right)` (size 2). Code (expect_error(case_when(c(TRUE, FALSE) ~ 1, c(FALSE, TRUE, FALSE) ~ 2, c(FALSE, TRUE, FALSE, NA) ~ 3))) Output Error in `case_when()`: ! Can't recycle `..1 (left)` (size 2) to match `..2 (left)` (size 3). Code (expect_error(case_when(51:53 ~ 1:3))) Output Error in `case_when()`: ! `..1 (left)` must be a logical vector, not an integer vector. Code (expect_error(case_when(paste(50)))) Output Error in `case_when()`: ! Case 1 (`paste(50)`) must be a two-sided formula, not the string "50". Code (expect_error(case_when(y ~ x, paste(50)))) Output Error in `case_when()`: ! Case 2 (`paste(50)`) must be a two-sided formula, not the string "50". Code (expect_error(case_when())) Output Error in `case_when()`: ! `...` can't be empty. Code (expect_error(case_when(NULL))) Output Error in `case_when()`: ! `...` can't be empty. Code (expect_error(case_when(~ 1:2))) Output Error in `case_when()`: ! Case 1 (`~1:2`) must be a two-sided formula, not a one-sided formula. # Using scalar LHS with vector RHS is deprecated (#7082) Code x <- 1:5 y <- 6:10 code <- 1L sex <- "M" expect_identical(case_when(code == 1L && sex == "M" ~ x, code == 1L && sex == "F" ~ y, code == 1L && sex == "M" ~ x + 1L, .default = 0L), x) Condition Warning: Calling `case_when()` with size 1 LHS inputs and size >1 RHS inputs was deprecated in dplyr 1.2.0. i This `case_when()` statement can result in subtle silent bugs and is very inefficient. Please use a series of if statements instead: ``` # Previously case_when(scalar_lhs1 ~ rhs1, scalar_lhs2 ~ rhs2, .default = default) # Now if (scalar_lhs1) { rhs1 } else if (scalar_lhs2) { rhs2 } else { default } ``` --- Code x <- 1 case_when(x == 1 ~ "a", x == 2 ~ character(), .default = "other") Condition Warning: Calling `case_when()` with size 1 LHS inputs and size >1 RHS inputs was deprecated in dplyr 1.2.0. i This `case_when()` statement can result in subtle silent bugs and is very inefficient. Please use a series of if statements instead: ``` # Previously case_when(scalar_lhs1 ~ rhs1, scalar_lhs2 ~ rhs2, .default = default) # Now if (scalar_lhs1) { rhs1 } else if (scalar_lhs2) { rhs2 } else { default } ``` Output character(0) # replace_when() allows vector RHS of the same size as `x` Code replace_when(x, x == 1 ~ 1:3) Condition Error in `replace_when()`: ! Can't recycle `..1 (right)` (size 3) to size 6. # replace_when() does not recycle LHS values Code replace_when(x, TRUE ~ 0) Condition Error in `replace_when()`: ! `..1 (left)` must have size 3, not size 1. --- Code replace_when(x, c(TRUE, TRUE, TRUE) ~ 0, NULL, TRUE ~ 0) Condition Error in `replace_when()`: ! `..3 (left)` must have size 3, not size 1. # replace_when() retains the type of `x` Code replace_when(x, x == "a" ~ "d") Condition Error in `replace_when()`: ! Can't convert from `..1 (right)` to > due to loss of generality. * Locations: 1 --- Code replace_when(x, x == "a" ~ "b", NULL, x == "b" ~ "d") Condition Error in `replace_when()`: ! Can't convert from `..3 (right)` to > due to loss of generality. * Locations: 1 # replace_when() does not allow named `...` Code replace_when(1, foo = TRUE ~ 2) Condition Error in `replace_when()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * foo = TRUE ~ 2 dplyr/tests/testthat/_snaps/relocate.md0000644000176200001440000000030014416000535020000 0ustar liggesusers# can only supply one of .before and .after Code relocate(df, .before = 1, .after = 1) Condition Error in `relocate()`: ! Can't supply both `.before` and `.after`. dplyr/tests/testthat/_snaps/nth-value.md0000644000176200001440000000321615106134104020113 0ustar liggesusers# `na_rm` is validated Code nth(1, 1, na_rm = 1) Condition Error in `nth()`: ! `na_rm` must be `TRUE` or `FALSE`, not the number 1. --- Code nth(1, 1, na_rm = c(TRUE, FALSE)) Condition Error in `nth()`: ! `na_rm` must be `TRUE` or `FALSE`, not a logical vector. # `default` must be size 1 (when not used with lists) Code nth(1L, n = 2L, default = 1:2) Condition Error in `nth()`: ! `default` must have size 1, not size 2. # `default` is cast to the type of `x` (when not used with lists) Code nth("x", 2, default = 2) Condition Error in `nth()`: ! Can't convert `default` to match type of `x` . # `n` is validated (#5466) Code nth(1:10, n = "x") Condition Error in `nth()`: ! Can't convert `n` to . --- Code nth(1:10, n = 1:2) Condition Error in `nth()`: ! `n` must have size 1, not size 2. --- Code nth(1:10, n = NA_integer_) Condition Error in `nth()`: ! `n` can't be `NA`. # `x` must be a vector Code nth(environment(), 1L) Condition Error in `vec_size()`: ! `x` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. # `order_by` must be the same size as `x` Code nth(1:5, n = 1L, order_by = 1:2) Condition Error in `nth()`: ! `order_by` must have size 5, not size 2. --- Code nth(1:5, n = 6L, order_by = 1:2) Condition Error in `nth()`: ! `order_by` must have size 5, not size 2. dplyr/tests/testthat/_snaps/mutate.md0000644000176200001440000002361615106134104017515 0ustar liggesusers# mutate() supports constants (#6056, #6305) Code (expect_error(mutate(df, z = !!z))) Output Error in `mutate()`: i In argument: `z = `. Caused by error: ! Inlined constant `z` must be size 10 or 1, not 5. Code (expect_error(mutate(group_by(df, g), z = !!z))) Output Error in `mutate()`: i In argument: `z = `. Caused by error: ! Inlined constant `z` must be size 10 or 1, not 5. Code (expect_error(mutate(rowwise(df), z = !!z))) Output Error in `mutate()`: i In argument: `z = `. Caused by error: ! Inlined constant `z` must be size 10 or 1, not 5. --- Code (expect_error(mutate(group_by(df, g), y = .env$y))) Output Error in `mutate()`: i In argument: `y = .env$y`. i In group 1: `g = 1`. Caused by error: ! `y` must be size 5 or 1, not 10. Code (expect_error(mutate(rowwise(df), y = .env$y))) Output Error in `mutate()`: i In argument: `y = .env$y`. i In row 1. Caused by error: ! `y` must be size 1, not 10. i Did you mean: `y = list(.env$y)` ? # can't overwrite column active bindings (#6666) Code mutate(df, y = { x <<- 2 x }) Condition Error in `mutate()`: i In argument: `y = { ... }`. Caused by error: ! unused argument (base::quote(2)) --- Code mutate(df, .by = g, y = { x <<- 2 x }) Condition Error in `mutate()`: i In argument: `y = { ... }`. i In group 1: `g = 1`. Caused by error: ! unused argument (base::quote(2)) --- Code mutate(gdf, y = { x <<- 2 x }) Condition Error in `mutate()`: i In argument: `y = { ... }`. i In group 1: `g = 1`. Caused by error: ! unused argument (base::quote(2)) # can't share local variables across expressions (#6666) Code mutate(df, x2 = { foo <- x x }, y2 = { foo }) Condition Error in `mutate()`: i In argument: `y2 = { ... }`. Caused by error: ! object 'foo' not found # rowwise mutate un-lists existing size-1 list-columns (#6302) Code mutate(df, y = x) Condition Error in `mutate()`: i In argument: `y = x`. i In row 2. Caused by error: ! `y` must be size 1, not 2. i Did you mean: `y = list(x)` ? # catches `.by` with grouped-df Code mutate(gdf, .by = x) Condition Error in `mutate()`: ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df Code mutate(rdf, .by = x) Condition Error in `mutate()`: ! Can't supply `.by` when `.data` is a rowwise data frame. # mutate() deals with 0 groups (#5534) Code mutate(df, y = max(x)) Condition Warning: There was 1 warning in `mutate()`. i In argument: `y = max(x)`. Caused by warning in `max()`: ! no non-missing arguments to max; returning -Inf Output # A tibble: 0 x 2 # Groups: x [0] # i 2 variables: x , y # mutate() give meaningful errors Code tbl <- tibble(x = 1:2, y = 1:2) (expect_error(mutate(tbl, y = NULL, a = sum(y)))) Output Error in `mutate()`: i In argument: `a = sum(y)`. Caused by error: ! object 'y' not found Code (expect_error(mutate(group_by(tbl, x), y = NULL, a = sum(y)))) Output Error in `mutate()`: i In argument: `a = sum(y)`. i In group 1: `x = 1`. Caused by error: ! object 'y' not found Code (expect_error(mutate(tibble(x = 1), y = mean))) Output Error in `mutate()`: i In argument: `y = mean`. Caused by error: ! `y` must be a vector, not a function. Code df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5) (expect_error(mutate(df, out = env(a = 1)))) Output Error in `mutate()`: i In argument: `out = env(a = 1)`. Caused by error: ! `out` must be a vector, not an environment. Code (expect_error(mutate(group_by(df, g), out = env(a = 1)))) Output Error in `mutate()`: i In argument: `out = env(a = 1)`. i In group 1: `g = 1`. Caused by error: ! `out` must be a vector, not an environment. Code (expect_error(mutate(rowwise(df), out = rnorm))) Output Error in `mutate()`: i In argument: `out = rnorm`. i In row 1. Caused by error: ! `out` must be a vector, not a function. i Did you mean: `out = list(rnorm)` ? Code (expect_error(mutate(group_by(data.frame(x = rep(1:5, each = 3)), x), val = ifelse( x < 3, "foo", 2)))) Output Error in `mutate()`: i In argument: `val = ifelse(x < 3, "foo", 2)`. Caused by error: ! `val` must return compatible vectors across groups. i Result of type for group 1: `x = 1`. i Result of type for group 3: `x = 3`. Code (expect_error(mutate(group_by(tibble(a = 1:3, b = 4:6), a), if (a == 1) NULL else "foo"))) Output Error in `mutate()`: i In argument: `if (a == 1) NULL else "foo"`. i In group 1: `a = 1`. Caused by error: ! `if (a == 1) NULL else "foo"` must return compatible vectors across groups. x Can't combine NULL and non NULL results. Code (expect_error(mutate(group_by(tibble(a = 1:3, b = 4:6), a), if (a == 2) NULL else "foo"))) Output Error in `mutate()`: i In argument: `if (a == 2) NULL else "foo"`. i In group 2: `a = 2`. Caused by error: ! `if (a == 2) NULL else "foo"` must return compatible vectors across groups. x Can't combine NULL and non NULL results. Code (expect_error(mutate(data.frame(x = c(2, 2, 3, 3)), int = 1:5))) Output Error in `mutate()`: i In argument: `int = 1:5`. Caused by error: ! `int` must be size 4 or 1, not 5. Code (expect_error(mutate(group_by(data.frame(x = c(2, 2, 3, 3)), x), int = 1:5))) Output Error in `mutate()`: i In argument: `int = 1:5`. i In group 1: `x = 2`. Caused by error: ! `int` must be size 2 or 1, not 5. Code (expect_error(mutate(group_by(data.frame(x = c(2, 3, 3)), x), int = 1:5))) Output Error in `mutate()`: i In argument: `int = 1:5`. i In group 1: `x = 2`. Caused by error: ! `int` must be size 1, not 5. Code (expect_error(mutate(rowwise(data.frame(x = c(2, 2, 3, 3))), int = 1:5))) Output Error in `mutate()`: i In argument: `int = 1:5`. i In row 1. Caused by error: ! `int` must be size 1, not 5. i Did you mean: `int = list(1:5)` ? Code (expect_error(mutate(rowwise(tibble(y = list(1:3, "a"))), y2 = y))) Output Error in `mutate()`: i In argument: `y2 = y`. i In row 1. Caused by error: ! `y2` must be size 1, not 3. i Did you mean: `y2 = list(y)` ? Code (expect_error(mutate(data.frame(x = 1:10), y = 11:20, y = 1:2))) Output Error in `mutate()`: i In argument: `y = 1:2`. Caused by error: ! `y` must be size 10 or 1, not 2. Code (expect_error(mutate(tibble(a = 1), c = .data$b))) Output Error in `mutate()`: i In argument: `c = .data$b`. Caused by error in `.data$b`: ! Column `b` not found in `.data`. Code (expect_error(mutate(group_by(tibble(a = 1:3), a), c = .data$b))) Output Error in `mutate()`: i In argument: `c = .data$b`. i In group 1: `a = 1`. Caused by error in `.data$b`: ! Column `b` not found in `.data`. Code lazy <- (function(x) list(enquo(x))) res <- mutate(rowwise(tbl), z = lazy(x), .keep = "unused") (expect_error(eval_tidy(res$z[[1]]))) Output Error: ! Obsolete data mask. x Too late to resolve `x` after the end of `dplyr::mutate()`. i Did you save an object that uses `x` lazily in a column in the `dplyr::mutate()` expression ? Code (expect_error(mutate(tibble(), stop("{")))) Output Error in `mutate()`: i In argument: `stop("{")`. Caused by error: ! { # mutate() errors refer to expressions if not named Code (expect_error(mutate(mtcars, 1:3))) Output Error in `mutate()`: i In argument: `1:3`. Caused by error: ! `1:3` must be size 32 or 1, not 3. Code (expect_error(mutate(group_by(mtcars, cyl), 1:3))) Output Error in `mutate()`: i In argument: `1:3`. i In group 1: `cyl = 4`. Caused by error: ! `1:3` must be size 11 or 1, not 3. # `mutate()` doesn't allow data frames with missing or empty names (#6758) Code mutate(df1) Condition Error in `mutate()`: ! Can't transform a data frame with `NA` or `""` names. --- Code mutate(df2) Condition Error in `mutate()`: ! Can't transform a data frame with missing names. dplyr/tests/testthat/_snaps/group-split.md0000644000176200001440000000224515106134104020476 0ustar liggesusers# group_split.grouped_df() warns about `...` Code out <- group_split(group_by(mtcars, cyl), cyl) Condition Warning: Calling `group_split()` on a ignores `...`. Please use `group_by(..., .add = TRUE) |> group_split()`. # group_split.rowwise_df() warns about `...` Code out <- group_split(rowwise(mtcars), cyl) Condition Warning: Calling `group_split()` on a ignores `...`. Please use `as_tibble() |> group_split(...)`. # `keep =` is defunct Code group_split(df, keep = TRUE) Condition Error: ! The `keep` argument of `group_split()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.keep` argument instead. --- Code group_split(gdf, keep = TRUE) Condition Error: ! The `keep` argument of `group_split()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.keep` argument instead. --- Code group_split(rdf, keep = TRUE) Condition Error: ! The `keep` argument of `group_split()` was deprecated in dplyr 1.0.0 and is now defunct. i Please use the `.keep` argument instead. dplyr/tests/testthat/_snaps/rename.md0000644000176200001440000000103514416000535017457 0ustar liggesusers# `.fn` result type is checked (#6561) Code rename_with(df, fn) Condition Error in `rename_with()`: ! `.fn` must return a character vector, not an integer. # `.fn` result size is checked (#6561) Code rename_with(df, fn) Condition Error in `rename_with()`: ! `.fn` must return a vector of length 2, not 3. # can't rename in `.cols` Code rename_with(df, toupper, .cols = c(y = x)) Condition Error in `rename_with()`: ! Can't rename variables in this context. dplyr/tests/testthat/_snaps/rank.md0000644000176200001440000000103614416000534017143 0ustar liggesusers# ntile() validates `n` Code ntile(1, n = 1.5) Condition Error in `ntile()`: ! `n` must be a whole number, not the number 1.5. --- Code ntile(1, n = c(1, 2)) Condition Error in `ntile()`: ! `n` must be a whole number, not a double vector. --- Code ntile(1, n = NA_real_) Condition Error in `ntile()`: ! `n` must be a whole number, not a numeric `NA`. --- Code ntile(1, n = 0) Condition Error in `ntile()`: ! `n` must be positive. dplyr/tests/testthat/_snaps/join.md0000644000176200001440000001623214416000530017147 0ustar liggesusers# can't use `keep = FALSE` with non-equi conditions (#6499) Code left_join(df1, df2, join_by(overlaps(xl, xu, yl, yu)), keep = FALSE) Condition Error in `left_join()`: ! Can't set `keep = FALSE` when using an inequality, rolling, or overlap join. --- Code full_join(df1, df2, join_by(overlaps(xl, xu, yl, yu)), keep = FALSE) Condition Error in `full_join()`: ! Can't set `keep = FALSE` when using an inequality, rolling, or overlap join. # join_mutate() validates arguments Code join_mutate(df, df, by = 1, type = "left") Condition Error: ! `by` must be a (named) character vector, list, `join_by()` result, or NULL, not the number 1. Code join_mutate(df, df, by = "x", type = "left", suffix = 1) Condition Error: ! `suffix` must be a character vector of length 2, not the number 1 of length 1. Code join_mutate(df, df, by = "x", type = "left", na_matches = "foo") Condition Error: ! `na_matches` must be one of "na" or "never", not "foo". Code join_mutate(df, df, by = "x", type = "left", keep = 1) Condition Error: ! `keep` must be `TRUE`, `FALSE`, or `NULL`, not the number 1. # join_filter() validates arguments Code join_filter(df, df, by = 1, type = "semi") Condition Error: ! `by` must be a (named) character vector, list, `join_by()` result, or NULL, not the number 1. Code join_filter(df, df, by = "x", type = "semi", na_matches = "foo") Condition Error: ! `na_matches` must be one of "na" or "never", not "foo". # mutating joins trigger many-to-many warning Code out <- left_join(df, df, join_by(x)) Condition Warning in `left_join()`: Detected an unexpected many-to-many relationship between `x` and `y`. i Row 1 of `x` matches multiple rows in `y`. i Row 1 of `y` matches multiple rows in `x`. i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. # mutating joins compute common columns Code out <- left_join(df1, df2) Message Joining with `by = join_by(x)` # filtering joins compute common columns Code out <- semi_join(df1, df2) Message Joining with `by = join_by(x)` # mutating joins reference original column in `y` when there are type errors (#6465) Code (expect_error(left_join(x, y, by = join_by(a == b)))) Output Error in `left_join()`: ! Can't join `x$a` with `y$b` due to incompatible types. i `x$a` is a . i `y$b` is a . # filtering joins reference original column in `y` when there are type errors (#6465) Code (expect_error(semi_join(x, y, by = join_by(a == b)))) Output Error in `semi_join()`: ! Can't join `x$a` with `y$b` due to incompatible types. i `x$a` is a . i `y$b` is a . # error if passed additional arguments Code inner_join(df1, df2, on = "a") Condition Error in `inner_join()`: ! `...` must be empty. x Problematic argument: * on = "a" Code left_join(df1, df2, on = "a") Condition Error in `left_join()`: ! `...` must be empty. x Problematic argument: * on = "a" Code right_join(df1, df2, on = "a") Condition Error in `right_join()`: ! `...` must be empty. x Problematic argument: * on = "a" Code full_join(df1, df2, on = "a") Condition Error in `full_join()`: ! `...` must be empty. x Problematic argument: * on = "a" Code nest_join(df1, df2, on = "a") Condition Error in `nest_join()`: ! `...` must be empty. x Problematic argument: * on = "a" Code anti_join(df1, df2, on = "a") Condition Error in `anti_join()`: ! `...` must be empty. x Problematic argument: * on = "a" Code semi_join(df1, df2, on = "a") Condition Error in `semi_join()`: ! `...` must be empty. x Problematic argument: * on = "a" # nest_join computes common columns Code out <- nest_join(df1, df2) Message Joining with `by = join_by(x)` # nest_join references original column in `y` when there are type errors (#6465) Code (expect_error(nest_join(x, y, by = join_by(a == b)))) Output Error in `nest_join()`: ! Can't join `x$a` with `y$b` due to incompatible types. i `x$a` is a . i `y$b` is a . # validates inputs Code nest_join(df1, df2, by = 1) Condition Error in `nest_join()`: ! `by` must be a (named) character vector, list, `join_by()` result, or NULL, not the number 1. Code nest_join(df1, df2, keep = 1) Condition Error in `nest_join()`: ! `keep` must be `TRUE`, `FALSE`, or `NULL`, not the number 1. Code nest_join(df1, df2, name = 1) Condition Error in `nest_join()`: ! `name` must be a single string, not the number 1. Code nest_join(df1, df2, na_matches = 1) Condition Error in `nest_join()`: ! `na_matches` must be a string or character vector. # `by = character()` technically respects `unmatched` Code left_join(df1, df2, by = character(), unmatched = "error") Condition Error in `left_join()`: ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. # `by = character()` technically respects `relationship` Code left_join(df, df, by = character(), relationship = "many-to-one") Condition Error in `left_join()`: ! Each row in `x` must match at most 1 row in `y`. i Row 1 of `x` matches multiple rows in `y`. # `by = character()` for a cross join is deprecated (#6604) Code out <- left_join(df1, df2, by = character()) Condition Warning: Using `by = character()` to perform a cross join was deprecated in dplyr 1.1.0. i Please use `cross_join()` instead. --- Code out <- semi_join(df1, df2, by = character()) Condition Warning: Using `by = character()` to perform a cross join was deprecated in dplyr 1.1.0. i Please use `cross_join()` instead. --- Code out <- nest_join(df1, df2, by = character()) Condition Warning: Using `by = character()` to perform a cross join was deprecated in dplyr 1.1.0. i Please use `cross_join()` instead. # `by = named character()` for a cross join works Code out <- left_join(df1, df2, by = by) Condition Warning: Using `by = character()` to perform a cross join was deprecated in dplyr 1.1.0. i Please use `cross_join()` instead. # `by = list(x = character(), y = character())` for a cross join is deprecated (#6604) Code out <- left_join(df1, df2, by = list(x = character(), y = character())) Condition Warning: Using `by = character()` to perform a cross join was deprecated in dplyr 1.1.0. i Please use `cross_join()` instead. dplyr/tests/testthat/_snaps/recode.md0000644000176200001440000000172015106134104017447 0ustar liggesusers# recode() gives meaningful error messages Code (expect_error(recode(factor("a"), a = 5, .missing = 10))) Output Error in `recode()`: ! `.missing` is not supported for factors. Code (expect_error(recode("a", b = 5, "c"))) Output Error in `recode()`: ! Argument 3 must be named. Code (expect_error(recode(factor("a"), b = 5, "c"))) Output Error in `recode()`: ! Argument 3 must be named. Code (expect_error(recode(1:5))) Output Error in `recode()`: ! No replacements provided. Code (expect_error(recode("a"))) Output Error in `recode()`: ! No replacements provided. Code (expect_error(recode(factor("a")))) Output Error in `recode()`: ! No replacements provided. dplyr/tests/testthat/_snaps/bind-rows.md0000644000176200001440000000263014416000507020115 0ustar liggesusers# bind_rows() only flattens S3 lists that inherit from list (#3924) Code bind_rows(lst1) Condition Error in `bind_rows()`: ! Argument 1 must be a data frame or a named atomic vector. # bind_rows() validates lists (#5417) Code bind_rows(list(x = 1), list(x = 1:3, y = 1:2)) Condition Error in `vctrs::data_frame()`: ! Can't recycle `x` (size 3) to match `y` (size 2). # bind_rows() give informative errors Code # invalid .id df1 <- tibble(x = 1:3) df2 <- tibble(x = 4:6) (expect_error(bind_rows(df1, df2, .id = 5))) Output Error in `bind_rows()`: ! `.id` must be a single string, not the number 5. Code # invalid type ll <- list(tibble(a = 1:5), env(a = 1)) (expect_error(bind_rows(ll))) Output Error in `bind_rows()`: ! Argument 2 must be a data frame or a named atomic vector. Code df1 <- tibble(a = factor("a")) df2 <- tibble(a = 1L) (expect_error(bind_rows(df1, df2))) Output Error in `bind_rows()`: ! Can't combine `..1$a` > and `..2$a` . Code # unnamed vectors (expect_error(bind_rows(1:2))) Output Error in `bind_rows()`: ! Argument 1 must be a data frame or a named atomic vector. dplyr/tests/testthat/_snaps/filter.md0000644000176200001440000002321115137161765017513 0ustar liggesusers# filter() and filter_out() allow matrices with 1 column with a deprecation warning (#6091) Code out <- filter(df, matrix(c(TRUE, FALSE), nrow = 2)) Condition Warning: Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. --- Code out <- filter_out(df, matrix(c(TRUE, FALSE), nrow = 2)) Condition Warning: Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. --- Code out <- filter(gdf, matrix(c(TRUE, FALSE), nrow = 2)) Condition Warning: Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. --- Code out <- filter_out(gdf, matrix(c(TRUE, FALSE), nrow = 2)) Condition Warning: Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. # filter() and filter_out() disallow matrices with >1 column Code filter(df, matrix(TRUE, nrow = 3, ncol = 2)) Condition Error in `filter()`: i In argument: `matrix(TRUE, nrow = 3, ncol = 2)`. Caused by error: ! `..1` must be a logical vector, not a logical matrix. --- Code filter_out(df, matrix(TRUE, nrow = 3, ncol = 2)) Condition Error in `filter_out()`: i In argument: `matrix(TRUE, nrow = 3, ncol = 2)`. Caused by error: ! `..1` must be a logical vector, not a logical matrix. # filter() and filter_out() disallow arrays with >2 dimensions Code filter(df, array(TRUE, dim = c(3, 1, 1))) Condition Error in `filter()`: i In argument: `array(TRUE, dim = c(3, 1, 1))`. Caused by error: ! `..1` must be a logical vector, not a logical array. --- Code filter_out(df, array(TRUE, dim = c(3, 1, 1))) Condition Error in `filter_out()`: i In argument: `array(TRUE, dim = c(3, 1, 1))`. Caused by error: ! `..1` must be a logical vector, not a logical array. # filter() gives useful error messages Code (expect_error(filter(group_by(iris, Species), 1:n()))) Output Error in `filter()`: i In argument: `1:n()`. i In group 1: `Species = setosa`. Caused by error: ! `..1` must be a logical vector, not an integer vector. Code (expect_error(filter(iris, 1:n()))) Output Error in `filter()`: i In argument: `1:n()`. Caused by error: ! `..1` must be a logical vector, not an integer vector. Code (expect_error(filter(data.frame(x = 1:2), matrix(c(TRUE, FALSE, TRUE, FALSE), nrow = 2)))) Output Error in `filter()`: i In argument: `matrix(c(TRUE, FALSE, TRUE, FALSE), nrow = 2)`. Caused by error: ! `..1` must be a logical vector, not a logical matrix. Code (expect_error(filter(group_by(iris, Species), c(TRUE, FALSE)))) Output Error in `filter()`: i In argument: `c(TRUE, FALSE)`. i In group 1: `Species = setosa`. Caused by error: ! `..1` must be of size 50 or 1, not size 2. Code (expect_error(filter(rowwise(iris, Species), c(TRUE, FALSE)))) Output Error in `filter()`: i In argument: `c(TRUE, FALSE)`. i In row 1. Caused by error: ! `..1` must be of size 1, not size 2. Code (expect_error(filter(iris, c(TRUE, FALSE)))) Output Error in `filter()`: i In argument: `c(TRUE, FALSE)`. Caused by error: ! `..1` must be of size 150 or 1, not size 2. Code (expect_error(filter(mtcars, `_x`))) Output Error in `filter()`: i In argument: `_x`. Caused by error: ! object '_x' not found Code (expect_error(filter(group_by(mtcars, cyl), `_x`))) Output Error in `filter()`: i In argument: `_x`. i In group 1: `cyl = 4`. Caused by error: ! object '_x' not found Code (expect_error(filter(mtcars, x = 1))) Output Error in `filter()`: ! We detected a named input. i This usually means that you've used `=` instead of `==`. i Did you mean `x == 1`? Code (expect_error(filter(mtcars, y > 2, z = 3))) Output Error in `filter()`: ! We detected a named input. i This usually means that you've used `=` instead of `==`. i Did you mean `z == 3`? Code (expect_error(filter(mtcars, TRUE, x = 1))) Output Error in `filter()`: ! We detected a named input. i This usually means that you've used `=` instead of `==`. i Did you mean `x == 1`? Code (expect_error(filter(ts(1:10)))) Output Error in `filter()`: ! Incompatible data source. x `.data` is a object, not a data source. i Did you want to use `stats::filter()`? Code (expect_error(filter(tibble(), stop("{")))) Output Error in `filter()`: i In argument: `stop("{")`. Caused by error: ! { # Using data frames in `filter()` is defunct (#7758) Code filter(df, across(everything(), ~ .x > 0)) Condition Error in `filter()`: i In argument: `across(everything(), ~.x > 0)`. Caused by error: ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. --- Code filter(gdf, across(everything(), ~ .x > 0)) Condition Error in `filter()`: i In argument: `across(everything(), ~.x > 0)`. i In group 1: `x = 1`. Caused by error: ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. --- Code filter(rdf, across(everything(), ~ .x > 0)) Condition Error in `filter()`: i In argument: `across(everything(), ~.x > 0)`. i In row 1. Caused by error: ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. --- Code filter(df, tibble(x > 0, y > 0)) Condition Error in `filter()`: i In argument: `tibble(x > 0, y > 0)`. Caused by error: ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. --- Code filter(gdf, tibble(x > 0, y > 0)) Condition Error in `filter()`: i In argument: `tibble(x > 0, y > 0)`. i In group 1: `x = 1`. Caused by error: ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. --- Code filter(rdf, tibble(x > 0, y > 0)) Condition Error in `filter()`: i In argument: `tibble(x > 0, y > 0)`. i In row 1. Caused by error: ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. # `filter()` doesn't allow data frames with missing or empty names (#6758) Code filter(df1) Condition Error in `filter()`: ! Can't transform a data frame with `NA` or `""` names. --- Code filter_out(df1) Condition Error in `filter_out()`: ! Can't transform a data frame with `NA` or `""` names. --- Code filter(df2) Condition Error in `filter()`: ! Can't transform a data frame with missing names. --- Code filter_out(df2) Condition Error in `filter_out()`: ! Can't transform a data frame with missing names. # can't use `.by` with `.preserve` Code filter(df, .by = x, .preserve = TRUE) Condition Error in `filter()`: ! Can't supply both `.by` and `.preserve`. --- Code filter_out(df, .by = x, .preserve = TRUE) Condition Error in `filter_out()`: ! Can't supply both `.by` and `.preserve`. # catches `.by` with grouped-df Code filter(gdf, .by = x) Condition Error in `filter()`: ! Can't supply `.by` when `.data` is a grouped data frame. --- Code filter_out(gdf, .by = x) Condition Error in `filter_out()`: ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df Code filter(rdf, .by = x) Condition Error in `filter()`: ! Can't supply `.by` when `.data` is a rowwise data frame. --- Code filter_out(rdf, .by = x) Condition Error in `filter_out()`: ! Can't supply `.by` when `.data` is a rowwise data frame. # catches `by` typo (#6647) Code filter(df, by = x) Condition Error in `filter()`: ! Can't specify an argument named `by` in this verb. i Did you mean to use `.by` instead? --- Code filter_out(df, by = x) Condition Error in `filter_out()`: ! Can't specify an argument named `by` in this verb. i Did you mean to use `.by` instead? dplyr/tests/testthat/_snaps/funs.md0000644000176200001440000000200115106134104017152 0ustar liggesusers# takes the common type between all inputs (#6478) Code between("1", 2, 3) Condition Error in `between()`: ! Can't combine `x` and `left` . --- Code between(1, "2", 3) Condition Error in `between()`: ! Can't combine `x` and `left` . --- Code between(1, 2, "3") Condition Error in `between()`: ! Can't combine `x` and `right` . # recycles `left` and `right` to the size of `x` Code between(1:3, 1:2, 1L) Condition Error in `between()`: ! Can't recycle `left` (size 2) to size 3. --- Code between(1:3, 1L, 1:2) Condition Error in `between()`: ! Can't recycle `right` (size 2) to size 3. # ptype argument affects type casting Code between(x, 1.5, 3.5, ptype = integer()) Condition Error in `between()`: ! Can't convert from `left` to due to loss of precision. * Locations: 1 dplyr/tests/testthat/_snaps/join-by.md0000644000176200001440000002351715016155021017566 0ustar liggesusers# joining by nothing is an error Code join_by() Condition Error in `join_by()`: ! Must supply at least one expression. i If you want a cross join, use `cross_join()`. # nicely catches required missing arguments when wrapped Code fn(a) Condition Error: ! Expressions using `==` can't contain missing arguments. x Argument `y` is missing. # allows for namespaced helpers (#6838) Code join_by(dplyr::between(x, left, right)) Output Join By: - dplyr::between(x, left, right) --- Code join_by(dplyr::within(xl, xu, yl, yu)) Output Join By: - dplyr::within(xl, xu, yl, yu) --- Code join_by(dplyr::overlaps(xl, xu, yl, yu)) Output Join By: - dplyr::overlaps(xl, xu, yl, yu) --- Code join_by(dplyr::closest(x < y)) Output Join By: - dplyr::closest(x < y) # has an informative print method Code join_by(a, b) Output Join By: - a - b --- Code join_by("a", "b") Output Join By: - "a" - "b" --- Code join_by(a == a, b >= c) Output Join By: - a == a - b >= c --- Code join_by(a == a, b >= "c") Output Join By: - a == a - b >= "c" --- Code join_by(a == a, closest(b >= c), closest(d < e)) Output Join By: - a == a - closest(b >= c) - closest(d < e) # has informative error messages Code join_by(a = b) Condition Error in `join_by()`: ! Can't name join expressions. i Did you use `=` instead of `==`? --- Code join_by(NULL) Condition Error in `join_by()`: ! Expressions can't be empty. x Expression 1 is empty. --- Code join_by(foo(x > y)) Condition Error in `join_by()`: ! Expressions must use one of: `==`, `>=`, `>`, `<=`, `<`, `closest()`, `between()`, `overlaps()`, or `within()`. i Expression 1 is `foo(x > y)`. --- Code join_by(x == y, x^y) Condition Error in `join_by()`: ! Expressions must use one of: `==`, `>=`, `>`, `<=`, `<`, `closest()`, `between()`, `overlaps()`, or `within()`. i Expression 2 is `x^y`. --- Code join_by(x + 1 == y) Condition Error in `join_by()`: ! Expressions can't contain computed columns, and can only reference columns by name or by explicitly specifying a side, like `x$col` or `y$col`. i Expression 1 contains `x + 1`. --- Code join_by(x == y + 1) Condition Error in `join_by()`: ! Expressions can't contain computed columns, and can only reference columns by name or by explicitly specifying a side, like `x$col` or `y$col`. i Expression 1 contains `y + 1`. --- Code join_by(1) Condition Error in `join_by()`: ! Each element of `...` must be a single column name or a join by expression. x Element 1 is not a name and not an expression. --- Code join_by(1()) Condition Error in `join_by()`: ! Expressions must use one of: `==`, `>=`, `>`, `<=`, `<`, `closest()`, `between()`, `overlaps()`, or `within()`. i Expression 1 is `1()`. --- Code join_by(dplyrr::between(x, left, right)) Condition Error in `join_by()`: ! Expressions can only be namespace prefixed with `dplyr::`. i Expression 1 is `dplyrr::between(x, left, right)`. --- Code join_by(x$a) Condition Error in `join_by()`: ! Can't use `$` when specifying a single column name. i Expression 1 is `x$a`. --- Code join_by(z$a == y$b) Condition Error in `join_by()`: ! The left-hand side of a `$` expression must be either `x$` or `y$`. i Expression 1 contains `z$a`. --- Code join_by(x$a == z$b) Condition Error in `join_by()`: ! The left-hand side of a `$` expression must be either `x$` or `y$`. i Expression 1 contains `z$b`. --- Code join_by((x + 1)$y == b) Condition Error in `join_by()`: ! The left-hand side of a `$` expression must be a symbol or string. i Expression 1 contains `(x + 1)$y`. --- Code join_by(x$a == x$b) Condition Error in `join_by()`: ! The left and right-hand sides of a binary expression must reference different tables. i Expression 1 contains `x$a == x$b`. --- Code join_by(y$a == b) Condition Error in `join_by()`: ! The left and right-hand sides of a binary expression must reference different tables. i Expression 1 contains `y$a == b`. --- Code join_by(between(x$a, x$a, x$b)) Condition Error in `join_by()`: ! Expressions containing `between()` can't all reference the same table. i Expression 1 is `between(x$a, x$a, x$b)`. --- Code join_by(within(x$a, x$b, x$a, x$b)) Condition Error in `join_by()`: ! Expressions containing `within()` can't all reference the same table. i Expression 1 is `within(x$a, x$b, x$a, x$b)`. --- Code join_by(overlaps(a, b, x$a, x$b)) Condition Error in `join_by()`: ! Expressions containing `overlaps()` can't all reference the same table. i Expression 1 is `overlaps(a, b, x$a, x$b)`. --- Code join_by(closest(x$a >= x$b)) Condition Error in `join_by()`: ! The left and right-hand sides of a binary expression must reference different tables. i Expression 1 contains `x$a >= x$b`. --- Code join_by(between(a, x$a, y$b)) Condition Error in `join_by()`: ! Expressions containing `between()` must reference the same table for the lower and upper bounds. i Expression 1 is `between(a, x$a, y$b)`. --- Code join_by(within(x$a, y$b, y$a, y$b)) Condition Error in `join_by()`: ! Expressions containing `within()` must reference the same table for the left-hand side lower and upper bounds. i Expression 1 is `within(x$a, y$b, y$a, y$b)`. --- Code join_by(overlaps(x$a, x$b, y$a, x$b)) Condition Error in `join_by()`: ! Expressions containing `overlaps()` must reference the same table for the right-hand side lower and upper bounds. i Expression 1 is `overlaps(x$a, x$b, y$a, x$b)`. --- Code join_by(`>`(x)) Condition Error: ! Expressions using `>` can't contain missing arguments. x Argument `y` is missing. --- Code join_by(between(x)) Condition Error: ! Expressions using `between()` can't contain missing arguments. x Argument `y_lower` is missing. --- Code join_by(within(x)) Condition Error: ! Expressions using `within()` can't contain missing arguments. x Argument `x_upper` is missing. --- Code join_by(overlaps(x)) Condition Error: ! Expressions using `overlaps()` can't contain missing arguments. x Argument `x_upper` is missing. --- Code join_by(closest()) Condition Error: ! Expressions using `closest()` can't contain missing arguments. x Argument `expr` is missing. --- Code join_by(`$`(x) > y) Condition Error: ! Expressions using `$` can't contain missing arguments. x Argument `name` is missing. --- Code join_by(closest(a >= b, 1)) Condition Error in `closest()`: ! unused argument (1) --- Code join_by(closest(a == b)) Condition Error in `join_by()`: ! The expression used in `closest()` can't use `==`. i Expression 1 is `closest(a == b)`. --- Code join_by(closest(x)) Condition Error in `join_by()`: ! The first argument of `closest()` must be an expression. i Expression 1 is `closest(x)`. --- Code join_by(closest(1)) Condition Error in `join_by()`: ! The first argument of `closest()` must be an expression. i Expression 1 is `closest(1)`. --- Code join_by(closest(x + y)) Condition Error in `join_by()`: ! The expression used in `closest()` must use one of: `>=`, `>`, `<=`, or `<`. i Expression 1 is `closest(x + y)`. --- Code join_by(between(x, lower, upper, bounds = 1)) Condition Error: ! `bounds` must be a string or character vector. --- Code join_by(between(x, lower, upper, bounds = "a")) Condition Error: ! `bounds` must be one of "[]", "[)", "(]", or "()", not "a". --- Code join_by(overlaps(x, y, lower, upper, bounds = 1)) Condition Error: ! `bounds` must be a string or character vector. --- Code join_by(overlaps(x, y, lower, upper, bounds = "a")) Condition Error: ! `bounds` must be one of "[]", "[)", "(]", or "()", not "a". --- Code join_by(between(x, lower, upper, foo = 1)) Condition Error: ! `...` must be empty. i Non-empty dots were detected inside `between()`. --- Code join_by(overlaps(x, y, lower, upper, foo = 1)) Condition Error: ! `...` must be empty. i Non-empty dots were detected inside `overlaps()`. # as_join_by() emits useful errors Code as_join_by(FALSE) Condition Error: ! `by` must be a (named) character vector, list, `join_by()` result, or NULL, not `FALSE`. # join_by_common() emits useful information Code by <- join_by_common(c("x", "y"), c("x", "y")) Message Joining with `by = join_by(x, y)` --- Code by <- join_by_common(c("_x", "foo bar"), c("_x", "foo bar")) Message Joining with `by = join_by(`_x`, `foo bar`)` --- Code join_by_common(c("x", "y"), c("w", "z")) Condition Error: ! `by` must be supplied when `x` and `y` have no common variables. i Use `cross_join()` to perform a cross-join. dplyr/tests/testthat/_snaps/join-cols.md0000644000176200001440000000652014416000525020110 0ustar liggesusers# can't mix non-equi conditions with `keep = FALSE` (#6499) Code join_cols(c("x", "y"), c("x", "z"), by = join_by(x, y > z), keep = FALSE) Condition Error: ! Can't set `keep = FALSE` when using an inequality, rolling, or overlap join. --- Code join_cols(c("xl", "xu"), c("yl", "yu"), by = join_by(xl >= yl, xu < yu), keep = FALSE) Condition Error: ! Can't set `keep = FALSE` when using an inequality, rolling, or overlap join. --- Code join_cols("x", c("yl", "yu"), by = join_by(between(x, yl, yu)), keep = FALSE) Condition Error: ! Can't set `keep = FALSE` when using an inequality, rolling, or overlap join. --- Code join_cols(c("xl", "xu"), c("yl", "yu"), by = join_by(overlaps(xl, xu, yl, yu)), keep = FALSE) Condition Error: ! Can't set `keep = FALSE` when using an inequality, rolling, or overlap join. # can't duplicate key between equi condition and non-equi condition Code join_cols("x", c("xl", "xu"), by = join_by(x > xl, x == xu)) Condition Error: ! Join columns in `x` must be unique. x Problem with `x`. --- Code join_cols(c("xl", "xu"), "x", by = join_by(xl < x, xu == x)) Condition Error: ! Join columns in `y` must be unique. x Problem with `x`. # emits useful messages Code join_cols(c("x", "y"), c("y", "y"), join_by(y)) Condition Error: ! Input columns in `y` must be unique. x Problem with `y`. --- Code join_cols(c("y", "y"), c("x", "y"), join_by(y)) Condition Error: ! Input columns in `x` must be unique. x Problem with `y`. --- Code join_cols(xy, xy, by = as_join_by(list("1", y = "2"))) Condition Error in `as_join_by()`: ! `by$x` must evaluate to a character vector. --- Code join_cols(xy, xy, by = as_join_by(list(x = "1", "2"))) Condition Error in `as_join_by()`: ! `by$y` must evaluate to a character vector. --- Code join_cols(xy, xy, by = as_join_by(c("x", NA))) Condition Error: ! Join columns in `x` can't be `NA`. x Problem at position 2. --- Code join_cols(xy, xy, by = as_join_by(c("aaa", "bbb"))) Condition Error: ! Join columns in `x` must be present in the data. x Problem with `aaa` and `bbb`. --- Code join_cols(xy, xy, by = as_join_by(c("x", "x", "x"))) Condition Error: ! Join columns in `x` must be unique. x Problem with `x`. --- Code join_cols(xyz, xyz, by = join_by(x, x > y, z)) Condition Error: ! Join columns in `x` must be unique. x Problem with `x`. --- Code join_cols(xy, xy, by = join_by(x), suffix = "x") Condition Error: ! `suffix` must be a character vector of length 2, not the string "x" of length 1. --- Code join_cols(xy, xy, by = join_by(x), suffix = c("", NA)) Condition Error: ! `suffix` can't be `NA`. # references original column in `y` when there are type errors (#6465) Code (expect_error(join_cast_common(x_key, y_key, vars))) Output Error: ! Can't join `x$a` with `y$b` due to incompatible types. i `x$a` is a . i `y$b` is a . dplyr/tests/testthat/_snaps/na-if.md0000644000176200001440000000210215106134104017173 0ustar liggesusers# is type stable on `x` Code na_if(0L, 1.5) Condition Error in `na_if()`: ! Can't convert from `y` to `x` due to loss of precision. * Locations: 1 # is size stable on `x` Code na_if(1, integer()) Condition Error in `na_if()`: ! Can't recycle `y` (size 0) to size 1. --- Code na_if(1, c(1, 2)) Condition Error in `na_if()`: ! Can't recycle `y` (size 2) to size 1. --- Code na_if(c(1, 2, 3), c(1, 2)) Condition Error in `na_if()`: ! Can't recycle `y` (size 2) to size 3. # requires vector types for `x` and `y` Code na_if(environment(), 1) Condition Error in `na_if()`: ! `x` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. --- Code na_if(1, environment()) Condition Error in `na_if()`: ! `y` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. dplyr/tests/testthat/_snaps/select-helpers.md0000644000176200001440000000035515106134104021130 0ustar liggesusers# group_cols(vars =) is defunct Code group_cols("a") Condition Error: ! The `vars` argument of `group_cols()` was deprecated in dplyr 1.0.0 and is now defunct. i Use `data` with entire dataframe instead dplyr/tests/testthat/_snaps/bind-cols.md0000644000176200001440000000151614416000507020065 0ustar liggesusers# bind_cols() repairs names Code bound <- bind_cols(df, df) Message New names: * `a` -> `a...1` * `b` -> `b...2` * `a` -> `a...3` * `b` -> `b...4` # bind_cols() handles unnamed list with name repair (#3402) Code df <- bind_cols(list(1, 2)) Message New names: * `` -> `...1` * `` -> `...2` # bind_cols() gives informative errors Code # # incompatible size (expect_error(bind_cols(a = 1:2, mtcars))) Output Error in `bind_cols()`: ! Can't recycle `a` (size 2) to match `..2` (size 32). Code (expect_error(bind_cols(mtcars, a = 1:3))) Output Error in `bind_cols()`: ! Can't recycle `..1` (size 32) to match `a` (size 3). dplyr/tests/testthat/_snaps/group-nest.md0000644000176200001440000000070515106134104020313 0ustar liggesusers# group_nest.grouped_df() warns about `...` Code group_nest(group_by(mtcars, cyl), cyl) Condition Warning: Calling `group_nest()` on a ignores `...`. Please use `group_by(..., .add = TRUE) |> group_nest()`. Output # A tibble: 3 x 2 cyl data > 1 4 [11 x 10] 2 6 [7 x 10] 3 8 [14 x 10] dplyr/tests/testthat/_snaps/deprec-context.md0000644000176200001440000000236315106134104021136 0ustar liggesusers# cur_data() is deprecated Code mutate(df, y = cur_data()) Condition Warning: There was 1 warning in `mutate()`. i In argument: `y = cur_data()`. Caused by warning: ! `cur_data()` was deprecated in dplyr 1.1.0. i Please use `pick()` instead. Output # A tibble: 1 x 2 x y$x 1 1 1 # cur_data_all() is deprecated Code mutate(df, y = cur_data_all()) Condition Warning: There was 1 warning in `mutate()`. i In argument: `y = cur_data_all()`. Caused by warning: ! `cur_data_all()` was deprecated in dplyr 1.1.0. i Please use `pick()` instead. Output # A tibble: 1 x 2 x y$x 1 1 1 # give useful error messages when not applicable Code (expect_error(cur_data())) Output Error in `cur_data()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. Code (expect_error(cur_data_all())) Output Error in `cur_data_all()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. dplyr/tests/testthat/_snaps/recode-values.md0000644000176200001440000001425315106134104020751 0ustar liggesusers# when `from` is a list, `to` must recycle to the same size as that list Code recode_values(1, from = list(1, 2, 3), to = c(1, 2)) Condition Error in `recode_values()`: ! Can't recycle `to` (size 2) to size 3. # `NA` is considered unmatched unless handled explicitly Code recode_values(x, from = table$from, to = table$to, unmatched = "error") Condition Error in `recode_values()`: ! Each location must be matched. x Location 4 is unmatched. # `x` must be a vector Code recode_values(x, 1 ~ 1) Condition Error in `recode_values()`: ! `x` must be a vector, not a object. x Detected incompatible scalar S3 list. To be treated as a vector, the object must explicitly inherit from or should implement a `vec_proxy()` method. Class: . i If this object comes from a package, please report this error to the package author. i Read our FAQ about creating vector types (`?vctrs::howto_faq_fix_scalar_type_error`) to learn more. --- Code replace_values(x, 1 ~ 1) Condition Error in `replace_values()`: ! `x` must be a vector, not a object. x Detected incompatible scalar S3 list. To be treated as a vector, the object must explicitly inherit from or should implement a `vec_proxy()` method. Class: . i If this object comes from a package, please report this error to the package author. i Read our FAQ about creating vector types (`?vctrs::howto_faq_fix_scalar_type_error`) to learn more. # respects `ptype` Code recode_values(1, 1 ~ 0L, ptype = character()) Condition Error in `recode_values()`: ! Can't convert `..1 (right)` to . --- Code recode_values(1, 1 ~ "x", NULL, 2 ~ 0L, ptype = character()) Condition Error in `recode_values()`: ! Can't convert `..3 (right)` to . --- Code recode_values(1, from = 1, to = 0L, ptype = character()) Condition Error in `recode_values()`: ! Can't convert `to` to . --- Code recode_values(1, from = 1, to = "x", default = 0L, ptype = character()) Condition Error in `recode_values()`: ! Can't convert `default` to . # `replace_values()` is type stable on `x` Code replace_values(x, "c" ~ "b") Condition Error in `replace_values()`: ! Can't convert from `..1 (left)` to > due to loss of generality. * Locations: 1 --- Code replace_values(x, from = "c", to = "b") Condition Error in `replace_values()`: ! Can't convert from `from` to `x` > due to loss of generality. * Locations: 1 --- Code replace_values(x, "a" ~ "c") Condition Error in `replace_values()`: ! Can't convert from `..1 (right)` to > due to loss of generality. * Locations: 1 --- Code replace_values(x, from = "a", to = "c") Condition Error in `replace_values()`: ! Can't convert from `to` to > due to loss of generality. * Locations: 1 --- Code replace_values(x, "a" ~ "b", NULL, "b" ~ "c") Condition Error in `replace_values()`: ! Can't convert from `..3 (right)` to > due to loss of generality. * Locations: 1 # `default` is part of `ptype` determination Code recode_values(1, from = 1, to = 0L, default = "x") Condition Error in `recode_values()`: ! Can't combine `to` and `default` . # `default` has its size checked Code recode_values(1:3, 1 ~ 0, default = 1:5) Condition Error in `recode_values()`: ! Can't recycle `default` (size 5) to size 3. # treats list `from` and `to` as lists of vectors Code recode_values(x, from = from, to = to) Condition Error in `recode_values()`: ! Can't convert `from[[1]]` to . # `...` must be unnamed Code recode_values(1, foo = 1 ~ 2) Condition Error in `recode_values()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * foo = 1 ~ 2 --- Code replace_values(1, foo = 1 ~ 2) Condition Error in `replace_values()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * foo = 1 ~ 2 # `...` must contain two sided formulas Code recode_values(1, 1 ~ 1, 2) Condition Error in `recode_values()`: ! Case 2 (`2`) must be a two-sided formula, not the number 2. --- Code replace_values(1, 1 ~ 1, 2) Condition Error in `replace_values()`: ! Case 2 (`2`) must be a two-sided formula, not the number 2. --- Code recode_values(1, 1 ~ 1, ~2) Condition Error in `recode_values()`: ! Case 2 (`~2`) must be a two-sided formula, not a one-sided formula. --- Code replace_values(1, 1 ~ 1, ~2) Condition Error in `replace_values()`: ! Case 2 (`~2`) must be a two-sided formula, not a one-sided formula. # throws correct errors based on all combinations of `...` and `from` and `to` Code recode_values(1) Condition Error in `recode_values()`: ! `...` can't be empty. --- Code recode_values(1, 1 ~ 2, from = 1) Condition Error in `recode_values()`: ! Can't supply both `from` and `...`. --- Code replace_values(1, 1 ~ 2, from = 1) Condition Error in `replace_values()`: ! Can't supply both `from` and `...`. --- Code recode_values(1, from = 1) Condition Error in `recode_values()`: ! Must supply both `from` and `to`. --- Code replace_values(1, from = 1) Condition Error in `replace_values()`: ! Must supply both `from` and `to`. --- Code recode_values(1, to = 1) Condition Error in `recode_values()`: ! Must supply both `from` and `to`. --- Code replace_values(1, to = 1) Condition Error in `replace_values()`: ! Must supply both `from` and `to`. dplyr/tests/testthat/_snaps/pick.md0000644000176200001440000001530015137161765017154 0ustar liggesusers# with `rowwise()` data, leaves list-cols unwrapped (#5951, #6264) Code mutate(rdf, z = pick(x, y)) Condition Error in `mutate()`: i In argument: `z = pick(x, y)`. i In row 2. Caused by error: ! `z` must be size 1, not 2. i Did you mean: `z = list(pick(x, y))` ? --- Code mutate(rdf, z = pick_wrapper(x, y)) Condition Error in `mutate()`: i In argument: `z = pick_wrapper(x, y)`. i In row 2. Caused by error: ! `z` must be size 1, not 2. i Did you mean: `z = list(pick_wrapper(x, y))` ? # can't explicitly select grouping columns (#5460) Code mutate(gdf, y = pick(g)) Condition Error in `mutate()`: i In argument: `y = pick(g)`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `g` doesn't exist. --- Code mutate(gdf, y = pick_wrapper(g)) Condition Error in `mutate()`: i In argument: `y = pick_wrapper(g)`. i In group 1: `g = 1`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `g` doesn't exist. # must supply at least one selector to `pick()` Code mutate(df, y = pick()) Condition Error in `mutate()`: i In argument: `y = pick()`. Caused by error in `pick()`: ! Must supply at least one input to `pick()`. --- Code mutate(df, y = pick_wrapper()) Condition Error in `mutate()`: i In argument: `y = pick_wrapper()`. Caused by error in `pick()`: ! Must supply at least one input to `pick()`. # the tidyselection and column extraction are evaluated on the current data Code mutate(gdf, x = NULL, y = pick(x)) Condition Error in `mutate()`: i In argument: `y = pick(x)`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `x` doesn't exist. --- Code mutate(gdf, x = NULL, y = pick_wrapper(x)) Condition Error in `mutate()`: i In argument: `y = pick_wrapper(x)`. i In group 1: `g = 1`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `x` doesn't exist. # can call `pick()` from a user defined function Code mutate(gdf, d = my_pick()) Condition Error in `mutate()`: i In argument: `d = my_pick()`. i In group 1: `a = 1`. Caused by error in `pick()`: i In argument: `all_of(x)`. Caused by error in `all_of()`: ! Can't subset elements that don't exist. x Element `a` doesn't exist. --- Code mutate(gdf, d = my_pick(y)) Condition Error in `mutate()`: i In argument: `d = my_pick(y)`. i In group 1: `a = 1`. Caused by error in `pick()`: i In argument: `all_of(x)`. Caused by error in `all_of()`: ! Can't subset elements that don't exist. x Element `a` doesn't exist. # errors correctly outside mutate context Code pick() Condition Error in `pick()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. --- Code pick(a, b) Condition Error in `pick()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. # when expansion occurs, error labels use the pre-expansion quosure Code mutate(df, if (cur_group_id() == 1L) pick(x) else "x", .by = g) Condition Error in `mutate()`: i In argument: `if (cur_group_id() == 1L) pick(x) else "x"`. Caused by error: ! `if (cur_group_id() == 1L) pick(x) else "x"` must return compatible vectors across groups. i Result of type > for group 1: `g = 1`. i Result of type for group 2: `g = 2`. # doesn't allow renaming Code mutate(data.frame(x = 1), pick(y = x)) Condition Error in `mutate()`: i In argument: `pick(y = x)`. Caused by error in `pick()`: ! Can't rename variables in this context. --- Code mutate(data.frame(x = 1), pick_wrapper(y = x)) Condition Error in `mutate()`: i In argument: `pick_wrapper(y = x)`. Caused by error in `pick()`: ! Can't rename variables in this context. # `pick()` errors in `arrange()` are useful Code arrange(df, pick(y)) Condition Error in `arrange()`: i In argument: `..1 = pick(y)`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `y` doesn't exist. --- Code arrange(df, foo(pick(x))) Condition Error in `arrange()`: i In argument: `..1 = foo(pick(x))`. Caused by error in `foo()`: ! could not find function "foo" # `filter()` / `filter_out()` with `pick()` that uses invalid tidy-selection errors Code filter(df, pick(x, a)) Condition Error in `filter()`: i In argument: `pick(x, a)`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `a` doesn't exist. --- Code filter(df, pick_wrapper(x, a)) Condition Error in `filter()`: i In argument: `pick_wrapper(x, a)`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `a` doesn't exist. --- Code filter_out(df, pick(x, a)) Condition Error in `filter_out()`: i In argument: `pick(x, a)`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `a` doesn't exist. --- Code filter_out(df, pick_wrapper(x, a)) Condition Error in `filter_out()`: i In argument: `pick_wrapper(x, a)`. Caused by error in `pick()`: ! Can't select columns that don't exist. x Column `a` doesn't exist. # `filter()` / `filter_out()` that doesn't use `pick()` result correctly errors Code filter(df, pick(x, y)$x) Condition Error in `filter()`: i In argument: `pick(x, y)$x`. Caused by error: ! `..1` must be a logical vector, not a double vector. --- Code filter(df, pick_wrapper(x, y)$x) Condition Error in `filter()`: i In argument: `pick_wrapper(x, y)$x`. Caused by error: ! `..1` must be a logical vector, not a double vector. --- Code filter_out(df, pick(x, y)$x) Condition Error in `filter_out()`: i In argument: `pick(x, y)$x`. Caused by error: ! `..1` must be a logical vector, not a double vector. --- Code filter_out(df, pick_wrapper(x, y)$x) Condition Error in `filter_out()`: i In argument: `pick_wrapper(x, y)$x`. Caused by error: ! `..1` must be a logical vector, not a double vector. dplyr/tests/testthat/_snaps/desc.md0000644000176200001440000000035215106134104017124 0ustar liggesusers# errors cleanly on non-vectors Code desc(mean) Condition Error in `desc()`: ! `x` must be a vector, not a function. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. dplyr/tests/testthat/_snaps/across.md0000644000176200001440000006447215137161765017536 0ustar liggesusers# across(.unpack =) errors if the unpacked data frame has non-unique names Code mutate(df, across(x:y, fn, .unpack = "{outer}")) Condition Error in `mutate()`: i In argument: `across(x:y, fn, .unpack = "{outer}")`. Caused by error in `across()`: ! Names must be unique. x These names are duplicated: * "x" at locations 1 and 2. * "y" at locations 3 and 4. # `.unpack` is validated Code summarise(df, across(x, mean, .unpack = 1)) Condition Error in `summarise()`: i In argument: `across(x, mean, .unpack = 1)`. Caused by error in `across()`: ! `.unpack` must be `TRUE`, `FALSE`, or a single string, not the number 1. --- Code summarise(df, across(x, mean, .unpack = c("x", "y"))) Condition Error in `summarise()`: i In argument: `across(x, mean, .unpack = c("x", "y"))`. Caused by error in `across()`: ! `.unpack` must be `TRUE`, `FALSE`, or a single string, not a character vector. --- Code summarise(df, across(x, mean, .unpack = NA)) Condition Error in `summarise()`: i In argument: `across(x, mean, .unpack = NA)`. Caused by error in `across()`: ! `.unpack` must be `TRUE`, `FALSE`, or a single string, not `NA`. # across() throws meaningful error with failure during expansion (#6534) Code summarise(df, across(everything(), fn())) Condition Error in `summarise()`: i In argument: `across(everything(), fn())`. Caused by error in `fn()`: ! oh no! --- Code summarise(df, across(everything(), fn()), .by = g) Condition Error in `summarise()`: i In argument: `across(everything(), fn())`. Caused by error in `fn()`: ! oh no! --- Code summarise(gdf, across(everything(), fn())) Condition Error in `summarise()`: i In argument: `across(everything(), fn())`. Caused by error in `fn()`: ! oh no! # across() gives meaningful messages Code (expect_error(summarise(tibble(x = 1), across(where(is.numeric), 42)))) Output Error in `summarise()`: i In argument: `across(where(is.numeric), 42)`. Caused by error in `across()`: ! `.fns` must be a function, a formula, or a list of functions/formulas. Code (expect_error(summarise(tibble(x = 1), across(y, mean)))) Output Error in `summarise()`: i In argument: `across(y, mean)`. Caused by error in `across()`: ! Can't select columns that don't exist. x Column `y` doesn't exist. Code (expect_error(summarise(tibble(x = 1), res = across(where(is.numeric), 42)))) Output Error in `summarise()`: i In argument: `res = across(where(is.numeric), 42)`. Caused by error in `across()`: ! `.fns` must be a function, a formula, or a list of functions/formulas. Code (expect_error(summarise(tibble(x = 1), z = across(y, mean)))) Output Error in `summarise()`: i In argument: `z = across(y, mean)`. Caused by error in `across()`: ! Can't select columns that don't exist. x Column `y` doesn't exist. Code (expect_error(summarise(tibble(x = 1), res = sum(if_any(where(is.numeric), 42)))) ) Output Error in `summarise()`: i In argument: `res = sum(if_any(where(is.numeric), 42))`. Caused by error in `if_any()`: ! `.fns` must be a function, a formula, or a list of functions/formulas. Code (expect_error(summarise(tibble(x = 1), res = sum(if_all(~ mean(.x)))))) Output Error in `summarise()`: i In argument: `res = sum(if_all(~mean(.x)))`. Caused by error in `if_all()`: ! Must supply a column selection. i You most likely meant: `if_all(everything(), ~mean(.x))`. i The first argument `.cols` selects a set of columns. i The second argument `.fns` operates on each selected columns. Code (expect_error(summarise(tibble(x = 1), res = sum(if_any(~ mean(.x)))))) Output Error in `summarise()`: i In argument: `res = sum(if_any(~mean(.x)))`. Caused by error in `if_any()`: ! Must supply a column selection. i You most likely meant: `if_any(everything(), ~mean(.x))`. i The first argument `.cols` selects a set of columns. i The second argument `.fns` operates on each selected columns. Code (expect_error(across())) Output Error in `across()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. Code (expect_error(c_across())) Output Error in `c_across()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. Code error_fn <- (function(.) { if (all(. > 10)) { rlang::abort("too small", call = call("error_fn")) } else { 42 } }) (expect_error(summarise(tibble(x = 1:10, y = 11:20), across(everything(), error_fn)))) Output Error in `summarise()`: i In argument: `across(everything(), error_fn)`. Caused by error in `across()`: ! Can't compute column `y`. Caused by error in `error_fn()`: ! too small Code (expect_error(mutate(tibble(x = 1:10, y = 11:20), across(everything(), error_fn))) ) Output Error in `mutate()`: i In argument: `across(everything(), error_fn)`. Caused by error in `across()`: ! Can't compute column `y`. Caused by error in `error_fn()`: ! too small Code (expect_error(summarise(tibble(x = 1:10, y = 11:20), force(across(everything(), error_fn))))) Output Error in `summarise()`: i In argument: `force(across(everything(), error_fn))`. Caused by error in `across()`: ! Can't compute column `y`. Caused by error in `error_fn()`: ! too small Code (expect_error(mutate(tibble(x = 1:10, y = 11:20), force(across(everything(), error_fn))))) Output Error in `mutate()`: i In argument: `force(across(everything(), error_fn))`. Caused by error in `across()`: ! Can't compute column `y`. Caused by error in `error_fn()`: ! too small Code (expect_error(summarise(tibble(x = 1), across(everything(), list(f = mean, f = mean)))) ) Output Error in `summarise()`: i In argument: `across(everything(), list(f = mean, f = mean))`. Caused by error in `across()`: ! Names must be unique. x These names are duplicated: * "x_f" at locations 1 and 2. # if_any() and if_all() aborts when predicate mistakingly used in .cols= (#5732) Code (expect_error(filter(df, if_any(~ .x > 5)))) Output Error in `filter()`: i In argument: `if_any(~.x > 5)`. Caused by error in `if_any()`: ! Must supply a column selection. i You most likely meant: `if_any(everything(), ~.x > 5)`. i The first argument `.cols` selects a set of columns. i The second argument `.fns` operates on each selected columns. Code (expect_error(filter(df, if_all(~ .x > 5)))) Output Error in `filter()`: i In argument: `if_all(~.x > 5)`. Caused by error in `if_all()`: ! Must supply a column selection. i You most likely meant: `if_all(everything(), ~.x > 5)`. i The first argument `.cols` selects a set of columns. i The second argument `.fns` operates on each selected columns. Code (expect_error(filter(df, !if_any(~ .x > 5)))) Output Error in `filter()`: i In argument: `!if_any(~.x > 5)`. Caused by error in `if_any()`: ! Must supply a column selection. i You most likely meant: `if_any(everything(), ~.x > 5)`. i The first argument `.cols` selects a set of columns. i The second argument `.fns` operates on each selected columns. Code (expect_error(filter(df, !if_all(~ .x > 5)))) Output Error in `filter()`: i In argument: `!if_all(~.x > 5)`. Caused by error in `if_all()`: ! Must supply a column selection. i You most likely meant: `if_all(everything(), ~.x > 5)`. i The first argument `.cols` selects a set of columns. i The second argument `.fns` operates on each selected columns. # inlined and non inlined lambdas work Code (expect_error(mutate(df, across(1:2, ~ .y + mean(bar))))) Output Error in `mutate()`: i In argument: `across(1:2, ~.y + mean(bar))`. Caused by error in `across()`: ! Can't compute column `foo`. Caused by error: ! the ... list contains fewer than 2 elements Code (expect_error(mutate(df, (across(1:2, ~ .y + mean(bar)))))) Output Error in `mutate()`: i In argument: `(across(1:2, ~.y + mean(bar)))`. Caused by error in `across()`: ! Can't compute column `foo`. Caused by error in `fn()`: ! the ... list contains fewer than 2 elements # anonymous function `.fns` can access the `.data` pronoun even when not inlined Code mutate(df, across(y, fn)) Condition Error in `mutate()`: i In argument: `across(y, fn)`. Caused by error in `across()`: ! Can't compute column `y`. Caused by error: ! Can't subset `.data` outside of a data mask context. # `across()` recycle `.fns` results to common size Code mutate(df, across(c(x, y), fn)) Condition Error in `mutate()`: i In argument: `across(c(x, y), fn)`. Caused by error in `across()`: ! Can't compute column `x`. Caused by error in `dplyr_internal_error()`: --- Code mutate(df, (across(c(x, y), fn))) Condition Error in `mutate()`: i In argument: `(across(c(x, y), fn))`. Caused by error: ! `(across(c(x, y), fn))` must be size 3 or 1, not 2. # `if_any()` and `if_all()` have consistent behavior across `filter()` and `mutate()` Code filter(df, if_any(y)) Condition Error in `filter()`: i In argument: `if_any(y)`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_any(y))) Condition Error in `filter()`: i In argument: `(if_any(y))`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_any(y)) Condition Error in `mutate()`: i In argument: `a = if_any(y)`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_any(y, identity)) Condition Error in `filter()`: i In argument: `if_any(y, identity)`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_any(y, identity))) Condition Error in `filter()`: i In argument: `(if_any(y, identity))`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_any(y, identity)) Condition Error in `mutate()`: i In argument: `a = if_any(y, identity)`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_all(y)) Condition Error in `filter()`: i In argument: `if_all(y)`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_all(y))) Condition Error in `filter()`: i In argument: `(if_all(y))`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_all(y)) Condition Error in `mutate()`: i In argument: `a = if_all(y)`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_all(y, identity)) Condition Error in `filter()`: i In argument: `if_all(y, identity)`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_all(y, identity))) Condition Error in `filter()`: i In argument: `(if_all(y, identity))`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_all(y, identity)) Condition Error in `mutate()`: i In argument: `a = if_all(y, identity)`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_any(c(y, z))) Condition Error in `filter()`: i In argument: `if_any(c(y, z))`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_any(c(y, z)))) Condition Error in `filter()`: i In argument: `(if_any(c(y, z)))`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_any(c(y, z))) Condition Error in `mutate()`: i In argument: `a = if_any(c(y, z))`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_any(c(y, z), identity)) Condition Error in `filter()`: i In argument: `if_any(c(y, z), identity)`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_any(c(y, z), identity))) Condition Error in `filter()`: i In argument: `(if_any(c(y, z), identity))`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_any(c(y, z), identity)) Condition Error in `mutate()`: i In argument: `a = if_any(c(y, z), identity)`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_all(c(y, z))) Condition Error in `filter()`: i In argument: `if_all(c(y, z))`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_all(c(y, z)))) Condition Error in `filter()`: i In argument: `(if_all(c(y, z)))`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_all(c(y, z))) Condition Error in `mutate()`: i In argument: `a = if_all(c(y, z))`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_all(c(y, z), identity)) Condition Error in `filter()`: i In argument: `if_all(c(y, z), identity)`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_all(c(y, z), identity))) Condition Error in `filter()`: i In argument: `(if_all(c(y, z), identity))`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_all(c(y, z), identity)) Condition Error in `mutate()`: i In argument: `a = if_all(c(y, z), identity)`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_any(c(y, z)), .by = g) Condition Error in `filter()`: i In argument: `if_any(c(y, z))`. i In group 1: `g = "a"`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_any(c(y, z))), .by = g) Condition Error in `filter()`: i In argument: `(if_any(c(y, z)))`. i In group 1: `g = "a"`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_any(c(y, z)), .by = g) Condition Error in `mutate()`: i In argument: `a = if_any(c(y, z))`. i In group 1: `g = "a"`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_any(c(y, z), identity), .by = g) Condition Error in `filter()`: i In argument: `if_any(c(y, z), identity)`. i In group 1: `g = "a"`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_any(c(y, z), identity)), .by = g) Condition Error in `filter()`: i In argument: `(if_any(c(y, z), identity))`. i In group 1: `g = "a"`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_any(c(y, z), identity), .by = g) Condition Error in `mutate()`: i In argument: `a = if_any(c(y, z), identity)`. i In group 1: `g = "a"`. Caused by error in `if_any()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_all(c(y, z)), .by = g) Condition Error in `filter()`: i In argument: `if_all(c(y, z))`. i In group 1: `g = "a"`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_all(c(y, z))), .by = g) Condition Error in `filter()`: i In argument: `(if_all(c(y, z)))`. i In group 1: `g = "a"`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_all(c(y, z)), .by = g) Condition Error in `mutate()`: i In argument: `a = if_all(c(y, z))`. i In group 1: `g = "a"`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, if_all(c(y, z), identity), .by = g) Condition Error in `filter()`: i In argument: `if_all(c(y, z), identity)`. i In group 1: `g = "a"`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code filter(df, (if_all(c(y, z), identity)), .by = g) Condition Error in `filter()`: i In argument: `(if_all(c(y, z), identity))`. i In group 1: `g = "a"`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. --- Code mutate(df, a = if_all(c(y, z), identity), .by = g) Condition Error in `mutate()`: i In argument: `a = if_all(c(y, z), identity)`. i In group 1: `g = "a"`. Caused by error in `if_all()`: ! `y` must be a logical vector, not an integer vector. # `if_any()` and `if_all()` recycle `.fns` results to common size Code filter(df, if_any(c(x, y), fn)) Condition Error in `filter()`: i In argument: `if_any(c(x, y), fn)`. Caused by error: ! `..1` must be of size 3 or 1, not size 2. --- Code filter(df, (if_any(c(x, y), fn))) Condition Error in `filter()`: i In argument: `(if_any(c(x, y), fn))`. Caused by error: ! `..1` must be of size 3 or 1, not size 2. --- Code mutate(df, a = if_any(c(x, y), fn)) Condition Error in `mutate()`: i In argument: `a = if_any(c(x, y), fn)`. Caused by error: ! `a` must be size 3 or 1, not 2. --- Code filter(df, if_all(c(x, y), fn)) Condition Error in `filter()`: i In argument: `if_all(c(x, y), fn)`. Caused by error: ! `..1` must be of size 3 or 1, not size 2. --- Code filter(df, (if_all(c(x, y), fn))) Condition Error in `filter()`: i In argument: `(if_all(c(x, y), fn))`. Caused by error: ! `..1` must be of size 3 or 1, not size 2. --- Code mutate(df, a = if_all(c(x, y), fn)) Condition Error in `mutate()`: i In argument: `a = if_all(c(x, y), fn)`. Caused by error: ! `a` must be size 3 or 1, not 2. # can't rename during selection (#6522) Code mutate(df, z = c_across(c(y = x))) Condition Error in `mutate()`: i In argument: `z = c_across(c(y = x))`. Caused by error in `c_across()`: ! Can't rename variables in this context. # can't explicitly select grouping columns (#6522) Code mutate(gdf, y = c_across(g)) Condition Error in `mutate()`: i In argument: `y = c_across(g)`. i In group 1: `g = 1`. Caused by error in `c_across()`: ! Can't select columns that don't exist. x Column `g` doesn't exist. # across() applies old `.cols = everything()` default with a warning Code out <- mutate(df, across(.fns = times_two)) Condition Warning: There was 1 warning in `mutate()`. i In argument: `across(.fns = times_two)`. Caused by warning: ! Using `across()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- mutate(gdf, across(.fns = times_two)) Condition Warning: There was 1 warning in `mutate()`. i In argument: `across(.fns = times_two)`. Caused by warning: ! Using `across()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- mutate(df, (across(.fns = times_two))) Condition Warning: There was 1 warning in `mutate()`. i In argument: `(across(.fns = times_two))`. Caused by warning: ! Using `across()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- mutate(gdf, (across(.fns = times_two))) Condition Warning: There were 2 warnings in `mutate()`. The first warning was: i In argument: `(across(.fns = times_two))`. i In group 1: `g = 1`. Caused by warning: ! Using `across()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. # if_any() and if_all() apply old `.cols = everything()` default with a warning Code out <- filter(df, if_any()) Condition Warning: Using `if_any()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- filter(gdf, if_any()) Condition Warning: Using `if_any()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- filter(df, if_all()) Condition Warning: Using `if_all()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- filter(gdf, if_all()) Condition Warning: Using `if_all()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- filter(df, (if_any())) Condition Warning: There was 1 warning in `filter()`. i In argument: `(if_any())`. Caused by warning: ! Using `if_any()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- filter(gdf, (if_any())) Condition Warning: There were 2 warnings in `filter()`. The first warning was: i In argument: `(if_any())`. i In group 1: `g = 1`. Caused by warning: ! Using `if_any()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. --- Code out <- filter(df, (if_all())) Condition Warning: There was 1 warning in `filter()`. i In argument: `(if_all())`. Caused by warning: ! Using `if_all()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. --- Code out <- filter(gdf, (if_all())) Condition Warning: There were 2 warnings in `filter()`. The first warning was: i In argument: `(if_all())`. i In group 1: `g = 1`. Caused by warning: ! Using `if_all()` without supplying `.cols` was deprecated in dplyr 1.1.0. i Please supply `.cols` instead. i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. # c_across() applies old `cols = everything()` default with a warning Code out <- mutate(df, z = sum(c_across())) Condition Warning: There were 2 warnings in `mutate()`. The first warning was: i In argument: `z = sum(c_across())`. i In row 1. Caused by warning: ! Using `c_across()` without supplying `cols` was deprecated in dplyr 1.1.0. i Please supply `cols` instead. i Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. # across errors with non-empty dots and no `.fns` supplied (#6638) Code mutate(df, across(x, .funs = ~ . * 1000)) Condition Error in `mutate()`: i In argument: `across(x, .funs = ~. * 1000)`. Caused by error in `across()`: ! `...` must be empty. x Problematic argument: * .funs = ~. * 1000 # across(...) is deprecated Code summarise(df, across(everything(), mean, na.rm = TRUE)) Condition Warning: There was 1 warning in `summarise()`. i In argument: `across(everything(), mean, na.rm = TRUE)`. Caused by warning: ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0. Supply arguments directly to `.fns` through an anonymous function instead. # Previously across(a:b, mean, na.rm = TRUE) # Now across(a:b, \(x) mean(x, na.rm = TRUE)) Output # A tibble: 1 x 1 x 1 1 dplyr/tests/testthat/_snaps/deprec-do.md0000644000176200001440000000152515106134104020053 0ustar liggesusers# do() gives meaningful error messages Code (expect_error(do(df, head, tail))) Output Error in `do()`: ! Can only supply one unnamed argument, not 2. Code (expect_error(do(ungroup(df), 1))) Output Error in `do()`: ! Result must be a data frame, not numeric. Code (expect_error(do(df, 1))) Output Error in `do()`: ! Results 1, 2, 3 must be data frames, not numeric. Code (expect_error(do(df, "a"))) Output Error in `do()`: ! Results 1, 2, 3 must be data frames, not character. Code (expect_error(do(df, x = 1, 2))) Output Error in `do()`: ! Arguments must either be all named or all unnamed. dplyr/tests/testthat/_snaps/sample.md0000644000176200001440000000504415106134104017472 0ustar liggesusers# sample_*() gives meaningful error messages Code df2 <- tibble(x = rep(1:2, 100), y = rep(c(0, 1), 100), g = rep(1:2, each = 100)) grp <- group_by(df2, g) (expect_error(sample_n(grp, nrow(df2) / 2, weight = y))) Output Error in `sample_n()`: ! Can't compute indices. i In group 1: `g = 1`. Caused by error in `sample.int()`: ! too few positive probabilities Code (expect_error(sample_frac(grp, 1, weight = y))) Output Error in `sample_frac()`: ! Can't compute indices. i In group 1: `g = 1`. Caused by error in `sample.int()`: ! too few positive probabilities Code (expect_error(sample_n(group_by(mtcars, cyl), 10))) Output Error in `sample_n()`: ! Can't compute indices. i In group 2: `cyl = 6`. Caused by error: ! `size` must be less than or equal to 7 (size of data). i set `replace = TRUE` to use sampling with replacement. Code (expect_error(sample_n(list()))) Output Error in `sample_n()`: ! `tbl` must be a data frame, not an empty list. Code (expect_error(sample_frac(list()))) Output Error in `sample_frac()`: ! `tbl` must be a data frame, not an empty list. Code # # respects weight df <- data.frame(x = 1:2, y = c(0, 1)) (expect_error(sample_n(df, 2, weight = y))) Output Error in `sample_n()`: ! Can't compute indices. Caused by error in `sample.int()`: ! too few positive probabilities Code (expect_error(sample_frac(df, 2))) Output Error in `sample_frac()`: ! Can't compute indices. Caused by error: ! `size` of sampled fraction must be less or equal to one. i set `replace = TRUE` to use sampling with replacement. Code (expect_error(sample_frac(group_by(df, y), 2))) Output Error in `sample_frac()`: ! Can't compute indices. i In group 1: `y = 0`. Caused by error: ! `size` of sampled fraction must be less or equal to one. i set `replace = TRUE` to use sampling with replacement. Code (expect_error(sample_frac(df, 1, weight = y))) Output Error in `sample_frac()`: ! Can't compute indices. Caused by error in `sample.int()`: ! too few positive probabilities dplyr/tests/testthat/_snaps/by.md0000644000176200001440000000126115106134104016620 0ustar liggesusers# throws tidyselect errors Code compute_by(by = y, data = df) Condition Error: ! Can't select columns that don't exist. x Column `y` doesn't exist. # can't set `.by` with a grouped-df Code compute_by(x, gdf) Condition Error: ! Can't supply `by` when `data` is a grouped data frame. # can't set `.by` with a rowwise-df Code compute_by(x, rdf) Condition Error: ! Can't supply `by` when `data` is a rowwise data frame. # can tweak the error args Code compute_by(x, gdf, by_arg = "x", data_arg = "dat") Condition Error: ! Can't supply `x` when `dat` is a grouped data frame. dplyr/tests/testthat/_snaps/consecutive-id.md0000644000176200001440000000116415106134104021131 0ustar liggesusers# follows recycling rules Code consecutive_id(1:3, 1:4) Condition Error in `consecutive_id()`: ! Can't recycle `..1` (size 3) to match `..2` (size 4). # generates useful errors Code consecutive_id(x = 1:4) Condition Error in `consecutive_id()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * x = 1:4 Code consecutive_id(mean) Condition Error in `consecutive_id()`: ! `..1` must be a vector, not a function. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. dplyr/tests/testthat/_snaps/sets.md0000644000176200001440000000653614472225345017213 0ustar liggesusers# extra arguments in ... error (#5891) Code intersect(df1, df2, z = 3) Condition Error in `intersect()`: ! `...` must be empty. x Problematic argument: * z = 3 Code union(df1, df2, z = 3) Condition Error in `union()`: ! `...` must be empty. x Problematic argument: * z = 3 Code union_all(df1, df2, z = 3) Condition Error in `union_all()`: ! `...` must be empty. x Problematic argument: * z = 3 Code setdiff(df1, df2, z = 3) Condition Error in `setdiff()`: ! `...` must be empty. x Problematic argument: * z = 3 Code symdiff(df1, df2, z = 3) Condition Error in `symdiff()`: ! `...` must be empty. x Problematic argument: * z = 3 # incompatible data frames error (#903) Code intersect(df1, df2) Condition Error in `intersect()`: ! `x` and `y` are not compatible. x Different number of columns: 1 vs 2. Code union(df1, df2) Condition Error in `union()`: ! `x` and `y` are not compatible. x Different number of columns: 1 vs 2. Code union_all(df1, df2) Condition Error in `union_all()`: ! `x` and `y` are not compatible. x Different number of columns: 1 vs 2. Code setdiff(df1, df2) Condition Error in `setdiff()`: ! `x` and `y` are not compatible. x Different number of columns: 1 vs 2. Code symdiff(df1, df2) Condition Error in `symdiff()`: ! `x` and `y` are not compatible. x Different number of columns: 1 vs 2. # is_compatible generates useful messages for different cases Code cat(is_compatible(tibble(x = 1), 1)) Output `y` must be a data frame. Code cat(is_compatible(tibble(x = 1), tibble(x = 1, y = 2))) Output Different number of columns: 1 vs 2. Code cat(is_compatible(tibble(x = 1, y = 1), tibble(y = 1, x = 1), ignore_col_order = FALSE)) Output Same column names, but different order. Code cat(is_compatible(tibble(x = 1), tibble(y = 1))) Output Cols in `y` but not `x`: `y`. Cols in `x` but not `y`: `x`. Code cat(is_compatible(tibble(x = 1), tibble(x = 1L), convert = FALSE)) Output Different types for column `x`: double vs integer. Code cat(is_compatible(tibble(x = 1), tibble(x = "a"))) Output Incompatible types for column `x`: double vs character. # setequal tibbles must have same rows and columns Code setequal(tibble(x = 1:2), tibble(y = 1:2)) Condition Error in `setequal()`: ! `x` and `y` are not compatible. x Cols in `y` but not `x`: `y`. x Cols in `x` but not `y`: `x`. --- Code setequal(tibble(x = 1:2), tibble(x = c("a", "b"))) Condition Error in `setequal()`: ! `x` and `y` are not compatible. x Incompatible types for column `x`: integer vs character. # setequal checks y is a data frame Code setequal(mtcars, 1) Condition Error in `setequal()`: ! `x` and `y` are not compatible. `y` must be a data frame. # setequal checks for extra arguments Code setequal(mtcars, mtcars, z = 2) Condition Error in `setequal()`: ! `...` must be empty. x Problematic argument: * z = 2 dplyr/tests/testthat/_snaps/lead-lag.md0000644000176200001440000000447615106134104017667 0ustar liggesusers# `lag()` gives informative error for objects Code lag(ts(1:10)) Condition Error in `lag()`: ! `x` must be a vector, not a , do you want `stats::lag()`? # `lead()` / `lag()` validate `n` Code lead(1:5, n = 1:2) Condition Error in `lead()`: ! `n` must be a whole number, not an integer vector. Code lead(1:5, -1) Condition Error in `lead()`: ! `n` must be positive. --- Code lag(1:5, n = 1:2) Condition Error in `lag()`: ! `n` must be a whole number, not an integer vector. Code lag(1:5, -1) Condition Error in `lag()`: ! `n` must be positive. # `lead()` / `lag()` check for empty dots Code lead(1:5, deault = 1) Condition Error in `lead()`: ! `...` must be empty. x Problematic argument: * deault = 1 --- Code lag(1:5, deault = 1) Condition Error in `lag()`: ! `...` must be empty. x Problematic argument: * deault = 1 # `lead()` / `lag()` require that `x` is a vector Code lead(environment()) Condition Error in `lead()`: ! `x` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. --- Code lag(environment()) Condition Error in `lag()`: ! `x` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. # `default` is cast to the type of `x` (#6330) Code shift(1L, default = 1.5) Condition Error: ! Can't convert from `default` to `x` due to loss of precision. * Locations: 1 # `default` must be size 1 (#5641) Code shift(1:5, default = 1:2) Condition Error: ! `default` must have size 1, not size 2. --- Code shift(1:5, default = integer()) Condition Error: ! `default` must have size 1, not size 0. # `n` is validated Code shift(1, n = 1:2) Condition Error in `shift()`: ! `n` must be a whole number, not an integer vector. # `order_by` must be the same size as `x` Code shift(1:5, order_by = 1:4) Condition Error in `with_order()`: ! `order_by` must have size 5, not size 4. dplyr/tests/testthat/_snaps/join-rows.md0000644000176200001440000002331615106134104020142 0ustar liggesusers# `relationship` default behavior is correct Code out <- join_rows(c(1, 1), c(1, 1), condition = "==") Condition Warning: Detected an unexpected many-to-many relationship between `x` and `y`. i Row 1 of `x` matches multiple rows in `y`. i Row 1 of `y` matches multiple rows in `x`. i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. # join_rows() allows `unmatched` to be specified independently for inner joins Code join_rows(c(1, 3), c(1, 2), type = "inner", unmatched = c("drop", "error")) Condition Error: ! Each row of `y` must be matched by `x`. i Row 2 of `y` was not matched. # join_rows() expects incompatible type errors to have been handled by join_cast_common() Code (expect_error(join_rows(data.frame(x = 1), data.frame(x = factor("a"))))) Output Error: ! `join_cast_common()` should have handled this. i This is an internal error that was detected in the dplyr package. Please report it at with a reprex () and the full backtrace. # join_rows() gives meaningful one-to-one errors Code join_rows(1, c(1, 1), relationship = "one-to-one") Condition Error: ! Each row in `x` must match at most 1 row in `y`. i Row 1 of `x` matches multiple rows in `y`. --- Code join_rows(c(1, 1), 1, relationship = "one-to-one") Condition Error: ! Each row in `y` must match at most 1 row in `x`. i Row 1 of `y` matches multiple rows in `x`. # join_rows() gives meaningful one-to-many errors Code join_rows(c(1, 1), 1, relationship = "one-to-many") Condition Error: ! Each row in `y` must match at most 1 row in `x`. i Row 1 of `y` matches multiple rows in `x`. # join_rows() gives meaningful many-to-one errors Code join_rows(1, c(1, 1), relationship = "many-to-one") Condition Error: ! Each row in `x` must match at most 1 row in `y`. i Row 1 of `x` matches multiple rows in `y`. # join_rows() gives meaningful many-to-many warnings Code join_rows(c(1, 1), c(1, 1)) Condition Warning: Detected an unexpected many-to-many relationship between `x` and `y`. i Row 1 of `x` matches multiple rows in `y`. i Row 1 of `y` matches multiple rows in `x`. i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. Output $x [1] 1 1 2 2 $y [1] 1 2 1 2 --- Code left_join(df, df, by = join_by(x)) Condition Warning in `left_join()`: Detected an unexpected many-to-many relationship between `x` and `y`. i Row 1 of `x` matches multiple rows in `y`. i Row 1 of `y` matches multiple rows in `x`. i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. Output x 1 1 2 1 3 1 4 1 # join_rows() gives meaningful error message on unmatched rows Code join_rows(data.frame(x = c(1, 2)), data.frame(x = c(3, 1)), type = "left", unmatched = "error") Condition Error: ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. --- Code join_rows(data.frame(x = c(1, 2)), data.frame(x = c(3, 1)), type = "nest", unmatched = "error") Condition Error: ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. --- Code join_rows(data.frame(x = c(1, 2)), data.frame(x = c(3, 1)), type = "right", unmatched = "error") Condition Error: ! Each row of `x` must have a match in `y`. i Row 2 of `x` does not have a match. --- Code join_rows(data.frame(x = c(1, 2)), data.frame(x = 1), type = "inner", unmatched = "error") Condition Error: ! Each row of `x` must have a match in `y`. i Row 2 of `x` does not have a match. --- Code join_rows(data.frame(x = c(1, 2)), data.frame(x = 1), type = "inner", unmatched = c("error", "drop")) Condition Error: ! Each row of `x` must have a match in `y`. i Row 2 of `x` does not have a match. --- Code join_rows(data.frame(x = 1), data.frame(x = c(1, 2)), type = "inner", unmatched = "error") Condition Error: ! Each row of `y` must be matched by `x`. i Row 2 of `y` was not matched. --- Code join_rows(data.frame(x = 1), data.frame(x = c(1, 2)), type = "inner", unmatched = c("drop", "error")) Condition Error: ! Each row of `y` must be matched by `x`. i Row 2 of `y` was not matched. # join_rows() always errors on unmatched missing values Code join_rows(data.frame(x = 1), data.frame(x = NA), type = "left", unmatched = "error", na_matches = "na") Condition Error: ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. --- Code join_rows(data.frame(x = NA), data.frame(x = NA), type = "left", unmatched = "error", na_matches = "never") Condition Error: ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. --- Code join_rows(data.frame(x = 1), data.frame(x = NA), type = "nest", unmatched = "error", na_matches = "na") Condition Error: ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. --- Code join_rows(data.frame(x = NA), data.frame(x = NA), type = "nest", unmatched = "error", na_matches = "never") Condition Error: ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. --- Code join_rows(data.frame(x = NA), data.frame(x = 1), type = "right", unmatched = "error", na_matches = "na") Condition Error: ! Each row of `x` must have a match in `y`. i Row 1 of `x` does not have a match. --- Code join_rows(data.frame(x = NA), data.frame(x = NA), type = "right", unmatched = "error", na_matches = "never") Condition Error: ! Each row of `x` must have a match in `y`. i Row 1 of `x` does not have a match. --- Code join_rows(data.frame(x = 1), data.frame(x = c(1, NA)), type = "inner", unmatched = "error", na_matches = "na") Condition Error: ! Each row of `y` must be matched by `x`. i Row 2 of `y` was not matched. --- Code join_rows(data.frame(x = 1), data.frame(x = c(1, NA)), type = "inner", unmatched = c("drop", "error"), na_matches = "na") Condition Error: ! Each row of `y` must be matched by `x`. i Row 2 of `y` was not matched. --- Code join_rows(data.frame(x = c(1, NA)), data.frame(x = 1), type = "inner", unmatched = "error", na_matches = "na") Condition Error: ! Each row of `x` must have a match in `y`. i Row 2 of `x` does not have a match. --- Code join_rows(data.frame(x = c(1, NA)), data.frame(x = 1), type = "inner", unmatched = c("error", "drop"), na_matches = "na") Condition Error: ! Each row of `x` must have a match in `y`. i Row 2 of `x` does not have a match. --- Code join_rows(data.frame(x = NA), data.frame(x = NA), type = "inner", unmatched = "error", na_matches = "never") Condition Error: ! Each row of `x` must have a match in `y`. i Row 1 of `x` does not have a match. # join_rows() validates `unmatched` Code join_rows(df, df, unmatched = 1) Condition Error: ! `unmatched` must be a character vector, not the number 1. Code join_rows(df, df, unmatched = "foo") Condition Error: ! `unmatched` must be one of "drop" or "error", not "foo". Code join_rows(df, df, type = "left", unmatched = character()) Condition Error: ! `unmatched` must be length 1, not 0. Code join_rows(df, df, type = "left", unmatched = c("drop", "error")) Condition Error: ! `unmatched` must be length 1, not 2. Code join_rows(df, df, type = "inner", unmatched = character()) Condition Error: ! `unmatched` must be length 1 or 2, not 0. Code join_rows(df, df, type = "inner", unmatched = c("drop", "error", "error")) Condition Error: ! `unmatched` must be length 1 or 2, not 3. Code join_rows(df, df, type = "inner", unmatched = c("drop", "dr")) Condition Error: ! `unmatched` must be one of "drop" or "error", not "dr". i Did you mean "drop"? # join_rows() validates `relationship` Code join_rows(df, df, relationship = 1) Condition Error: ! `relationship` must be a string or character vector. --- Code join_rows(df, df, relationship = "none") Condition Error: ! `relationship` must be one of "one-to-one", "one-to-many", "many-to-one", or "many-to-many", not "none". --- Code join_rows(df, df, relationship = "warn-many-to-many") Condition Error: ! `relationship` must be one of "one-to-one", "one-to-many", "many-to-one", or "many-to-many", not "warn-many-to-many". i Did you mean "many-to-many"? # join_rows() rethrows overflow error nicely (#6912) Code join_rows(df, df, condition = ">=") Condition Error: ! This join would result in more rows than dplyr can handle. i 50000005000000 rows would be returned. 2147483647 rows is the maximum number allowed. i Double check your join keys. This error commonly occurs due to a missing join key, or an improperly specified join condition. dplyr/tests/testthat/_snaps/when.md0000644000176200001440000000473315137161765017177 0ustar liggesusers# no recycling is performed! Code when_any(TRUE, c(TRUE, FALSE)) Condition Error in `when_any()`: ! `..2` must have size 1, not size 2. --- Code when_all(TRUE, c(TRUE, FALSE)) Condition Error in `when_all()`: ! `..2` must have size 1, not size 2. --- Code when_any(TRUE, size = 2) Condition Error in `when_any()`: ! `..1` must have size 2, not size 1. --- Code when_all(TRUE, size = 2) Condition Error in `when_all()`: ! `..1` must have size 2, not size 1. # inputs must be strictly logical vectors Code when_any(1) Condition Error in `when_any()`: ! `..1` must be a logical vector, not the number 1. --- Code when_all(1) Condition Error in `when_all()`: ! `..1` must be a logical vector, not the number 1. --- Code when_any(array(TRUE)) Condition Error in `when_any()`: ! `..1` must be a logical vector, not a logical 1D array. --- Code when_all(array(TRUE)) Condition Error in `when_all()`: ! `..1` must be a logical vector, not a logical 1D array. --- Code when_any(structure(TRUE, class = "foo")) Condition Error in `when_any()`: ! `..1` must be a logical vector, not a object. --- Code when_all(structure(TRUE, class = "foo")) Condition Error in `when_all()`: ! `..1` must be a logical vector, not a object. # `...` can't be named Code when_any(x = TRUE) Condition Error in `when_any()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * x = TRUE --- Code when_all(x = TRUE) Condition Error in `when_all()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * x = TRUE # `na_rm` is validated Code when_any(na_rm = "x") Condition Error in `when_any()`: ! `na_rm` must be `TRUE` or `FALSE`, not the string "x". --- Code when_all(na_rm = "x") Condition Error in `when_all()`: ! `na_rm` must be `TRUE` or `FALSE`, not the string "x". # `size` is validated Code when_any(size = "x") Condition Error in `vec_pany()`: ! `.size` must be a scalar integer or double. --- Code when_all(size = "x") Condition Error in `vec_pall()`: ! `.size` must be a scalar integer or double. dplyr/tests/testthat/_snaps/transmute.md0000644000176200001440000000113514416000544020233 0ustar liggesusers# transmute() error messages Code (expect_error(transmute(mtcars, cyl2 = cyl, .keep = "all"))) Output Error in `transmute()`: ! The `.keep` argument is not supported. Code (expect_error(transmute(mtcars, cyl2 = cyl, .before = disp))) Output Error in `transmute()`: ! The `.before` argument is not supported. Code (expect_error(transmute(mtcars, cyl2 = cyl, .after = disp))) Output Error in `transmute()`: ! The `.after` argument is not supported. dplyr/tests/testthat/_snaps/if-else.md0000644000176200001440000000367115106134104017541 0ustar liggesusers# takes the common type of `true` and `false` (#6243) Code if_else(TRUE, 1, "x") Condition Error in `if_else()`: ! Can't combine `true` and `false` . # includes `missing` in the common type computation if used Code if_else(TRUE, 1, 2, missing = "x") Condition Error in `if_else()`: ! Can't combine `true` and `missing` . # `condition` must be logical (and isn't cast to logical!) Code if_else(1:10, 1, 2) Condition Error in `if_else()`: ! `condition` must be a logical vector, not an integer vector. # `condition` can't be an array (#7723) Code if_else(array(TRUE), 1, 2) Condition Error in `if_else()`: ! `condition` must be a logical vector, not a logical 1D array. # `true`, `false`, and `missing` must recycle to the size of `condition` Code if_else(x < 2, bad, x) Condition Error in `if_else()`: ! Can't recycle `true` (size 2) to size 3. --- Code if_else(x < 2, x, bad) Condition Error in `if_else()`: ! Can't recycle `false` (size 2) to size 3. --- Code if_else(x < 2, x, x, missing = bad) Condition Error in `if_else()`: ! Can't recycle `missing` (size 2) to size 3. # must have empty dots Code if_else(TRUE, 1, 2, missing = 3, 4) Condition Error in `if_else()`: ! `...` must be empty. x Problematic argument: * ..1 = 4 i Did you forget to name an argument? # `ptype` overrides the common type Code if_else(TRUE, 1L, 2.5, ptype = integer()) Condition Error in `if_else()`: ! Can't convert from `false` to due to loss of precision. * Locations: 1 # `size` is deprecated Code x <- if_else(c(TRUE, FALSE), 1, 2, size = 2) Condition Warning: The `size` argument of `if_else()` is deprecated as of dplyr 1.2.0. dplyr/tests/testthat/_snaps/n-distinct.md0000644000176200001440000000106615106134104020265 0ustar liggesusers# n_distinct() generates useful errors Code n_distinct() Condition Error in `n_distinct()`: ! `...` is absent, but must be supplied. Code n_distinct(x = 1:4) Condition Error in `n_distinct()`: ! Arguments in `...` must be passed by position, not name. x Problematic argument: * x = 1:4 Code n_distinct(mean) Condition Error in `n_distinct()`: ! `..1` must be a vector, not a function. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. dplyr/tests/testthat/_snaps/colwise-select.md0000644000176200001440000000330315106134104021127 0ustar liggesusers# colwise select() / rename() give meaningful errors Code df <- tibble(x = 0L, y = 0.5, z = 1) (expect_error(rename_all(df))) Output Error in `rename_all()`: ! `.funs` must specify a renaming function. Code (expect_error(rename_if(df, is_integerish))) Output Error in `rename_if()`: ! `.funs` must specify a renaming function. Code (expect_error(rename_at(df, vars(x:y)))) Output Error in `rename_at()`: ! `.funs` must specify a renaming function. Code (expect_error(rename_all(df, list(tolower, toupper)))) Output Error in `rename_all()`: ! `.funs` must contain one renaming function, not 2. Code (expect_error(select_all(df, list(tolower, toupper)))) Output Error in `select_all()`: ! `.funs` must contain one renaming function, not 2. Code (expect_error(select_if(df, function(.x) 1))) Output Error in `select_if()`: ! `.p` is invalid. x `.p` should return a single logical. i `.p` returns a for column `x`. Code (expect_error(select_if(df, function(.x) c(TRUE, TRUE)))) Output Error in `select_if()`: ! `.p` is invalid. x `.p` should return a single logical. i `.p` returns a size 2 for column `x`. Code (expect_error(select_all(data.frame(), .funs = 42))) Output Error in `select_all()`: ! `.funs` must be a one sided formula, a function, or a function name. dplyr/tests/testthat/_snaps/join-cross.md0000644000176200001440000000030214416000525020271 0ustar liggesusers# cross join checks for duplicate names Code cross_join(df1, df2) Condition Error in `cross_join()`: ! Input columns in `x` must be unique. x Problem with `a`. dplyr/tests/testthat/_snaps/summarise.md0000644000176200001440000002271315137161765020241 0ustar liggesusers# can't overwrite column active bindings (#6666) Code summarise(df, y = { x <<- x + 2L mean(x) }) Condition Error in `summarise()`: i In argument: `y = { ... }`. Caused by error: ! unused argument (base::quote(3:6)) --- Code summarise(df, .by = g, y = { x <<- x + 2L mean(x) }) Condition Error in `summarise()`: i In argument: `y = { ... }`. i In group 1: `g = 1`. Caused by error: ! unused argument (base::quote(3:4)) --- Code summarise(gdf, y = { x <<- x + 2L mean(x) }) Condition Error in `summarise()`: i In argument: `y = { ... }`. i In group 1: `g = 1`. Caused by error: ! unused argument (base::quote(3:4)) # can't use `.by` with `.groups` Code summarise(df, .by = x, .groups = "drop") Condition Error in `summarise()`: ! Can't supply both `.by` and `.groups`. # catches `.by` with grouped-df Code summarise(gdf, .by = x) Condition Error in `summarise()`: ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df Code summarise(rdf, .by = x) Condition Error in `summarise()`: ! Can't supply `.by` when `.data` is a rowwise data frame. # `summarise()` doesn't allow data frames with missing or empty names (#6758) Code summarise(df1) Condition Error in `summarise()`: ! Can't transform a data frame with `NA` or `""` names. --- Code summarise(df2) Condition Error in `summarise()`: ! Can't transform a data frame with missing names. # summarise() messages about implicit `.groups` default Code summarise(group_by(df, x)) Output # A tibble: 1 x 1 x 1 1 --- Code summarise(rowwise(df)) Output # A tibble: 1 x 0 --- Code summarise(group_by(df, x, y)) Message `summarise()` has regrouped the output. i Summaries were computed grouped by x and y. i Output is grouped by x. i Use `summarise(.groups = "drop_last")` to silence this message. i Use `summarise(.by = c(x, y))` for per-operation grouping (`?dplyr::dplyr_by`) instead. Output # A tibble: 1 x 2 # Groups: x [1] x y 1 1 2 --- Code summarise(rowwise(df, x, y)) Message `summarise()` has converted the output from a rowwise data frame to a grouped data frame. i Summaries were computed rowwise. i Output is grouped by x and y. i Use `summarise(.groups = "keep")` to silence this message. Output # A tibble: 1 x 2 # Groups: x, y [1] x y 1 1 2 # summarise() respects `dplyr.summarise.inform = FALSE` Code eval_global(summarise(group_by(tibble(x = 1, y = 2), x, y))) Output # A tibble: 1 x 2 # Groups: x [1] x y 1 1 2 --- Code eval_global(summarise(rowwise(tibble(x = 1, y = 2), x, y))) Output # A tibble: 1 x 2 # Groups: x, y [1] x y 1 1 2 # summarise() gives meaningful errors Code (expect_error(summarise(tibble(x = 1, y = c(1, 2, 2), z = runif(3)), a = rlang::env( a = 1)))) Output Error in `summarise()`: i In argument: `a = rlang::env(a = 1)`. Caused by error: ! `a` must be a vector, not an environment. Code (expect_error(summarise(group_by(tibble(x = 1, y = c(1, 2, 2), z = runif(3)), x, y), a = rlang::env(a = 1)))) Output Error in `summarise()`: i In argument: `a = rlang::env(a = 1)`. i In group 1: `x = 1`, `y = 1`. Caused by error: ! `a` must be a vector, not an environment. Code (expect_error(summarise(group_by(tibble(x = 1, y = c(1, 2, 2), y2 = c(1, 2, 2), z = runif(3)), x, y, y2), a = rlang::env(a = 1)))) Output Error in `summarise()`: i In argument: `a = rlang::env(a = 1)`. i In group 1: `x = 1`, `y = 1`, `y2 = 1`. Caused by error: ! `a` must be a vector, not an environment. Code (expect_error(summarise(rowwise(tibble(x = 1, y = c(1, 2, 2), z = runif(3))), a = lm(y ~ x)))) Output Error in `summarise()`: i In argument: `a = lm(y ~ x)`. i In row 1. Caused by error: ! `a` must be a vector, not a object. i Did you mean: `a = list(lm(y ~ x))` ? Code (expect_error(summarise(group_by(tibble(id = 1:2, a = list(1, "2")), id), a = a[[ 1]]))) Output Error in `summarise()`: i In argument: `a = a[[1]]`. Caused by error: ! `a` must return compatible vectors across groups. i Result of type for group 1: `id = 1`. i Result of type for group 2: `id = 2`. Code (expect_error(summarise(rowwise(tibble(id = 1:2, a = list(1, "2"))), a = a[[1]])) ) Output Error in `summarise()`: i In argument: `a = a[[1]]`. Caused by error: ! `a` must return compatible vectors across groups. Code (expect_error(summarise(group_by(data.frame(x = 1:2, g = 1:2), g), x = if (g == 1) 42))) Output Error in `summarise()`: i In argument: `x = if (g == 1) 42`. i In group 2: `g = 2`. Caused by error: ! `x` must return compatible vectors across groups. x Can't combine NULL and non NULL results. Code (expect_error(summarise(group_by(data.frame(x = 1:2, g = 1:2), g), x = if (g == 2) 42))) Output Error in `summarise()`: i In argument: `x = if (g == 2) 42`. i In group 1: `g = 1`. Caused by error: ! `x` must return compatible vectors across groups. x Can't combine NULL and non NULL results. Code (expect_error(summarise(tibble(a = 1), c = .data$b))) Output Error in `summarise()`: i In argument: `c = .data$b`. Caused by error in `.data$b`: ! Column `b` not found in `.data`. Code (expect_error(summarise(group_by(tibble(a = 1:3), a), c = .data$b))) Output Error in `summarise()`: i In argument: `c = .data$b`. i In group 1: `a = 1`. Caused by error in `.data$b`: ! Column `b` not found in `.data`. Code (expect_error(summarise(tibble(x = 1, x = 1, .name_repair = "minimal"), x))) Output Error in `summarise()`: ! Can't transform a data frame with duplicate names. Code (expect_error(summarise(tibble(), stop("{")))) Output Error in `summarise()`: i In argument: `stop("{")`. Caused by error: ! { Code (expect_error(summarise(group_by(tibble(a = 1, b = "{value:1, unit:a}"), b), a = stop( "!")))) Output Error in `summarise()`: i In argument: `a = stop("!")`. i In group 1: `b = "{value:1, unit:a}"`. Caused by error: ! ! # non-summary results are defunct in favor of `reframe()` (#6382, #7761) Code out <- summarise(df, x = which(x < 3)) Condition Error in `summarise()`: i In argument: `x = which(x < 3)`. Caused by error: ! `x` must be size 1, not 2. i To return more or less than 1 row per group, use `reframe()`. --- Code out <- summarise(df, x = which(x < 3), .by = g) Condition Error in `summarise()`: i In argument: `x = which(x < 3)`. i In group 1: `g = 1`. Caused by error: ! `x` must be size 1, not 2. i To return more or less than 1 row per group, use `reframe()`. --- Code out <- summarise(gdf, x = which(x < 3)) Condition Error in `summarise()`: i In argument: `x = which(x < 3)`. i In group 1: `g = 1`. Caused by error: ! `x` must be size 1, not 2. i To return more or less than 1 row per group, use `reframe()`. --- Code out <- summarise(rdf, x = which(x < 3)) Condition Error in `summarise()`: i In argument: `x = which(x < 3)`. i In row 3. Caused by error: ! `x` must be size 1, not 0. i To return more or less than 1 row per group, use `reframe()`. --- Code summarise(tibble(), x = 1, y = 1:3, z = 1) Condition Error in `summarise()`: i In argument: `y = 1:3`. Caused by error: ! `y` must be size 1, not 3. i To return more or less than 1 row per group, use `reframe()`. --- Code gf <- group_by(tibble(a = 1:2), a) summarise(gf, x = 1, y = 1:3, z = 1) Condition Error in `summarise()`: i In argument: `y = 1:3`. i In group 1: `a = 1`. Caused by error: ! `y` must be size 1, not 3. i To return more or less than 1 row per group, use `reframe()`. --- Code gf <- group_by(tibble(a = 1:2), a) summarise(gf, x = seq_len(a), y = 1) Condition Error in `summarise()`: i In argument: `x = seq_len(a)`. i In group 2: `a = 2`. Caused by error: ! `x` must be size 1, not 2. i To return more or less than 1 row per group, use `reframe()`. dplyr/tests/testthat/_snaps/grouped-df.md0000644000176200001440000001036215106134104020244 0ustar liggesusers# validate_grouped_df() gives useful errors Code (expect_error(validate_grouped_df(df1))) Output Error in `validate_grouped_df()`: ! The `.rows` column must be list of one-based integer vectors. Code (expect_error(group_data(df1))) Output Error in `group_data()`: ! `.data` must be a valid object. Caused by error in `validate_grouped_df()`: ! The `.rows` column must be list of one-based integer vectors. Code (expect_error(validate_grouped_df(df2))) Output Error in `validate_grouped_df()`: ! The last column of the `groups` attribute must be called `.rows`. Code (expect_error(validate_grouped_df(df2))) Output Error in `validate_grouped_df()`: ! The last column of the `groups` attribute must be called `.rows`. Code (expect_error(validate_grouped_df(df3))) Output Error in `validate_grouped_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(validate_grouped_df(df4))) Output Error in `validate_grouped_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(validate_grouped_df(df5))) Output Error in `validate_grouped_df()`: ! Corrupt `grouped_df` using old (< 0.8.0) format. i Strip off old grouping with `ungroup()`. Code (expect_error(validate_grouped_df(df6, check_bounds = TRUE))) Output Error in `validate_grouped_df()`: ! out of bounds indices. Code (expect_error(validate_grouped_df(df7, check_bounds = TRUE))) Output Error in `validate_grouped_df()`: ! out of bounds indices. Code (expect_error(validate_grouped_df(df8, check_bounds = TRUE))) Output Error in `validate_grouped_df()`: ! out of bounds indices. Code (expect_error(validate_grouped_df(df10))) Output Error in `validate_grouped_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(validate_grouped_df(df11))) Output Error in `validate_grouped_df()`: ! The `groups` attribute must be a data frame. Code (expect_error(new_grouped_df(tibble(x = 1:10), tibble(other = list(1:2))))) Output Error in `new_grouped_df()`: ! The last column of `groups` must be called ".rows". Code (expect_error(new_grouped_df(10))) Output Error in `new_grouped_df()`: ! `x` must be a data frame. # helper gives meaningful error messages Code (expect_error(grouped_df(data.frame(x = 1), "y", FALSE))) Output Error in `compute_groups()`: ! `vars` missing from `data`: `y`. Code (expect_error(grouped_df(data.frame(x = 1), 1))) Output Error in `grouped_df()`: ! `vars` must be a character vector. # using the deprecated global option `dplyr.legacy_locale` forces the system locale Code result <- compute_groups(df, "x") Condition Warning: `options(dplyr.legacy_locale =)` was deprecated in dplyr 1.2.0. i If needed for `arrange()`, use `arrange(.locale =)` instead. i If needed for `group_by() |> summarise()`, follow up with an additional `arrange(.locale =)` call. i Use `Sys.getlocale("LC_COLLATE")` to determine your system locale, and compare against `stringi::stri_locale_list()` to determine the `.locale` value to use. --- Code result <- group_by(df, x) Condition Warning: `options(dplyr.legacy_locale =)` was deprecated in dplyr 1.2.0. i If needed for `arrange()`, use `arrange(.locale =)` instead. i If needed for `group_by() |> summarise()`, follow up with an additional `arrange(.locale =)` call. i Use `Sys.getlocale("LC_COLLATE")` to determine your system locale, and compare against `stringi::stri_locale_list()` to determine the `.locale` value to use. dplyr/tests/testthat/_snaps/copy-to.md0000644000176200001440000000054714416000514017606 0ustar liggesusers# `auto_copy()` throws an informative error on different sources (#6798) Code auto_copy(df, NULL) Condition Error in `auto_copy()`: ! `x` and `y` must share the same src. i `x` is a object. i `y` is `NULL`. i Set `copy = TRUE` if `y` can be copied to the same source as `x` (may be slow). dplyr/tests/testthat/_snaps/colwise.md0000644000176200001440000000123214416000512017647 0ustar liggesusers# colwise utils gives meaningful error messages Code (expect_error(tbl_at_vars(iris, raw(3)))) Output Error: ! `.vars` must be a character/numeric vector or a `vars()` object, not a raw vector. Code (expect_error(tbl_if_vars(iris, list(identity, force), environment()))) Output Error: ! `.predicate` must have length 1, not 2. Code .funs <- as_fun_list(list(identity, force), caller_env()) (expect_error(tbl_if_vars(iris, .funs, environment()))) Output Error: ! `.predicate` must have length 1, not 2. dplyr/tests/testthat/_snaps/rows.md0000644000176200001440000002413714416000537017214 0ustar liggesusers# rows_insert() doesn't allow insertion of matched keys by default Code (expect_error(rows_insert(x, y, by = "a"))) Output Error in `rows_insert()`: ! `y` can't contain keys that already exist in `x`. i The following rows in `y` have keys that already exist in `x`: `c(1)`. i Use `conflict = "ignore"` if you want to ignore these `y` rows. --- Code (expect_error(rows_insert(x, y, by = "a"))) Output Error in `rows_insert()`: ! `y` can't contain keys that already exist in `x`. i The following rows in `y` have keys that already exist in `x`: `c(1, 2, 3)`. i Use `conflict = "ignore"` if you want to ignore these `y` rows. # rows_insert() casts keys to the type of `x` Code (expect_error(rows_insert(x, y, "key"))) Output Error in `rows_insert()`: ! Can't convert from `y$key` to `x$key` due to loss of precision. * Locations: 1 # rows_insert() casts values to the type of `x` Code (expect_error(rows_insert(x, y, "key"))) Output Error in `rows_insert()`: ! Can't convert from `y$value` to `x$value` due to loss of precision. * Locations: 1 # rows_insert() checks that `x` and `y` contain `by` (#6652) Code (expect_error(rows_insert(x, y, by = "c"))) Output Error in `rows_insert()`: ! All columns specified through `by` must exist in `x` and `y`. i The following columns are missing from `x`: `c`. --- Code (expect_error(rows_insert(x, y, by = c("a", "b")))) Output Error in `rows_insert()`: ! All columns specified through `by` must exist in `x` and `y`. i The following columns are missing from `y`: `b`. # `conflict` is validated Code (expect_error(rows_insert(x, y, by = "a", conflict = "foo"))) Output Error in `rows_insert()`: ! `conflict` must be one of "error" or "ignore", not "foo". Code (expect_error(rows_insert(x, y, by = "a", conflict = 1))) Output Error in `rows_insert()`: ! `conflict` must be a string or character vector. # rows_append() casts to the type of `x` Code (expect_error(rows_append(x, y))) Output Error in `rows_append()`: ! Can't convert from `y$key` to `x$key` due to loss of precision. * Locations: 1 # rows_append() requires that `y` columns be a subset of `x` Code (expect_error(rows_append(x, y))) Output Error in `rows_append()`: ! All columns in `y` must exist in `x`. i The following columns only exist in `y`: `c`. # rows_update() requires `y` keys to exist in `x` by default Code (expect_error(rows_update(x, y, "a"))) Output Error in `rows_update()`: ! `y` must contain keys that already exist in `x`. i The following rows in `y` have keys that don't exist in `x`: `c(1, 3)`. i Use `unmatched = "ignore"` if you want to ignore these `y` rows. # rows_update() doesn't allow `y` keys to be duplicated (#5553) Code (expect_error(rows_update(x, y, by = "a"))) Output Error in `rows_update()`: ! `y` key values must be unique. i The following rows contain duplicate key values: `c(1, 2)`. # rows_update() casts keys to their common type for matching but retains `x` type Code (expect_error(rows_update(x, y, "key"))) Output Error in `rows_update()`: ! Can't combine `x$key` and `y$key` . # rows_update() casts values to the type of `x` Code (expect_error(rows_update(x, y, "key"))) Output Error in `rows_update()`: ! Can't convert from `y$value` to `x$value` due to loss of precision. * Locations: 1 # `unmatched` is validated Code (expect_error(rows_update(x, y, by = "a", unmatched = "foo"))) Output Error in `rows_update()`: ! `unmatched` must be one of "error" or "ignore", not "foo". Code (expect_error(rows_update(x, y, by = "a", unmatched = 1))) Output Error in `rows_update()`: ! `unmatched` must be a string or character vector. # rows_patch() requires `y` keys to exist in `x` by default Code (expect_error(rows_patch(x, y, "a"))) Output Error in `rows_patch()`: ! `y` must contain keys that already exist in `x`. i The following rows in `y` have keys that don't exist in `x`: `c(1, 3)`. i Use `unmatched = "ignore"` if you want to ignore these `y` rows. # rows_patch() doesn't allow `y` keys to be duplicated (#5553) Code (expect_error(rows_patch(x, y, by = "a"))) Output Error in `rows_patch()`: ! `y` key values must be unique. i The following rows contain duplicate key values: `c(1, 2)`. # rows_patch() casts keys to their common type for matching but retains `x` type Code (expect_error(rows_patch(x, y, "key"))) Output Error in `rows_patch()`: ! Can't combine `x$key` and `y$key` . # rows_patch() casts values to the type of `x` Code (expect_error(rows_patch(x, y, "key"))) Output Error in `rows_patch()`: ! Can't convert from `y$value` to `x$value` due to loss of precision. * Locations: 1 # rows_upsert() doesn't allow `y` keys to be duplicated (#5553) Code (expect_error(rows_upsert(x, y, by = "a"))) Output Error in `rows_upsert()`: ! `y` key values must be unique. i The following rows contain duplicate key values: `c(1, 2)`. # rows_upsert() casts keys to their common type for matching but retains `x` type Code (expect_error(rows_upsert(x, y, "key"))) Output Error in `rows_upsert()`: ! Can't combine `x$key` and `y$key` . # rows_upsert() casts keys to the type of `x` Code (expect_error(rows_upsert(x, y, "key"))) Output Error in `rows_upsert()`: ! Can't convert from `y$key` to `x$key` due to loss of precision. * Locations: 1 # rows_upsert() casts values to the type of `x` Code (expect_error(rows_upsert(x, y, "key"))) Output Error in `rows_upsert()`: ! Can't convert from `y$value` to `x$value` due to loss of precision. * Locations: 1 # rows_delete() ignores extra `y` columns, with a message Code out <- rows_delete(x, y) Message Matching, by = "a" Ignoring extra `y` columns: b --- Code out <- rows_delete(x, y, by = "a") Message Ignoring extra `y` columns: b # rows_delete() requires `y` keys to exist in `x` by default Code (expect_error(rows_delete(x, y, "a"))) Output Error in `rows_delete()`: ! `y` must contain keys that already exist in `x`. i The following rows in `y` have keys that don't exist in `x`: `c(1, 3)`. i Use `unmatched = "ignore"` if you want to ignore these `y` rows. # rows_delete() casts keys to their common type for matching but retains `x` type Code (expect_error(rows_delete(x, y, "key"))) Output Error in `rows_delete()`: ! Can't combine `x$key` and `y$key` . # rows_check_x_contains_y() checks that `y` columns are in `x` Code (expect_error(rows_check_x_contains_y(x, y))) Output Error: ! All columns in `y` must exist in `x`. i The following columns only exist in `y`: `b`. # rows_check_by() checks that `y` has at least 1 column before using it (#6061) Code (expect_error(rows_check_by(by = NULL, y = y))) Output Error: ! `y` must have at least one column. # rows_check_by() uses the first column from `y` by default, with a message Code by <- rows_check_by(by = NULL, y = y) Message Matching, by = "a" # rows_check_by() validates `by` Code (expect_error(rows_check_by(by = 1, y = y))) Output Error: ! `by` must be a character vector. Code (expect_error(rows_check_by(by = character(), y = y))) Output Error: ! `by` must specify at least 1 column. Code (expect_error(rows_check_by(by = c(x = "y"), y = y))) Output Error: ! `by` must be unnamed. # rows_check_contains_by() checks that all `by` columns are in `x` Code (expect_error(rows_check_contains_by(x, "y", arg = "x"))) Output Error: ! All columns specified through `by` must exist in `x` and `y`. i The following columns are missing from `x`: `y`. Code (expect_error(rows_check_contains_by(x, c("y", "x", "z"), arg = "y"))) Output Error: ! All columns specified through `by` must exist in `x` and `y`. i The following columns are missing from `y`: `y` and `z`. # rows_check_unique() requires uniqueness Code (expect_error(rows_check_unique(x["x"], "x"))) Output Error: ! `x` key values must be unique. i The following rows contain duplicate key values: `c(1, 2, 3)`. Code (expect_error(rows_check_unique(x[c("x", "y")], "y"))) Output Error: ! `y` key values must be unique. i The following rows contain duplicate key values: `c(1, 3)`. dplyr/tests/testthat/_snaps/deprec-funs.md0000644000176200001440000000167414416000516020433 0ustar liggesusers# funs() is deprecated Code funs(fn = bar) Condition Warning: `funs()` was deprecated in dplyr 0.8.0. i Please use a list of either functions or lambdas: # Simple named list: list(mean = mean, median = median) # Auto named with `tibble::lst()`: tibble::lst(mean, median) # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE)) Output $ fn: bar(.) # funs() give meaningful error messages Code (expect_error(funs(function(si) { mp[si] }))) Output Error in `funs()`: ! `function(si) { mp[si] }` must be a function name (quoted or unquoted) or an unquoted call, not `function`. Code (expect_error(funs(~ mp[.]))) Output Error in `funs()`: ! `~mp[.]` must be a function name (quoted or unquoted) or an unquoted call, not `~`. dplyr/tests/testthat/_snaps/defunct-lazyeval.md0000644000176200001440000000700015137161765021501 0ustar liggesusers# generate informative errors Code add_count_() Condition Error: ! `add_count_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `add_count()` instead. i See vignette('programming') for more help Code add_tally_() Condition Error: ! `add_tally_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `add_tally()` instead. i See vignette('programming') for more help Code arrange_() Condition Error: ! `arrange_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `arrange()` instead. i See vignette('programming') for more help Code count_() Condition Error: ! `count_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `count()` instead. i See vignette('programming') for more help Code distinct_() Condition Error: ! `distinct_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `distinct()` instead. i See vignette('programming') for more help Code do_() Condition Error: ! `do_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `do()` instead. i See vignette('programming') for more help Code filter_() Condition Error: ! `filter_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `filter()` instead. i See vignette('programming') for more help Code funs_() Condition Error: ! `funs_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `funs()` instead. i See vignette('programming') for more help Code group_by_() Condition Error: ! `group_by_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `group_by()` instead. i See vignette('programming') for more help Code group_indices_() Condition Error: ! `group_indices_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `group_indices()` instead. Code mutate_() Condition Error: ! `mutate_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `mutate()` instead. i See vignette('programming') for more help Code tally_() Condition Error: ! `tally_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `tally()` instead. i See vignette('programming') for more help Code transmute_() Condition Error: ! `transmute_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `transmute()` instead. i See vignette('programming') for more help Code rename_() Condition Error: ! `rename_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `rename()` instead. Code select_() Condition Error: ! `select_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `select()` instead. Code slice_() Condition Error: ! `slice_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `slice()` instead. Code summarise_() Condition Error: ! `summarise_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `summarise()` instead. Code summarize_() Condition Error: ! `summarise_()` was deprecated in dplyr 0.7.0 and is now defunct. i Please use `summarise()` instead. dplyr/tests/testthat/_snaps/context.md0000644000176200001440000000234714416000514017700 0ustar liggesusers# give useful error messages when not applicable Code (expect_error(n())) Output Error in `n()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. Code (expect_error(cur_column())) Output Error in `cur_column()`: ! Must only be used inside `across()`. Code (expect_error(cur_group())) Output Error in `cur_group()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. Code (expect_error(cur_group_id())) Output Error in `cur_group_id()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. Code (expect_error(cur_group_rows())) Output Error in `cur_group_rows()`: ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`. # group labels are correctly formatted Code group_labels_details(c(a = 1)) Output [1] "`a = 1`" Code group_labels_details(c(a = 1, b = 2)) Output [1] "`a = 1`, `b = 2`" dplyr/tests/testthat/_snaps/coalesce.md0000644000176200001440000000436415106134104017773 0ustar liggesusers# coalesce() gives meaningful error messages Code coalesce(1:2, 1:3) Condition Error in `coalesce()`: ! Can't recycle `..1` (size 2) to match `..2` (size 3). --- Code coalesce(1:2, letters[1:2]) Condition Error in `coalesce()`: ! Can't combine `..1` and `..2` . # `.size` overrides the common size Code coalesce(x, 1:2, .size = vec_size(x)) Condition Error in `coalesce()`: ! Can't recycle `..2` (size 2) to size 1. # can't be empty Code coalesce() Condition Error in `coalesce()`: ! `...` can't be empty. # must have at least one non-`NULL` vector Code coalesce(NULL, NULL) Condition Error in `coalesce()`: ! `...` must contain at least 1 non-`NULL` value. # inputs must be vectors Code coalesce(1, environment()) Condition Error in `coalesce()`: ! `..2` must be a vector, not an environment. i Read our FAQ about scalar types (`?vctrs::faq_error_scalar_type`) to learn more. # names in error messages are indexed correctly Code coalesce(1, "x") Condition Error in `coalesce()`: ! Can't combine `..1` and `..2` . --- Code coalesce(1, y = "x") Condition Error in `coalesce()`: ! Can't combine `..1` and `y` . --- Code coalesce(1:2, 1:3) Condition Error in `coalesce()`: ! Can't recycle `..1` (size 2) to match `..2` (size 3). --- Code coalesce(1:2, y = 1:3) Condition Error in `coalesce()`: ! Can't recycle `..1` (size 2) to match `y` (size 3). --- Code coalesce(1, NULL, "x") Condition Error in `coalesce()`: ! Can't combine `..1` and `..3` . --- Code coalesce(1, NULL, y = "x") Condition Error in `coalesce()`: ! Can't combine `..1` and `y` . --- Code coalesce(1:2, NULL, 1:3) Condition Error in `coalesce()`: ! Can't recycle `..1` (size 2) to match `..3` (size 3). --- Code coalesce(1:2, NULL, y = 1:3) Condition Error in `coalesce()`: ! Can't recycle `..1` (size 2) to match `y` (size 3). dplyr/tests/testthat/_snaps/distinct.md0000644000176200001440000000164015106134104020030 0ustar liggesusers# distinct errors when selecting an unknown column (#3140) Code df <- tibble(g = c(1, 2), x = c(1, 2)) (expect_error(distinct(df, aa, x))) Output Error in `distinct()`: ! Must use existing variables. x `aa` not found in `.data`. Code (expect_error(distinct(df, aa, bb))) Output Error in `distinct()`: ! Must use existing variables. x `aa` not found in `.data`. x `bb` not found in `.data`. Code (expect_error(distinct(df, .data$aa))) Output Error in `distinct()`: ! Must use existing variables. x `aa` not found in `.data`. Code (expect_error(distinct(df, y = a + 1))) Output Error in `distinct()`: i In argument: `y = a + 1`. Caused by error: ! object 'a' not found dplyr/tests/testthat/_snaps/count-tally.md0000644000176200001440000001005715137161765020505 0ustar liggesusers# name must be string Code count(df, x, name = 1) Condition Error in `tally()`: ! `name` must be a single string, not the number 1. --- Code count(df, x, name = letters) Condition Error in `tally()`: ! `name` must be a single string, not a character vector. # can only explicitly chain together multiple tallies Code df <- data.frame(g = c(1, 1, 2, 2), n = 1:4) count(df, g, wt = n) Output g n 1 1 3 2 2 7 Code count(count(df, g, wt = n), wt = n) Output n 1 10 Code count(df, n) Message Storing counts in `nn`, as `n` already present in input i Use `name = "new_name"` to pick a new name. Output n nn 1 1 1 2 2 1 3 3 1 4 4 1 # count() owns errors (#6139) Code (expect_error(count(mtcars, new = 1 + ""))) Output Error in `count()`: i In argument: `new = 1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code (expect_error(count(mtcars, wt = 1 + ""))) Output Error in `count()`: i In argument: `n = base::sum(1 + "", na.rm = TRUE)`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator # count() `wt = n()` is deprecated Code count(df, a, wt = n()) Condition Warning: `wt = n()` was deprecated in dplyr 1.0.1. i You can now omit the `wt` argument. Output # A tibble: 5 x 2 a n 1 1 1 2 2 1 3 3 1 4 4 1 5 5 1 # tally() owns errors (#6139) Code (expect_error(tally(mtcars, wt = 1 + ""))) Output Error in `tally()`: i In argument: `n = base::sum(1 + "", na.rm = TRUE)`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator # tally() `wt = n()` is deprecated Code tally(df, wt = n()) Condition Warning: `wt = n()` was deprecated in dplyr 1.0.1. i You can now omit the `wt` argument. Output # A tibble: 1 x 1 n 1 5 # `.drop` is defunct Code add_count(df, f, .drop = FALSE) Condition Error: ! The `.drop` argument of `add_count()` was deprecated in dplyr 1.0.0 and is now defunct. # add_count() `wt = n()` is deprecated Code add_count(df, a, wt = n()) Condition Warning: `wt = n()` was deprecated in dplyr 1.0.1. i You can now omit the `wt` argument. Output # A tibble: 5 x 2 a n 1 1 1 2 2 1 3 3 1 4 4 1 5 5 1 # add_count() owns errors (#6139) Code (expect_error(add_count(mtcars, new = 1 + ""))) Output Error in `add_count()`: i In argument: `new = 1 + ""`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator Code (expect_error(add_count(mtcars, wt = 1 + ""))) Output Error in `add_count()`: i In argument: `n = base::sum(1 + "", na.rm = TRUE)`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator # add_tally() owns errors (#6139) Code (expect_error(add_tally(mtcars, wt = 1 + ""))) Output Error in `add_tally()`: i In argument: `n = base::sum(1 + "", na.rm = TRUE)`. Caused by error in `1 + ""`: ! non-numeric argument to binary operator # add_tally() `wt = n()` is deprecated Code add_tally(df, wt = n()) Condition Warning: `wt = n()` was deprecated in dplyr 1.0.1. i You can now omit the `wt` argument. Output # A tibble: 5 x 2 a n 1 1 5 2 2 5 3 3 5 4 4 5 5 5 5 dplyr/tests/testthat/test-bind-cols.R0000644000176200001440000000752315106134104017362 0ustar liggesuserstest_that("bind_cols() uses shallow copies", { skip_if_not_installed("lobstr") df1 <- data.frame( int = 1:10, num = rnorm(10), cha = letters[1:10], stringsAsFactors = FALSE ) df2 <- data.frame( log = sample(c(T, F), 10, replace = TRUE), dat = seq.Date(Sys.Date(), length.out = 10, by = "day"), tim = seq(Sys.time(), length.out = 10, by = "1 hour") ) df <- bind_cols(df1, df2) expect_equal(lobstr::obj_addrs(df1), lobstr::obj_addrs(df[names(df1)])) expect_equal(lobstr::obj_addrs(df2), lobstr::obj_addrs(df[names(df2)])) }) test_that("bind_cols() handles lists (#1104)", { exp <- tibble(x = 1, y = "a", z = 2) l1 <- list(x = 1, y = "a") l2 <- list(z = 2) expect_identical(bind_cols(l1, l2), exp) expect_identical(bind_cols(list(l1, l2)), exp) }) test_that("bind_cols() handles empty argument list (#1963)", { expect_equal(bind_cols(), tibble()) }) test_that("bind_cols() handles all-NULL values (#2303)", { expect_identical(bind_cols(list(a = NULL, b = NULL)), tibble()) expect_identical(bind_cols(NULL), tibble()) }) test_that("bind_cols() repairs names", { df <- tibble(a = 1, b = 2) expect_snapshot(bound <- bind_cols(df, df)) expect_message( repaired <- as_tibble( data.frame(a = 1, b = 2, a = 1, b = 2, check.names = FALSE), .name_repair = "unique" ), "New names" ) expect_identical(bound, repaired) }) test_that("bind_cols() unpacks tibbles", { expect_equal( bind_cols(list(y = tibble(x = 1:2))), tibble(x = 1:2) ) expect_equal( bind_cols(list(y = tibble(x = 1:2), z = tibble(y = 1:2))), tibble(x = 1:2, y = 1:2) ) }) test_that("bind_cols() honours .name_repair=", { expect_message( res <- bind_cols( data.frame(a = 1), data.frame(a = 2) ) ) expect_equal(res, data.frame(a...1 = 1, a...2 = 2)) expect_error(bind_cols( .name_repair = "check_unique", data.frame(a = 1), data.frame(a = 2) )) }) test_that("bind_cols() accepts NULL (#1148)", { df1 <- tibble(a = 1:10, b = 1:10) df2 <- tibble(c = 1:10, d = 1:10) res1 <- bind_cols(df1, df2) res2 <- bind_cols(NULL, df1, df2) res3 <- bind_cols(df1, NULL, df2) res4 <- bind_cols(df1, df2, NULL) expect_identical(res1, res2) expect_identical(res1, res3) expect_identical(res1, res4) }) test_that("bind_cols() infers classes from first result (#1692)", { d1 <- data.frame(a = 1:10, b = rep(1:2, each = 5)) d2 <- tibble(c = 1:10, d = rep(1:2, each = 5)) d3 <- group_by(d2, d) d4 <- rowwise(d2) d5 <- list(c = 1:10, d = rep(1:2, each = 5)) suppressMessages({ expect_equal(class(bind_cols(d1, d1)), "data.frame") expect_equal(class(bind_cols(d2, d1)), c("tbl_df", "tbl", "data.frame")) }) res3 <- bind_cols(d3, d1) expect_equal(class(res3), c("grouped_df", "tbl_df", "tbl", "data.frame")) expect_equal(map_int(group_rows(res3), length), c(5, 5)) expect_equal( class(bind_cols(d4, d1)), c("rowwise_df", "tbl_df", "tbl", "data.frame") ) expect_equal(class(bind_cols(d5, d1)), "data.frame") }) test_that("accepts named columns", { expect_identical(bind_cols(a = 1:2, b = 3:4), tibble(a = 1:2, b = 3:4)) }) test_that("ignores NULL values", { expect_identical(bind_cols(a = 1, NULL, b = 2, NULL), tibble(a = 1, b = 2)) }) test_that("bind_cols() handles unnamed list with name repair (#3402)", { expect_snapshot(df <- bind_cols(list(1, 2))) expect_identical(df, bind_cols(list(...1 = 1, ...2 = 2))) }) test_that("bind_cols() doesn't squash record types", { df <- data.frame(x = 1) posixlt <- as.POSIXlt(as.Date("1970-01-01")) expect_identical( bind_cols(df, y = posixlt), new_data_frame(list(x = 1, y = posixlt)) ) }) test_that("bind_cols() gives informative errors", { expect_snapshot({ "# incompatible size" (expect_error(bind_cols(a = 1:2, mtcars))) (expect_error(bind_cols(mtcars, a = 1:3))) }) }) dplyr/tests/testthat/test-group-trim.R0000644000176200001440000000116215106134104017606 0ustar liggesuserstest_that("group_trim() is identity on non grouped data", { expect_identical(group_trim(iris), iris) }) test_that("group_trim() always regroups even if no factors", { res <- mtcars |> group_by(cyl) |> filter(cyl == 6, .preserve = TRUE) |> group_trim() expect_equal(n_groups(res), 1L) }) test_that("group_trim() drops factor levels in data and grouping structure", { res <- iris |> group_by(Species) |> filter(Species == "setosa") |> group_trim() expect_equal(n_groups(res), 1L) expect_equal(levels(res$Species), "setosa") expect_equal(levels(attr(res, "groups")$Species), "setosa") }) dplyr/tests/testthat/test-pull.R0000644000176200001440000000124414266276767016512 0ustar liggesuserstest_that("default extracts last var from data frame", { df <- tibble(x = 1:10, y = 1:10) expect_equal(pull(df), 1:10) }) test_that("can extract by name, or positive/negative position", { x <- 1:10 df <- tibble(x = x, y = runif(10)) expect_equal(pull(df, x), x) expect_equal(pull(df, 1L), x) expect_equal(pull(df, 1), x) expect_equal(pull(df, -2), x) expect_equal(pull(df, -2L), x) }) test_that("can extract named vectors", { x <- 1:10 y <- letters[x] df <- tibble(x = x, y = y) xn <- set_names(x, y) expect_equal(pull(df, x), x) expect_equal(pull(df, x, y), xn) expect_equal(pull(df, 1, 2), xn) expect_equal(names(pull(df, x, y)), y) }) dplyr/tests/testthat/test-join-by.R0000644000176200001440000003071615106134104017057 0ustar liggesusers# ------------------------------------------------------------------------------ # `join_by()` test_that("works with equi conditions", { by <- join_by(x == y, a == b) expect_identical(by$x, c("x", "a")) expect_identical(by$y, c("y", "b")) expect_identical(by$condition, c("==", "==")) expect_identical(by$filter, c("none", "none")) }) test_that("works with non-equi conditions", { by <- join_by(x == y, a > b, a >= b, a < b, a <= b) expect_identical(by$x, c("x", rep("a", 4))) expect_identical(by$y, c("y", rep("b", 4))) expect_identical(by$condition, c("==", ">", ">=", "<", "<=")) }) test_that("works with `closest()`", { by <- join_by(x == y, closest(a >= b)) expect_identical(by$x, c("x", "a")) expect_identical(by$y, c("y", "b")) expect_identical(by$filter, c("none", "max")) expect_identical(by$condition, c("==", ">=")) by <- join_by(x == y, closest(a > b)) expect_identical(by$x, c("x", "a")) expect_identical(by$y, c("y", "b")) expect_identical(by$filter, c("none", "max")) expect_identical(by$condition, c("==", ">")) by <- join_by(x == y, closest(a <= b)) expect_identical(by$x, c("x", "a")) expect_identical(by$y, c("y", "b")) expect_identical(by$filter, c("none", "min")) expect_identical(by$condition, c("==", "<=")) by <- join_by(x == y, closest(a < b)) expect_identical(by$x, c("x", "a")) expect_identical(by$y, c("y", "b")) expect_identical(by$filter, c("none", "min")) expect_identical(by$condition, c("==", "<")) }) test_that("works with single arguments", { by <- join_by(a, b) expect_identical(by$x, c("a", "b")) expect_identical(by$y, c("a", "b")) }) test_that("works with character strings", { by1 <- join_by("a", "b" == "c", closest("d" >= "e")) by2 <- join_by(a, b == c, closest(d >= e)) expect_identical(by1$condition, by2$condition) expect_identical(by1$filter, by2$filter) expect_identical(by1$x, by2$x) expect_identical(by1$y, by2$y) }) test_that("works with explicit referencing", { by <- join_by(x$a == y$b) expect_identical(by$x, "a") expect_identical(by$y, "b") by <- join_by(y$a == x$b) expect_identical(by$x, "b") expect_identical(by$y, "a") }) test_that("join condition is correctly reversed with explicit referencing", { by <- join_by(y$a == x$a, y$a >= x$a, y$a > x$a, y$a <= x$a, y$a < x$a) expect_identical(by$condition, c("==", "<=", "<", ">=", ">")) }) test_that("`closest()` works with explicit referencing", { by <- join_by(closest(y$a <= x$b), closest(y$a > x$b)) expect_identical(by$x, c("b", "b")) expect_identical(by$y, c("a", "a")) expect_identical(by$filter, c("max", "min")) expect_identical(by$condition, c(">=", "<")) }) test_that("between conditions expand correctly", { by <- join_by(between(a, b, c)) expect_identical(by$x, c("a", "a")) expect_identical(by$y, c("b", "c")) by <- join_by(between(y$a, x$b, x$c)) expect_identical(by$x, c("b", "c")) expect_identical(by$y, c("a", "a")) by <- join_by(between(a, b, c, bounds = "[]")) expect_identical(by$condition, c(">=", "<=")) by <- join_by(between(a, b, c, bounds = "[)")) expect_identical(by$condition, c(">=", "<")) by <- join_by(between(a, b, c, bounds = "(]")) expect_identical(by$condition, c(">", "<=")) by <- join_by(between(a, b, c, bounds = "()")) expect_identical(by$condition, c(">", "<")) by <- join_by(between(y$a, x$b, x$c, bounds = "[]")) expect_identical(by$condition, c("<=", ">=")) by <- join_by(between(y$a, x$b, x$c, bounds = "[)")) expect_identical(by$condition, c("<=", ">")) by <- join_by(between(y$a, x$b, x$c, bounds = "(]")) expect_identical(by$condition, c("<", ">=")) by <- join_by(between(y$a, x$b, x$c, bounds = "()")) expect_identical(by$condition, c("<", ">")) }) test_that("within conditions expand correctly", { by <- join_by(within(a, b, c, d)) expect_identical(by$x, c("a", "b")) expect_identical(by$y, c("c", "d")) expect_identical(by$condition, c(">=", "<=")) by <- join_by(within(y$a, y$b, x$b, x$c)) expect_identical(by$x, c("b", "c")) expect_identical(by$y, c("a", "b")) expect_identical(by$condition, c("<=", ">=")) }) test_that("overlaps conditions expand correctly", { by <- join_by(overlaps(a, b, c, d)) expect_identical(by$x, c("a", "b")) expect_identical(by$y, c("d", "c")) by <- join_by(overlaps(y$a, y$b, x$b, x$c)) expect_identical(by$x, c("c", "b")) expect_identical(by$y, c("a", "b")) by <- join_by(overlaps(a, b, c, d, bounds = "[]")) expect_identical(by$condition, c("<=", ">=")) by <- join_by(overlaps(a, b, c, d, bounds = "[)")) expect_identical(by$condition, c("<", ">")) by <- join_by(overlaps(a, b, c, d, bounds = "(]")) expect_identical(by$condition, c("<", ">")) by <- join_by(overlaps(a, b, c, d, bounds = "()")) expect_identical(by$condition, c("<", ">")) by <- join_by(overlaps(y$a, y$b, x$b, x$c, bounds = "[]")) expect_identical(by$condition, c(">=", "<=")) by <- join_by(overlaps(y$a, y$b, x$b, x$c, bounds = "[)")) expect_identical(by$condition, c(">", "<")) by <- join_by(overlaps(y$a, y$b, x$b, x$c, bounds = "(]")) expect_identical(by$condition, c(">", "<")) by <- join_by(overlaps(y$a, y$b, x$b, x$c, bounds = "()")) expect_identical(by$condition, c(">", "<")) }) test_that("between / overlaps / within / closest can use named arguments", { by <- join_by(between(a, y_upper = b, y_lower = c)) expect_identical(by$x, c("a", "a")) expect_identical(by$y, c("c", "b")) by <- join_by(overlaps(y_lower = c, y_upper = d, x_lower = a, x_upper = b)) expect_identical(by$x, c("a", "b")) expect_identical(by$y, c("d", "c")) expect_identical(by$condition, c("<=", ">=")) by <- join_by(overlaps( y_lower = x$c, y_upper = x$d, x_lower = y$a, x_upper = y$b )) expect_identical(by$x, c("d", "c")) expect_identical(by$y, c("a", "b")) expect_identical(by$condition, c(">=", "<=")) by <- join_by(within(y_lower = c, y_upper = d, x_lower = a, x_upper = b)) expect_identical(by$x, c("a", "b")) expect_identical(by$y, c("c", "d")) expect_identical(by$condition, c(">=", "<=")) by <- join_by(within( y_lower = x$c, y_upper = x$d, x_lower = y$a, x_upper = y$b )) expect_identical(by$x, c("c", "d")) expect_identical(by$y, c("a", "b")) expect_identical(by$condition, c("<=", ">=")) by <- join_by(closest(expr = a > b)) expect_identical(by$x, "a") expect_identical(by$y, "b") }) test_that("joining by nothing is an error", { expect_snapshot(error = TRUE, { join_by() }) }) test_that("can pass `...` on to wrapped `join_by()`", { fn <- function(...) { join_by(...) } fn2 <- function(x) { fn({{ x }} == y) } expect_identical(fn(x == y, a <= b), join_by(x == y, a <= b)) expect_identical(fn2(a), join_by(a == y)) }) test_that("can wrap `join_by()` and use embracing to inject columns (#6469)", { fn <- function(x) { join_by({{ x }} == y) } expect_identical(fn("foo"), join_by("foo" == y)) # Expression substitution, not quosure evaluation a <- "foo" expect_identical(fn(a), join_by(a == y)) # But you can inline with `!!` expect_identical(fn(!!a), join_by("foo" == y)) fn <- function(x, top) { join_by(between({{ x }}, lower, {{ top }})) } expect_identical(fn(x, y), join_by(between(x, lower, y))) }) test_that("can wrap `join_by()` and use embracing to inject expressions", { fn <- function(expr) { join_by({{ expr }}, a <= b) } expect_identical(fn(a == b), join_by(a == b, a <= b)) }) test_that("nicely catches required missing arguments when wrapped", { fn <- function(x, y) { join_by({{ x }} == {{ y }}) } expect_snapshot(error = TRUE, fn(a)) }) test_that("allows for namespaced helpers (#6838)", { # Captures namespaced expression for printing expect_snapshot(join_by(dplyr::between(x, left, right))) expect_snapshot(join_by(dplyr::within(xl, xu, yl, yu))) expect_snapshot(join_by(dplyr::overlaps(xl, xu, yl, yu))) expect_snapshot(join_by(dplyr::closest(x < y))) # Underlying values are otherwise the same as non-namespaced version by <- join_by(dplyr::between(x, left, right)) reference <- join_by(between(x, left, right)) expect_identical(by$condition, reference$condition) expect_identical(by$filter, reference$filter) expect_identical(by$x, reference$x) expect_identical(by$y, reference$y) }) test_that("has an informative print method", { expect_snapshot(join_by(a, b)) expect_snapshot(join_by("a", "b")) expect_snapshot(join_by(a == a, b >= c)) expect_snapshot(join_by(a == a, b >= "c")) expect_snapshot(join_by(a == a, closest(b >= c), closest(d < e))) }) test_that("has informative error messages", { # `=` rather than `==` expect_snapshot(error = TRUE, join_by(a = b)) # Empty expression expect_snapshot(error = TRUE, join_by(NULL)) # Improper helper specification expect_snapshot(error = TRUE, join_by(foo(x > y))) # Improper separator expect_snapshot(error = TRUE, join_by(x == y, x^y)) # Improper LHS expect_snapshot(error = TRUE, join_by(x + 1 == y)) # Improper RHS expect_snapshot(error = TRUE, join_by(x == y + 1)) # Garbage input expect_snapshot(error = TRUE, join_by(1)) # Call with non-symbol first element expect_snapshot(error = TRUE, join_by(1())) # Namespace prefixed helper with non-dplyr namespace # (typo or re-export, which currently isn't allowed) expect_snapshot(error = TRUE, join_by(dplyrr::between(x, left, right))) # Top level usage of `$` expect_snapshot(error = TRUE, join_by(x$a)) # `$` must only contain x/y on LHS expect_snapshot(error = TRUE, join_by(z$a == y$b)) expect_snapshot(error = TRUE, join_by(x$a == z$b)) # Extra cautious check for horrible usage of `$` expect_snapshot(error = TRUE, join_by(`$`(x + 1, y) == b)) # Referencing the same table expect_snapshot(error = TRUE, join_by(x$a == x$b)) expect_snapshot(error = TRUE, join_by(y$a == b)) expect_snapshot(error = TRUE, join_by(between(x$a, x$a, x$b))) expect_snapshot(error = TRUE, join_by(within(x$a, x$b, x$a, x$b))) expect_snapshot(error = TRUE, join_by(overlaps(a, b, x$a, x$b))) expect_snapshot(error = TRUE, join_by(closest(x$a >= x$b))) # Referencing different tables in lower/upper bound pairs expect_snapshot(error = TRUE, join_by(between(a, x$a, y$b))) expect_snapshot(error = TRUE, join_by(within(x$a, y$b, y$a, y$b))) expect_snapshot(error = TRUE, join_by(overlaps(x$a, x$b, y$a, x$b))) # Too few arguments expect_snapshot(error = TRUE, join_by(`>`(x))) expect_snapshot(error = TRUE, join_by(between(x))) expect_snapshot(error = TRUE, join_by(within(x))) expect_snapshot(error = TRUE, join_by(overlaps(x))) expect_snapshot(error = TRUE, join_by(closest())) expect_snapshot(error = TRUE, join_by(`$`(x) > y)) # Too many arguments expect_snapshot(error = TRUE, join_by(closest(a >= b, 1))) # `==` in `closest()` expect_snapshot(error = TRUE, join_by(closest(a == b))) # Non-expression in `closest()` expect_snapshot(error = TRUE, join_by(closest(x))) expect_snapshot(error = TRUE, join_by(closest(1))) # Invalid expression in `closest()` expect_snapshot(error = TRUE, join_by(closest(x + y))) # Invalid `bounds` in `between()` and `overlaps()` expect_snapshot(error = TRUE, join_by(between(x, lower, upper, bounds = 1))) expect_snapshot(error = TRUE, join_by(between(x, lower, upper, bounds = "a"))) expect_snapshot( error = TRUE, join_by(overlaps(x, y, lower, upper, bounds = 1)) ) expect_snapshot( error = TRUE, join_by(overlaps(x, y, lower, upper, bounds = "a")) ) # Non-empty dots in `between()` and `overlaps()` expect_snapshot(error = TRUE, join_by(between(x, lower, upper, foo = 1))) expect_snapshot(error = TRUE, join_by(overlaps(x, y, lower, upper, foo = 1))) }) # ------------------------------------------------------------------------------ # `as_join_by()` test_that("as_join_by() emits useful errors", { expect_snapshot(error = TRUE, as_join_by(FALSE)) }) # ------------------------------------------------------------------------------ # `join_by_common()` test_that("automatically finds common variables", { x_names <- c("x", "y") y_names <- c("x", "z") expect_message(by <- join_by_common(x_names, y_names)) expect_identical(by$x, "x") expect_identical(by$y, "x") }) test_that("join_by_common() emits useful information", { # Common by message expect_snapshot(by <- join_by_common(c("x", "y"), c("x", "y"))) # Works with names that need backticks expect_snapshot(by <- join_by_common(c("_x", "foo bar"), c("_x", "foo bar"))) # No common variables error expect_snapshot(error = TRUE, join_by_common(c("x", "y"), c("w", "z"))) }) dplyr/tests/testthat/test-rank.R0000644000176200001440000000655115106134104016443 0ustar liggesusers# ranking functions ------------------------------------------------------- test_that("ranking empty vector returns empty vector (#762)", { x <- numeric() expect_equal(row_number(x), numeric()) expect_equal(min_rank(x), numeric()) expect_equal(dense_rank(x), numeric()) expect_equal(percent_rank(x), numeric()) expect_equal(cume_dist(x), numeric()) expect_equal(ntile(x, 1), numeric()) }) test_that("rank functions deal pass NA (and NaN) through (#774, #1132)", { x <- c(1, 2, NA, 1, 0, NaN) expect_equal(percent_rank(x), c(1 / 3, 1, NA, 1 / 3, 0, NA)) expect_equal(min_rank(x), c(2L, 4L, NA, 2L, 1L, NA)) expect_equal(dense_rank(x), c(2L, 3L, NA, 2L, 1L, NA)) expect_equal(cume_dist(x), c(.75, 1, NA, .75, .25, NA)) expect_equal(row_number(x), c(2L, 4L, NA, 3L, 1L, NA)) }) test_that("ranking functions can handle data frames", { # Explicitly testing partially/fully incomplete rows df <- tibble( year = c(2020, 2020, 2021, 2020, 2020, NA), month = c(3, 2, 1, 2, NA, NA) ) expect_identical(row_number(df), c(3L, 1L, 4L, 2L, NA, NA)) expect_identical(min_rank(df), c(3L, 1L, 4L, 1L, NA, NA)) expect_identical(dense_rank(df), c(2L, 1L, 3L, 1L, NA, NA)) expect_identical(percent_rank(df), c(2 / 3, 0 / 3, 3 / 3, 0 / 3, NA, NA)) expect_identical(cume_dist(df), c(3 / 4, 2 / 4, 4 / 4, 2 / 4, NA, NA)) expect_identical(ntile(df, 2), c(2L, 1L, 2L, 1L, NA, NA)) expect_identical(ntile(df, 4), c(3L, 1L, 4L, 2L, NA, NA)) }) # row_number() -------------------------------------------------------------- test_that("zero-arg row_number() works in mutate", { n <- c(1, 5, 2, 9) df <- tibble(id = rep(letters[1:4], n)) expect_equal(mutate(df, rn = row_number())$rn, 1:sum(n)) gf <- group_by(df, id) expect_equal(mutate(gf, rn = row_number())$rn, sequence(n)) }) # ntile() ------------------------------------------------------------------- test_that("ntile puts biggest groups first (#4995) ", { expect_equal(ntile(1, 5), 1) expect_equal(ntile(1:2, 5), 1:2) expect_equal(ntile(1:3, 5), 1:3) expect_equal(ntile(1:4, 5), 1:4) expect_equal(ntile(1:5, 5), 1:5) expect_equal(ntile(1:6, 5), c(1, 1:5)) expect_equal(ntile(1, 7), 1) expect_equal(ntile(1:2, 7), 1:2) expect_equal(ntile(1:3, 7), 1:3) expect_equal(ntile(1:4, 7), 1:4) expect_equal(ntile(1:5, 7), 1:5) expect_equal(ntile(1:6, 7), 1:6) expect_equal(ntile(1:7, 7), 1:7) expect_equal(ntile(1:8, 7), c(1, 1:7)) }) test_that("ntile ignores NAs", { x <- c(1:3, NA, NA, NA) expect_equal(ntile(x, 3), x) x1 <- c(1L, 1L, 1L, NA, NA, NA) expect_equal(ntile(x, 1), x1) }) test_that("ntile always returns an integer", { expect_equal(ntile(numeric(), 3), integer()) expect_equal(ntile(NA, 3), NA_integer_) }) test_that("ntile() does not overflow (#4186)", { out <- ntile(1:1e5, n = 1e5) expect_equal(out, 1:1e5) }) test_that("ntile() works with one argument (#3418)", { df <- tibble(id = c(1, 1, 2, 2, 2), x = 1:5) expect_equal(mutate(df, out = ntile(n = 3))$out, c(1, 1, 2, 2, 3)) gf <- group_by(df, id) expect_equal(mutate(gf, out = ntile(n = 2))$out, c(1, 2, 1, 1, 2)) }) test_that("ntile() validates `n`", { expect_snapshot(error = TRUE, { ntile(1, n = 1.5) }) expect_snapshot(error = TRUE, { ntile(1, n = c(1, 2)) }) expect_snapshot(error = TRUE, { ntile(1, n = NA_real_) }) expect_snapshot(error = TRUE, { ntile(1, n = 0) }) }) dplyr/tests/testthat/test-select.R0000644000176200001440000001370015106134104016761 0ustar liggesuserstest_that("select preserves grouping", { gf <- group_by(tibble(g = 1:3, x = 3:1), g) i <- count_regroups(out <- select(gf, h = g)) expect_equal(i, 0) expect_equal(group_vars(out), "h") }) test_that("grouping variables preserved with a message, unless already selected (#1511, #5841)", { df <- tibble(g = 1:3, x = 3:1) |> group_by(g) expect_snapshot({ res <- select(df, x) }) expect_named(res, c("g", "x")) df <- tibble(a = 1, b = 2, c = 3) |> group_by(a) expect_equal(df |> select(a = b), tibble(a = 2)) df <- tibble(a = 1, b = 2, c = 3) |> group_by(a, b) expect_snapshot({ expect_equal(df |> select(a = c), tibble(b = 2, a = 3) |> group_by(b)) expect_equal(df |> select(b = c), tibble(a = 1, b = 3) |> group_by(a)) }) }) test_that("non-syntactic grouping variable is preserved (#1138)", { expect_snapshot( df <- tibble(`a b` = 1L) |> group_by(`a b`) |> select() ) expect_named(df, "a b") }) test_that("select doesn't fail if some names missing", { df1 <- data.frame(x = 1:10, y = 1:10, z = 1:10) df2 <- setNames(df1, c("x", "y", "")) # df3 <- setNames(df1, c("x", "", "")) expect_equal(select(df1, x), data.frame(x = 1:10)) expect_equal(select(df2, x), data.frame(x = 1:10)) # expect_equal(select(df3, x), data.frame(x = 1:10)) }) # Special cases ------------------------------------------------- test_that("select with no args returns nothing", { empty <- select(mtcars) expect_equal(df_n_col(empty), 0) expect_equal(nrow(empty), 32) empty <- select(mtcars, !!!list()) expect_equal(df_n_col(empty), 0) expect_equal(nrow(empty), 32) }) test_that("select excluding all vars returns nothing", { expect_equal(dim(select(mtcars, -(mpg:carb))), c(32, 0)) expect_equal(dim(select(mtcars, starts_with("x"))), c(32, 0)) expect_equal(dim(select(mtcars, -matches("."))), c(32, 0)) }) test_that("negating empty match returns everything", { df <- data.frame(x = 1:3, y = 3:1) expect_equal(select(df, -starts_with("xyz")), df) }) test_that("can select with duplicate columns", { df <- tibble(x = 1, x = 2, y = 1, .name_repair = "minimal") # can extract duplicate cols by position expect_named(df |> select(1, 3), c("x", "y")) # can select out non-duplicated columns expect_named(df |> select(y), "y") }) # Select variables ----------------------------------------------- test_that("select can be before group_by (#309)", { df <- data.frame( id = c(1, 1, 2, 2, 2, 3, 3, 4, 4, 5), year = c(2013, 2013, 2012, 2013, 2013, 2013, 2012, 2012, 2013, 2013), var1 = rnorm(10) ) dfagg <- df |> group_by(id, year) |> select(id, year, var1) |> summarise(var1 = mean(var1)) expect_equal(names(dfagg), c("id", "year", "var1")) }) test_that("select succeeds in presence of raw columns (#1803)", { df <- tibble(a = 1:3, b = as.raw(1:3)) expect_identical(select(df, a), df["a"]) expect_identical(select(df, b), df["b"]) expect_identical(select(df, -b), df["a"]) }) test_that("arguments to select() don't match vars_select() arguments", { df <- tibble(a = 1) expect_identical(select(df, var = a), tibble(var = 1)) expect_identical( select(group_by(df, a), var = a), group_by(tibble(var = 1), var) ) expect_identical(select(df, exclude = a), tibble(exclude = 1)) expect_identical(select(df, include = a), tibble(include = 1)) expect_identical( select(group_by(df, a), exclude = a), group_by(tibble(exclude = 1), exclude) ) expect_identical( select(group_by(df, a), include = a), group_by(tibble(include = 1), include) ) }) test_that("can select() with deprecated `.data` pronoun (#2715)", { withr::local_options(lifecycle_verbosity = "quiet") expect_identical(select(mtcars, .data$cyl), select(mtcars, cyl)) }) test_that("can select() with character vectors", { expect_identical( select(mtcars, "cyl", !!"disp", c("cyl", "am", "drat")), mtcars[c("cyl", "disp", "am", "drat")] ) }) test_that("select() treats NULL inputs as empty", { expect_identical(select(mtcars, cyl), select(mtcars, NULL, cyl, NULL)) }) test_that("can select() with strings and character vectors", { vars <- c(foo = "cyl", bar = "am") expect_identical(select(mtcars, !!!vars), select(mtcars, foo = cyl, bar = am)) expect_identical(select(mtcars, !!vars), select(mtcars, foo = cyl, bar = am)) }) test_that("select works on empty names (#3601)", { df <- data.frame(x = 1, y = 2, z = 3) colnames(df) <- c("x", "y", "") expect_identical(select(df, x)$x, 1) colnames(df) <- c("", "y", "z") expect_identical(select(df, y)$y, 2) }) test_that("select works on NA names (#3601)", { df <- data.frame(x = 1, y = 2, z = 3) colnames(df) <- c("x", "y", NA) expect_identical(select(df, x)$x, 1) colnames(df) <- c(NA, "y", "z") expect_identical(select(df, y)$y, 2) }) test_that("select() keeps attributes of raw data frames (#5831)", { df <- data.frame(x = 1) attr(df, "a") <- "b" expect_equal(attr(select(df, x), "a"), "b") }) test_that("select() provides informative errors", { expect_snapshot({ (expect_error(select(mtcars, 1 + ""))) }) }) # dplyr_col_select() ------------------------------------------------------ test_that("dplyr_col_select() aborts when `[` implementation is broken", { local_methods( "[.dplyr_test_broken_operator" = function(x, ...) { unclass(x) }, "[.dplyr_test_operator_wrong_size" = function(x, ...) { data.frame() } ) df1 <- new_tibble( list(x = 1), nrow = 1L, class = "dplyr_test_broken_operator" ) expect_snapshot({ (expect_error( select(df1, 1:2) )) (expect_error( select(df1, 0) )) }) df2 <- new_tibble( list(x = 1), nrow = 1L, class = "dplyr_test_operator_wrong_size" ) expect_error(select(df2, 1:2)) expect_snapshot({ # from vctrs (expect_error( select(df1, 2) )) # not returning a data frame (expect_error( select(df1, 1) )) # unexpected number of columns (expect_error( select(df2, 1) )) }) }) dplyr/tests/testthat/test-colwise-arrange.R0000644000176200001440000000415115106134104020564 0ustar liggesusersdf <- mtcars[1:3] test_that("scoped arrange is identical to manual arrange", { expect_identical(arrange_all(df), arrange(df, mpg, cyl, disp)) expect_identical(arrange_at(df, vars(mpg)), arrange(df, mpg)) expect_identical(arrange_if(iris, is.factor), arrange(iris, Species)) }) test_that(".funs is applied to variables before sorting", { expect_identical(arrange_all(df, `-`), arrange(df, -mpg, -cyl, -disp)) }) test_that("arrange_at can arrange by grouping variables (#3351, #3332, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) expect_identical( arrange_at(tbl, vars(gr1)), arrange(tbl, gr1) ) expect_identical( arrange_at(tbl, vars(-x)), arrange(tbl, gr1, gr2) ) }) test_that("arrange_all arranges by grouping variable (#3351, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) expect_identical( arrange_all(tbl), arrange(tbl, gr1, gr2, x) ) expect_identical( arrange_all(tbl, desc), arrange(tbl, desc(gr1), desc(gr2), desc(x)) ) }) test_that("arrange_if arranges by grouping variable (#3351, #3480)", { tbl <- tibble(gr1 = rep(1:2, 4), gr2 = rep(1:2, each = 4), x = 1:8) |> group_by(gr1) expect_identical( arrange_if(tbl, is.integer), arrange(tbl, gr1, gr2, x) ) }) test_that("scoped arrange respect .by_group (#3245)", { d <- group_by(df, cyl) expect_identical( arrange_all(d, .by_group = TRUE), arrange(d, cyl, mpg, disp) ) expect_identical( arrange_if(d, is.numeric, .by_group = TRUE), arrange(d, cyl, mpg, disp) ) expect_identical( arrange_at(d, vars(mpg, disp), .by_group = TRUE), arrange(d, cyl, mpg, disp) ) }) test_that("scoped `arrange()` respects `.locale`", { df <- tibble(x = c("A", "a", "b", "B")) expect_identical( arrange_all(df, .locale = "C"), arrange(df, x, .locale = "C") ) expect_identical( arrange_if(df, is.character, .locale = "C"), arrange(df, x, .locale = "C") ) expect_identical( arrange_at(df, vars(x), .locale = "C"), arrange(df, x, .locale = "C") ) }) dplyr/tests/testthat/test-group-split.R0000644000176200001440000001071315106134104017770 0ustar liggesuserstest_that("group_split() keeps the grouping variables by default", { tbl <- tibble(x = 1:4, g = factor(rep(c("a", "b"), each = 2))) res <- group_split(tbl, g) expect_equal(res, list_of(tbl[1:2, ], tbl[3:4, ])) expect_identical(res, list_of(tbl[1:2, ], tbl[3:4, ])) expect_s3_class(res, "vctrs_list_of") expect_identical( attr(res, "ptype"), tibble(x = integer(), g = factor(levels = c("a", "b"))) ) }) test_that("group_split() can discard the grouping variables with .keep = FALSE", { tbl <- tibble(x = 1:4, g = factor(rep(c("a", "b"), each = 2))) res <- group_split(tbl, g, .keep = FALSE) expect_identical( res, list_of(tbl[1:2, 1, drop = FALSE], tbl[3:4, 1, drop = FALSE]) ) expect_s3_class(res, "vctrs_list_of") expect_identical(attr(res, "ptype"), tibble(x = integer())) }) test_that("group_split() respects empty groups", { tbl <- tibble( x = 1:4, g = factor(rep(c("a", "b"), each = 2), levels = c("a", "b", "c")) ) res <- group_split(tbl, g) expect_identical(res, list_of(tbl[1:2, ], tbl[3:4, ])) expect_s3_class(res, "vctrs_list_of") expect_identical( attr(res, "ptype"), tibble(x = integer(), g = factor(levels = c("a", "b", "c"))) ) res <- group_split(tbl, g, .drop = FALSE) expect_identical(res, list_of(tbl[1:2, ], tbl[3:4, ], tbl[integer(), ])) }) test_that("group_split.grouped_df() warns about `...`", { expect_snapshot({ out <- group_split(group_by(mtcars, cyl), cyl) }) }) test_that("group_split.rowwise_df() warns about `...`", { expect_snapshot({ out <- group_split(rowwise(mtcars), cyl) }) }) test_that("group_split.grouped_df() works", { iris <- as_tibble(iris) expect_identical( iris |> group_by(Species) |> group_split(), iris |> group_split(Species) ) }) test_that("group_split / bind_rows round trip", { setosa <- iris |> filter(Species == "setosa") |> as_tibble() chunks <- setosa |> group_split(Species) expect_identical(length(chunks), 1L) expect_identical(bind_rows(chunks), setosa) chunks <- setosa |> group_split(Species, .drop = FALSE) expect_identical(length(chunks), 3L) expect_identical(bind_rows(chunks), setosa) }) test_that("group_split() works if no grouping column", { expect_identical(group_split(iris), list_of(as_tibble(iris))) }) test_that("group_split(.keep=FALSE) does not try to remove virtual grouping columns (#4045)", { iris3 <- as_tibble(iris[1:3, ]) rows <- list(c(1L, 3L, 2L), c(3L, 2L, 3L)) df <- new_grouped_df( iris3, groups = tibble(.bootstrap = 1:2, .rows := rows) ) res <- group_split(df, .keep = FALSE) expect_identical( res, list_of(iris3[rows[[1L]], ], iris3[rows[[2L]], ]) ) }) test_that("group_split() respects .drop", { chunks <- tibble(f = factor("b", levels = c("a", "b", "c"))) |> group_split(f, .drop = TRUE) expect_identical(length(chunks), 1L) }) test_that("group_split() on a bare data frame returns bare tibbles", { df <- data.frame(x = 1:2) tib <- as_tibble(df) expect <- list_of(vec_slice(tib, 1), vec_slice(tib, 2)) expect_identical(group_split(df, x), expect) }) test_that("group_split() on a grouped df returns a list of tibbles", { df <- tibble(x = 1:2) gdf <- group_by(df, x) expect <- list_of(vec_slice(df, 1), vec_slice(df, 2)) expect_identical(group_split(gdf), expect) }) test_that("group_split() on a rowwise df returns a list of tibbles", { df <- tibble(x = 1:2) rdf <- rowwise(df) expect <- list_of(vec_slice(df, 1), vec_slice(df, 2)) expect_identical(group_split(rdf), expect) }) test_that("group_split() works with subclasses implementing group_by() / ungroup()", { local_foo_df() df <- list(x = c(1, 2, 2)) df <- new_tibble(df, nrow = 3L, class = "foo_df") expect <- list_of(vec_slice(df, 1), vec_slice(df, 2:3)) expect_identical(group_split(df, x), expect) }) test_that("group_split() internally uses dplyr_row_slice()", { local_foo_df() df <- list(x = c(1, 2, 2)) df <- new_tibble(df, nrow = 3L, class = "foo_df") local_methods( dplyr_row_slice.foo_df = function(x, i, ...) { abort(class = "dplyr_row_slice_called") } ) expect_error(group_split(df, x), class = "dplyr_row_slice_called") }) test_that("`keep =` is defunct", { df <- tibble(x = 1) gdf <- group_by(df, x) rdf <- rowwise(df) expect_snapshot(error = TRUE, { group_split(df, keep = TRUE) }) expect_snapshot(error = TRUE, { group_split(gdf, keep = TRUE) }) expect_snapshot(error = TRUE, { group_split(rdf, keep = TRUE) }) }) dplyr/tests/testthat/test-deprec-do.R0000644000176200001440000001367715106134104017361 0ustar liggesusers# Grouped data frames ---------------------------------------------------------- df <- data.frame( g = c(1, 2, 2, 3, 3, 3), x = 1:6, y = 6:1 ) |> group_by(g) test_that("unnamed results bound together by row", { first <- df |> do(head(., 1)) expect_equal(nrow(first), 3) expect_equal(first$g, 1:3) expect_equal(first$x, c(1, 2, 4)) }) test_that("named argument become list columns", { out <- df |> do(nrow = nrow(.), ncol = df_n_col(.)) expect_equal(out$nrow, list(1, 2, 3)) # includes grouping columns expect_equal(out$ncol, list(3, 3, 3)) }) test_that("multiple outputs can access data (#2998)", { out <- do(tibble(a = 1), g = nrow(.), h = nrow(.)) expect_equal(names(out), c("g", "h")) expect_equal(out$g, list(1L)) expect_equal(out$h, list(1L)) }) test_that("columns in output override columns in input", { out <- df |> do(data.frame(g = 1)) expect_equal(names(out), "g") expect_equal(out$g, c(1, 1, 1)) }) test_that("empty results preserved (#597)", { blankdf <- function(x) data.frame(blank = numeric(0)) dat <- data.frame(a = 1:2, b = factor(1:2)) expect_equal( dat |> group_by(b, .drop = FALSE) |> do(blankdf(.)) |> ungroup(), tibble(b = factor(integer(), levels = 1:2), blank = numeric()) ) }) test_that("empty inputs give empty outputs (#597)", { out <- data.frame(a = numeric(), b = factor()) |> group_by(b, .drop = FALSE) |> do(data.frame()) expect_equal(out, data.frame(b = factor()) |> group_by(b, .drop = FALSE)) out <- data.frame(a = numeric(), b = character()) |> group_by(b, .drop = FALSE) |> do(data.frame()) expect_equal(out, data.frame(b = character()) |> group_by(b, .drop = FALSE)) }) test_that("grouped do evaluates args in correct environment", { a <- 10 f <- function(a) { mtcars |> group_by(cyl) |> do(a = a) } expect_equal(f(100)$a, list(100, 100, 100)) }) # Ungrouped data frames -------------------------------------------------------- test_that("ungrouped data frame with unnamed argument returns data frame", { out <- mtcars |> do(head(.)) expect_s3_class(out, "data.frame") expect_equal(dim(out), c(6, 11)) }) test_that("ungrouped data frame with named argument returns list data frame", { out <- mtcars |> do(x = 1, y = 2:10) expect_s3_class(out, "tbl_df") expect_equal(out$x, list(1)) expect_equal(out$y, list(2:10)) }) test_that("ungrouped do evaluates args in correct environment", { a <- 10 f <- function(a) { mtcars |> do(a = a) } expect_equal(f(100)$a, list(100)) }) # Rowwise data frames ---------------------------------------------------------- test_that("can do on rowwise dataframe", { out <- mtcars |> rowwise() |> do(x = 1) exp <- tibble(x = rep(list(1), nrow(mtcars))) |> rowwise() expect_identical(out, exp) }) # Zero row inputs -------------------------------------------------------------- test_that("empty data frames give consistent outputs", { dat <- tibble(x = numeric(0), g = character(0)) grp <- dat |> group_by(g) emt <- grp |> filter(FALSE) dat |> do(data.frame()) |> vapply(pillar::type_sum, character(1)) |> length() |> expect_equal(0) dat |> do(data.frame(y = integer(0))) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(y = "int")) dat |> do(data.frame(.)) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(x = "dbl", g = "chr")) dat |> do(data.frame(., y = integer(0))) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(x = "dbl", g = "chr", y = "int")) dat |> do(y = df_n_col(.)) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(y = "list")) # Grouped data frame should have same col types as ungrouped, with addition # of grouping variable grp |> do(data.frame()) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(g = "chr")) grp |> do(data.frame(y = integer(0))) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(g = "chr", y = "int")) grp |> do(data.frame(.)) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(x = "dbl", g = "chr")) grp |> do(data.frame(., y = integer(0))) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(x = "dbl", g = "chr", y = "int")) grp |> do(y = df_n_col(.)) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(g = "chr", y = "list")) # A empty grouped dataset should have same types as grp emt |> do(data.frame()) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(g = "chr")) emt |> do(data.frame(y = integer(0))) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(g = "chr", y = "int")) emt |> do(data.frame(.)) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(x = "dbl", g = "chr")) emt |> do(data.frame(., y = integer(0))) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(x = "dbl", g = "chr", y = "int")) emt |> do(y = df_n_col(.)) |> vapply(pillar::type_sum, character(1)) |> expect_equal(c(g = "chr", y = "list")) }) test_that("handling of empty data frames in do", { blankdf <- function(x) data.frame(blank = numeric(0)) dat <- data.frame(a = 1:2, b = factor(1:2)) res <- dat |> group_by(b, .drop = FALSE) |> do(blankdf(.)) expect_equal(names(res), c("b", "blank")) }) test_that("do() does not retain .drop attribute (#4176)", { res <- iris |> group_by(Species) |> do(data.frame(n = 1)) expect_null(attr(res, ".drop", exact = TRUE)) }) # Errors -------------------------------------------- test_that("do() gives meaningful error messages", { df <- data.frame( g = c(1, 2, 2, 3, 3, 3), x = 1:6, y = 6:1 ) |> group_by(g) expect_snapshot({ (expect_error(df |> do(head, tail))) # unnamed elements must return data frames (expect_error(df |> ungroup() |> do(1))) (expect_error(df |> do(1))) (expect_error(df |> do("a"))) # can't use both named and unnamed args (expect_error(df |> do(x = 1, 2))) }) }) dplyr/tests/testthat/test-grouped-df.R0000644000176200001440000001442015106134104017536 0ustar liggesuserstest_that("new_grouped_df can create alternative grouping structures (#3837)", { tbl <- new_grouped_df( tibble(x = rnorm(10)), groups = tibble( ".rows" := replicate(5, sample(1:10, replace = TRUE), simplify = FALSE) ) ) res <- summarise(tbl, x = mean(x)) expect_equal(nrow(res), 5L) }) test_that("new_grouped_df does not have rownames (#4173)", { tbl <- new_grouped_df( tibble(x = rnorm(10)), groups = tibble( ".rows" := replicate(5, sample(1:10, replace = TRUE), simplify = FALSE) ) ) expect_false(tibble::has_rownames(tbl)) }) test_that("[ method can remove grouping vars", { df <- tibble(x = 1, y = 2, z = 3) gf <- group_by(df, x, y) expect_equal(gf, gf) expect_equal(gf[1], group_by(df[1], x)) expect_equal(gf[3], df[3]) }) test_that("[ method reuses group_data() if possible", { df <- tibble(x = 1, y = 2, z = 3) gf <- group_by(df, x, y) expect_true(rlang::is_reference(group_data(gf), group_data(gf[1:2]))) expect_true(rlang::is_reference(group_data(gf), group_data(gf[, 1:2]))) }) test_that("[ supports drop=TRUE (#3714)", { df <- tibble(x = 1, y = 2) gf <- group_by(df, x) expect_type(gf[, "y", drop = TRUE], "double") expect_s3_class(gf[, c("x", "y"), drop = TRUE], "tbl_df") }) test_that("$<-, [[<-, and [<- update grouping data if needed", { df <- tibble(x = 1, y = 2) gf <- group_by(df, x) # value has to be past the ellipsis in $<-() expect_equal(group_data(`$<-`(gf, "x", value = 2))$x, 2) expect_equal(group_data(`$<-`(gf, "y", value = 2))$x, 1) expect_equal( group_data({ gf2 <- gf gf2[[1]] <- 3 gf2 })$x, 3 ) expect_equal(group_data(`[<-`(gf, 1, "x", value = 4))$x, 4) }) test_that("can remove grouping cols with subset assignment", { df <- tibble(x = 1, y = 2) gf1 <- gf2 <- gf3 <- group_by(df, x, y) gf1$x <- NULL gf2[["x"]] <- NULL gf3[, "x"] <- NULL expect_named(group_data(gf1), c("y", ".rows")) expect_named(group_data(gf2), c("y", ".rows")) expect_named(group_data(gf3), c("y", ".rows")) }) test_that("names<- updates grouping data", { df <- tibble(x = 1, y = 2, z = 3) gf <- group_by(df, x, y) names(gf) <- c("z1", "z2", "z3") expect_named(group_data(gf), c("z1", "z2", ".rows")) names(gf)[1] <- c("Z1") expect_named(group_data(gf), c("Z1", "z2", ".rows")) }) test_that("names<- doesn't modify group data if not necessary", { df <- tibble(x = 1, y = 2) gf1 <- gf2 <- group_by(df, x) expect_true(rlang::is_reference(group_data(gf1), group_data(gf2))) names(gf1) <- c("x", "Y") expect_true(rlang::is_reference(group_data(gf1), group_data(gf2))) }) test_that("group order is maintained in grouped-df methods (#5040)", { gdf <- group_by(mtcars, cyl, am, vs) x <- gdf[0, ] expect_identical(group_vars(x), group_vars(gdf)) x <- gdf x$am <- 1 expect_identical(group_vars(x), group_vars(gdf)) x <- gdf x["am"] <- 1 expect_identical(group_vars(x), group_vars(gdf)) x <- gdf x[["am"]] <- 1 expect_identical(group_vars(x), group_vars(gdf)) x <- gdf names <- names(x) names[9] <- "am2" names(x) <- names expect_identical(group_vars(x), group_vars(group_by(x, cyl, am2, vs))) }) # validate ---------------------------------------------------------------- test_that("validate_grouped_df() gives useful errors", { df1 <- group_by(tibble(x = 1:4, g = rep(1:2, each = 2)), g) groups <- attr(df1, "groups") groups[[2]] <- 1:2 attr(df1, "groups") <- groups df2 <- group_by(tibble(x = 1:4, g = rep(1:2, each = 2)), g) groups <- attr(df2, "groups") names(groups) <- c("g", "not.rows") attr(df2, "groups") <- groups df3 <- df2 attr(df3, "groups") <- tibble() df4 <- df3 attr(df4, "groups") <- NA df5 <- tibble(x = 1:4, g = rep(1:2, each = 2)) attr(df5, "vars") <- "g" attr(df5, "class") <- c("grouped_df", "tbl_df", "tbl", "data.frame") df6 <- new_grouped_df( tibble(x = 1:10), groups = tibble(".rows" := list(1:5, -1L)) ) df7 <- df6 attr(df7, "groups")$.rows <- list(11L) df8 <- df6 attr(df8, "groups")$.rows <- list(0L) df10 <- df6 attr(df10, "groups") <- tibble() df11 <- df6 attr(df11, "groups") <- NULL expect_snapshot({ # Invalid `groups` attribute (expect_error(validate_grouped_df(df1))) (expect_error(group_data(df1))) (expect_error(validate_grouped_df(df2))) (expect_error(validate_grouped_df(df2))) (expect_error(validate_grouped_df(df3))) (expect_error(validate_grouped_df(df4))) # Older style grouped_df (expect_error(validate_grouped_df(df5))) # validate_grouped_df( (expect_error(validate_grouped_df(df6, check_bounds = TRUE))) (expect_error(validate_grouped_df(df7, check_bounds = TRUE))) (expect_error(validate_grouped_df(df8, check_bounds = TRUE))) (expect_error(validate_grouped_df(df10))) (expect_error(validate_grouped_df(df11))) # new_grouped_df() (expect_error( new_grouped_df( tibble(x = 1:10), tibble(other = list(1:2)) ) )) (expect_error(new_grouped_df(10))) }) }) # compute_group ---------------------------------------------------------- test_that("helper gives meaningful error messages", { expect_snapshot({ (expect_error(grouped_df(data.frame(x = 1), "y", FALSE))) (expect_error(grouped_df(data.frame(x = 1), 1))) }) }) test_that("NA and NaN are in separate groups at the end", { df <- tibble(x = c(NA, NaN, NA, 1)) result <- compute_groups(df, "x") expect_identical(result$x, c(1, NaN, NA)) }) test_that("groups are ordered in the C locale", { df <- tibble(x = c("a", "A", "Z", "b")) result <- compute_groups(df, "x") expect_identical(result$x, c("A", "Z", "a", "b")) }) test_that("using the deprecated global option `dplyr.legacy_locale` forces the system locale", { skip_if_not(has_collate_locale("en_US"), message = "Can't use 'en_US' locale") local_options(dplyr.legacy_locale = TRUE) withr::local_collate("en_US") df <- tibble(x = c("a", "A", "Z", "b")) # Should get deprecation warning about `dplyr.legacy_locale` expect_snapshot({ result <- compute_groups(df, "x") }) expect_identical(result$x, c("a", "A", "b", "Z")) # Confirming that the deprecation warning also shows up in `group_by()` itself expect_snapshot({ result <- group_by(df, x) }) expect_identical(group_data(result)$x, c("a", "A", "b", "Z")) }) dplyr/tests/testthat/test-select-helpers.R0000644000176200001440000000103315106134104020415 0ustar liggesuserstest_that("group_cols() selects grouping variables", { df <- tibble(x = 1:3, y = 1:3) gf <- group_by(df, x) expect_equal(df |> select(group_cols()), df[integer()]) expect_message( expect_equal(gf |> select(group_cols()), gf["x"]), NA ) }) test_that("group_cols() finds groups in scoped helpers", { gf <- group_by(tibble(x = 1, y = 2), x) out <- select_at(gf, vars(group_cols())) expect_named(out, "x") }) test_that("group_cols(vars =) is defunct", { expect_snapshot(error = TRUE, { group_cols("a") }) }) dplyr/tests/testthat/test-if-else.R0000644000176200001440000000526515106134104017035 0ustar liggesuserstest_that("scalar true and false are vectorised", { x <- c(TRUE, TRUE, FALSE, FALSE) expect_equal(if_else(x, 1, 2), c(1, 1, 2, 2)) }) test_that("vector true and false are ok", { x <- c(-1, 0, 1) expect_equal(if_else(x < 0, x, 0), c(-1, 0, 0)) expect_equal(if_else(x > 0, x, 0), c(0, 0, 1)) }) test_that("missing values are missing", { expect_equal(if_else(c(TRUE, NA, FALSE), -1, 1), c(-1, NA, 1)) }) test_that("works with lists", { x <- list(1, 2, 3) expect_equal( if_else(c(TRUE, TRUE, FALSE), x, list(NULL)), list(1, 2, NULL) ) }) test_that("works with data frames", { true <- tibble(x = 1, y = 2) false <- tibble(x = 3, y = 4) expect_identical( if_else(c(TRUE, FALSE, NA, TRUE), true, false), vec_c(true, false, NA, true) ) }) test_that("works with vctrs rcrd types", { true <- new_rcrd(list(x = 1, y = 2)) false <- new_rcrd(list(x = 3, y = 4)) expect_identical( if_else(c(TRUE, FALSE, NA, TRUE), true, false), vec_c(true, false, NA, true) ) }) test_that("takes the common type of `true` and `false` (#6243)", { expect_identical(if_else(TRUE, 1L, 1.5), 1) expect_snapshot(error = TRUE, { if_else(TRUE, 1, "x") }) }) test_that("includes `missing` in the common type computation if used", { expect_identical(if_else(TRUE, 1L, 2L, missing = 3), 1) expect_snapshot(error = TRUE, { if_else(TRUE, 1, 2, missing = "x") }) }) test_that("can recycle to size 0 `condition`", { expect_identical(if_else(logical(), 1, 2, missing = 3), double()) }) test_that("accepts logical conditions with attributes (#6678)", { x <- structure(TRUE, label = "foo") expect_identical(if_else(x, 1, 2), 1) }) test_that("`condition` must be logical (and isn't cast to logical!)", { expect_snapshot(error = TRUE, { if_else(1:10, 1, 2) }) }) test_that("`condition` can't be an array (#7723)", { expect_snapshot(error = TRUE, { if_else(array(TRUE), 1, 2) }) }) test_that("`true`, `false`, and `missing` must recycle to the size of `condition`", { x <- 1:3 bad <- 1:2 expect_snapshot(error = TRUE, { if_else(x < 2, bad, x) }) expect_snapshot(error = TRUE, { if_else(x < 2, x, bad) }) expect_snapshot(error = TRUE, { if_else(x < 2, x, x, missing = bad) }) }) test_that("must have empty dots", { expect_snapshot(error = TRUE, { if_else(TRUE, 1, 2, missing = 3, 4) }) }) test_that("`ptype` overrides the common type", { expect_identical(if_else(TRUE, 2, 1L, ptype = integer()), 2L) expect_snapshot(error = TRUE, { if_else(TRUE, 1L, 2.5, ptype = integer()) }) }) test_that("`size` is deprecated", { expect_snapshot({ x <- if_else(c(TRUE, FALSE), 1, 2, size = 2) }) expect_identical(x, c(1, 2)) }) dplyr/tests/testthat/test-coalesce.R0000644000176200001440000001063515106134104017264 0ustar liggesuserstest_that("non-missing scalar replaces all missing values", { x <- c(NA, 1) expect_equal(coalesce(x, 1), c(1, 1)) }) test_that("coerces to common type", { expect_identical(coalesce(NA, 1), 1) f <- factor("x", levels = c("x", "y")) expect_identical(coalesce(NA, f), f) }) test_that("inputs are recycled to their common size", { expect_identical(coalesce(1, c(2, 3)), c(1, 1)) expect_identical(coalesce(1, 2:3), c(1, 1)) }) test_that("finds non-missing values in multiple positions", { x1 <- c(1L, NA, NA) x2 <- c(NA, 2L, NA) x3 <- c(NA, NA, 3L) expect_equal(coalesce(x1, x2, x3), 1:3) }) test_that("coalesce() gives meaningful error messages", { expect_snapshot(error = TRUE, { coalesce(1:2, 1:3) }) expect_snapshot(error = TRUE, { coalesce(1:2, letters[1:2]) }) }) test_that("coalesce() supports one-dimensional arrays (#5557)", { x <- array(1:10) out <- coalesce(x, 0L) expect_identical(out, x) }) test_that("only updates entirely missing matrix rows", { x <- c(1, NA, NA, NA) x <- matrix(x, nrow = 2, byrow = TRUE) y <- c(2, 2, NA, 1) y <- matrix(y, nrow = 2, byrow = TRUE) expect <- c(1, NA, NA, 1) expect <- matrix(expect, nrow = 2, byrow = TRUE) expect_identical(coalesce(x, y), expect) }) test_that("only updates entirely missing data frame rows", { x <- tibble(x = c(1, NA), y = c(NA, NA)) y <- tibble(x = c(2, NA), y = c(TRUE, TRUE)) expect <- tibble(x = c(1, NA), y = c(NA, TRUE)) expect_identical(coalesce(x, y), expect) }) test_that("only updates entirely missing rcrd observations", { x <- new_rcrd(list(x = c(1, NA), y = c(NA, NA))) y <- new_rcrd(list(x = c(2, NA), y = c(TRUE, TRUE))) expect <- new_rcrd(list(x = c(1, NA), y = c(NA, TRUE))) expect_identical(coalesce(x, y), expect) }) test_that("`.ptype` overrides the common type (r-lib/funs#64)", { x <- c(1L, NA) expect_identical(coalesce(x, 99, .ptype = x), c(1L, 99L)) }) test_that("`.size` overrides the common size", { x <- 1L expect_snapshot(error = TRUE, { coalesce(x, 1:2, .size = vec_size(x)) }) }) test_that("can't be empty", { expect_snapshot(error = TRUE, { coalesce() }) }) test_that("must have at least one non-`NULL` vector", { expect_snapshot(error = TRUE, { coalesce(NULL, NULL) }) }) test_that("`NULL`s are discarded (r-lib/funs#80)", { expect_identical( coalesce(c(1, NA, NA), NULL, c(1, 2, NA), NULL, 3), c(1, 2, 3) ) }) test_that("works with multiple scalars", { expect_identical( coalesce(c(1, NA), 2, 3), c(1, 2) ) }) test_that("works with trailing `NA`", { # Not promoted to `default` expect_identical( coalesce(c(1, NA), NA), c(1, NA) ) }) test_that("resulting names come from all inputs", { expect_named( coalesce( c(x = 1, y = NA), c(a = 3, b = 4) ), c("x", "b") ) # No name if nothing stops the coalesce expect_named( coalesce( c(x = 1, y = NA), c(a = 3, b = NA) ), c("x", "") ) # Unused inputs still force named output. # "Common names" principle, like common type or size. expect_named( coalesce( c(1, 2), c(a = 3, b = 4) ), c("", "") ) expect_named( coalesce( c(1, 2), c(a = NA_real_) ), c("", "") ) # Size 1 default name is recycled if used expect_named( coalesce( c(a = 1, b = NA, c = 2, d = NA), c(e = 0) ), c("a", "e", "c", "e") ) expect_named( coalesce( c(a = 1, b = NA, c = 2, d = NA), c(e = NA) ), c("a", "", "c", "") ) # With multiple scalars that force namedness expect_named( coalesce(c(1, NA), 2, c(a = 3)), c("", "") ) expect_named( coalesce(c(1, NA), c(a = 2), 3), c("", "a") ) }) test_that("inputs must be vectors", { expect_snapshot(error = TRUE, { coalesce(1, environment()) }) }) test_that("names in error messages are indexed correctly", { expect_snapshot(error = TRUE, { coalesce(1, "x") }) expect_snapshot(error = TRUE, { coalesce(1, y = "x") }) expect_snapshot(error = TRUE, { coalesce(1:2, 1:3) }) expect_snapshot(error = TRUE, { coalesce(1:2, y = 1:3) }) # With `NULL`s, which get "dropped" expect_snapshot(error = TRUE, { coalesce(1, NULL, "x") }) expect_snapshot(error = TRUE, { coalesce(1, NULL, y = "x") }) expect_snapshot(error = TRUE, { coalesce(1:2, NULL, 1:3) }) expect_snapshot(error = TRUE, { coalesce(1:2, NULL, y = 1:3) }) }) dplyr/tests/testthat/helper-lazy.R0000644000176200001440000000112715106134104016761 0ustar liggesusersskip_if_no_lazy_character <- function() { new_lazy_character <- import_vctrs("new_lazy_character", optional = TRUE) lazy_character_is_materialized <- import_vctrs( "lazy_character_is_materialized", optional = TRUE ) if (is.null(new_lazy_character) || is.null(lazy_character_is_materialized)) { skip("Lazy character helpers from vctrs are not available.") } invisible() } new_lazy_character <- function(fn) { f <- import_vctrs("new_lazy_character") f(fn) } lazy_character_is_materialized <- function(x) { f <- import_vctrs("lazy_character_is_materialized") f(x) } dplyr/tests/testthat/test-colwise-distinct.R0000644000176200001440000000165715106134104020776 0ustar liggesuserstest_that("scoped distinct is identical to manual distinct", { df <- tibble( x = rep(2:5, each = 2), y = rep(2:3, each = 4), z = "a" ) expect_identical(distinct_all(df), distinct(df, x, y, z)) expect_identical(distinct_at(df, vars(x)), distinct(df, x)) expect_identical(distinct_if(df, is.integer), distinct(df, x, y)) }) test_that(".funs is applied to variables before getting distinct rows", { df <- tibble( x = rep(2:5, each = 2), y = rep(2:3, each = 4) ) expect_identical(distinct_all(df, `-`), distinct(mutate_all(df, `-`), x, y)) }) test_that("scoped distinct applies to grouping variables (#3480)", { df <- tibble( g = rep(1:2, each = 4), x = rep(2:5, each = 2) / 2, y = rep(2:3, each = 4) / 2 ) out <- df[c(1, 3, 5, 8), ] expect_identical(distinct_all(df), out) expect_identical(distinct_at(df, vars(g, x, y)), out) expect_identical(distinct_if(df, is.numeric), out) }) dplyr/tests/testthat/test-group-by.R0000644000176200001440000004666415106134104017265 0ustar liggesusersdf <- data.frame(x = rep(1:3, each = 10), y = rep(1:6, each = 5)) test_that("group_by() with .add = TRUE adds groups", { add_groups1 <- function(tbl) group_by(tbl, x, y, .add = TRUE) add_groups2 <- function(tbl) { group_by(group_by(tbl, x, .add = TRUE), y, .add = TRUE) } expect_equal(group_vars(add_groups1(df)), c("x", "y")) expect_equal(group_vars(add_groups2(df)), c("x", "y")) }) test_that("group_by(, ) computes the expressions on the ungrouped data frame (#5938)", { df <- data.frame( x = 1:4, g = rep(1:2, each = 2) ) count <- 0 out <- df |> group_by(g) |> group_by(big = { count <<- count + 1 x > mean(x) }) expect_equal(out$big, c(FALSE, FALSE, TRUE, TRUE)) expect_equal(count, 1L) expect_equal(group_vars(out), c("big")) count <- 0 out <- df |> group_by(g) |> group_by( big = { count <<- count + 1 x > mean(x) }, .add = TRUE ) expect_equal(out$big, c(FALSE, FALSE, TRUE, TRUE)) expect_equal(count, 1L) expect_equal(group_vars(out), c("g", "big")) count <- 0 out <- df |> group_by(g) |> mutate(big = { count <<- count + 1 x > mean(x) }) |> group_by(big) expect_equal(out$big, c(FALSE, TRUE, FALSE, TRUE)) expect_equal(count, 2L) expect_equal(group_vars(out), c("big")) count <- 0 out <- df |> group_by(g) |> mutate(big = { count <<- count + 1 x > mean(x) }) |> group_by(big, .add = TRUE) expect_equal(out$big, c(FALSE, TRUE, FALSE, TRUE)) expect_equal(count, 2L) expect_equal(group_vars(out), c("g", "big")) }) test_that("joins preserve grouping", { df <- data.frame(x = rep(1:2, each = 4), y = rep(1:4, each = 2)) g <- group_by(df, x) expect_equal( group_vars(inner_join( g, g, by = c("x", "y"), relationship = "many-to-many" )), "x" ) expect_equal( group_vars(left_join( g, g, by = c("x", "y"), relationship = "many-to-many" )), "x" ) expect_equal(group_vars(semi_join(g, g, by = c("x", "y"))), "x") expect_equal(group_vars(anti_join(g, g, by = c("x", "y"))), "x") }) test_that("constructors drops groups", { df <- data.frame(x = 1:3) |> group_by(x) expect_equal(group_vars(as_tibble(df)), character()) }) test_that("grouping by constant adds column (#410)", { grouped <- group_by(mtcars, "cyl") |> summarise(foo = n()) expect_equal(names(grouped), c('"cyl"', "foo")) expect_equal(nrow(grouped), 1L) }) test_that("can partially `ungroup()` (#6606)", { df <- tibble(g1 = 1:2, g2 = 3:4, x = 5:6) gdf <- group_by(df, g1, g2) expect_identical(ungroup(gdf, g1), group_by(df, g2)) expect_identical(ungroup(gdf, g1, g2), df) }) test_that("can't rename while partially `ungroup()`-ing (#6606)", { df <- tibble(g = 1:2, x = 3:4) gdf <- group_by(df, g) expect_snapshot(error = TRUE, { ungroup(gdf, g2 = g) }) }) # Test full range of variable types -------------------------------------------- test_that("local group_by preserves variable types", { df_var <- tibble( l = c(T, F), i = 1:2, d = Sys.Date() + 1:2, f = factor(letters[1:2]), num = 1:2 + 0.5, t = Sys.time() + 1:2, c = letters[1:2] ) attr(df_var$t, "tzone") <- "" for (var in names(df_var)) { expected <- tibble(!!var := sort(unique(df_var[[var]])), n = 1L) summarised <- df_var |> group_by(!!sym(var)) |> summarise(n = n()) expect_equal(summarised, expected) } }) test_that("mutate does not lose variables (#144)", { df <- tibble(a = rep(1:4, 2), b = rep(1:4, each = 2), x = runif(8)) by_ab <- group_by(df, a, b) by_a <- summarise(by_ab, x = sum(x), .groups = "drop_last") by_a_quartile <- group_by(by_a, quartile = ntile(x, 4)) expect_equal(names(by_a_quartile), c("a", "b", "x", "quartile")) }) test_that("group_by uses shallow copy", { skip_if_not_installed("lobstr") m1 <- group_by(mtcars, cyl) expect_equal(group_vars(mtcars), character()) expect_equal( lobstr::obj_addrs(mtcars), lobstr::obj_addrs(m1) ) }) test_that("group_by orders by groups. #242", { df <- data.frame(a = sample(1:10, 3000, replace = TRUE)) |> group_by(a) expect_equal(group_data(df)$a, 1:10) df <- data.frame( a = sample(letters[1:10], 3000, replace = TRUE), stringsAsFactors = FALSE ) |> group_by(a) expect_equal(group_data(df)$a, letters[1:10]) df <- data.frame(a = sample(sqrt(1:10), 3000, replace = TRUE)) |> group_by(a) expect_equal(group_data(df)$a, sqrt(1:10)) }) test_that("Can group_by() a POSIXlt", { skip_if_not_installed("tibble", "2.99.99") df <- tibble( x = 1:5, times = as.POSIXlt(seq.Date(Sys.Date(), length.out = 5, by = "day")) ) g <- group_by(df, times) expect_equal(nrow(group_data(g)), 5L) }) test_that("group_by() handles list as grouping variables", { df <- tibble(x = 1:3, y = list(1:2, 1:3, 1:2)) gdata <- group_data(group_by(df, y)) expect_equal(nrow(gdata), 2L) expect_equal(gdata$y, list(1:2, 1:3)) expect_equal(gdata$.rows, list_of(c(1L, 3L), 2L)) }) test_that("select(group_by(.)) implicitly adds grouping variables (#170)", { expect_snapshot( res <- mtcars |> group_by(vs) |> select(mpg) ) expect_equal(names(res), c("vs", "mpg")) }) test_that("group_by only creates one group for NA (#401)", { x <- as.numeric(c(NA, NA, NA, 10:1, 10:1)) w <- c(20, 30, 40, 1:10, 1:10) * 10 n_distinct(x) # 11 OK res <- data.frame(x = x, w = w) |> group_by(x) |> summarise(n = n()) expect_equal(nrow(res), 11L) }) test_that("there can be 0 groups (#486)", { data <- tibble(a = numeric(0), g = character(0)) |> group_by(g) expect_equal(length(data$a), 0L) expect_equal(length(data$g), 0L) expect_equal(map_int(group_rows(data), length), integer(0)) }) test_that("group_by works with zero-row data frames (#486)", { df <- data.frame(a = numeric(0), b = numeric(0), g = character(0)) dfg <- group_by(df, g, .drop = FALSE) expect_equal(dim(dfg), c(0, 3)) expect_equal(group_vars(dfg), "g") expect_equal(group_size(dfg), integer(0)) x <- summarise(dfg, n = n()) expect_equal(dim(x), c(0, 2)) expect_equal(group_vars(x), character()) x <- mutate(dfg, c = b + 1) expect_equal(dim(x), c(0, 4)) expect_equal(group_vars(x), "g") expect_equal(group_size(x), integer(0)) x <- filter(dfg, a == 100) expect_equal(dim(x), c(0, 3)) expect_equal(group_vars(x), "g") expect_equal(group_size(x), integer(0)) x <- arrange(dfg, a, g) expect_equal(dim(x), c(0, 3)) expect_equal(group_vars(x), "g") expect_equal(group_size(x), integer(0)) expect_snapshot( x <- select(dfg, a) # Only select 'a' column; should result in 'g' and 'a' ) expect_equal(dim(x), c(0, 2)) expect_equal(group_vars(x), "g") expect_equal(group_size(x), integer(0)) }) test_that("[ on grouped_df preserves grouping if subset includes grouping vars", { df <- tibble(x = 1:5, ` ` = 6:10) by_x <- df |> group_by(x) expect_equal(by_x |> groups(), by_x |> (\(.) .[1:2])() |> groups()) # non-syntactic name by_ns <- df |> group_by(` `) expect_equal(by_ns |> groups(), by_ns |> (\(.) .[1:2])() |> groups()) }) test_that("[ on grouped_df drops grouping if subset doesn't include grouping vars", { by_cyl <- mtcars |> group_by(cyl) no_cyl <- by_cyl |> (\(.) .[c(1, 3)])() expect_equal(group_vars(no_cyl), character()) expect_s3_class(no_cyl, "tbl_df") }) test_that("group_by works after arrange (#959)", { df <- tibble(Log = c(1, 2, 1, 2, 1, 2), Time = c(10, 1, 3, 0, 15, 11)) res <- df |> arrange(Time) |> group_by(Log) |> mutate(Diff = Time - lag(Time)) expect_true(all(is.na(res$Diff[c(1, 3)]))) expect_equal(res$Diff[c(2, 4, 5, 6)], c(1, 7, 10, 5)) }) test_that("group_by keeps attributes", { d <- data.frame(x = structure(1:10, foo = "bar")) gd <- group_by(d) expect_equal(attr(gd$x, "foo"), "bar") }) test_that("ungroup.rowwise_df gives a tbl_df (#936)", { res <- mtcars |> rowwise() |> ungroup() |> class() expect_equal(res, c("tbl_df", "tbl", "data.frame")) }) test_that("group_by handles encodings for native strings (#1507)", { local_non_utf8_encoding() special <- get_native_lang_string() df <- data.frame(x = 1:3, Eng = 2:4) for (names_converter in c(enc2native, enc2utf8)) { for (dots_converter in c(enc2native, enc2utf8)) { names(df) <- names_converter(c(special, "Eng")) res <- group_by(df, !!!syms(dots_converter(special))) expect_equal(names(res), names(df)) expect_equal(group_vars(res), special) } } for (names_converter in c(enc2native, enc2utf8)) { names(df) <- names_converter(c(special, "Eng")) res <- group_by(df, !!!special) expect_equal(names(res), c(names(df), deparse(special))) expect_equal(groups(res), list(as.name(enc2native(deparse(special))))) } }) test_that("group_by handles raw columns (#1803)", { df <- tibble(a = 1:3, b = as.raw(1:3)) expect_identical(ungroup(group_by(df, a)), df) expect_identical(ungroup(group_by(df, b)), df) }) test_that("rowwise handles raw columns (#1803)", { df <- tibble(a = 1:3, b = as.raw(1:3)) expect_s3_class(rowwise(df), "rowwise_df") }) test_that("group_by() names pronouns correctly (#2686)", { expect_named(group_by(tibble(x = 1), .data$x), "x") expect_named(group_by(tibble(x = 1), .data[["x"]]), "x") }) test_that("group_by() does not affect input data (#3028)", { x <- data.frame(old1 = c(1, 2, 3), old2 = c(4, 5, 6)) |> group_by(old1) y <- x |> select(new1 = old1, new2 = old2) expect_identical(groups(x), syms(quote(old1))) }) test_that("group_by() does not mutate for nothing when using the .data pronoun (#2752, #3533)", { expect_identical( iris |> group_by(Species) |> group_by(.data$Species), iris |> group_by(Species) ) expect_identical( iris |> group_by(Species) |> group_by(.data[["Species"]]), iris |> group_by(Species) ) df <- tibble(x = 1:5) attr(df, "y") <- 1 expect_equal(df |> group_by(.data$x) |> attr("y"), 1) expect_equal(df |> group_by(.data[["x"]]) |> attr("y"), 1) }) test_that("tbl_sum gets the right number of groups", { res <- data.frame(x = c(1, 1, 2, 2)) |> group_by(x) |> pillar::tbl_sum() expect_equal(res, c("A tibble" = "4 x 1", "Groups" = "x [2]")) }) test_that("group_by ignores empty quosures (3780)", { empty <- quo() expect_equal(group_by(mtcars, cyl), group_by(mtcars, cyl, !!empty)) }) # Zero groups --------------------------------------------------- test_that("mutate handles grouped tibble with 0 groups (#3935)", { df <- tibble(x = integer()) |> group_by(x) res <- mutate(df, y = mean(x), z = +mean(x), n = n()) expect_equal(names(res), c("x", "y", "z", "n")) expect_equal(nrow(res), 0L) expect_equal(res$y, double()) expect_equal(res$z, double()) expect_equal(res$n, integer()) }) test_that("summarise handles grouped tibble with 0 groups (#3935)", { df <- tibble(x = integer()) |> group_by(x) res <- summarise(df, y = mean(x), z = +mean(x), n = n()) expect_equal(names(res), c("x", "y", "z", "n")) expect_equal(nrow(res), 0L) expect_equal(res$y, double()) expect_equal(res$n, integer()) expect_equal(res$z, double()) }) test_that("filter handles grouped tibble with 0 groups (#3935)", { df <- tibble(x = integer()) |> group_by(x) res <- filter(df, x > 3L) expect_identical(df, res) }) test_that("select handles grouped tibble with 0 groups (#3935)", { df <- tibble(x = integer()) |> group_by(x) res <- select(df, x) expect_identical(df, res) }) test_that("arrange handles grouped tibble with 0 groups (#3935)", { df <- tibble(x = integer()) |> group_by(x) res <- arrange(df, x) expect_identical(df, res) }) test_that("group_by() with empty spec produces a grouped data frame with 0 grouping variables", { gdata <- group_data(group_by(iris)) expect_equal(names(gdata), ".rows") expect_equal(gdata$.rows, list_of(1:nrow(iris))) gdata <- group_data(group_by(iris, !!!list())) expect_equal(names(gdata), ".rows") expect_equal(gdata$.rows, list_of(1:nrow(iris))) }) # .drop = TRUE --------------------------------------------------- test_that("group_by(.drop = TRUE) drops empty groups (4061)", { res <- iris |> filter(Species == "setosa") |> group_by(Species, .drop = TRUE) expect_identical( group_data(res), structure( tibble( Species = factor("setosa", levels = levels(iris$Species)), .rows := list_of(1:50) ), .drop = TRUE ) ) expect_true(group_by_drop_default(res)) }) test_that("grouped data frames remember their .drop (#4061)", { res <- iris |> filter(Species == "setosa") |> group_by(Species, .drop = TRUE) res2 <- res |> filter(Sepal.Length > 5) expect_true(group_by_drop_default(res2)) res3 <- res |> filter(Sepal.Length > 5, .preserve = FALSE) expect_true(group_by_drop_default(res3)) res4 <- res3 |> group_by(Species) expect_true(group_by_drop_default(res4)) expect_equal(nrow(group_data(res4)), 1L) }) test_that("grouped data frames remember their .drop = FALSE (#4337)", { res <- iris |> filter(Species == "setosa") |> group_by(Species, .drop = FALSE) expect_false(group_by_drop_default(res)) res2 <- res |> group_by(Species) expect_false(group_by_drop_default(res2)) }) test_that("group_by(.drop = FALSE) preserve ordered factors (#5455)", { df <- tibble(x = ordered("x")) drop <- df |> group_by(x) |> group_data() nodrop <- df |> group_by(x, .drop = FALSE) |> group_data() expect_equal(is.ordered(drop$x), is.ordered(nodrop$x)) expect_true(is.ordered(nodrop$x)) }) test_that("summarise maintains the .drop attribute (#4061)", { df <- tibble( f1 = factor("a", levels = c("a", "b", "c")), f2 = factor("d", levels = c("d", "e", "f", "g")), x = 42 ) res <- df |> group_by(f1, f2, .drop = TRUE) expect_equal(n_groups(res), 1L) res2 <- summarise(res, x = sum(x), .groups = "drop_last") expect_equal(n_groups(res2), 1L) expect_true(group_by_drop_default(res2)) }) test_that("joins maintain the .drop attribute (#4061)", { df1 <- group_by( tibble( f1 = factor(c("a", "b"), levels = c("a", "b", "c")), x = 42:43 ), f1, .drop = TRUE ) df2 <- group_by( tibble( f1 = factor(c("a"), levels = c("a", "b", "c")), y = 1 ), f1, .drop = TRUE ) res <- left_join(df1, df2, by = "f1") expect_equal(n_groups(res), 2L) df2 <- group_by( tibble( f1 = factor(c("a", "c"), levels = c("a", "b", "c")), y = 1:2 ), f1, .drop = TRUE ) res <- full_join(df1, df2, by = "f1") expect_equal(n_groups(res), 3L) }) test_that("group_by(add = TRUE) sets .drop if the origonal data was .drop", { d <- tibble( f1 = factor("b", levels = c("a", "b", "c")), f2 = factor("g", levels = c("e", "f", "g")), x = 48 ) res <- group_by(group_by(d, f1, .drop = TRUE), f2, .add = TRUE) expect_equal(n_groups(res), 1L) expect_true(group_by_drop_default(res)) }) test_that("group_by_drop_default() is forgiving about corrupt grouped df (#4306)", { df <- tibble(x = 1:2, y = 1:2) |> structure(class = c("grouped_df", "tbl_df", "tbl", "data.frame")) expect_true(group_by_drop_default(df)) }) test_that("group_by() puts NA groups last in STRSXP (#4227)", { res <- tibble(x = c("apple", NA, "banana"), y = 1:3) |> group_by(x) |> group_data() expect_identical(res$x, c("apple", "banana", NA_character_)) expect_identical(res$.rows, list_of(1L, 3L, 2L)) }) test_that("group_by() does not create arbitrary NA groups for factors when drop = TRUE (#4460)", { res <- expect_warning(group_data(group_by(iris, Species)[0, ]), NA) expect_equal(nrow(res), 0L) res <- expect_warning(group_data(group_by(iris[0, ], Species)), NA) expect_equal(nrow(res), 0L) }) test_that("group_by() can handle auto splicing in the mutate() step", { expect_identical( iris |> group_by(Species), iris |> group_by(data.frame(Species = Species)) ) expect_identical( iris |> group_by(Species), iris |> group_by(pick(Species)) ) expect_identical( iris |> mutate(across(starts_with("Sepal"), round)) |> group_by(Sepal.Length, Sepal.Width), iris |> group_by(across(starts_with("Sepal"), round)) ) }) test_that("group_by() can combine usual spec and auto-splicing-mutate() step", { expect_identical( iris |> mutate(across(starts_with("Sepal"), round)) |> group_by(Sepal.Length, Sepal.Width, Species), iris |> group_by(across(starts_with("Sepal"), round), Species) ) expect_identical( iris |> mutate(across(starts_with("Sepal"), round)) |> group_by(Species, Sepal.Length, Sepal.Width), iris |> group_by(Species, across(starts_with("Sepal"), round)) ) }) # mutate() semantics test_that("group_by() has mutate() semantics (#4984)", { expect_equal( tibble(a = 1, b = 2) |> group_by(c = a * b, d = c + 1), tibble(a = 1, b = 2) |> mutate(c = a * b, d = c + 1) |> group_by(c, d) ) }) test_that("implicit mutate() operates on ungrouped data (#5598)", { vars <- tibble(x = c(1, 2), y = c(3, 4), z = c(5, 6)) |> dplyr::group_by(y) |> dplyr::group_by(pick(any_of(c('y', 'z')))) |> dplyr::group_vars() expect_equal(vars, c("y", "z")) }) test_that("grouped_df() does not break row.names (#5745)", { groups <- compute_groups(data.frame(x = 1:10), "x") expect_equal(.row_names_info(groups, type = 0), c(NA, -10L)) }) test_that("group_by() keeps attributes unrelated to the grouping (#5760)", { d <- data.frame(x = 453, y = 642) attr(d, "foo") <- "bar" d2 <- group_by(d, x) expect_equal(attr(d2, "foo"), "bar") d3 <- group_by(d2, y, .add = TRUE) expect_equal(attr(d2, "foo"), "bar") d4 <- group_by(d2, y2 = y * 2, .add = TRUE) expect_equal(attr(d2, "foo"), "bar") }) test_that("group_by() works with quosures (tidyverse/lubridate#959)", { ignore <- function(...) NA f <- function(var) { tibble(x = 1) |> group_by(g = ignore({{ var }})) } g <- function(var) { # This used to fail with the extra argument tibble(x = 1) |> group_by(g = ignore({{ var }}, 1)) } expect_equal(f(), tibble(x = 1, g = NA) |> group_by(g)) expect_equal(g(), tibble(x = 1, g = NA) |> group_by(g)) }) # Errors ------------------------------------------------------------------ test_that("group_by() and ungroup() give meaningful error messages", { expect_snapshot({ df <- tibble(x = 1, y = 2) (expect_error(df |> group_by(unknown))) (expect_error(df |> ungroup(x))) (expect_error(df |> group_by(x, y) |> ungroup(z))) (expect_error(df |> group_by(z = a + 1))) }) }) # Deprecation ------------------------------------------------------------- test_that("group_by(add =) is defunct", { # While it was being deprecated, it was getting passed through the `...` # down to `group_by_prepare()`. df <- tibble(x = 1, y = 2) expect_snapshot(error = TRUE, { group_by(df, x, add = TRUE) }) }) test_that("group_by_prepare(add =) is defunct", { df <- tibble(x = 1, y = 2) # We let this say `group_by()` in the error because it is more likely that # that is where it came from expect_snapshot(error = TRUE, { group_by_prepare(df, x, add = TRUE) }) }) test_that("group_by(.dots =) is defunct", { # While it was being deprecated, it was getting passed through the `...` # down to `group_by_prepare()`. df <- tibble(x = 1, y = 1) expect_snapshot(error = TRUE, { group_by(df, .dots = "x") }) }) test_that("group_by_prepare(.dots =) is defunct", { df <- tibble(x = 1, y = 1) # We let this say `group_by()` in the error because it is more likely that # that is where it came from expect_snapshot(error = TRUE, { group_by_prepare(df, .dots = "x") }) }) dplyr/tests/testthat/test-across.R0000644000176200001440000014316615137161765017027 0ustar liggesusers# across ------------------------------------------------------------------ test_that("across() works on one column data.frame", { df <- data.frame(x = 1) out <- df |> mutate(across(everything(), identity)) expect_equal(out, df) }) test_that("across() does not select grouping variables", { df <- data.frame(g = 1, x = 1) out <- df |> group_by(g) |> summarise(x = across(everything(), identity)) |> pull() expect_equal(out, tibble(x = 1)) }) test_that("across() correctly names output columns", { gf <- tibble(x = 1, y = 2, z = 3, s = "") |> group_by(x) expect_named( summarise(gf, across(everything(), identity)), c("x", "y", "z", "s") ) expect_named( summarise(gf, across(everything(), identity, .names = "id_{.col}")), c("x", "id_y", "id_z", "id_s") ) expect_named( summarise(gf, across(where(is.numeric), mean)), c("x", "y", "z") ) expect_named( summarise(gf, across(where(is.numeric), mean, .names = "mean_{.col}")), c("x", "mean_y", "mean_z") ) expect_named( summarise(gf, across(where(is.numeric), list(mean = mean, sum = sum))), c("x", "y_mean", "y_sum", "z_mean", "z_sum") ) expect_named( summarise(gf, across(where(is.numeric), list(mean = mean, sum))), c("x", "y_mean", "y_2", "z_mean", "z_2") ) expect_named( summarise(gf, across(where(is.numeric), list(mean, sum = sum))), c("x", "y_1", "y_sum", "z_1", "z_sum") ) expect_named( summarise(gf, across(where(is.numeric), list(mean, sum))), c("x", "y_1", "y_2", "z_1", "z_2") ) expect_named( summarise( gf, across( where(is.numeric), list(mean = mean, sum = sum), .names = "{.fn}_{.col}" ) ), c("x", "mean_y", "sum_y", "mean_z", "sum_z") ) }) test_that("across(.unpack =) can unpack data frame columns", { fn1 <- function(x) { tibble(a = x, b = x + 1) } fn2 <- function(x) { tibble(c = -x, d = x - 1) } df <- tibble(x = 1:2, y = 3:4) out <- mutate(df, across(x:y, list(one = fn1, two = fn2), .unpack = TRUE)) expect <- tibble( x = 1:2, y = 3:4, x_one_a = x, x_one_b = x + 1, x_two_c = -x, x_two_d = x - 1, y_one_a = y, y_one_b = y + 1, y_two_c = -y, y_two_d = y - 1 ) expect_identical(out, expect) }) test_that("across(.unpack =) allows a glue specification for `.unpack`", { fn <- function(x) { tibble(a = x, b = x + 1) } df <- tibble(x = 1) out <- mutate(df, across(x, fn, .unpack = "{outer}.{inner}")) expect_named(out, c("x", "x.a", "x.b")) # Can use variables from caller env out <- local({ name <- "name" mutate(df, across(x, fn, .unpack = "{name}.{inner}")) }) expect_named(out, c("x", "name.a", "name.b")) }) test_that("across(.unpack =) skips unpacking non-df-cols", { fn <- function(x) { tibble(a = x, b = x + 1) } df <- tibble(x = 1) out <- mutate(df, across(x, list(fn = fn, double = ~ .x * 2), .unpack = TRUE)) expect <- tibble(x = 1, x_fn_a = 1, x_fn_b = 2, x_double = 2) expect_identical(out, expect) }) test_that("across(.unpack =) uses the result of `.names` as `{outer}`", { fn <- function(x) { tibble(a = x, b = x + 1) } df <- tibble(x = 1, y = 2) out <- df |> mutate(across( x:y, list(f = fn), .names = "{.col}.{.fn}", .unpack = "{inner}.{outer}" )) expect_named(out, c("x", "y", "a.x.f", "b.x.f", "a.y.f", "b.y.f")) }) test_that("across(.unpack =) errors if the unpacked data frame has non-unique names", { fn <- function(x) { tibble(a = x, b = x) } df <- tibble(x = 1, y = 2) expect_snapshot(error = TRUE, { mutate(df, across(x:y, fn, .unpack = "{outer}")) }) }) test_that("`.unpack` is validated", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { summarise(df, across(x, mean, .unpack = 1)) }) expect_snapshot(error = TRUE, { summarise(df, across(x, mean, .unpack = c("x", "y"))) }) expect_snapshot(error = TRUE, { summarise(df, across(x, mean, .unpack = NA)) }) }) test_that("across() result locations are aligned with column names (#4967)", { df <- tibble(x = 1:2, y = c("a", "b")) expect <- tibble( x_cls = "integer", x_type = TRUE, y_cls = "character", y_type = FALSE ) x <- summarise(df, across(everything(), list(cls = class, type = is.numeric))) expect_identical(x, expect) }) test_that("across() works sequentially (#4907)", { df <- tibble(a = 1) expect_equal( mutate( df, x = df_n_col(across(where(is.numeric), identity)), y = df_n_col(across(where(is.numeric), identity)) ), tibble(a = 1, x = 1L, y = 2L) ) expect_equal( mutate(df, a = "x", y = df_n_col(across(where(is.numeric), identity))), tibble(a = "x", y = 0L) ) expect_equal( mutate(df, x = 1, y = df_n_col(across(where(is.numeric), identity))), tibble(a = 1, x = 1, y = 2L) ) }) test_that("across() retains original ordering", { df <- tibble(a = 1, b = 2) expect_named( mutate(df, a = 2, x = across(everything(), identity))$x, c("a", "b") ) }) test_that("across() throws meaningful error with failure during expansion (#6534)", { df <- tibble(g = 1, x = 1, y = 2, z = 3) gdf <- group_by(df, g) fn <- function() { stop("oh no!") } # Ends up failing inside the `fn()` call, which gets evaluated # during `across()` expansion but outside any group context expect_snapshot(error = TRUE, { summarise(df, across(everything(), fn())) }) expect_snapshot(error = TRUE, { summarise(df, across(everything(), fn()), .by = g) }) expect_snapshot(error = TRUE, { summarise(gdf, across(everything(), fn())) }) }) test_that("across() gives meaningful messages", { expect_snapshot({ # expanding (expect_error( tibble(x = 1) |> summarise(across(where(is.numeric), 42)) )) (expect_error( tibble(x = 1) |> summarise(across(y, mean)) )) # computing (expect_error( tibble(x = 1) |> summarise(res = across(where(is.numeric), 42)) )) (expect_error( tibble(x = 1) |> summarise(z = across(y, mean)) )) (expect_error( tibble(x = 1) |> summarise(res = sum(if_any(where(is.numeric), 42))) )) (expect_error( tibble(x = 1) |> summarise(res = sum(if_all(~ mean(.x)))) )) (expect_error( tibble(x = 1) |> summarise(res = sum(if_any(~ mean(.x)))) )) (expect_error(across())) (expect_error(c_across())) # problem while computing error_fn <- function(.) { if (all(. > 10)) { rlang::abort("too small", call = call("error_fn")) } else { 42 } } (expect_error( # expanding tibble(x = 1:10, y = 11:20) |> summarise(across(everything(), error_fn)) )) (expect_error( # expanding tibble(x = 1:10, y = 11:20) |> mutate(across(everything(), error_fn)) )) (expect_error( # evaluating tibble(x = 1:10, y = 11:20) |> summarise(force(across(everything(), error_fn))) )) (expect_error( # evaluating tibble(x = 1:10, y = 11:20) |> mutate(force(across(everything(), error_fn))) )) # name issue (expect_error( tibble(x = 1) |> summarise(across(everything(), list(f = mean, f = mean))) )) }) }) test_that("monitoring cache - across() can be used twice in the same expression", { df <- tibble(a = 1, b = 2) expect_equal( mutate( df, x = df_n_col(across(where(is.numeric), identity)) + df_n_col(across(a, identity)) ), tibble(a = 1, b = 2, x = 3) ) }) test_that("monitoring cache - across() can be used in separate expressions", { df <- tibble(a = 1, b = 2) expect_equal( mutate( df, x = df_n_col(across(where(is.numeric), identity)), y = df_n_col(across(a, identity)) ), tibble(a = 1, b = 2, x = 2, y = 1) ) }) test_that("monitoring cache - across() usage can depend on the group id", { df <- tibble(g = 1:2, a = 1:2, b = 3:4) df <- group_by(df, g) switcher <- function() { if_else(cur_group_id() == 1L, across(a, identity)$a, across(b, identity)$b) } expect <- df expect$x <- c(1L, 4L) expect_equal( mutate(df, x = switcher()), expect ) }) test_that("monitoring cache - across() internal cache key depends on all inputs", { df <- tibble(g = rep(1:2, each = 2), a = 1:4) df <- group_by(df, g) expect_identical( mutate( df, tibble( x = across(where(is.numeric), mean)$a, y = across(where(is.numeric), max)$a ) ), mutate(df, x = mean(a), y = max(a)) ) }) test_that("across() rejects non vectors", { expect_error( data.frame(x = 1) |> summarise(across(everything(), ~ sym("foo"))) ) }) test_that("across() uses tidy recycling rules", { expect_equal( data.frame(x = 1, y = 2) |> reframe(across(everything(), ~ rep(42, .))), data.frame(x = rep(42, 2), y = rep(42, 2)) ) expect_error( data.frame(x = 2, y = 3) |> reframe(across(everything(), ~ rep(42, .))) ) }) test_that("across() returns a data frame with 1 row (#5204)", { df <- tibble(x = 1:42) expect_equal( mutate(df, across(c(), as.factor)), df ) expect_equal( mutate(df, y = across(c(), as.factor))$y, tibble::new_tibble(list(), nrow = 42) ) mutate(df, { res <- across(c(), as.factor) expect_equal(nrow(res), 1L) res }) }) test_that("across(.names=) can use local variables in addition to {col} and {fn}", { res <- local({ prefix <- "MEAN" data.frame(x = 42) |> summarise(across(everything(), mean, .names = "{prefix}_{.col}")) }) expect_identical(res, data.frame(MEAN_x = 42)) }) test_that("across(.unpack=) can use local variables in addition to {outer} and {inner}", { fn <- function(x) { tibble(x = x, y = x + 1) } res <- local({ prefix <- "FN" data.frame(col1 = 42, col2 = 24) |> summarise(across(everything(), fn, .unpack = "{prefix}_{outer}_{inner}")) }) expect_identical( res, data.frame( FN_col1_x = 42, FN_col1_y = 43, FN_col2_x = 24, FN_col2_y = 25 ) ) }) test_that("across() uses environment from the current quosure (#5460)", { # If the data frame `y` is selected, causes a subscript conversion # error since it is fractional df <- data.frame(x = 1, y = 2.4) y <- "x" expect_equal(df |> summarise(across(all_of(y), mean)), data.frame(x = 1)) expect_equal(df |> mutate(across(all_of(y), mean)), df) expect_equal(df |> filter(if_all(all_of(y), ~ .x < 2)), df) # Inherited case expect_error(df |> summarise(local(across(all_of(y), mean)))) expect_equal( df |> summarise(summarise(pick(everything()), across(all_of(y), mean))), df |> summarise(across(all_of(y), mean)) ) }) test_that("across() sees columns in the recursive case (#5498)", { skip_if_not_installed("purrr") df <- tibble( vars = list("foo"), data = list(data.frame(foo = 1, bar = 2)) ) out <- df |> mutate( data = purrr::map2(data, vars, function(.x, .y) { .x |> mutate(across(all_of(.y), ~NA)) }) ) exp <- tibble( vars = list("foo"), data = list(data.frame(foo = NA, bar = 2)) ) expect_identical(out, exp) out <- df |> mutate( data = purrr::map2(data, vars, function(.x, .y) { local({ .y <- "bar" .x |> mutate(across(all_of(.y), ~NA)) }) }) ) exp <- tibble( vars = list("foo"), data = list(data.frame(foo = 1, bar = NA)) ) expect_identical(out, exp) }) test_that("across() works with empty data frames (#5523)", { expect_equal( mutate(tibble(), across(everything(), identity)), tibble() ) }) test_that("lambdas in mutate() + across() can use columns", { df <- tibble(x = 2, y = 4, z = 8) expect_identical( df |> mutate(data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(across(everything(), ~ .x / y)) ) expect_identical( df |> mutate(data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(+across(everything(), ~ .x / y)) ) expect_identical( df |> mutate(data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(across(everything(), ~ .x / .data$y)) ) expect_identical( df |> mutate(data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(+across(everything(), ~ .x / .data$y)) ) }) test_that("lambdas in summarise() + across() can use columns", { df <- tibble(x = 2, y = 4, z = 8) expect_identical( df |> summarise(data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(across(everything(), ~ .x / y)) ) expect_identical( df |> summarise(data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(+across(everything(), ~ .x / y)) ) expect_identical( df |> summarise(data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(across(everything(), ~ .x / .data$y)) ) expect_identical( df |> summarise(data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(+across(everything(), ~ .x / .data$y)) ) }) test_that("lambdas in mutate() + across() can use columns in follow up expressions (#5717)", { df <- tibble(x = 2, y = 4, z = 8) expect_identical( df |> mutate(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(a = 2, across(c(x, y, z), ~ .x / y)) ) expect_identical( df |> mutate(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(a = 2, +across(c(x, y, z), ~ .x / y)) ) expect_identical( df |> mutate(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(a = 2, across(c(x, y, z), ~ .x / .data$y)) ) expect_identical( df |> mutate(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> mutate(a = 2, +across(c(x, y, z), ~ .x / .data$y)) ) }) test_that("lambdas in summarise() + across() can use columns in follow up expressions (#5717)", { df <- tibble(x = 2, y = 4, z = 8) expect_identical( df |> summarise(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(a = 2, across(c(x, y, z), ~ .x / y)) ) expect_identical( df |> summarise(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(a = 2, +across(c(x, y, z), ~ .x / y)) ) expect_identical( df |> summarise(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(a = 2, across(c(x, y, z), ~ .x / .data$y)) ) expect_identical( df |> summarise(a = 2, data.frame(x = x / y, y = y / y, z = z / y)), df |> summarise(a = 2, +across(c(x, y, z), ~ .x / .data$y)) ) }) test_that("functions defined inline can use columns (#5734)", { df <- data.frame(x = 1, y = 2) expect_equal( df |> mutate(across('x', function(.x) .x / y)) |> pull(x), 0.5 ) }) test_that("if_any() and if_all() can be used in mutate() (#5709)", { d <- data.frame(x = c(1, 5, 10, 10), y = c(0, 0, 0, 10), z = c(10, 5, 1, 10)) res <- d |> mutate( any = if_any(x:z, ~ . > 8), all = if_all(x:z, ~ . > 8) ) expect_equal(res$any, c(TRUE, FALSE, TRUE, TRUE)) expect_equal(res$all, c(FALSE, FALSE, FALSE, TRUE)) }) test_that("across() caching not confused when used from if_any() and if_all() (#5782)", { res <- data.frame(x = 1:3) |> mutate( any = if_any(x, ~ . >= 2) + if_any(x, ~ . >= 3), all = if_all(x, ~ . >= 2) + if_all(x, ~ . >= 3) ) expect_equal(res$any, c(0, 1, 2)) expect_equal(res$all, c(0, 1, 2)) }) test_that("if_any() and if_all() respect filter()-like NA handling", { df <- expand.grid( x = c(TRUE, FALSE, NA), y = c(TRUE, FALSE, NA) ) expect_identical( filter(df, x & y), filter(df, if_all(c(x, y), identity)) ) expect_identical( filter(df, x | y), filter(df, if_any(c(x, y), identity)) ) }) test_that("if_any() and if_all() aborts when predicate mistakingly used in .cols= (#5732)", { df <- data.frame(x = 1:10, y = 1:10) expect_snapshot({ # expanded case (expect_error(filter(df, if_any(~ .x > 5)))) (expect_error(filter(df, if_all(~ .x > 5)))) # non expanded case (expect_error(filter(df, !if_any(~ .x > 5)))) (expect_error(filter(df, !if_all(~ .x > 5)))) }) }) test_that("across() correctly reset column", { expect_error(cur_column()) res <- data.frame(x = 1) |> summarise( a = { expect_error(cur_column()) 2 }, across( x, ~ { expect_equal(cur_column(), "x") 3 }, .names = "b" ), # top_across() c = { expect_error(cur_column()) 4 }, force(across( x, ~ { expect_equal(cur_column(), "x") 5 }, .names = "d" )), # across() e = { expect_error(cur_column()) 6 } ) expect_equal(res, data.frame(a = 2, b = 3, c = 4, d = 5, e = 6)) expect_error(cur_column()) res <- data.frame(x = 1) |> mutate( a = { expect_error(cur_column()) 2 }, # top_across() across( x, ~ { expect_equal(cur_column(), "x") 3 }, .names = "b" ), c = { expect_error(cur_column()) 4 }, # across() force(across( x, ~ { expect_equal(cur_column(), "x") 5 }, .names = "d" )), e = { expect_error(cur_column()) 6 } ) expect_equal(res, data.frame(x = 1, a = 2, b = 3, c = 4, d = 5, e = 6)) expect_error(cur_column()) }) test_that("across() can omit dots", { df <- tibble(x = tibble(foo = 1), y = tibble(foo = 2)) # top res <- mutate( df, across( everything(), list ) ) expect_equal(res$x[[1]]$foo, 1) expect_equal(res$y[[1]]$foo, 2) # not top res <- mutate( df, force(across( everything(), list )) ) expect_equal(res$x[[1]]$foo, 1) expect_equal(res$y[[1]]$foo, 2) }) test_that("group variables are in scope (#5832)", { f <- function(x, z) x + z gdf <- data.frame(x = 1:2, y = 3:4, g = 1:2) |> group_by(g) exp <- gdf |> summarise(x = f(x, z = y)) expect_equal( gdf |> summarise(across(x, ~ f(.x, z = y))), exp ) expect_equal( gdf |> summarise(across(x, ~ f(.x, z = y))), exp ) }) test_that("can pass quosure through `across()`", { summarise_mean <- function(data, vars) { data |> summarise(across({{ vars }}, mean)) } gdf <- data.frame(g = c(1, 1, 2), x = 1:3) |> group_by(g) expect_equal( gdf |> summarise_mean(where(is.numeric)), summarise(gdf, x = mean(x)) ) }) test_that("across() inlines formulas", { # Env of captured quosure passed to `as_across_fn_call()`. The # unevaluated lambdas should inherit from that env after inlining. env <- env() lambda <- quo_eval_fns(quo(function(x) fn(x)), mask = env) out <- as_across_fn_call(lambda, quote(var), env, env) expect_equal(out, new_quosure(quote(fn(var)), env)) formula <- quo_eval_fns(quo(~ fn(.x)), mask = env) out <- as_across_fn_call(formula, quote(var), env, env) expect_equal(out, new_quosure(quote(fn(var)), env)) # Evaluated formulas preserve their own env f <- local(~ fn(.x)) fn <- quo_eval_fns(quo(!!f), mask = env) out <- as_across_fn_call(fn, quote(var), env, env) expect_equal(get_env(f), get_env(fn)) expect_equal(out, new_quosure(call2(fn, quote(var)), env)) # Inlining is disabled for complex lambda calls fn <- quo_eval_fns(quo(function(x, y) x), mask = env) out <- as_across_fn_call(fn, quote(var), env, env) expect_equal(out, new_quosure(call2(fn, quote(var)), env)) # Formulas are converted to functions expect_rlang_lambda <- function(fn) { expect_s3_class(fn, "rlang_lambda_function") out <- as_across_fn_call(fn, quote(var), env, env) expect_equal(out, new_quosure(call2(fn, quote(var)), env)) } out <- quo_eval_fns(quo(~.y), mask = env) expect_rlang_lambda(out) out <- quo_eval_fns(quo(list(~.y)), mask = env) expect_type(out, "list") map(out, expect_rlang_lambda) # All formula-lambda arguments are interpolated fn <- quo_eval_fns(quo(~ list(.x, ., .x)), mask = env) out <- as_across_fn_call(fn, quote(var), env, env) expect_equal( out, new_quosure(quote(list(var, var, var)), f_env(f)) ) }) test_that("inlined and non inlined lambdas work", { df <- data.frame(foo = 1:2, bar = 100:101) exp <- data.frame(foo = c(101.5, 102.5), bar = c(200.5, 201.5)) expect_equal(df |> mutate(across(1:2, function(x) x + mean(bar))), exp) expect_equal(df |> mutate((across(1:2, function(x) x + mean(bar)))), exp) expect_equal(df |> mutate(across(1:2, ~ .x + mean(bar))), exp) expect_equal(df |> mutate((across(1:2, ~ .x + mean(bar)))), exp) expect_equal(df |> mutate(across(1:2, ~ ..1 + mean(bar))), exp) expect_equal(df |> mutate((across(1:2, ~ ..1 + mean(bar)))), exp) # Message generated by base R changed skip_if_not_installed("base", "3.6.0") expect_snapshot({ (expect_error(df |> mutate(across(1:2, ~ .y + mean(bar))))) (expect_error(df |> mutate((across(1:2, ~ .y + mean(bar)))))) }) }) test_that("list of lambdas work", { df <- data.frame(foo = 1:2, bar = 100:101) exp <- cbind( df, data.frame(foo_1 = c(101.5, 102.5), bar_1 = c(200.5, 201.5)) ) expect_equal(df |> mutate(across(1:2, list(function(x) x + mean(bar)))), exp) expect_equal( df |> mutate((across(1:2, list(function(x) x + mean(bar))))), exp ) expect_equal(df |> mutate(across(1:2, list(~ .x + mean(bar)))), exp) expect_equal(df |> mutate((across(1:2, list(~ .x + mean(bar))))), exp) }) test_that("anonymous function `.fns` can access the `.data` pronoun even when not inlined", { df <- tibble(x = 1:2, y = 3:4) # Can't access it here, `fn()`'s environment doesn't know about `.data` fn <- function(col) { .data[["x"]] } expect_snapshot(error = TRUE, { mutate(df, across(y, fn)) }) # Can access it with inlinable quosures out <- mutate( df, across(y, function(col) { .data[["x"]] }) ) expect_identical(out$y, out$x) # Can access it with non-inlinable quosures out <- mutate( df, across(y, function(col) { return(.data[["x"]]) }) ) expect_identical(out$y, out$x) }) test_that("across() uses local formula environment (#5881)", { f <- local({ prefix <- "foo" ~ paste(prefix, .x) }) df <- tibble(x = "x") expect_equal( mutate(df, across(x, f)), tibble(x = "foo x") ) expect_equal( mutate(df, across(x, list(f = f))), tibble(x = "x", x_f = "foo x") ) local({ # local() here is not necessary, it's just in case the # code is run directly without the test_that() prefix <- "foo" expect_equal( mutate(df, across(x, ~ paste(prefix, .x))), tibble(x = "foo x") ) expect_equal( mutate(df, across(x, list(f = ~ paste(prefix, .x)))), tibble(x = "x", x_f = "foo x") ) }) expect_equal( data.frame(x = 1) |> mutate(across(1, list(f = local(~ . + 1)))), data.frame(x = 1, x_f = 2) ) expect_equal( data.frame(x = 1) |> mutate(across( 1, local({ `_local_var` <- 1 ~ . + `_local_var` }) )), data.frame(x = 2) ) }) test_that("unevaluated formulas (currently) fail", { df <- tibble(x = "x") expect_error( mutate(df, across(x, quote(~ paste("foo", .x)))) ) }) test_that("across() can access lexical scope (#5862)", { f_across <- function(data, cols, fn) { data |> summarise( across({{ cols }}, fn) ) } df <- data.frame(x = 1:10, y = 1:10) expect_equal( f_across(df, c(x, y), mean), summarise(df, across(c(x, y), mean)) ) }) test_that("across() allows renaming in `.cols` (#6895)", { df <- tibble(x = 1, y = 2, z = 3) cols <- set_names(c("x", "y"), c("a", "b")) expect_identical( mutate(df, across(all_of(cols), identity)), mutate(df, a = x, b = y) ) expect_identical( mutate(df, (across(all_of(cols), identity))), mutate(df, a = x, b = y) ) expect_identical( mutate(df, across(all_of(cols), identity, .names = "{.col}_name")), mutate(df, a_name = x, b_name = y) ) expect_identical( mutate(df, (across(all_of(cols), identity, .names = "{.col}_name"))), mutate(df, a_name = x, b_name = y) ) }) test_that("if_any() and if_all() expansions deal with no inputs or single inputs", { d <- data.frame(x = 1) # No inputs expect_equal( filter(d, if_any(starts_with("c"), ~FALSE)), filter(d, FALSE) ) expect_equal( filter(d, if_all(starts_with("c"), ~FALSE)), filter(d) ) # Single inputs expect_equal( filter(d, if_any(x, ~FALSE)), filter(d, FALSE) ) expect_equal( filter(d, if_all(x, ~FALSE)), filter(d, FALSE) ) }) test_that("if_any() on zero-column selection behaves like any() (#7059, #7077)", { tbl <- tibble( x1 = 1:5, x2 = c(-1, 4, 5, 4, 1), y = c(1, 4, 2, 4, 9), ) expect_equal( filter(tbl, if_any(c(), ~FALSE)), filter(tbl, FALSE) ) expect_equal( filter(tbl, if_any(c(), ~TRUE)), filter(tbl, FALSE) ) expect_equal( pull(mutate(tbl, z = if_any(c(), ~FALSE)), z), rep(FALSE, nrow(tbl)) ) expect_equal( pull(mutate(tbl, z = if_any(c(), ~TRUE)), z), rep(FALSE, nrow(tbl)) ) }) test_that("if_all() on zero-column selection behaves like all() (#7059, #7077)", { tbl <- tibble( x1 = 1:5, x2 = c(-1, 4, 5, 4, 1), y = c(1, 4, 2, 4, 9), ) expect_equal( filter(tbl, if_all(c(), ~FALSE)), filter(tbl, TRUE) ) expect_equal( filter(tbl, if_all(c(), ~TRUE)), filter(tbl, TRUE) ) expect_equal( pull(mutate(tbl, z = if_all(c(), ~FALSE)), z), rep(TRUE, nrow(tbl)) ) expect_equal( pull(mutate(tbl, z = if_all(c(), ~TRUE)), z), rep(TRUE, nrow(tbl)) ) }) test_that("if_any() and if_all() wrapped deal with no inputs or single inputs", { d <- data.frame(x = 1) # No inputs expect_equal( filter(d, (if_any(starts_with("c"), ~FALSE))), filter(d, FALSE) ) expect_equal( filter(d, (if_all(starts_with("c"), ~FALSE))), filter(d, TRUE) ) # Single inputs expect_equal( filter(d, (if_any(x, ~FALSE))), filter(d, FALSE) ) expect_equal( filter(d, (if_all(x, ~FALSE))), filter(d, FALSE) ) expect_equal( filter(d, (if_any(x, ~TRUE))), filter(d, TRUE) ) expect_equal( filter(d, (if_all(x, ~TRUE))), filter(d, TRUE) ) }) test_that("expanded if_any() finds local data", { limit <- 7 df <- data.frame(x = 1:10, y = 10:1) expect_identical( filter(df, if_any(everything(), ~ .x > limit)), filter(df, x > limit | y > limit) ) }) test_that("across() can use named selections", { df <- data.frame(x = 1, y = 2) # no fns expect_equal( df |> summarise(across(c(a = x, b = y))), data.frame(a = 1, b = 2) ) expect_equal( df |> summarise(across(all_of(c(a = "x", b = "y")))), data.frame(a = 1, b = 2) ) # no fns, non expanded expect_equal( df |> summarise((across(c(a = x, b = y)))), data.frame(a = 1, b = 2) ) expect_equal( df |> summarise((across(all_of(c(a = "x", b = "y"))))), data.frame(a = 1, b = 2) ) # one fn expect_equal( df |> summarise(across(c(a = x, b = y), mean)), data.frame(a = 1, b = 2) ) expect_equal( df |> summarise(across(all_of(c(a = "x", b = "y")), mean)), data.frame(a = 1, b = 2) ) # one fn - non expanded expect_equal( df |> summarise((across(c(a = x, b = y), mean))), data.frame(a = 1, b = 2) ) expect_equal( df |> summarise((across(all_of(c(a = "x", b = "y")), mean))), data.frame(a = 1, b = 2) ) # multiple fns expect_equal( df |> summarise(across(c(a = x, b = y), list(mean = mean, sum = sum))), data.frame(a_mean = 1, a_sum = 1, b_mean = 2, b_sum = 2) ) expect_equal( df |> summarise(across( all_of(c(a = "x", b = "y")), list(mean = mean, sum = sum) )), data.frame(a_mean = 1, a_sum = 1, b_mean = 2, b_sum = 2) ) # multiple fns - non expanded expect_equal( df |> summarise((across(c(a = x, b = y), list(mean = mean, sum = sum)))), data.frame(a_mean = 1, a_sum = 1, b_mean = 2, b_sum = 2) ) expect_equal( df |> summarise( (across(all_of(c(a = "x", b = "y")), list(mean = mean, sum = sum))) ), data.frame(a_mean = 1, a_sum = 1, b_mean = 2, b_sum = 2) ) }) test_that("expr_subtitute() stops at lambdas (#5896)", { expect_identical( expr_substitute(expr(map(.x, ~ mean(.x))), quote(.x), quote(a)), expr(map(a, ~ mean(.x))) ) expect_identical( expr_substitute(expr(map(.x, function(.x) mean(.x))), quote(.x), quote(a)), expr(map(a, function(.x) mean(.x))) ) }) test_that("expr_subtitute() keeps at double-sided formula (#5894)", { expect_identical( expr_substitute( expr(case_when(.x < 5 ~ 5, .default = .x)), quote(.x), quote(a) ), expr(case_when(a < 5 ~ 5, .default = a)) ) expect_identical( expr_substitute( expr(case_when(. < 5 ~ 5, .default = .)), quote(.), quote(a) ), expr(case_when(a < 5 ~ 5, .default = a)) ) }) test_that("across() predicates operate on whole data", { df <- tibble( x = c(1, 1, 2), g = c(1, 1, 2) ) out <- df |> mutate(across(where(~ n_distinct(.x) > 1), ~ .x + 10)) exp <- tibble( x = c(11, 11, 12), g = c(11, 11, 12) ) expect_equal(out, exp) out <- df |> group_by(g) |> mutate(across(where(~ n_distinct(.x) > 1), ~ .x + 10)) exp <- tibble( x = c(11, 11, 12), g = c(1, 1, 2) ) |> group_by(g) expect_equal(out, exp) }) test_that("expand_across() expands lambdas", { quo <- quo(across(c(cyl, am), ~ identity(.x))) quo <- new_dplyr_quosure( quo, name = quo, is_named = FALSE, index = 1 ) by <- compute_by(by = NULL, data = mtcars, error_call = call("caller")) DataMask$new(mtcars, by, "mutate", call("caller")) expect_equal( map(expand_across(quo), quo_get_expr), exprs( cyl = identity(cyl), am = identity(am) ) ) }) test_that("expand_if_across() expands lambdas", { quo <- quo(if_any(c(cyl, am), ~ . > 4)) quo <- new_dplyr_quosure( quo, name = quo, is_named = FALSE, index = 1 ) by <- compute_by(by = NULL, data = mtcars, error_call = call("caller")) DataMask$new(mtcars, by, "mutate", call("caller")) quo <- expand_if_across(quo) # We just need to look for something we know we insert into the expression. # `expect_snapshot()` doesn't seem to play nicely with covr on CI here, the # expression captured seems to contain `covr:::count()` calls. expect_true( grepl( "asNamespace", paste0(expr_deparse(quo_squash(quo)), collapse = " ") ) ) }) test_that("rowwise() preserves list-cols iff no `.fns` (#5951, #6264)", { # TODO: Deprecate this behavior in favor of `pick()`, which doesn't preserve # list-cols but is well-defined as pure macro expansion. rf <- rowwise(tibble(x = list(1:2, 3:5))) # Need to unchop so works like mutate(rf, x = length(x)) out <- mutate(rf, across(everything(), length)) expect_equal(out$x, c(2, 3)) # Need to preserve to create valid data frame out <- mutate(rf, across = list(across(everything()))) expect_equal( out$across, list( tibble(x = list(1:2)), tibble(x = list(3:5)) ) ) }) test_that("`across()` recycle `.fns` results to common size", { df <- tibble( x = c(TRUE, FALSE, TRUE), y = c(1L, 2L, 3L) ) # The `.fns` results are recycled within just the `across()` inputs first, not # immediately to the whole group size. The returned data frame from `across()` # is what is then recycled to the whole group size. fn <- function(x) { if (is.logical(x)) { x } else { TRUE } } expect_identical( mutate(df, across(c(x, y), fn)), tibble(x = df$x, y = rep(TRUE, times = nrow(df))) ) expect_identical( mutate(df, (across(c(x, y), fn))), tibble(x = df$x, y = rep(TRUE, times = nrow(df))) ) # Not forcing the result of `.fns` to immediately recycle to the group size is # useful for niche cases where you want to compute something with `across()` # but it isn't actually what you return fn <- function(x) { c(mean(x), median(x)) } expect_identical( mutate(df, { # Maybe your `across()` call returns something of length 2 values <- across(c(x, y), fn) # But then you manipulate it to return something compatible with the group size new_tibble(map(values, max)) }), tibble(x = c(1, 1, 1), y = c(2, 2, 2)) ) # Unrecyclable expect_snapshot(error = TRUE, { # TODO: This error is bad mutate(df, across(c(x, y), fn)) }) expect_snapshot(error = TRUE, { mutate(df, (across(c(x, y), fn))) }) }) test_that("`if_any()` and `if_all()` have consistent behavior across `filter()` and `mutate()`", { # Tests a full suite comparing: # - `filter()` vs `mutate()` # - `filter()`'s evaluation vs expansion models # - With and without `.fns` # `w` and `x` cover all combinations of `|` and `&` df <- data.frame( w = c(TRUE, FALSE, NA, TRUE, FALSE, TRUE, FALSE, NA, NA), x = c(TRUE, FALSE, NA, FALSE, TRUE, NA, NA, TRUE, FALSE), y = 1:9, z = 10:18, g = c("a", "b", "a", "b", "b", "a", "c", "a", "a") ) # Zero inputs expect_identical( filter(df, if_any(c())), filter(df, FALSE) ) expect_identical( filter(df, (if_any(c()))), filter(df, FALSE) ) expect_identical( mutate(df, a = if_any(c())), mutate(df, a = FALSE) ) expect_identical( filter(df, if_any(c(), identity)), filter(df, FALSE) ) expect_identical( filter(df, (if_any(c(), identity))), filter(df, FALSE) ) expect_identical( mutate(df, a = if_any(c(), identity)), mutate(df, a = FALSE) ) expect_identical( filter(df, if_all(c())), filter(df, TRUE) ) expect_identical( filter(df, (if_all(c()))), filter(df, TRUE) ) expect_identical( mutate(df, a = if_all(c())), mutate(df, a = TRUE) ) expect_identical( filter(df, if_all(c(), identity)), filter(df, TRUE) ) expect_identical( filter(df, (if_all(c(), identity))), filter(df, TRUE) ) expect_identical( mutate(df, a = if_all(c(), identity)), mutate(df, a = TRUE) ) # One input expect_identical( filter(df, if_any(w)), filter(df, w) ) expect_identical( filter(df, (if_any(w))), filter(df, w) ) expect_identical( mutate(df, a = if_any(w)), mutate(df, a = w) ) expect_identical( filter(df, if_any(w, identity)), filter(df, w) ) expect_identical( filter(df, (if_any(w, identity))), filter(df, w) ) expect_identical( mutate(df, a = if_any(w, identity)), mutate(df, a = w) ) expect_identical( filter(df, if_all(w)), filter(df, w) ) expect_identical( filter(df, (if_all(w))), filter(df, w) ) expect_identical( mutate(df, a = if_all(w)), mutate(df, a = w) ) expect_identical( filter(df, if_all(w, identity)), filter(df, w) ) expect_identical( filter(df, (if_all(w, identity))), filter(df, w) ) expect_identical( mutate(df, a = if_all(w, identity)), mutate(df, a = w) ) # Two inputs expect_identical( filter(df, if_any(c(w, x))), filter(df, w | x) ) expect_identical( filter(df, (if_any(c(w, x)))), filter(df, w | x) ) expect_identical( mutate(df, a = if_any(c(w, x))), mutate(df, a = w | x) ) expect_identical( filter(df, if_any(c(w, x), identity)), filter(df, w | x) ) expect_identical( filter(df, (if_any(c(w, x), identity))), filter(df, w | x) ) expect_identical( mutate(df, a = if_any(c(w, x), identity)), mutate(df, a = w | x) ) expect_identical( filter(df, if_all(c(w, x))), filter(df, w & x) ) expect_identical( filter(df, (if_all(c(w, x)))), filter(df, w & x) ) expect_identical( mutate(df, a = if_all(c(w, x))), mutate(df, a = w & x) ) expect_identical( filter(df, if_all(c(w, x), identity)), filter(df, w & x) ) expect_identical( filter(df, (if_all(c(w, x), identity))), filter(df, w & x) ) expect_identical( mutate(df, a = if_all(c(w, x), identity)), mutate(df, a = w & x) ) # Two inputs (grouped) expect_identical( filter(df, if_any(c(w, x)), .by = g), filter(df, w | x, .by = g) ) expect_identical( filter(df, (if_any(c(w, x))), .by = g), filter(df, w | x, .by = g) ) expect_identical( mutate(df, a = if_any(c(w, x)), .by = g), mutate(df, a = w | x, .by = g) ) expect_identical( filter(df, if_any(c(w, x), identity), .by = g), filter(df, w | x, .by = g) ) expect_identical( filter(df, (if_any(c(w, x), identity)), .by = g), filter(df, w | x, .by = g) ) expect_identical( mutate(df, a = if_any(c(w, x), identity), .by = g), mutate(df, a = w | x, .by = g) ) expect_identical( filter(df, if_all(c(w, x)), .by = g), filter(df, w & x, .by = g) ) expect_identical( filter(df, (if_all(c(w, x))), .by = g), filter(df, w & x, .by = g) ) expect_identical( mutate(df, a = if_all(c(w, x)), .by = g), mutate(df, a = w & x, .by = g) ) expect_identical( filter(df, if_all(c(w, x), identity), .by = g), filter(df, w & x, .by = g) ) expect_identical( filter(df, (if_all(c(w, x), identity)), .by = g), filter(df, w & x, .by = g) ) expect_identical( mutate(df, a = if_all(c(w, x), identity), .by = g), mutate(df, a = w & x, .by = g) ) # One non-logical input (all error) expect_snapshot(error = TRUE, filter(df, if_any(y))) expect_snapshot(error = TRUE, filter(df, (if_any(y)))) expect_snapshot(error = TRUE, mutate(df, a = if_any(y))) expect_snapshot(error = TRUE, filter(df, if_any(y, identity))) expect_snapshot(error = TRUE, filter(df, (if_any(y, identity)))) expect_snapshot(error = TRUE, mutate(df, a = if_any(y, identity))) expect_snapshot(error = TRUE, filter(df, if_all(y))) expect_snapshot(error = TRUE, filter(df, (if_all(y)))) expect_snapshot(error = TRUE, mutate(df, a = if_all(y))) expect_snapshot(error = TRUE, filter(df, if_all(y, identity))) expect_snapshot(error = TRUE, filter(df, (if_all(y, identity)))) expect_snapshot(error = TRUE, mutate(df, a = if_all(y, identity))) # Two non-logical inputs (all error) expect_snapshot(error = TRUE, filter(df, if_any(c(y, z)))) expect_snapshot(error = TRUE, filter(df, (if_any(c(y, z))))) expect_snapshot(error = TRUE, mutate(df, a = if_any(c(y, z)))) expect_snapshot(error = TRUE, filter(df, if_any(c(y, z), identity))) expect_snapshot(error = TRUE, filter(df, (if_any(c(y, z), identity)))) expect_snapshot(error = TRUE, mutate(df, a = if_any(c(y, z), identity))) expect_snapshot(error = TRUE, filter(df, if_all(c(y, z)))) expect_snapshot(error = TRUE, filter(df, (if_all(c(y, z))))) expect_snapshot(error = TRUE, mutate(df, a = if_all(c(y, z)))) expect_snapshot(error = TRUE, filter(df, if_all(c(y, z), identity))) expect_snapshot(error = TRUE, filter(df, (if_all(c(y, z), identity)))) expect_snapshot(error = TRUE, mutate(df, a = if_all(c(y, z), identity))) # Two non-logical inputs (grouped) (all error) expect_snapshot(error = TRUE, { filter(df, if_any(c(y, z)), .by = g) }) expect_snapshot(error = TRUE, { filter(df, (if_any(c(y, z))), .by = g) }) expect_snapshot(error = TRUE, { mutate(df, a = if_any(c(y, z)), .by = g) }) expect_snapshot(error = TRUE, { filter(df, if_any(c(y, z), identity), .by = g) }) expect_snapshot(error = TRUE, { filter(df, (if_any(c(y, z), identity)), .by = g) }) expect_snapshot(error = TRUE, { mutate(df, a = if_any(c(y, z), identity), .by = g) }) expect_snapshot(error = TRUE, { filter(df, if_all(c(y, z)), .by = g) }) expect_snapshot(error = TRUE, { filter(df, (if_all(c(y, z))), .by = g) }) expect_snapshot(error = TRUE, { mutate(df, a = if_all(c(y, z)), .by = g) }) expect_snapshot(error = TRUE, { filter(df, if_all(c(y, z), identity), .by = g) }) expect_snapshot(error = TRUE, { filter(df, (if_all(c(y, z), identity)), .by = g) }) expect_snapshot(error = TRUE, { mutate(df, a = if_all(c(y, z), identity), .by = g) }) }) test_that("`if_any()` and `if_all()` recycle `.fns` results to common size", { df <- data.frame( x = c(TRUE, FALSE, NA), y = c(1L, 2L, 3L) ) # `.fns` results recycle. Both `across()` and `if_any()`/`if_all()` recycle to # a common size amongst their inputs (here, size 1), then that data frame is # recycled to the group size. fn <- function(x) { if (is.logical(x)) { c(TRUE, FALSE, TRUE) } else { TRUE } } expect_identical( filter(df, if_any(c(x, y), fn)), filter(df, TRUE) ) expect_identical( filter(df, (if_any(c(x, y), fn))), filter(df, TRUE) ) expect_identical( mutate(df, a = if_any(c(x, y), fn)), mutate(df, a = TRUE) ) expect_identical( filter(df, if_all(c(x, y), fn)), filter(df, c(TRUE, FALSE, TRUE)) ) expect_identical( filter(df, (if_all(c(x, y), fn))), filter(df, c(TRUE, FALSE, TRUE)) ) expect_identical( mutate(df, a = if_all(c(x, y), fn)), mutate(df, a = c(TRUE, FALSE, TRUE)) ) # Unrecyclable (all error, can't recycle to group size) # It is correct that these show `..1` in the error for `filter()`. The error # is about recycling of the result of `if_any()`, i.e. the data frame in the # 1st argument slot. fn <- function(x) c(TRUE, FALSE) expect_snapshot(error = TRUE, filter(df, if_any(c(x, y), fn))) expect_snapshot(error = TRUE, filter(df, (if_any(c(x, y), fn)))) expect_snapshot(error = TRUE, mutate(df, a = if_any(c(x, y), fn))) expect_snapshot(error = TRUE, filter(df, if_all(c(x, y), fn))) expect_snapshot(error = TRUE, filter(df, (if_all(c(x, y), fn)))) expect_snapshot(error = TRUE, mutate(df, a = if_all(c(x, y), fn))) }) # c_across ---------------------------------------------------------------- test_that("selects and combines columns", { df <- data.frame(x = 1:2, y = 3:4) out <- df |> summarise(z = list(c_across(x:y))) expect_equal(out$z, list(1:4)) }) test_that("can't rename during selection (#6522)", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { mutate(df, z = c_across(c(y = x))) }) }) test_that("can't explicitly select grouping columns (#6522)", { # Related to removing the mask layer from the quosure environments df <- tibble(g = 1, x = 2) gdf <- group_by(df, g) expect_snapshot(error = TRUE, { mutate(gdf, y = c_across(g)) }) }) test_that("`all_of()` is evaluated in the correct environment (#6522)", { # Related to removing the mask layer from the quosure environments df <- tibble(x = 1, y = 2) # We expect an "object not found" error, but we don't control that # so we aren't going to snapshot it, especially since the call reported # by those kinds of errors changed in R 4.3. expect_error(mutate(df, z = c_across(all_of(y)))) y <- "x" expect <- df[["x"]] out <- mutate(df, z = c_across(all_of(y))) expect_identical(out$z, expect) }) # cols deprecation -------------------------------------------------------- test_that("across() applies old `.cols = everything()` default with a warning", { local_options(lifecycle_verbosity = "warning") df <- tibble(g = c(1, 2), x = c(1, 2), y = c(3, 4)) gdf <- group_by(df, g) times_two <- function(x) x * 2 # Expansion path expect_snapshot(out <- mutate(df, across(.fns = times_two))) expect_identical(out$g, df$g * 2) expect_identical(out$x, df$x * 2) expect_identical(out$y, df$y * 2) expect_snapshot(out <- mutate(gdf, across(.fns = times_two))) expect_identical(out$g, df$g) expect_identical(out$x, df$x * 2) expect_identical(out$y, df$y * 2) # Evaluation path expect_snapshot(out <- mutate(df, (across(.fns = times_two)))) expect_identical(out$g, df$g * 2) expect_identical(out$x, df$x * 2) expect_identical(out$y, df$y * 2) expect_snapshot(out <- mutate(gdf, (across(.fns = times_two)))) expect_identical(out$g, df$g) expect_identical(out$x, df$x * 2) expect_identical(out$y, df$y * 2) }) test_that("if_any() and if_all() apply old `.cols = everything()` default with a warning", { local_options(lifecycle_verbosity = "warning") df <- tibble(x = c(TRUE, FALSE, TRUE), y = c(FALSE, FALSE, TRUE)) gdf <- mutate(df, g = c(1, 1, 2), .before = 1) gdf <- group_by(gdf, g) # Expansion path expect_snapshot(out <- filter(df, if_any())) expect_identical(out, df[c(1, 3), ]) expect_snapshot(out <- filter(gdf, if_any())) expect_identical(out, gdf[c(1, 3), ]) expect_snapshot(out <- filter(df, if_all())) expect_identical(out, df[3, ]) expect_snapshot(out <- filter(gdf, if_all())) expect_identical(out, gdf[3, ]) # Evaluation path expect_snapshot(out <- filter(df, (if_any()))) expect_identical(out, df[c(1, 3), ]) expect_snapshot(out <- filter(gdf, (if_any()))) expect_identical(out, gdf[c(1, 3), ]) expect_snapshot(out <- filter(df, (if_all()))) expect_identical(out, df[3, ]) expect_snapshot(out <- filter(gdf, (if_all()))) expect_identical(out, gdf[3, ]) }) test_that("c_across() applies old `cols = everything()` default with a warning", { local_options(lifecycle_verbosity = "warning") df <- tibble(x = c(1, 3), y = c(2, 4)) df <- rowwise(df) # Will see 2 warnings because verbosity option forces it to warn every time expect_snapshot(out <- mutate(df, z = sum(c_across()))) expect_identical(out$z, c(3, 7)) }) # fns deprecation --------------------------------------------------------- test_that("across() applies old `.fns = NULL` default", { df <- tibble(x = 1, y = 2) # Expansion path out <- mutate(df, z = across(everything())) expect_identical(out$z, df) # Evaluation path out <- mutate(df, z = (across(everything()))) expect_identical(out$z, df) }) test_that("if_any() and if_all() apply old `.fns = NULL` default", { df <- tibble(x = c(TRUE, FALSE, TRUE), y = c(FALSE, FALSE, TRUE)) # Expansion path expect_identical(filter(df, if_any(everything())), df[c(1, 3), ]) expect_identical(filter(df, if_all(everything())), df[3, ]) # Evaluation path expect_identical(filter(df, (if_any(everything()))), df[c(1, 3), ]) expect_identical(filter(df, (if_all(everything()))), df[3, ]) }) test_that("across errors with non-empty dots and no `.fns` supplied (#6638)", { df <- tibble(x = 1) expect_snapshot( error = TRUE, mutate(df, across(x, .funs = ~ . * 1000)) ) }) # dots -------------------------------------------------------------------- test_that("across(...) is deprecated", { df <- tibble(x = c(1, NA)) expect_snapshot(summarise(df, across(everything(), mean, na.rm = TRUE))) }) test_that("across() passes ... to functions", { options(lifecycle_verbosity = "quiet") df <- tibble(x = c(1, NA)) expect_equal( summarise(df, across(everything(), mean, na.rm = TRUE)), tibble(x = 1) ) expect_equal( summarise( df, across(everything(), list(mean = mean, median = median), na.rm = TRUE) ), tibble(x_mean = 1, x_median = 1) ) }) test_that("across() passes unnamed arguments following .fns as ... (#4965)", { options(lifecycle_verbosity = "quiet") df <- tibble(x = 1) expect_equal(mutate(df, across(x, `+`, 1)), tibble(x = 2)) }) test_that("across() avoids simple argument name collisions with ... (#4965)", { options(lifecycle_verbosity = "quiet") df <- tibble(x = c(1, 2)) expect_equal(summarize(df, across(x, tail, n = 1)), tibble(x = 2)) }) test_that("across() evaluates ... with promise semantics (#5813)", { options(lifecycle_verbosity = "quiet") df <- tibble(x = tibble(foo = 1), y = tibble(foo = 2)) res <- mutate( df, across(everything(), mutate, foo = foo + 1) ) expect_equal(res$x$foo, 2) expect_equal(res$y$foo, 3) # Dots are evaluated only once new_counter <- function() { n <- 0L function() { n <<- n + 1L n } } counter <- new_counter() list_second <- function(...) { list(..2) } res <- mutate( df, across(everything(), list_second, counter()) ) expect_equal(res$x[[1]], 1) expect_equal(res$y[[1]], 1) }) test_that("arguments in dots are evaluated once per group", { options(lifecycle_verbosity = "quiet") set.seed(0) out <- data.frame(g = 1:3, var = NA) |> group_by(g) |> mutate(across(var, function(x, y) y, rnorm(1))) |> pull(var) set.seed(0) expect_equal(out, rnorm(3)) }) test_that("group variables are in scope when passed in dots (#5832)", { options(lifecycle_verbosity = "quiet") f <- function(x, z) x + z gdf <- data.frame(x = 1:2, y = 3:4, g = 1:2) |> group_by(g) exp <- gdf |> summarise(x = f(x, z = y)) expect_equal( gdf |> summarise(across(x, f, z = y)), exp ) expect_equal( gdf |> summarise((across(x, f, z = y))), exp ) }) test_that("symbols are looked up as list or functions (#6545)", { df <- tibble(mean = 1:5) exp <- summarise(df, across(everything(), function(x) mean(x))) expect_equal( summarise(df, across(everything(), mean)), exp ) expect_equal( summarise(df, (across(everything(), mean))), exp ) exp <- summarise(df, across(everything(), list(function(x) mean(x)))) expect_equal( summarize(df, across(everything(), list(mean))), exp ) expect_equal( summarize(df, (across(everything(), list(mean)))), exp ) }) test_that("non-inlinable but maskable lambdas give precedence to function arguments", { df <- data.frame( foo = 1, bar = "a" ) out <- mutate(df, across(1:2, function(foo) return(foo))) expect_equal(out, df) }) test_that("maskable lambdas can refer to their lexical environment", { foo <- "OK" df <- tibble(bar = "a") # Non-inlinable expect_equal( mutate(df, across(1, function(x) return(paste(x, foo)))), tibble(bar = "a OK") ) expect_equal( mutate(df, across(1, ~ return(paste(.x, foo)))), tibble(bar = "a OK") ) # Inlinable expect_equal( mutate(df, across(1, function(x) paste(x, foo))), tibble(bar = "a OK") ) expect_equal( mutate(df, across(1, ~ paste(.x, foo))), tibble(bar = "a OK") ) }) dplyr/tests/testthat/test-distinct.R0000644000176200001440000001172215106134104017325 0ustar liggesuserstest_that("distinct equivalent to local unique when keep_all is TRUE", { df <- data.frame( x = c(1, 1, 1, 1), y = c(1, 1, 2, 2), z = c(1, 2, 1, 2) ) expect_equal(distinct(df), unique(df)) }) test_that("distinct for single column works as expected (#1937)", { df <- tibble( x = c(1, 1, 1, 1), y = c(1, 1, 2, 2), z = c(1, 2, 1, 2) ) expect_equal(distinct(df, x, .keep_all = FALSE), unique(df["x"])) expect_equal(distinct(df, y, .keep_all = FALSE), unique(df["y"])) }) test_that("distinct works for 0-sized columns (#1437)", { df <- tibble(x = 1:10) |> select(-x) ddf <- distinct(df) expect_equal(df_n_col(ddf), 0L) }) test_that("if no variables specified, uses all", { df <- tibble(x = c(1, 1), y = c(2, 2)) expect_equal(distinct(df), tibble(x = 1, y = 2)) }) test_that("distinct keeps only specified cols", { df <- tibble(x = c(1, 1, 1), y = c(1, 1, 1)) expect_equal(df |> distinct(x), tibble(x = 1)) }) test_that("unless .keep_all = TRUE", { df <- tibble(x = c(1, 1, 1), y = 3:1) expect_equal(df |> distinct(x), tibble(x = 1)) expect_equal(df |> distinct(x, .keep_all = TRUE), tibble(x = 1, y = 3L)) }) test_that("distinct doesn't duplicate columns", { df <- tibble(a = 1:3, b = 4:6) expect_named(df |> distinct(a, a), "a") expect_named(df |> group_by(a) |> distinct(a), "a") }) test_that("grouped distinct always includes group cols", { df <- tibble(g = c(1, 2), x = c(1, 2)) out <- df |> group_by(g) |> distinct(x) expect_named(out, c("g", "x")) }) test_that("empty grouped distinct equivalent to empty ungrouped", { df <- tibble(g = c(1, 2), x = c(1, 2)) df1 <- df |> distinct() |> group_by(g) df2 <- df |> group_by(g) |> distinct() expect_equal(df1, df2) }) test_that("distinct on a new, mutated variable is equivalent to mutate followed by distinct", { df <- tibble(g = c(1, 2), x = c(1, 2)) df1 <- df |> distinct(aa = g * 2) df2 <- df |> mutate(aa = g * 2) |> distinct(aa) expect_equal(df1, df2) }) test_that("distinct on a new, copied variable is equivalent to mutate followed by distinct (#3234)", { df <- tibble(g = c(1, 2), x = c(1, 2)) df1 <- df |> distinct(aa = g) df2 <- df |> mutate(aa = g) |> distinct(aa) expect_equal(df1, df2) }) test_that("distinct on a dataframe or tibble with columns of type list throws an error", { df <- tibble( a = c("1", "1", "2", "2", "3", "3"), b = list("A") ) df2 <- data.frame(x = 1:5, y = I(list(1:3, 2:4, 3:5, 4:6, 5:7))) expect_identical(df2 |> distinct(), df2) expect_identical(df |> distinct(), df |> slice(c(1, 3, 5))) }) test_that("distinct handles 0 columns edge case (#2954)", { d <- select(data.frame(x = c(1, 1)), one_of(character(0))) res <- distinct(d) expect_equal(nrow(res), 1L) expect_equal(nrow(distinct(tibble())), 0L) }) test_that("distinct respects order of the specified variables (#3195, #6156)", { d <- data.frame(x = 1:2, y = 3:4) expect_named(distinct(d, y, x), c("y", "x")) }) test_that("distinct adds grouping variables to front if missing", { d <- data.frame(x = 1:2, y = 3:4) expect_named(distinct(group_by(d, y), x), c("y", "x")) expect_named(distinct(group_by(d, y), x, y), c("x", "y")) }) test_that("distinct() understands both NA variants (#4516)", { df <- data.frame(col_a = c(1, NA, NA)) df$col_a <- df$col_a + 0 df$col_a[2] <- NA_real_ expect_equal(nrow(distinct(df)), 2L) df_1 <- data.frame(col_a = c(1, NA)) df_2 <- data.frame(col_a = c(1, NA)) df_1$col_a <- df_1$col_a + 0 df_2$col_a <- df_2$col_a + 0 df_1$col_a[2] <- NA expect_equal(nrow(setdiff(df_1, df_2)), 0L) }) test_that("distinct() handles auto splicing", { expect_equal( iris |> distinct(Species), iris |> distinct(data.frame(Species = Species)) ) expect_equal( iris |> distinct(Species), iris |> distinct(pick(Species)) ) expect_equal( iris |> mutate(across(starts_with("Sepal"), round)) |> distinct(Sepal.Length, Sepal.Width), iris |> distinct(across(starts_with("Sepal"), round)) ) }) test_that("distinct preserves grouping", { gf <- group_by(tibble(x = c(1, 1, 2, 2), y = x), x) i <- count_regroups(out <- distinct(gf)) expect_equal(i, 0) expect_equal(group_vars(out), "x") i <- count_regroups(out <- distinct(gf, x = x + 2)) expect_equal(i, 1) expect_equal(group_vars(out), "x") }) test_that("distinct() preserves attributes on bare data frames (#6318)", { df <- vctrs::data_frame(x = c(1, 1)) attr(df, "foo") <- "bar" out <- distinct(df, x) expect_identical(attr(out, "foo"), "bar") out <- distinct(df, y = x + 1L) expect_identical(attr(out, "foo"), "bar") }) # Errors ------------------------------------------------------------------ test_that("distinct errors when selecting an unknown column (#3140)", { expect_snapshot({ df <- tibble(g = c(1, 2), x = c(1, 2)) (expect_error(df |> distinct(aa, x))) (expect_error(df |> distinct(aa, bb))) (expect_error(df |> distinct(.data$aa))) (expect_error(df |> distinct(y = a + 1))) }) }) dplyr/tests/testthat/test-recode.R0000644000176200001440000001210515106134104016741 0ustar liggesuserstest_that("positional substitution works", { expect_equal(recode(1:2, "a", "b"), c("a", "b")) }) test_that("names override positions", { expect_equal(recode(1:2, `2` = "b", `1` = "a"), c("a", "b")) }) test_that("named substitution works", { x1 <- letters[1:3] x2 <- factor(x1) expect_equal( recode(x1, a = "apple", .default = NA_character_), c("apple", NA, NA) ) expect_equal( recode(x2, a = "apple", .default = NA_character_), factor(c("apple", NA, NA)) ) }) test_that("missing values replaced by missing argument", { expect_equal(recode(c(1, NA), "a"), c("a", NA)) expect_equal(recode(c(1, NA), "a", .missing = "b"), c("a", "b")) expect_equal( recode(c(letters[1:3], NA), .missing = "A"), c("a", "b", "c", "A") ) }) test_that("unmatched value replaced by default argument", { expect_warning(expect_equal(recode(c(1, 2), "a"), c("a", NA))) expect_equal(recode(c(1, 2), "a", .default = "b"), c("a", "b")) expect_equal(recode(letters[1:3], .default = "A"), c("A", "A", "A")) }) test_that("missing and default place nicely together", { expect_equal( recode(c(1, 2, NA), "a", .default = "b", .missing = "c"), c("a", "b", "c") ) }) test_that("can give name x", { expect_equal(recode("x", x = "a"), "a") }) test_that(".default works when not all values are named", { x <- rep(1:3, 3) expect_equal(recode(x, `3` = 10L, .default = x), rep(c(1L, 2L, 10L), 3)) }) test_that(".default is aliased to .x when missing and compatible", { x <- letters[1:3] expect_equal(recode(x, a = "A"), c("A", "b", "c")) n <- 1:3 expect_equal(recode(n, `1` = 10L), c(10L, 2L, 3L)) }) test_that(".default is not aliased to .x when missing and not compatible", { x <- letters[1:3] expect_warning(expect_equal(recode(x, a = 1), c(1L, NA, NA))) n <- 1:3 expect_warning(expect_equal(recode(n, `1` = "a"), c("a", NA, NA))) }) test_that("conversion of unreplaced values to NA gives warning", { expect_warning(recode(1:3, `1` = "a"), "treated as NA") expect_warning(recode_factor(letters[1:3], b = 1, c = 2)) }) test_that(".dot argument works correctly (PR #2110)", { x1 <- letters[1:3] x2 <- 1:3 x3 <- factor(x1) expect_equal( recode(x1, a = "apple", b = "banana", .default = NA_character_), recode(x1, .default = NA_character_, !!!list(a = "apple", b = "banana")) ) expect_equal( recode(x1, a = "apple", b = "banana", .default = NA_character_), recode(x1, a = "apple", .default = NA_character_, !!!list(b = "banana")) ) expect_equal( recode(x2, "1" = 4, "2" = 5, .default = NA_real_), recode(x2, .default = NA_real_, !!!list("1" = 4, "2" = 5)) ) expect_equal( recode(x2, "1" = 4, "2" = 5, .default = NA_real_), recode(x2, "1" = 4, .default = NA_real_, !!!list("2" = 5)) ) expect_equal( recode_factor(x3, a = "apple", b = "banana", .default = NA_character_), recode_factor( x3, .default = NA_character_, !!!list(a = "apple", b = "banana") ) ) }) # factor ------------------------------------------------------------------ test_that("default .default works with factors", { expect_equal(recode(factor(letters[1:3]), a = "A"), factor(c("A", "b", "c"))) }) test_that("can recode factor to double", { f <- factor(letters[1:3]) expect_equal(recode(f, a = 1, b = 2, c = 3), c(1, 2, 3)) expect_equal(recode(f, a = 1, b = 2), c(1, 2, NA)) expect_equal(recode(f, a = 1, b = 2, .default = 99), c(1, 2, 99)) }) test_that("recode_factor() handles .missing and .default levels", { x <- c(1:3, NA) expect_warning( expect_equal( recode_factor(x, `1` = "z", `2` = "y"), factor(c("z", "y", NA, NA), levels = c("z", "y")) ) ) expect_equal( recode_factor(x, `1` = "z", `2` = "y", .default = "D"), factor(c("z", "y", "D", NA), levels = c("z", "y", "D")) ) expect_equal( recode_factor(x, `1` = "z", `2` = "y", .default = "D", .missing = "M"), factor(c("z", "y", "D", "M"), c("z", "y", "D", "M")) ) }) test_that("recode_factor() handles vector .default", { expected <- factor(c("a", "z", "y"), levels = c("z", "y", "a")) x1 <- letters[1:3] x2 <- factor(x1) expect_equal(recode_factor(x1, b = "z", c = "y"), expected) expect_equal(recode_factor(x2, b = "z", c = "y"), expected) expect_equal(recode_factor(x1, b = "z", c = "y", .default = x1), expected) expect_equal(recode_factor(x2, b = "z", c = "y", .default = x1), expected) }) test_that("can recode factor with redundant levels", { expect_equal( recode(factor(letters[1:4]), d = "c", b = "a"), factor(c("a", "a", "c", "c"), levels = c("a", "c")) ) expect_equal( recode_factor(letters[1:4], d = "c", b = "a"), factor(c("a", "a", "c", "c"), levels = c("c", "a")) ) }) # Errors -------------------------------------------- test_that("recode() gives meaningful error messages", { expect_snapshot({ (expect_error(recode(factor("a"), a = 5, .missing = 10))) (expect_error(recode("a", b = 5, "c"))) (expect_error(recode(factor("a"), b = 5, "c"))) # no replacement (expect_error(recode(1:5))) (expect_error(recode("a"))) (expect_error(recode(factor("a")))) }) }) dplyr/tests/testthat/test-slice.R0000644000176200001440000004434515106134104016612 0ustar liggesuserstest_that("empty slice drops all rows (#6573)", { df <- tibble(g = c(1, 1, 2), x = 1:3) gdf <- group_by(df, g) rdf <- rowwise(df) expect_identical(slice(df), df[integer(), ]) expect_identical(slice(gdf), gdf[integer(), ]) expect_identical(slice(rdf), rdf[integer(), ]) }) test_that("slicing data.frame yields data.frame", { df <- data.frame(x = 1:3) expect_equal(slice(df, 1), data.frame(x = 1L)) }) test_that("slice keeps positive indices, ignoring out of range (#226)", { gf <- group_by(tibble(g = c(1, 2, 2, 3, 3, 3), id = 1:6), g) out <- slice(gf, 1) expect_equal(out$id, c(1, 2, 4)) out <- slice(gf, 2) expect_equal(out$id, c(3, 5)) }) test_that("slice drops negative indices, ignoring out of range (#3073)", { gf <- group_by(tibble(g = c(1, 2, 2, 3, 3, 3), id = 1:6), g) out <- slice(gf, -1) expect_equal(out$id, c(3, 5, 6)) out <- slice(gf, -(1:2)) expect_equal(out$id, 6) }) test_that("slice errors if positive and negative indices mixed", { expect_snapshot(error = TRUE, { slice(tibble(), 1, -1) }) }) test_that("slice ignores 0 and NA (#3313, #1235)", { gf <- group_by(tibble(g = c(1, 2, 2, 3, 3, 3), id = 1:6), g) out <- slice(gf, 0) expect_equal(out$id, integer()) out <- slice(gf, 0, 1) expect_equal(out$id, c(1, 2, 4)) out <- slice(gf, NA) expect_equal(out$id, integer()) out <- slice(gf, NA, -1) expect_equal(out$id, c(3, 5, 6)) }) test_that("slicing with one-column matrix is deprecated", { df <- tibble(g = c(1, 2, 2, 3, 3, 3), id = 1:6) expect_snapshot({ out <- slice(df, matrix(c(1, 3))) }) expect_equal(out$id, c(1, 3)) }) test_that("slice errors if index is not numeric", { expect_snapshot(error = TRUE, { slice(tibble(), "a") }) }) test_that("slice preserves groups iff requested", { gf <- group_by(tibble(g = c(1, 2, 2, 3, 3, 3), id = 1:6), g) out <- slice(gf, 2, 3) expect_equal(group_keys(out), tibble(g = c(2, 3))) expect_equal(group_rows(out), list_of(1, c(2, 3))) out <- slice(gf, 2, 3, .preserve = TRUE) expect_equal(group_keys(out), tibble(g = c(1, 2, 3))) expect_equal(group_rows(out), list_of(integer(), 1, c(2, 3))) }) test_that("slice handles zero-row and zero-column inputs (#1219, #2490)", { df <- tibble(x = numeric()) expect_equal(slice(df, 1), df) df <- tibble(.rows = 10) expect_equal(slice(df, 1), tibble(.rows = 1)) }) test_that("user errors are correctly labelled", { df <- tibble(x = 1:3) expect_snapshot(error = TRUE, { slice(df, 1 + "") slice(group_by(df, x), 1 + "") }) }) test_that("`...` can't be named (#6554)", { df <- tibble(g = 1, x = 1) expect_snapshot(error = TRUE, { slice(df, 1, foo = g) }) }) test_that("slice keeps zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) expect_equal(group_size(slice(df, 1)), c(1, 1, 0)) }) test_that("slicing retains labels for zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), g = c(1, 1, 2, 2), x = c(1, 2, 1, 4) ) df <- group_by(df, e, f, g, .drop = FALSE) expect_equal( ungroup(count(slice(df, 1))), tibble( e = 1, f = factor(1:3), g = c(1, 2, NA), n = c(1L, 1L, 0L) ) ) }) test_that("can group transiently using `.by`", { df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3)) out <- slice(df, n(), .by = g) expect_identical(out$g, c(1, 2)) expect_identical(out$x, c(2, 3)) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains bare data.frame class", { df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3)) out <- slice(df, n(), .by = g) expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains data frame attributes", { # With data.frames or tibbles df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 3)) tbl <- as_tibble(df) attr(df, "foo") <- "bar" attr(tbl, "foo") <- "bar" out <- slice(df, n(), .by = g) expect_identical(attr(out, "foo"), "bar") out <- slice(tbl, n(), .by = g) expect_identical(attr(out, "foo"), "bar") }) test_that("transient grouping orders by first appearance", { df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) out <- slice(df, which(x == max(x)), .by = g) expect_identical(out$g, c(2, 1, 0)) expect_identical(out$x, c(8, 2, 5)) }) test_that("can't use `.by` with `.preserve`", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { slice(df, .by = x, .preserve = TRUE) }) }) test_that("catches `.by` with grouped-df", { df <- tibble(x = 1) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { slice(gdf, .by = x) }) }) test_that("catches `.by` with rowwise-df", { df <- tibble(x = 1) rdf <- rowwise(df) expect_snapshot(error = TRUE, { slice(rdf, .by = x) }) }) test_that("catches `by` typo (#6647)", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { slice(df, by = x) }) }) # Slice variants ---------------------------------------------------------- test_that("slice_helpers() call get_slice_size()", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { slice_head(df, n = "a") slice_tail(df, n = "a") slice_min(df, x, n = "a") slice_max(df, x, n = "a") slice_sample(df, n = "a") }) }) test_that("get_slice_size() validates its inputs", { expect_snapshot(error = TRUE, { get_slice_size(n = 1, prop = 1) get_slice_size(n = "a") get_slice_size(prop = "a") }) }) test_that("get_slice_size() snapshots", { expect_snapshot({ body(get_slice_size(prop = 0)) body(get_slice_size(prop = 0.4)) body(get_slice_size(prop = 2)) body(get_slice_size(prop = 2, allow_outsize = TRUE)) body(get_slice_size(prop = -0.4)) body(get_slice_size(prop = -2)) body(get_slice_size(n = 0)) body(get_slice_size(n = 4)) body(get_slice_size(n = 20)) body(get_slice_size(n = 20, allow_outsize = TRUE)) body(get_slice_size(n = -4)) body(get_slice_size(n = -20)) }) }) test_that("get_slice_size() standardises prop", { expect_equal(get_slice_size(prop = 0)(10), 0) expect_equal(get_slice_size(prop = 0.4)(10), 4) expect_equal(get_slice_size(prop = 2)(10), 10) expect_equal(get_slice_size(prop = 2, allow_outsize = TRUE)(10), 20) expect_equal(get_slice_size(prop = -0.4)(10), 6) expect_equal(get_slice_size(prop = -2)(10), 0) }) test_that("get_slice_size() standardises n", { expect_equal(get_slice_size(n = 0)(10), 0) expect_equal(get_slice_size(n = 4)(10), 4) expect_equal(get_slice_size(n = 20)(10), 10) expect_equal(get_slice_size(n = 20, allow_outsize = TRUE)(10), 20) expect_equal(get_slice_size(n = -4)(10), 6) expect_equal(get_slice_size(n = -20)(10), 0) }) test_that("get_slice_size() rounds prop in the right direction", { expect_equal(get_slice_size(prop = 0.16)(10), 1) expect_equal(get_slice_size(prop = -0.16)(10), 9) }) test_that("n must be an integer", { df <- tibble(x = 1:5) expect_snapshot(slice_head(df, n = 1.1), error = TRUE) }) test_that("functions silently truncate results", { # only test positive n because get_slice_size() converts all others df <- tibble(x = 1:5) expect_equal(nrow(slice_head(df, n = 6)), 5) expect_equal(nrow(slice_tail(df, n = 6)), 5) expect_equal(nrow(slice_min(df, x, n = 6)), 5) expect_equal(nrow(slice_max(df, x, n = 6)), 5) expect_equal(nrow(slice_sample(df, n = 6)), 5) }) test_that("slice helpers with n = 0 return no rows", { df <- tibble(x = 1:5) expect_equal(nrow(slice_head(df, n = 0)), 0) expect_equal(nrow(slice_tail(df, n = 0)), 0) expect_equal(nrow(slice_min(df, x, n = 0)), 0) expect_equal(nrow(slice_max(df, x, n = 0)), 0) expect_equal(nrow(slice_sample(df, n = 0)), 0) }) test_that("slice_*() doesn't look for `n` in data (#6089)", { df <- data.frame(x = 1:10, n = 10:1, g = rep(1:2, each = 5)) expect_error(slice_max(df, order_by = n), NA) expect_error(slice_min(df, order_by = n), NA) expect_error(slice_sample(df, weight_by = n, n = 1L), NA) df <- group_by(df, g) expect_error(slice_max(df, order_by = n), NA) expect_error(slice_min(df, order_by = n), NA) expect_error(slice_sample(df, weight_by = n, n = 1L), NA) }) test_that("slice_*() checks that `n=` is explicitly named and ... is empty", { # i.e. that every function calls check_slice_dots() df <- data.frame(x = 1:10) expect_snapshot(error = TRUE, { slice_head(df, 5) slice_tail(df, 5) slice_min(df, x, 5) slice_max(df, x, 5) slice_sample(df, 5) }) # And works with namespace prefix (#6946) expect_snapshot(error = TRUE, { dplyr::slice_head(df, 5) dplyr::slice_tail(df, 5) dplyr::slice_min(df, x, 5) dplyr::slice_max(df, x, 5) dplyr::slice_sample(df, 5) }) expect_snapshot(error = TRUE, { slice_head(df, 5, 2) slice_tail(df, 5, 2) slice_min(df, x, 5, 2) slice_max(df, x, 5, 2) slice_sample(df, 5, 2) }) }) test_that("slice_helpers do call slice() and benefit from dispatch (#6084)", { local_methods( slice.noisy = function(.data, ..., .preserve = FALSE) { warning("noisy") NextMethod() } ) nf <- tibble(x = 1:10, g = rep(1:2, each = 5)) |> group_by(g) class(nf) <- c("noisy", class(nf)) expect_warning(slice(nf, 1:2), "noisy") expect_warning(slice_sample(nf, n = 2), "noisy") expect_warning(slice_head(nf, n = 2), "noisy") expect_warning(slice_tail(nf, n = 2), "noisy") expect_warning(slice_min(nf, x, n = 2), "noisy") expect_warning(slice_max(nf, x, n = 2), "noisy") expect_warning(sample_n(nf, 2), "noisy") expect_warning(sample_frac(nf, .5), "noisy") }) test_that("slice_helper `by` errors use correct error context and correct `by_arg`", { df <- tibble(x = 1) gdf <- group_by(df, x) expect_snapshot(error = TRUE, { slice_head(gdf, n = 1, by = x) slice_tail(gdf, n = 1, by = x) slice_min(gdf, order_by = x, by = x) slice_max(gdf, order_by = x, by = x) slice_sample(gdf, n = 1, by = x) }) }) test_that("slice_helper catches `.by` typo (#6647)", { df <- tibble(x = 1) expect_snapshot(error = TRUE, { slice_head(df, n = 1, .by = x) slice_tail(df, n = 1, .by = x) slice_min(df, order_by = x, .by = x) slice_max(df, order_by = x, .by = x) slice_sample(df, n = 1, .by = x) }) }) # slice_min/slice_max ----------------------------------------------------- test_that("min and max return ties by default", { df <- tibble(id = 1:5, x = c(1, 1, 1, 2, 2)) expect_equal(slice_min(df, x)$id, c(1, 2, 3)) expect_equal(slice_max(df, x)$id, c(4, 5)) expect_equal(slice_min(df, x, with_ties = FALSE)$id, 1) expect_equal(slice_max(df, x, with_ties = FALSE)$id, 4) }) test_that("min and max reorder results", { df <- data.frame(id = 1:4, x = c(2, 3, 1, 2)) expect_equal(slice_min(df, x, n = 2)$id, c(3, 1, 4)) expect_equal(slice_max(df, x, n = 2)$id, c(2, 1, 4)) expect_equal(slice_min(df, x, n = 2, with_ties = FALSE)$id, c(3, 1)) expect_equal(slice_max(df, x, n = 2, with_ties = FALSE)$id, c(2, 1)) }) test_that("min and max include NAs when appropriate", { df <- tibble(id = 1:3, x = c(1, NA, NA)) expect_equal(slice_min(df, x, n = 1)$id, 1) expect_equal(slice_max(df, x, n = 1)$id, 1) expect_equal(slice_min(df, x, n = 2)$id, c(1, 2, 3)) expect_equal(slice_min(df, x, n = 2, with_ties = FALSE)$id, c(1, 2)) df <- tibble(id = 1:4, x = NA) expect_equal(slice_min(df, x, n = 2, na_rm = TRUE)$id, integer()) expect_equal(slice_max(df, x, n = 2, na_rm = TRUE)$id, integer()) }) test_that("min and max ignore NA's when requested (#4826)", { df <- tibble(id = 1:4, x = c(2, NA, 1, 2)) expect_equal(slice_min(df, x, n = 2, na_rm = TRUE)$id, c(3, 1, 4)) expect_equal(slice_max(df, x, n = 2, na_rm = TRUE)$id, c(1, 4)) # Check with list to confirm use full vctrs support df <- tibble(id = 1:4, x = list(NULL, 1, NULL, NULL)) expect_equal(slice_min(df, x, n = 2, na_rm = TRUE)$id, 2) expect_equal(slice_max(df, x, n = 2, na_rm = TRUE)$id, 2) # Drop when any element is missing df <- tibble(id = 1:3, a = c(1, 2, NA), b = c(2, NA, NA)) expect_equal(slice_min(df, tibble(a, b), n = 3, na_rm = TRUE)$id, 1) expect_equal(slice_max(df, tibble(a, b), n = 3, na_rm = TRUE)$id, 1) }) test_that("slice_min/max() count from back with negative n/prop", { df <- tibble(id = 1:4, x = c(2, 3, 1, 4)) expect_equal(slice_min(df, x, n = -1), slice_min(df, x, n = 3)) expect_equal(slice_max(df, x, n = -1), slice_max(df, x, n = 3)) # and can be larger than group size expect_equal(slice_min(df, x, n = -10), df[0, ]) expect_equal(slice_max(df, x, n = -10), df[0, ]) }) test_that("slice_min/max() can order by multiple variables (#6176)", { df <- tibble(id = 1:4, x = 1, y = c(1, 4, 2, 3)) expect_equal(slice_min(df, tibble(x, y), n = 1)$id, 1) expect_equal(slice_max(df, tibble(x, y), n = 1)$id, 2) }) test_that("slice_min/max() work with `by`", { df <- tibble(g = c(2, 2, 1, 1), x = c(1, 2, 3, 1)) expect_identical(slice_min(df, x, by = g), df[c(1, 4), ]) expect_identical(slice_max(df, x, by = g), df[c(2, 3), ]) }) test_that("slice_min/max() inject `with_ties` and `na_rm` (#6725)", { # So columns named `with_ties` and `na_rm` don't mask those arguments df <- tibble(x = c(1, 1, 2, 2), with_ties = 1:4) expect_identical(slice_min(df, x, n = 1), vec_slice(df, 1:2)) expect_identical(slice_min(df, x, n = 1, with_ties = FALSE), vec_slice(df, 1)) expect_identical(slice_max(df, x, n = 1), vec_slice(df, 3:4)) expect_identical(slice_max(df, x, n = 1, with_ties = FALSE), vec_slice(df, 3)) df <- tibble(x = c(1, NA), na_rm = 1:2) expect_identical(slice_min(df, x, n = 2), df) expect_identical(slice_min(df, x, n = 2, na_rm = TRUE), vec_slice(df, 1)) expect_identical(slice_max(df, x, n = 2), df) expect_identical(slice_max(df, x, n = 2, na_rm = TRUE), vec_slice(df, 1)) }) test_that("slice_min/max() check size of `order_by=` (#5922)", { expect_snapshot(error = TRUE, { slice_min(data.frame(x = 1:10), 1:6) slice_max(data.frame(x = 1:10), 1:6) }) }) test_that("slice_min/max() validate simple arguments", { expect_snapshot(error = TRUE, { slice_min(data.frame(x = 1:10)) slice_max(data.frame(x = 1:10)) slice_min(data.frame(x = 1:10), x, with_ties = 1) slice_max(data.frame(x = 1:10), x, with_ties = 1) slice_min(data.frame(x = 1:10), x, na_rm = 1) slice_max(data.frame(x = 1:10), x, na_rm = 1) }) }) # slice_sample ------------------------------------------------------------ test_that("slice_sample() respects weight_by and replaces", { df <- tibble(x = 1:100, wt = c(1, rep(0, 99))) out <- slice_sample(df, n = 1, weight_by = wt) expect_equal(out$x, 1) out <- slice_sample(df, n = 2, weight_by = wt, replace = TRUE) expect_equal(out$x, c(1, 1)) }) test_that("slice_sample() can increase rows iff replace = TRUE", { df <- tibble(x = 1:10) expect_equal(nrow(slice_sample(df, n = 20, replace = FALSE)), 10) expect_equal(nrow(slice_sample(df, n = 20, replace = TRUE)), 20) }) test_that("slice_sample() checks size of `weight_by=` (#5922)", { df <- tibble(x = 1:10) expect_snapshot(slice_sample(df, n = 2, weight_by = 1:6), error = TRUE) }) test_that("slice_sample() works with zero-row data frame (#5729)", { df <- tibble(x = character(), w = numeric()) out <- slice_sample(df, prop = 0.5, weight_by = w) expect_equal(out, df) }) test_that("`slice_sample()` validates `replace`", { df <- tibble() expect_snapshot(error = TRUE, { slice_sample(df, replace = 1) slice_sample(df, replace = NA) }) }) test_that("slice_sample() injects `replace` (#6725)", { # So a column named `replace` doesn't mask that argument df <- tibble(replace = 1) expect_identical(slice_sample(df, n = 2), df) expect_identical( slice_sample(df, n = 2, replace = TRUE), vec_slice(df, c(1, 1)) ) }) test_that("slice_sample() handles positive n= and prop=", { gf <- group_by(tibble(a = 1, b = 1), a) expect_equal(slice_sample(gf, n = 3, replace = TRUE), gf[c(1, 1, 1), ]) expect_equal(slice_sample(gf, prop = 3, replace = TRUE), gf[c(1, 1, 1), ]) }) test_that("slice_sample() handles negative n= and prop= (#6402)", { df <- tibble(a = 1:2) expect_equal(nrow(slice_sample(df, n = -1)), 1) expect_equal(nrow(slice_sample(df, prop = -0.5)), 1) # even if larger than n expect_equal(nrow(slice_sample(df, n = -3)), 0) expect_equal(nrow(slice_sample(df, prop = -2)), 0) }) test_that("slice_sample() works with `by`", { df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) expect_identical(slice_sample(df, n = 2, by = g)$g, c(2, 2, 1)) }) # slice_head/slice_tail --------------------------------------------------- test_that("slice_head/slice_tail keep positive values", { gf <- group_by(tibble(g = c(1, 2, 2, 3, 3, 3), id = 1:6), g) expect_equal(slice_head(gf, n = 1)$id, c(1, 2, 4)) expect_equal(slice_head(gf, n = 2)$id, c(1, 2, 3, 4, 5)) expect_equal(slice_tail(gf, n = 1)$id, c(1, 3, 6)) expect_equal(slice_tail(gf, n = 2)$id, c(1, 2, 3, 5, 6)) }) test_that("slice_head/tail() count from back with negative n/prop", { df <- tibble(id = 1:4, x = c(2, 3, 1, 4)) expect_equal(slice_head(df, n = -1), slice_head(df, n = 3)) expect_equal(slice_tail(df, n = -1), slice_tail(df, n = 3)) # and can be larger than group size expect_equal(slice_head(df, n = -10), df[0, ]) expect_equal(slice_tail(df, n = -10), df[0, ]) }) test_that("slice_head/slice_tail drop from opposite end when n/prop negative", { gf <- group_by(tibble(g = c(1, 2, 2, 3, 3, 3), id = 1:6), g) expect_equal(slice_head(gf, n = -1)$id, c(2, 4, 5)) expect_equal(slice_head(gf, n = -2)$id, 4) expect_equal(slice_tail(gf, n = -1)$id, c(3, 5, 6)) expect_equal(slice_tail(gf, n = -2)$id, 6) }) test_that("slice_head/slice_tail handle infinite n/prop", { df <- tibble(x = 1) expect_identical(slice_head(df, n = Inf), df) expect_identical(slice_tail(df, n = Inf), df) expect_identical(slice_head(df, n = -Inf), df[0, ]) expect_identical(slice_tail(df, n = -Inf), df[0, ]) expect_identical(slice_head(df, prop = Inf), df) expect_identical(slice_tail(df, prop = Inf), df) expect_identical(slice_head(df, prop = -Inf), df[0, ]) expect_identical(slice_tail(df, prop = -Inf), df[0, ]) }) test_that("slice_head/slice_tail work with `by`", { df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) expect_identical(slice_head(df, n = 2, by = g), df[c(1, 2, 4), ]) expect_identical(slice_tail(df, n = 2, by = g), df[c(2, 3, 4), ]) }) dplyr/tests/testthat.R0000644000176200001440000000006613663216626014546 0ustar liggesuserslibrary(testthat) library(dplyr) test_check("dplyr") dplyr/MD50000644000176200001440000006154115140333347011726 0ustar liggesusers4ca10dc9ec0d46fde5b6208270cadc39 *DESCRIPTION 1fa27e33e5d11d6f7ad224481cd8a618 *LICENSE 88e3340c6cccd4ddf36c1e6b39daf2f1 *NAMESPACE f00f9128880c2080bf9067b9abdfa672 *NEWS.md 12a204efd533ad2050fb89527254a57c *R/across.R 4e7aa335c11f90afbffaa572f94b4fe0 *R/all-equal.R 95c0e11c1ee6a5382b78100f02d0b5ac *R/arrange.R de12bcad153efe7d18fb6f0bf98cf44a *R/bind-cols.R 0df182e1c198e5985b4ca0e977b50f71 *R/bind-rows.R ca131f16a67f6f9443dfe66a8833b24b *R/by.R 7df2b21568fb8ad90bff08a2d9291951 *R/case-match.R bbcd9992e33e9c1b77dda7f7bfaaf401 *R/case-when.R c1e519be2c2a4136bbc90d3e4c1c6812 *R/coalesce.R a2d315f463de2c4ef74ea2c373c48e9f *R/colwise-arrange.R dc5c1c34a15c351229949177ce8a6fb9 *R/colwise-distinct.R a96c6580ae74964363e946865722f1ed *R/colwise-filter.R 2d12caa4fe176fe231b7e23cddd07355 *R/colwise-funs.R 0e199cc62beb3b9e3c0e6b35fee6b89d *R/colwise-group-by.R 96c21c84848a804b83f46661762c98f1 *R/colwise-mutate.R b550ce85c24bc8df37d7070a438dd431 *R/colwise-select.R d60dd72d772aae809df7d2beacdf4378 *R/colwise.R 689d6a8d8b0cdec20ba7d803ca87a746 *R/compat-dbplyr.R 7aceefe1161ba8adbc26b4cb9bbf6c26 *R/compat-name-repair.R 83f247d573c42e0c3ee6dc776b3eec83 *R/compute-collect.R eaff53eaf8cb1ebc34f3a91218d348c1 *R/conditions.R a546372943ed514b622e9bcbcaf4fc11 *R/consecutive-id.R be6dcdc280a91c9838f2076aeb12315f *R/context.R 0c36b56f70d8b062d06209cd54430ad4 *R/copy-to.R b2305688ed6d55ebcd89f8d7a6251729 *R/count-tally.R 5b876ab8c3207fd6b7d12c64085d3f74 *R/data-bands.R e8dffe1c9fd2c88227e0269b63d6ec6e *R/data-mask.R 3d5101b0c532799921a8278ca363f10c *R/data-starwars.R 93cbc7c5aaa7d335a1552b876f0be1bb *R/data-storms.R 057cc89b489db6cb44ef2f05ed3fcbdd *R/dbplyr.R 54ff45e8176ebed557f05c2614dafbb9 *R/defunct-each.R 9726af40bf8ded9be20bfd033a41f8a1 *R/defunct-lazyeval.R 072dec837ae37e4456d719e0ef4c7335 *R/defunct.R 74cfdf22e2062da4370d748309004031 *R/deprec-context.R 6f75b53ee8e8c06f9b656f046c0da8e7 *R/deprec-do.R 01267d01b9dc3d2034ae3bee8b9d37d7 *R/deprec-funs.R 2ebd2250a5222af7e20069c66054a6ad *R/desc.R a3e4157c32a9b600674f20c373fdc17e *R/distinct.R e1a9c3e0a54a300b6dc08cc8bec02ee0 *R/doc-methods.R d6802b61428b201738f7445cf52961fe *R/doc-params.R 15be552cf4dafce0dadf0fecc648d83d *R/dplyr.R afb8acbbed05a93882c6df31d50a74be *R/error.R f6a4aabb00676e043166dfb27423d091 *R/explain.R 2a71239978a2de699d405ec69f7745f5 *R/filter.R 3548393701acb48c06bcd5871ecde353 *R/funs.R 5d7048401e7ef3c12c4e7ad155e90368 *R/generics.R efe1734357c63a6f24ec7fc3b089581f *R/group-by.R 740100ff284e43462825e5a56850c33b *R/group-data.R 5a8abc5e73039f76f80655f519cce836 *R/group-map.R 70d95f293e379a1d9e3d14764d693c7a *R/group-nest.R b58c5f04a00a73989007bae1302f1fc4 *R/group-split.R 96503257ebd9e8354521d702c0ee4269 *R/group-trim.R b07325afd1446ad2a1320e65bf024fba *R/grouped-df.R b9235d007a7833abfa411065e6e60a03 *R/groups-with.R 51fa7348e8aad9098aa3d6bf57057ffc *R/if-else.R 8c5f08458cfd4e7f75a27924b2789c83 *R/import-standalone-obj-type.R 302d9aa7fe4217f36d09bdf2496b82ac *R/import-standalone-purrr.R 09c045dfb7be704d32aec050f68a17a3 *R/import-standalone-types-check.R d725395ec5531d160cd4081d1ade9759 *R/join-by.R 92051e57ad08c8939d9bc9227a84d379 *R/join-cols.R 92f1f8758fcac9a6bbf4a79bfe65c93f *R/join-common-by.R a1fa1bc5e209aa01124e176497ad62df *R/join-cross.R 7d889b7f2b364bdf3e11cf7295050de2 *R/join-rows.R f6f0d34f073206a86fea476172db865d *R/join.R 3db3f9f9a7fae22c84c2b60f1c51acfa *R/lead-lag.R 4c5190dc4a316d50901332ea156472cf *R/locale.R 28794e18f254d82b1afb09ea6ad076ce *R/mutate.R c3e8af38ed384ab63b32e8e111c4c8b1 *R/n-col.R e88a7e03664da18eb88b92840c0ba369 *R/n-distinct.R 0ad0d3178e9bf397f0e1e6ced4cc3564 *R/na-if.R 9f92869c84547da11884ad5523fc4697 *R/near.R 5e803717913c65abebc557cca8e4d10a *R/nest-by.R 9a0628b02b12918b019bc40f552e0a7b *R/nth-value.R 1b12fa5fc953c40eb6b4e0ac91e3b2a2 *R/order-by.R 05dcd21d6c1babdcfc8a1fc52f25d5cd *R/pick.R 001cef67825a4f4513207279cb951597 *R/progress.R e24dacc24febbc2e7c0c99515bfa796e *R/pull.R 54e7c5a01a495094265336db79baba06 *R/rank.R d3f1ba4d9f8cc51f3c7c0a1231088451 *R/recode-values.R 0316e302661210ae5d293773b1fd0f76 *R/recode.R 9aab2581ff22019c5db573e06686f2e1 *R/reexport-magrittr.R 33f0441e9fa2771f280b778addc2b3be *R/reexport-pillar.R a77255a9f80ab02feb7dabca4d829344 *R/reexport-tibble.R c3b94b102cd42403ae9b177d6be2a907 *R/reframe.R fa6f187dac0a521a013d5585e9ab88b0 *R/relocate.R f2ea1bfc87ee5c59098543b5b3318943 *R/rename.R bcc4ffe66cb9dac7ccad9854f0d3e4a0 *R/rows.R 9a6edfc728adc30d5b7a496331dae4c0 *R/rowwise.R 20ac68d17893fe2b5d33bd33986aea9e *R/sample.R f39a9ff5c4dbc25deed0cb8c27a6bee3 *R/select-helpers.R 665c6ab5118f00cefee36c801262e766 *R/select.R 9877610f84918b709c33960e7e7adb83 *R/sets.R e2cc4986ef832ac1cfbf685c73c2f880 *R/slice.R f2cf2fa94689b0f7179774dcc466ffd7 *R/src-dbi.R c3b80d897d87e20f702dc2b1ed49f9b0 *R/src.R 574213c55ce3e06db70c389292154e92 *R/summarise.R f2ec1b5f2698c35e0836426aece8ed5b *R/tbl.R 218b2cce7123098c0e604c0ba886bd68 *R/top-n.R b802ad9cba79f88e2dc6cbcc77e3aea4 *R/transmute.R 482c17e9bda3631c27dcd7d0b5282b1d *R/ts.R 4c23ccc79b0837ad8f647980d3afb384 *R/utils-format.R af7b631a3b93ef46ae7037db17c94a5b *R/utils-tidy-eval.R fc07037e519ca45c8110e4e3c2de465a *R/utils.R cc73e032f17b3add4de5a67545a6a8f7 *R/vctrs.R 4b2881644fa008423f8eb649e61d3d4c *R/when.R 35a43916c1ee18e83138abbaf8ed48af *R/zzz.R 2afb0ef7277d85d625c17bb95a0279ab *README.md 390908e28f427ed5a451e3ed149272f8 *build/stage23.rdb 59609aad6461d04932882d9a524792b7 *build/vignette.rds a79561c8013e7a7f3c23d509f4918bf8 *data/band_instruments.rda 3aa4b1478fc31219480e88c876c3aeed *data/band_instruments2.rda 4d44ad5e4198daccbd4227dca895750b *data/band_members.rda b7f0799100f2923b9a3e00b87e5db157 *data/starwars.rda dd7b89df8dd60498eaf9be647de81148 *data/storms.rda f3f852a1998c77dc3e69e6a6974cdb26 *inst/doc/base.R 2b80f7fdabdad54fef3c43b9b249d90f *inst/doc/base.Rmd 38e6be59bad36c7a3fdc5308948e9f8a *inst/doc/base.html 68f7c9b19ec8664b14518359f44c9e78 *inst/doc/colwise.R 5683c040c5190a9c4bb8aa7c45e85a1c *inst/doc/colwise.Rmd 23c5a7950099473ed612bf35fc82703b *inst/doc/colwise.html 4849181e2d36fe7eded09a4cbc48c306 *inst/doc/dplyr.R be7224a6557fcb932cb5ff32148ac78d *inst/doc/dplyr.Rmd 91c5203ef8771a5bfa42c01f4417129e *inst/doc/dplyr.html d4203f3345df17d4b97aaf7f81d9f250 *inst/doc/grouping.R 45eb4773ba8147e92298bd2e8c061445 *inst/doc/grouping.Rmd 47b2911d259d0d21c9be0bb5de338926 *inst/doc/grouping.html 4a8ed3350978fa0f5740270844cc99e3 *inst/doc/in-packages.R fe4d82679348405f16f72240c684b2fd *inst/doc/in-packages.Rmd d09458b838b1732c9940ceb2583bb47a *inst/doc/in-packages.html 80d649b40621580bbd1e53c9363d2123 *inst/doc/programming.R 8ff0f0c2f68dce47a933fbfdbd1b410b *inst/doc/programming.Rmd 207fb11e4b2549ff0b4fda20b109470e *inst/doc/programming.html b2bedc768cfff0c90da67d5cbf1b3d93 *inst/doc/recoding-replacing.R 7848fb38e9829bfa4f6070080094079d *inst/doc/recoding-replacing.Rmd 29dee08eb6f5a602aae2349e2be50b84 *inst/doc/recoding-replacing.html c6c5bb95169a9224b0ddc359f1fdb7a7 *inst/doc/rowwise.R ce2fd8447f3ddb567f2c476d6a84f4d9 *inst/doc/rowwise.Rmd 2279bef31a1e6fef226be04f6cec62f1 *inst/doc/rowwise.html f699337a2ada4b9cd9a798c613bbd28b *inst/doc/two-table.R 34bcf97cd32148f9cb7bd86004211316 *inst/doc/two-table.Rmd 7055bae1b8f216d0613e3370b1ed8fd9 *inst/doc/two-table.html 686ce0c0ed7c50aa068e7863c7352e87 *inst/doc/window-functions.R 9c75a2060ef61f4c0ca448358e454b78 *inst/doc/window-functions.Rmd 65e2951b2da923370a5d56fc05fb679c *inst/doc/window-functions.html 7beb72bea949d4af58f5db7f1537cf1d *man/across.Rd 5e5865f79f9151d43c2d1a26807fd077 *man/all_equal.Rd fc3ca5a449dfba1b9c181d703989edcf *man/all_vars.Rd 7e0cbb020473fdd515448ef2ab23bcc7 *man/args_by.Rd 109b45b8cd6d8ae66f1c9fa852bd4c2c *man/arrange.Rd fb5d2e7a97578a7654eab6ab6d96bd04 *man/arrange_all.Rd c0986838d8faabf1cedeec484a840a65 *man/auto_copy.Rd 1fe1b4696a46635cf0566543c167fcea *man/backend_dbplyr.Rd 95f8bf5158b02ad31c35c25101661326 *man/band_members.Rd f88d80a26102343989a74f4434a30b8f *man/between.Rd 6452d27792ace197bd1debc150688a6f *man/bind_cols.Rd 0afa1ffc285d2834c5a41964dedb6107 *man/bind_rows.Rd a3c23ae2ee8a21f5d65331489183f46d *man/c_across.Rd 76cbf5f43659eb0e3808e306c0bf34ac *man/case-and-replace-when.Rd 2670a9f56c5897874545ac3c7fc90aef *man/case_match.Rd 408ca2c722f9c0be2eb5429f11fdc7fb *man/check_dbplyr.Rd 3d14987a0f01d8c5d684ca764449b14d *man/coalesce.Rd dde6e8ebe2d80016fa0e555593df4736 *man/common_by.Rd acb68751f66a44758a04564303190cb1 *man/compute.Rd 49635cadb3b2660e196a7275b3e5c1fd *man/consecutive_id.Rd 26e4268215421f571da56a315a8375f9 *man/context.Rd 27267e225bb75e5bfabe22b36542d468 *man/copy_to.Rd a69b705edf3600b10e9acd0960712818 *man/count.Rd 13dc8db480d1284208ab6597ebb9ff59 *man/cross_join.Rd 7d3aa62f07bed9e9c8637dac348040e0 *man/cumall.Rd 9863afedd6c5f95504c33cc14efb63ca *man/defunct-each.Rd b39b56af82327c513a999e08d5f1ac8c *man/defunct-lazyeval.Rd 9372d06480eeb64ac80c827d79782c19 *man/defunct.Rd ad2b164deec391787f59795a9c25e905 *man/deprec-context.Rd 05216644f8efa9c26a97d38c7207e1a2 *man/desc.Rd f3844446fa59fedc153689579defecb0 *man/dim_desc.Rd ab551e289d0a5ad555aa5fad3e061414 *man/distinct.Rd f38d8386af87e6000c068ce6c122a092 *man/distinct_all.Rd 32542005fa1b85f655358e1e2b553825 *man/do.Rd c0737068100b3495d8174b2c9d52624c *man/dplyr-locale.Rd b9aad8050f72309d5cbabc61e7fb8b5b *man/dplyr-package.Rd 5bcd95b10ad8adc049217022efbe8148 *man/dplyr_by.Rd a8a41bb128dbb47187566c45bfdfca04 *man/dplyr_data_masking.Rd 9894b613cc89cb16ce1b6932eb617310 *man/dplyr_extending.Rd 77bad0ab5896994bc755cf7e4b404c75 *man/dplyr_tidy_select.Rd 28643e7ff9cb2dc026fc3db666c0158e *man/explain.Rd cb1e46f469cfbbbde29c8b5113e1d789 *man/figures/lifecycle-archived.svg c0d2e5a54f1fa4ff02bf9533079dd1f7 *man/figures/lifecycle-defunct.svg a1b8c987c676c16af790f563f96cbb1f *man/figures/lifecycle-deprecated.svg c3978703d8f40f2679795335715e98f4 *man/figures/lifecycle-experimental.svg 952b59dc07b171b97d5d982924244f61 *man/figures/lifecycle-maturing.svg 27b879bf3677ea76e3991d56ab324081 *man/figures/lifecycle-questioning.svg 46de21252239c5a23d400eae83ec6b2d *man/figures/lifecycle-retired.svg 6902bbfaf963fbc4ed98b86bda80caa2 *man/figures/lifecycle-soft-deprecated.svg 53b3f893324260b737b3c46ed2a0e643 *man/figures/lifecycle-stable.svg 1c1fe7a759b86dc6dbcbe7797ab8246c *man/figures/lifecycle-superseded.svg 1c9275a703baa038ba079452e917859c *man/figures/logo.png a1ec84e39bd65d32dbd695f60312ca82 *man/filter-joins.Rd 8b8693fbf0c5f2b88a2fd12fad604ec0 *man/filter.Rd a87bdbd545a96906e138a0cc5e1a4e51 *man/filter_all.Rd 4a981d03b2890607274edf8c2bb56b3c *man/funs.Rd 507ac66fa13f805f3cabd9f918d9f8a6 *man/glimpse.Rd 65500b263234f8483a6aaa44fed5912b *man/group_by.Rd c337d26a09f2c25a696061a1b239d9b3 *man/group_by_all.Rd 3a1ad74bf4dd5ad8ce1874da13c675db *man/group_by_drop_default.Rd ea8fc1b365d72661a1673577e9ef3176 *man/group_by_prepare.Rd 163c803b20550ad7c358aa9e98f2bc90 *man/group_cols.Rd 06aab5bc1b3e3aacf23ef86731918a31 *man/group_data.Rd d63f27f5ab9ea1d83ba512314b4d8c54 *man/group_map.Rd b468d1dfd6c34d8356531297ca8a5e61 *man/group_nest.Rd 4a736662ae202159b42d5b501356d09a *man/group_split.Rd 27f4ac5a90dd0faf70e604038d65ae24 *man/group_trim.Rd 10bb88f3a0e92972d8310247f42a5326 *man/grouped_df.Rd f1cb723967400b1953c3b0188df9f92a *man/ident.Rd a4292f544a07ea3a8352e6ab03397afb *man/if_else.Rd 1d1139ec2bfe64b5a3513f21d292fe4d *man/join_by.Rd 73c4160301b17422973415f7246cea67 *man/last_dplyr_warnings.Rd 8c37d3bfe0b863bcf2da03803c60f539 *man/lead-lag.Rd 1caecd2aca352fbb3107bd9f6518ceca *man/make_tbl.Rd 786b5a4dbc91c07cbec3da9ae24a26da *man/mutate-joins.Rd 61fd1c736452228362f4c6e355ea5c67 *man/mutate.Rd 27dd4547d99082b0a772755c54bed773 *man/mutate_all.Rd e5fc124fa2e4d735d686adfa459ee2ce *man/n_distinct.Rd 010b2e6733ba1f8c91566b025ad452b9 *man/na_if.Rd 0c99aebdb1662bb9e9fc97334212cc4d *man/near.Rd b81df64b1ecdb70e5fc7958926d0067b *man/nest_by.Rd 0965dc37659996e774aa4441cc9d8916 *man/nest_join.Rd c63195ca8a28bc11dfbed073f3436264 *man/new_grouped_df.Rd 4d1947391e9210cea1bc6cde3f488ae7 *man/nth.Rd f5db068d8fa8477b3c54ed168301f45e *man/ntile.Rd 535b4dbee16f84764a5a889687d83159 *man/order_by.Rd 775253850568ce83bc89f3778a000b47 *man/percent_rank.Rd 2cf9b230c5653fa9e686bec95859a409 *man/pick.Rd 771a1b83656a2c73444c14f67231fc8b *man/progress_estimated.Rd d127ca73719972837e267e1c84d8c846 *man/pull.Rd 0db91dd541f11ec0d40f665844da35cc *man/recode-and-replace-values.Rd 9ca38238d01a236c746c1ca4a768243d *man/recode.Rd 7b3a653c018c43b68b45af0e891406d7 *man/reexports.Rd af33135129043b7780a741e8dbafe45d *man/reframe.Rd 29e79b0c6b341845d98deeda31c1ffbc *man/relocate.Rd fae253bc7d56873ad8eea91e0652239b *man/rename.Rd e4f204d228eedd343567a17ffc1ff4c8 *man/rmd/by.Rmd b08113cbec5e246ced3dae5437c3aa35 *man/rmd/overview.Rmd 6d879a98671c8cfa53b57feb32afba55 *man/rmd/select.Rmd a753fe73d0a62db5b4dcd4d094346956 *man/row_number.Rd 2868c2097b7a287618f1a6bad0ab8e33 *man/rows.Rd 22d3c635aa610dbb9231314c9ddd1b2e *man/rowwise.Rd 988ce8f1ec9a57ceca65bb3454cc1d20 *man/same_src.Rd 07d7cb164ad6e346243c7379b8fd1f07 *man/sample_n.Rd 51c5d5f440f86ff70438da9ef9ff671e *man/scoped.Rd 7242f98bd731807bd7b2a60983783c71 *man/select.Rd fb88790d1f45fa7e8bf05d317d46631e *man/select_all.Rd 1438b8a19e545dcc119495bae06148d3 *man/setops.Rd 3dfb75d3ec59a69245d45bd2f2b2c0af *man/slice.Rd 58452b35baf6118d7567b5cc0ce5aa9d *man/sql.Rd b095c446446ccec5ad5a71f7c67abb2e *man/src.Rd aa497adc8e0bf111028008b2c9fbfc2d *man/src_tbls.Rd 34225333bb8746a5cc36d133da79382d *man/starwars.Rd e354523b3b7ea5dcf44fd7237654a6ed *man/storms.Rd 3065821cb93bfaf38520bed845242955 *man/summarise.Rd 1e0ef0b2866e0792e9f709debb1ff177 *man/summarise_all.Rd c116f7273062b1cea8dbb10ecb2ebfd0 *man/tbl.Rd ff0579184a0570e6393a9487639a9bc8 *man/tbl_ptype.Rd 12b8d60664b65b89bdabf47c859bce32 *man/tbl_vars.Rd c2e6c2efbbf4d3935f37a736a4bff145 *man/tidyeval-compat.Rd d97f9107ae5c1c3c290cefff4a6f7082 *man/top_n.Rd 248651e2177fb50827b58603ab1ede0f *man/transmute.Rd 7ec42f01d5daceaf0c324cdd3e4001c6 *man/vars.Rd 85cffcfa7a19eb7706387386db6b6057 *man/when-any-all.Rd 665dd00b0cea25b3106ca1df01b21e0c *man/with_groups.Rd 0f716a00c40a985424f71afd4d758a80 *man/with_order.Rd b54ecd9fca5ba3ef0fa5f1296a48a180 *src/chop.cpp d37ec2b37f9888ea569c958930196483 *src/dplyr.h d71c88ed4cb9f4b14930c023d6e2b426 *src/filter.cpp 61d1b344819372c36879d75d4c0ff591 *src/funs.cpp 3f53ec1012ce7ebc98f5de1829a98764 *src/group_by.cpp 67ccfa80646c08b438ebf67a38667549 *src/group_data.cpp f84cc19491ab8b057f3e5ba251d61c47 *src/imports.cpp e026b22c1da8b255026390760724278f *src/init.cpp f63b614b4a8837867349630ee83bd311 *src/mask.cpp c7355467bfd2895af9e6f7db98efbff3 *src/mutate.cpp 42f0a0da29d8577ccb28d3c28887a9b0 *src/reconstruct.cpp e677f0014a3e2b0c0fe9746e91268859 *src/slice.cpp 9763ca71d49e53ddad538b16ea150f37 *src/summarise.cpp d28bcadabdbb2b99dfeca9181c55eb93 *src/utils.h 60c24a9c9c03f728e0d81d86fa6ca4d0 *tests/testthat.R 09fe626e79e5a99d1dca2ba79cd7e0b2 *tests/testthat/_snaps/across.md 239b530d0c8ec531136121ffc1279f97 *tests/testthat/_snaps/all-equal.md a0d5c974718f4327865bd6e2ab253ba8 *tests/testthat/_snaps/arrange.md 53712b17e76e8e65a58039553f709a1b *tests/testthat/_snaps/bind-cols.md d4262e4f1d27bee8d3d12a123e0fa9de *tests/testthat/_snaps/bind-rows.md ca8069fe3007b535a24f3c80a562fc34 *tests/testthat/_snaps/by.md dad40f9c769677e881ea516dbf9cabe8 *tests/testthat/_snaps/case-match.md 5247c658d6df128f9a4fd3ea1c587cfd *tests/testthat/_snaps/case-when.md 1fc37c52ad99ff5fae8da7f23a6de783 *tests/testthat/_snaps/coalesce.md a3be2f2082973753d912e3d4417e4360 *tests/testthat/_snaps/colwise-filter.md 59092c6fb36e1834e6d52a27e89348e8 *tests/testthat/_snaps/colwise-mutate.md d4cad704241424436eb2c16fbd5be619 *tests/testthat/_snaps/colwise-select.md 344c594226a02e072933b23638c3f9b0 *tests/testthat/_snaps/colwise.md 7d1096ba89cc8088b437dfd419ee36a7 *tests/testthat/_snaps/conditions.md 8e9c4225047ba6eb3c06eec9264bb481 *tests/testthat/_snaps/consecutive-id.md ef6207a499a8e561c3336e0748c69364 *tests/testthat/_snaps/context.md 6abc687e7ef86d1809fd19312ddd5864 *tests/testthat/_snaps/copy-to.md 69e4cd4b74ced6de327451d0c6a4078b *tests/testthat/_snaps/count-tally.md 591fbaad37b4f37d2b14cea2b754b7fe *tests/testthat/_snaps/defunct-each.md 8c5c97e0dec7767419d11e82948c400a *tests/testthat/_snaps/defunct-lazyeval.md 2fa12b01ed7c334e03cee3a881db533e *tests/testthat/_snaps/defunct.md c19889046852acef5930ccf8c3a9229a *tests/testthat/_snaps/deprec-context.md f5daf236b5111acd635cfe0cb0ecaa6f *tests/testthat/_snaps/deprec-do.md cc527570dc25d1a4ba00caec5002d8a1 *tests/testthat/_snaps/deprec-funs.md 399e68e740d4f683ef080eb1e526f1a8 *tests/testthat/_snaps/desc.md db2c9199aceb103196fe27dc77a56c6b *tests/testthat/_snaps/distinct.md d44984c92b34cfc88c38f759597d897f *tests/testthat/_snaps/filter.md 6fe1f31adefb8468db4ace6072824d25 *tests/testthat/_snaps/funs.md 7918f14576659fe7d379fb9c9288d750 *tests/testthat/_snaps/group-by.md 3aae9c89d14b42864d4300953fbc53bc *tests/testthat/_snaps/group-data.md d5cb8431973452be3c01ca9c223cf332 *tests/testthat/_snaps/group-map.md a60da778f7b36d7069db545d447b052d *tests/testthat/_snaps/group-nest.md f9dd8d461783a3dbeb9e0ae4084633c9 *tests/testthat/_snaps/group-split.md f3c173f54665682eb2d2eb2171664052 *tests/testthat/_snaps/grouped-df.md 4058e4761dae6a5dc5f462a64435aaee *tests/testthat/_snaps/if-else.md 4d778608e3fbccc770d45af80e2793c6 *tests/testthat/_snaps/join-by.md 72b0bc8a58f17c321637e811bf24baeb *tests/testthat/_snaps/join-cols.md dac9cfecd2c566cbee2400a970fc1a22 *tests/testthat/_snaps/join-cross.md 95cd294f578d6600f9b68677cbd0bd73 *tests/testthat/_snaps/join-rows.md ceab7c96ab407163ac1982310fc0b30c *tests/testthat/_snaps/join.md 667a66fa2a8d955121ea7155b84b94af *tests/testthat/_snaps/lead-lag.md 0d87ea3e51515a272c7e266fceef9d3c *tests/testthat/_snaps/locale.md c7dabb9cbe22d8a0e5e5af69dc7afd52 *tests/testthat/_snaps/mutate.md 8a812ca96383b875a7869f0da2527cfb *tests/testthat/_snaps/n-distinct.md 8292e5c401cbcb087ff4e2225c3e5e5e *tests/testthat/_snaps/na-if.md 726bd4dbbbb8e0ce4606e727b6dc2792 *tests/testthat/_snaps/nth-value.md 78ffbe99f6ff0a217abafa32a3c5e020 *tests/testthat/_snaps/order-by.md 79d6455e6e9cee991831f08b18577911 *tests/testthat/_snaps/pick.md 5a6cf7c95915ce5c018962f4bd3c4c9e *tests/testthat/_snaps/rank.md c51f78331639907049434e6139d024bc *tests/testthat/_snaps/recode-values.md 42c0f06f90606b881ca1059575e41d87 *tests/testthat/_snaps/recode.md 7ad69224d2e27b23a41aa11565ede869 *tests/testthat/_snaps/reframe.md bc055b59f8f0decc34e9e85f34801dae *tests/testthat/_snaps/relocate.md 6046982d2ca8766341f2b34a699aed5f *tests/testthat/_snaps/rename.md 0aba62069e295a9874e28996251995c9 *tests/testthat/_snaps/rows.md 53489be7738869efc66645feff3ae75e *tests/testthat/_snaps/rowwise.md 00b5d8781aedf0c2e2a320fb663c0673 *tests/testthat/_snaps/sample.md 2c5e135a4395020f2e5c7fca63c99dee *tests/testthat/_snaps/select-helpers.md 94751a12ce5d0d72bd432d3e9fe4f9de *tests/testthat/_snaps/select.md dca965b05cbfad13401d9e605ca021bb *tests/testthat/_snaps/sets.md d5a2f9e6a064e69ee817b3df501bb170 *tests/testthat/_snaps/slice.md de657c4e4467926fe30af0b82725d470 *tests/testthat/_snaps/summarise.md 72b2f086d5f8154b91f7817d7ac2c51d *tests/testthat/_snaps/top-n.md 07e61ca097e0511318a0a59f689fa458 *tests/testthat/_snaps/transmute.md 99009301e243bd8f583545bafbd5c28e *tests/testthat/_snaps/when.md ce71038c4d0b3e45faf92a87c141938e *tests/testthat/helper-dplyr.R fb4f3b8f833f39bbdac9828a3947101c *tests/testthat/helper-encoding.R ee63564f6f23b276c9a5e43e05b283fa *tests/testthat/helper-lazy.R 184b9a3701693b11c886455f587fd9d4 *tests/testthat/helper-pick.R e96ec1478ea3b93273f7eb1011b4223f *tests/testthat/helper-s3.R 5c40bc3557d7e68c3f963b4f428f5c20 *tests/testthat/helper-torture.R 9b1454daf972b9b05ae86c6d72ce47a4 *tests/testthat/test-across.R 0d2494c8e5bc18a980af3ed04df0bf66 *tests/testthat/test-all-equal.R 8709c0fab055bfdee09ec24ce4ed3982 *tests/testthat/test-arrange.R 720cceed003c8f840d9d44bb7194bb8f *tests/testthat/test-bind-cols.R 05790cda87eb2a48f85340801aa9aa8d *tests/testthat/test-bind-rows.R 3a77f6be418406835f35687008522dd6 *tests/testthat/test-by.R 6ea50dc4d2a9f278ef34ceb3a6670de2 *tests/testthat/test-case-match.R 18c46522de12dcdb4320fe6080e59a34 *tests/testthat/test-case-when.R e180f028e3b82e72c2e766c6f21a002e *tests/testthat/test-coalesce.R 3558b9d7126d5e56320f50399c1e3ab8 *tests/testthat/test-colwise-arrange.R 66d3a2da5ec9b15614781234738955ba *tests/testthat/test-colwise-distinct.R b81f92717e94ff752581c25ca29b53ee *tests/testthat/test-colwise-filter.R 989caa75d2c5938cdb53e2cbae96e8cb *tests/testthat/test-colwise-funs.R b2a77fc853f97a75fdf3cbd755ddfae4 *tests/testthat/test-colwise-group-by.R 9a7805fc73fc46ddcfabf672532fe474 *tests/testthat/test-colwise-mutate.R 1303a7e203a1d14496082ddb1842aa9f *tests/testthat/test-colwise-select.R f4220656298d2ca1fdf3e1ebf7c3eed5 *tests/testthat/test-colwise.R c3bff8ef681c7f356f912065589a44a2 *tests/testthat/test-conditions.R 2d0baee20f63a6d778bd1a133a01d5ad *tests/testthat/test-consecutive-id.R 274b55875fa6736747f558556684f8cc *tests/testthat/test-context.R ea3dac87c76870b942e3010ca1431b56 *tests/testthat/test-copy-to.R c9e36357e6e014b5b32e3f230927ef71 *tests/testthat/test-count-tally.R 2ee1f7101d72de5512d1e45a1ddf3825 *tests/testthat/test-data-mask.R 5b39c8c68f11c207ac9d095911da7a13 *tests/testthat/test-defunct-each.R e4156e05cc8b96037b0fe4c3a6fa15e0 *tests/testthat/test-defunct-lazyeval.R d6a855fd3a881b25c79b23ea81b22c73 *tests/testthat/test-defunct.R 0c2118d61d5bdffe107cb3ae57df762c *tests/testthat/test-deprec-context.R b355a0ac47b2cadcb0965a0b54b6f384 *tests/testthat/test-deprec-do.R 89736e89bd513ef16e47346f40f345d9 *tests/testthat/test-deprec-funs.R acf56d27f1cab0e1fa47a047d04152cf *tests/testthat/test-desc.R db9b94ee31473e78e1049a9e12e0a0b6 *tests/testthat/test-distinct.R 4b0c2adaa6a8342076cc31ea5e25a9eb *tests/testthat/test-filter.R 80d8711519cb81d883530e321e22c972 *tests/testthat/test-funs.R 262890aaa2d363ccba6c2afefe91384b *tests/testthat/test-generics.R 64062bd9c3d19acb6478bece9f100aaa *tests/testthat/test-group-by.R be8bb02f79b3ad121c935a34096633be *tests/testthat/test-group-data.R eb14e52f00d3f146b7adb7f130fda313 *tests/testthat/test-group-map.R a8d3e9db32e4576191f9d776dd0cd346 *tests/testthat/test-group-nest.R 3dcc0bfdec8b5c5d0a1cad743b900f2d *tests/testthat/test-group-split.R 7b52d9fc875197ed0cd26dbf4ab0d5ae *tests/testthat/test-group-trim.R 5785ce4259ef7fe2395c5e96f7a7f0fb *tests/testthat/test-grouped-df.R 02220a49617b5e8dbb284e3d07661bf4 *tests/testthat/test-groups-with.R 570bab13dcc5c6abd2d8f6160156cbc2 *tests/testthat/test-if-else.R 1faf8ccdb6af1d55a6406d7327cbb4bf *tests/testthat/test-join-by.R c1a9322ce10f6cb534eb1353508c619e *tests/testthat/test-join-cols.R fa27f5dbfe980b2090e642f3205072e1 *tests/testthat/test-join-cross.R c88ce2cdb75fd602977ae5d8edf82622 *tests/testthat/test-join-rows.R d89f517eb88fae2cccf071aea0cabf82 *tests/testthat/test-join.R 73c948ca1084a0db496696dda447836f *tests/testthat/test-lead-lag.R de0f770a9cb0ab5f4110b4c2eec359e2 *tests/testthat/test-locale.R 3a79d15b98923c4637650e32c087637c *tests/testthat/test-mutate.R 6db6e2391ed314adbebd6bade950e858 *tests/testthat/test-n-distinct.R 03a733f0a357a8551e1d2163ec106402 *tests/testthat/test-na-if.R 5ea93280062e3dc5258f73736989706d *tests/testthat/test-near.R 08a7f7ff9cdd521013ea3a3a56ff92f5 *tests/testthat/test-nest-by.R 433ca06f6e54815613373a8b817054e2 *tests/testthat/test-nth-value.R af8dc848df3244214c86ec45fb98e0c8 *tests/testthat/test-order-by.R 0b3d873cce7af1b52d91e3facc248910 *tests/testthat/test-pick.R 65c13e813a1ba14107e7b5e72e09ee0a *tests/testthat/test-pull.R 66283b197eab97b3145dd20c3e9711df *tests/testthat/test-rank.R c5826a46b0933937b75b9cd036467bd5 *tests/testthat/test-recode-values.R 19d902f3a8ccb2eaeafd42297d645d26 *tests/testthat/test-recode.R ba4f7d3f50ec3c6a793e7b370ed82353 *tests/testthat/test-reframe.R 97ec922dd2976459db36aef2b54ebcfa *tests/testthat/test-relocate.R 7c49cc6facacaca98bc291df4f050150 *tests/testthat/test-rename.R 5999d22542d11551fb87e2506cc9f15b *tests/testthat/test-rows.R 2a2f5301fb5b989eb82d42f473728479 *tests/testthat/test-rowwise.R 381d8e13710138c4f8323c334912ff94 *tests/testthat/test-sample.R 419861650696e8f2410bcc75d49ee2fa *tests/testthat/test-select-helpers.R d0e6c6b0d25fcc08e12f93082fd3060d *tests/testthat/test-select.R 3d905f5428709a73640126fd5a73f503 *tests/testthat/test-sets.R 162433bc74e5ac104f15c79afacc6345 *tests/testthat/test-slice.R db565b0d20e096ffb506dd27c7d20f55 *tests/testthat/test-src-dbi.R 328ebb68adc9802e8c76ef7443d5ae3a *tests/testthat/test-summarise.R f97a14a5701b40eba1580202c66cad3b *tests/testthat/test-tbl.R a921848945d326ce71b4f89f35f5be4a *tests/testthat/test-top-n.R 80cc2521bcfc65b1290ffca047657364 *tests/testthat/test-transmute.R f4666100b9074677d59e84c2901cc8c8 *tests/testthat/test-utils.R 2e41e334b58685459473b82d142fa0c7 *tests/testthat/test-when.R a1f7324daafb911eaef71fe3069e7d83 *tests/testthat/utf-8.txt 2b80f7fdabdad54fef3c43b9b249d90f *vignettes/base.Rmd 5683c040c5190a9c4bb8aa7c45e85a1c *vignettes/colwise.Rmd be7224a6557fcb932cb5ff32148ac78d *vignettes/dplyr.Rmd 45eb4773ba8147e92298bd2e8c061445 *vignettes/grouping.Rmd fe4d82679348405f16f72240c684b2fd *vignettes/in-packages.Rmd 8ff0f0c2f68dce47a933fbfdbd1b410b *vignettes/programming.Rmd 7848fb38e9829bfa4f6070080094079d *vignettes/recoding-replacing.Rmd ce2fd8447f3ddb567f2c476d6a84f4d9 *vignettes/rowwise.Rmd 34bcf97cd32148f9cb7bd86004211316 *vignettes/two-table.Rmd 9c75a2060ef61f4c0ca448358e454b78 *vignettes/window-functions.Rmd dplyr/R/0000755000176200001440000000000015137234433011612 5ustar liggesusersdplyr/R/group-by.R0000644000176200001440000002214715106134104013475 0ustar liggesusers#' Group by one or more variables #' #' @description #' Most data operations are done on groups defined by variables. #' `group_by()` takes an existing tbl and converts it into a grouped tbl #' where operations are performed "by group". `ungroup()` removes grouping. #' #' @family grouping functions #' @inheritParams arrange #' @param ... <[`data-masking`][rlang::args_data_masking]> In `group_by()`, #' variables or computations to group by. Computations are always done on the #' ungrouped data frame. To perform computations on the grouped data, you need #' to use a separate `mutate()` step before the `group_by()`. #' Computations are not allowed in `nest_by()`. #' In `ungroup()`, variables to remove from the grouping. #' @param .add When `FALSE`, the default, `group_by()` will #' override existing groups. To add to the existing groups, use #' `.add = TRUE`. #' @param .drop Drop groups formed by factor levels that don't appear in the #' data? The default is `TRUE` except when `.data` has been previously #' grouped with `.drop = FALSE`. See [group_by_drop_default()] for details. #' @return A grouped data frame with class [`grouped_df`][grouped_df], #' unless the combination of `...` and `add` yields a empty set of #' grouping columns, in which case a tibble will be returned. #' @section Methods: #' These function are **generic**s, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' #' * `group_by()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. #' * `ungroup()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. #' #' @section Ordering: #' Currently, `group_by()` internally orders the groups in ascending order. This #' results in ordered output from functions that aggregate groups, such as #' [summarise()]. #' #' When used as grouping columns, character vectors are ordered in the C locale #' for performance and reproducibility across R sessions. If the resulting #' ordering of your grouped operation matters and is dependent on the locale, #' you should follow up the grouped operation with an explicit call to #' [arrange()] and set the `.locale` argument. For example: #' #' ``` #' data |> #' group_by(chr) |> #' summarise(avg = mean(x)) |> #' arrange(chr, .locale = "en") #' ``` #' #' This is often useful as a preliminary step before generating content intended #' for humans, such as an HTML table. #' #' ## Legacy behavior #' #' `r lifecycle::badge("deprecated")` #' #' Prior to dplyr 1.1.0, character vector grouping columns were ordered in the #' system locale. Setting the global option `dplyr.legacy_locale` to `TRUE` #' retains this legacy behavior, but this has been deprecated. Update existing #' code to explicitly call `arrange(.locale = )` instead. Run #' `Sys.getlocale("LC_COLLATE")` to determine your system locale, and compare #' that against the list in [stringi::stri_locale_list()] to find an appropriate #' value for `.locale`, i.e. for American English, `"en_US"`. #' #' @export #' @examples #' by_cyl <- mtcars |> group_by(cyl) #' #' # grouping doesn't change how the data looks (apart from listing #' # how it's grouped): #' by_cyl #' #' # It changes how it acts with the other dplyr verbs: #' by_cyl |> summarise( #' disp = mean(disp), #' hp = mean(hp) #' ) #' by_cyl |> filter(disp == max(disp)) #' #' # Each call to summarise() removes a layer of grouping #' by_vs_am <- mtcars |> group_by(vs, am) #' by_vs <- by_vs_am |> summarise(n = n()) #' by_vs #' by_vs |> summarise(n = sum(n)) #' #' # To removing grouping, use ungroup #' by_vs |> #' ungroup() |> #' summarise(n = sum(n)) #' #' # By default, group_by() overrides existing grouping #' by_cyl |> #' group_by(vs, am) |> #' group_vars() #' #' # Use add = TRUE to instead append #' by_cyl |> #' group_by(vs, am, .add = TRUE) |> #' group_vars() #' #' # You can group by expressions: this is a short-hand #' # for a mutate() followed by a group_by() #' mtcars |> #' group_by(vsam = vs + am) #' #' # The implicit mutate() step is always performed on the #' # ungrouped data. Here we get 3 groups: #' mtcars |> #' group_by(vs) |> #' group_by(hp_cut = cut(hp, 3)) #' #' # If you want it to be performed by groups, #' # you have to use an explicit mutate() call. #' # Here we get 3 groups per value of vs #' mtcars |> #' group_by(vs) |> #' mutate(hp_cut = cut(hp, 3)) |> #' group_by(hp_cut) #' #' # when factors are involved and .drop = FALSE, groups can be empty #' tbl <- tibble( #' x = 1:10, #' y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) #' ) #' tbl |> #' group_by(y, .drop = FALSE) |> #' group_rows() #' group_by <- function( .data, ..., .add = FALSE, .drop = group_by_drop_default(.data) ) { UseMethod("group_by") } #' @export group_by.data.frame <- function( .data, ..., .add = FALSE, .drop = group_by_drop_default(.data) ) { groups <- group_by_prepare( .data, ..., .add = .add, error_call = current_env() ) grouped_df(groups$data, groups$group_names, .drop) } #' @rdname group_by #' @export #' @param x A [tbl()] ungroup <- function(x, ...) { UseMethod("ungroup") } #' @export ungroup.grouped_df <- function(x, ...) { if (missing(...)) { as_tibble(x) } else { old_groups <- group_vars(x) to_remove <- tidyselect::eval_select( expr = expr(c(...)), data = x, allow_rename = FALSE ) to_remove <- names(to_remove) new_groups <- setdiff(old_groups, to_remove) group_by(x, !!!syms(new_groups)) } } #' @export ungroup.rowwise_df <- function(x, ...) { check_dots_empty() as_tibble(x) } #' @export ungroup.data.frame <- function(x, ...) { check_dots_empty() x } #' Prepare for grouping and other operations #' #' `*_prepare()` performs standard manipulation that is needed prior #' to actual data processing. They are only be needed by packages #' that implement dplyr backends. #' #' @return A list #' \item{data}{Modified tbl} #' \item{groups}{Modified groups} #' @export #' @keywords internal group_by_prepare <- function( .data, ..., .add = FALSE, .dots = deprecated(), add = deprecated(), error_call = caller_env() ) { error_call <- dplyr_error_call(error_call) if (!missing(add)) { lifecycle::deprecate_stop("1.0.0", "group_by(add = )", "group_by(.add = )") } if (!missing(.dots)) { lifecycle::deprecate_stop("1.0.0", "group_by(.dots = )") } new_groups <- enquos(..., .ignore_empty = "all") # If any calls, use mutate to add new columns, then group by those computed_columns <- add_computed_columns( .data, new_groups, error_call = error_call ) out <- computed_columns$data group_names <- computed_columns$added_names if (.add) { group_names <- union(group_vars(.data), group_names) } unknown <- setdiff(group_names, tbl_vars(out)) if (length(unknown) > 0) { bullets <- c( "Must group by variables found in `.data`.", x = glue("Column `{unknown}` is not found.") ) abort(bullets, call = error_call) } list( data = out, groups = syms(group_names), group_names = group_names ) } add_computed_columns <- function(.data, vars, error_call = caller_env()) { is_symbol <- map_lgl(vars, quo_is_variable_reference) needs_mutate <- have_name(vars) | !is_symbol if (any(needs_mutate)) { # TODO: use less of a hack if (inherits(.data, "data.frame")) { bare_data <- ungroup(.data) by <- compute_by(by = NULL, data = bare_data) cols <- mutate_cols( bare_data, dplyr_quosures(!!!vars), by = by, error_call = error_call ) out <- dplyr_col_modify(.data, cols) col_names <- names(cols) } else { out <- mutate(.data, !!!vars) col_names <- names(exprs_auto_name(vars)) } } else { out <- .data col_names <- names(exprs_auto_name(vars)) } list(data = out, added_names = col_names) } quo_is_variable_reference <- function(quo) { if (quo_is_symbol(quo)) { return(TRUE) } if (quo_is_call(quo, n = 2)) { expr <- quo_get_expr(quo) if (is_call(expr, c("$", "[["))) { if (!identical(expr[[2]], sym(".data"))) { return(FALSE) } param <- expr[[3]] if (is_symbol(param) || is_string(param)) { return(TRUE) } } } FALSE } #' Default value for .drop argument of group_by #' #' @param .tbl A data frame #' #' @return `TRUE` unless `.tbl` is a grouped data frame that was previously #' obtained by `group_by(.drop = FALSE)` #' #' @examples #' group_by_drop_default(iris) #' #' iris |> #' group_by(Species) |> #' group_by_drop_default() #' #' iris |> #' group_by(Species, .drop = FALSE) |> #' group_by_drop_default() #' #' @keywords internal #' @export group_by_drop_default <- function(.tbl) { UseMethod("group_by_drop_default") } #' @export group_by_drop_default.default <- function(.tbl) { TRUE } #' @export group_by_drop_default.grouped_df <- function(.tbl) { tryCatch( { !identical(attr(group_data(.tbl), ".drop"), FALSE) }, error = function(e) { TRUE } ) } dplyr/R/compat-name-repair.R0000644000176200001440000001244315106134104015410 0ustar liggesusers# compat-name-repair (last updated: tibble 2.0.1.9000) # This file serves as a reference for compatibility functions for # name repair in tibble, until name repair is available in rlang. error_name_length_required <- function() { "`n` must be specified, when the `names` attribute is `NULL`." } minimal_names <- function(name, n) { if (is.null(name) && missing(n)) { abort(error_name_length_required()) } ## TODO: address scenarios where name is not NULL and n != length(name)? if (is.null(name)) { rep_len("", n) } else { name %|% "" } } set_minimal_names <- function(x) { new_names <- minimal_names(names(x), n = length(x)) set_names(x, new_names) } unique_names <- function(name, quiet = FALSE, transform = identity) { min_name <- minimal_names(name) naked_name <- strip_pos(min_name) naked_is_empty <- (naked_name == "") new_name <- transform(naked_name) new_name <- append_pos(new_name, needs_suffix = naked_is_empty) duped_after <- duplicated(new_name) | duplicated(new_name, fromLast = TRUE) new_name <- append_pos(new_name, duped_after) if (!quiet) { describe_repair(name, new_name) } new_name } set_unique_names <- function(x, quiet = FALSE) { x <- set_minimal_names(x) new_names <- unique_names(names(x), quiet = quiet) set_names(x, new_names) } universal_names <- function(name, quiet = FALSE) { unique_names(name, quiet = quiet, transform = make_syntactic) } set_universal_names <- function(x, quiet = FALSE) { x <- set_minimal_names(x) new_names <- universal_names(names(x), quiet = quiet) set_names(x, new_names) } ## makes each individual name syntactic ## does not enforce unique-ness make_syntactic <- function(name) { name[is.na(name)] <- "" name[name == ""] <- "." name[name == "..."] <- "...." name <- sub("^_", "._", name) new_name <- make.names(name) X_prefix <- grepl("^X", new_name) & !grepl("^X", name) new_name[X_prefix] <- sub("^X", "", new_name[X_prefix]) dot_suffix <- which(new_name == paste0(name, ".")) new_name[dot_suffix] <- sub("^(.*)[.]$", ".\\1", new_name[dot_suffix]) ## illegal characters have been replaced with '.' via make.names() ## however, we have: ## * declined its addition of 'X' prefixes ## * turned its '.' suffixes to '.' prefixes regex <- paste0( "^(?[.]{0,2})", "(?[0-9]*)", "(?[^0-9]?.*$)" ) re <- re_match(new_name, pattern = regex) needs_dots <- which(re$numbers != "") needs_third_dot <- (re$leftovers[needs_dots] == "") re$leading_dots[needs_dots] <- ifelse(needs_third_dot, "...", "..") new_name <- paste0(re$leading_dots, re$numbers, re$leftovers) new_name } append_pos <- function(name, needs_suffix) { need_append_pos <- which(needs_suffix) name[need_append_pos] <- paste0(name[need_append_pos], "..", need_append_pos) name } strip_pos <- function(name) { rx <- "[.][.][1-9][0-9]*$" gsub(rx, "", name) %|% "" } describe_repair <- function(orig_name, name) { if (length(orig_name) != length(name)) { abort(c( "`orig_name` and `name` have different sizes.", i = glue("`orig_name` is of size {length(orig_name)}."), i = glue("`name` is of size {length(name)}.") )) } new_names <- name != minimal_names(orig_name) if (any(new_names)) { msg <- bullets( "New names:", paste0( tick_if_needed(orig_name[new_names]), " -> ", tick_if_needed(name[new_names]), .problem = "" ) ) message(msg) } } ## from rematch2, except we don't add tbl_df or tbl classes to the return value re_match <- function(text, pattern, perl = TRUE, ...) { if (!is.character(pattern) || length(pattern) != 1L || is.na(pattern)) { abort(c( "incompatible `pattern`.", i = "`pattern` should be a scalar string." )) } text <- as.character(text) match <- regexpr(pattern, text, perl = perl, ...) start <- as.vector(match) length <- attr(match, "match.length") end <- start + length - 1L matchstr <- substring(text, start, end) matchstr[start == -1] <- NA_character_ res <- data.frame( stringsAsFactors = FALSE, .text = text, .match = matchstr ) if (!is.null(attr(match, "capture.start"))) { gstart <- attr(match, "capture.start") glength <- attr(match, "capture.length") gend <- gstart + glength - 1L groupstr <- substring(text, gstart, gend) groupstr[gstart == -1] <- NA_character_ dim(groupstr) <- dim(gstart) res <- cbind(groupstr, res, stringsAsFactors = FALSE) } names(res) <- c(attr(match, "capture.names"), ".text", ".match") res } # A better version (with far more dependencies) exists in msg-format.R bullets <- function(header, ..., .problem) { problems <- c(...) MAX_BULLETS <- 6L if (length(problems) >= MAX_BULLETS) { n_more <- length(problems) - MAX_BULLETS + 1L problems[[MAX_BULLETS]] <- "..." length(problems) <- MAX_BULLETS } paste0( header, "\n", paste0("* ", problems, collapse = "\n") ) } # FIXME: Also exists in pillar, do we need to export? tick <- function(x) { ifelse(is.na(x), "NA", encodeString(x, quote = "`")) } is_syntactic <- function(x) { ret <- (make_syntactic(x) == x) ret[is.na(x)] <- FALSE ret } tick_if_needed <- function(x) { needs_ticks <- !is_syntactic(x) x[needs_ticks] <- tick(x[needs_ticks]) x } dplyr/R/nth-value.R0000644000176200001440000001127315106134104013632 0ustar liggesusers#' Extract the first, last, or nth value from a vector #' #' These are useful helpers for extracting a single value from a vector. They #' are guaranteed to return a meaningful value, even when the input is shorter #' than expected. You can also provide an optional secondary vector that defines #' the ordering. #' #' @details #' For most vector types, `first(x)`, `last(x)`, and `nth(x, n)` work like #' `x[[1]]`, `x[[length(x)]`, and `x[[n]]`, respectively. The primary exception #' is data frames, where they instead retrieve rows, i.e. `x[1, ]`, `x[nrow(x), #' ]`, and `x[n, ]`. This is consistent with the tidyverse/vctrs principle which #' treats data frames as a vector of rows, rather than a vector of columns. #' #' @param x A vector #' @param n For `nth()`, a single integer specifying the position. #' Negative integers index from the end (i.e. `-1L` will return the #' last value in the vector). #' @param order_by An optional vector the same size as `x` used to determine the #' order. #' @param default A default value to use if the position does not exist in `x`. #' #' If `NULL`, the default, a missing value is used. #' #' If supplied, this must be a single value, which will be cast to the type of #' `x`. #' #' When `x` is a list , `default` is allowed to be any value. There are no #' type or size restrictions in this case. #' @param na_rm Should missing values in `x` be removed before extracting the #' value? #' #' @return #' If `x` is a list, a single element from that list. Otherwise, a vector the #' same type as `x` with size 1. #' #' @export #' @examples #' x <- 1:10 #' y <- 10:1 #' #' first(x) #' last(y) #' #' nth(x, 1) #' nth(x, 5) #' nth(x, -2) #' #' # `first()` and `last()` are often useful in `summarise()` #' df <- tibble(x = x, y = y) #' df |> #' summarise( #' across(x:y, first, .names = "{col}_first"), #' y_last = last(y) #' ) #' #' # Selecting a position that is out of bounds returns a default value #' nth(x, 11) #' nth(x, 0) #' #' # This out of bounds behavior also applies to empty vectors #' first(integer()) #' #' # You can customize the default value with `default` #' nth(x, 11, default = -1L) #' first(integer(), default = 0L) #' #' # `order_by` provides optional ordering #' last(x) #' last(x, order_by = y) #' #' # `na_rm` removes missing values before extracting the value #' z <- c(NA, NA, 1, 3, NA, 5, NA) #' first(z) #' first(z, na_rm = TRUE) #' last(z, na_rm = TRUE) #' nth(z, 3, na_rm = TRUE) #' #' # For data frames, these select entire rows #' df <- tibble(a = 1:5, b = 6:10) #' first(df) #' nth(df, 4) nth <- function(x, n, order_by = NULL, default = NULL, na_rm = FALSE) { size <- vec_size(x) vec_check_size(n, size = 1L) n <- vec_cast(n, to = integer()) if (!is.null(order_by)) { vec_check_size(order_by, size = size) } default <- check_nth_default(default, x = x) check_bool(na_rm) if (na_rm && vec_any_missing(x)) { not_missing <- !vec_detect_missing(x) x <- vec_slice(x, not_missing) size <- vec_size(x) if (!is.null(order_by)) { order_by <- vec_slice(order_by, not_missing) } } if (is.na(n)) { abort("`n` can't be `NA`.") } if (n < 0L) { # Negative values index from RHS n <- size + n + 1L } if (n <= 0L || n > size) { return(default) } if (!is.null(order_by)) { order <- vec_order_radix(order_by) n <- order[[n]] } vec_slice2(x, n) } #' @export #' @rdname nth first <- function(x, order_by = NULL, default = NULL, na_rm = FALSE) { nth(x, 1L, order_by = order_by, default = default, na_rm = na_rm) } #' @export #' @rdname nth last <- function(x, order_by = NULL, default = NULL, na_rm = FALSE) { nth(x, -1L, order_by = order_by, default = default, na_rm = na_rm) } check_nth_default <- function(default, x, ..., error_call = caller_env()) { check_dots_empty0(...) if (obj_is_list(x)) { # Very special behavior for lists, since we use `[[` on them. # Valid to use any `default` here (even non-vectors). # And `default = NULL` is the correct default `default` for lists. return(default) } if (is.null(default)) { return(vec_init(x)) } vec_check_size(default, size = 1L, call = error_call) default <- vec_cast( x = default, to = x, x_arg = "default", to_arg = "x", call = error_call ) default } vec_slice2 <- function(x, i) { # Our unimplemented vctrs equivalent of `[[` # https://github.com/r-lib/vctrs/pull/1228/ # A real implementation would use this, but it is too slow right now # and we know `i` is a valid integer index (#6682) # i <- vec_as_location2(i, vec_size(x)) if (obj_is_list(x)) { out <- .subset2(x, i) } else { out <- vec_slice(x, i) out <- vec_set_names(out, NULL) } out } dplyr/R/lead-lag.R0000644000176200001440000000720015106134104013370 0ustar liggesusers#' Compute lagged or leading values #' #' Find the "previous" (`lag()`) or "next" (`lead()`) values in a vector. Useful #' for comparing values behind of or ahead of the current values. #' #' @param x A vector #' @param n Positive integer of length 1, giving the number of positions to #' lag or lead by #' @param default The value used to pad `x` back to its original size after the #' lag or lead has been applied. The default, `NULL`, pads with a missing #' value. If supplied, this must be a vector with size 1, which will be cast #' to the type of `x`. #' @param order_by An optional secondary vector that defines the ordering to use #' when applying the lag or lead to `x`. If supplied, this must be the same #' size as `x`. #' @param ... Not used. #' #' @return #' A vector with the same type and size as `x`. #' #' @name lead-lag #' @examples #' lag(1:5) #' lead(1:5) #' #' x <- 1:5 #' tibble(behind = lag(x), x, ahead = lead(x)) #' #' # If you want to look more rows behind or ahead, use `n` #' lag(1:5, n = 1) #' lag(1:5, n = 2) #' #' lead(1:5, n = 1) #' lead(1:5, n = 2) #' #' # If you want to define a value to pad with, use `default` #' lag(1:5) #' lag(1:5, default = 0) #' #' lead(1:5) #' lead(1:5, default = 6) #' #' # If the data are not already ordered, use `order_by` #' scrambled <- slice_sample( #' tibble(year = 2000:2005, value = (0:5) ^ 2), #' prop = 1 #' ) #' #' wrong <- mutate(scrambled, previous_year_value = lag(value)) #' arrange(wrong, year) #' #' right <- mutate(scrambled, previous_year_value = lag(value, order_by = year)) #' arrange(right, year) NULL #' @export #' @rdname lead-lag lag <- function(x, n = 1L, default = NULL, order_by = NULL, ...) { if (inherits(x, "ts")) { abort("`x` must be a vector, not a , do you want `stats::lag()`?") } check_dots_empty0(...) check_number_whole(n) if (n < 0L) { abort("`n` must be positive.") } shift(x, n = n, default = default, order_by = order_by) } #' @export #' @rdname lead-lag lead <- function(x, n = 1L, default = NULL, order_by = NULL, ...) { check_dots_empty0(...) check_number_whole(n) if (n < 0L) { abort("`n` must be positive.") } shift(x, n = -n, default = default, order_by = order_by) } shift <- function( x, ..., n = 1L, default = NULL, order_by = NULL, error_call = caller_env() ) { check_dots_empty0(...) if (!is.null(order_by)) { out <- with_order( order_by = order_by, fun = shift, x = x, n = n, default = default, error_call = error_call ) return(out) } obj_check_vector(x, call = error_call) check_number_whole(n) n <- vec_cast(n, integer(), call = error_call) if (!is.null(default)) { vec_check_size(default, size = 1L, call = error_call) default <- vec_cast( x = default, to = x, x_arg = "default", to_arg = "x", call = error_call ) } lag <- n >= 0L n <- abs(n) size <- vec_size(x) if (n > size) { n <- size } if (is.null(default)) { shift_slice(x, n, size, lag) } else { shift_c(x, n, size, lag, default) } } shift_slice <- function(x, n, size, lag) { loc_default <- vec_rep(NA_integer_, n) if (lag) { loc <- seq2(1L, size - n) loc <- vec_c(loc_default, loc) vec_slice(x, loc) } else { loc <- seq2(1L + n, size) loc <- vec_c(loc, loc_default) vec_slice(x, loc) } } shift_c <- function(x, n, size, lag, default) { default <- vec_rep(default, n) if (lag) { loc <- seq2(1L, size - n) x <- vec_slice(x, loc) vec_c(default, x, .ptype = x) } else { loc <- seq2(1L + n, size) x <- vec_slice(x, loc) vec_c(x, default, .ptype = x) } } dplyr/R/groups-with.R0000644000176200001440000000244615106134104014221 0ustar liggesusers#' Perform an operation with temporary groups #' #' @description #' `r lifecycle::badge("superseded")` #' #' This was an experimental function that allows you to modify the grouping #' variables for a single operation; it is superseded in favour of using the #' `.by` argument to individual verbs. #' #' @param .data A data frame #' @param .groups <[`tidy-select`][dplyr_tidy_select]> One or more variables #' to group by. Unlike [group_by()], you can only group by existing variables, #' and you can use tidy-select syntax like `c(x, y, z)` to select multiple #' variables. #' #' Use `NULL` to temporarily **un**group. #' @param .f Function to apply to regrouped data. #' Supports purrr-style `~` syntax #' @param ... Additional arguments passed on to `...`. #' @keywords internal #' @export #' @examples #' df <- tibble(g = c(1, 1, 2, 2, 3), x = runif(5)) #' #' # Old #' df |> #' with_groups(g, mutate, x_mean = mean(x)) #' # New #' df |> mutate(x_mean = mean(x), .by = g) with_groups <- function(.data, .groups, .f, ...) { lifecycle::signal_stage("experimental", "with_groups()") loc <- tidyselect::eval_select(enquo(.groups), data = tbl_ptype(.data)) val <- syms(names(.data)[loc]) out <- group_by(.data, !!!val) .f <- as_function(.f) out <- .f(out, ...) dplyr_reconstruct(out, .data) } dplyr/R/progress.R0000644000176200001440000001064415137161765013615 0ustar liggesusers#' Progress bar with estimated time. #' #' @description #' `r lifecycle::badge("deprecated")` #' #' This progress bar has been deprecated since providing progress bars is not #' the responsibility of dplyr. Instead, you might try the more powerful #' [progress](https://github.com/r-lib/progress) package. #' #' This reference class represents a text progress bar displayed estimated #' time remaining. When finished, it displays the total duration. The #' automatic progress bar can be disabled by setting option #' `dplyr.show_progress` to `FALSE`. #' #' @param n Total number of items #' @param min_time Progress bar will wait until at least `min_time` #' seconds have elapsed before displaying any results. #' @return A ref class with methods `tick()`, `print()`, #' `pause()`, and `stop()`. #' @keywords internal #' @export #' @examples #' p <- progress_estimated(3) #' p$tick() #' p$tick() #' p$tick() #' #' p <- progress_estimated(3) #' for (i in 1:3) p$pause(0.1)$tick()$print() #' #' p <- progress_estimated(3) #' p$tick()$print()$ #' pause(1)$stop() #' #' # If min_time is set, progress bar not shown until that many #' # seconds have elapsed #' p <- progress_estimated(3, min_time = 3) #' for (i in 1:3) p$pause(0.1)$tick()$print() #' #' \dontrun{ #' p <- progress_estimated(10, min_time = 3) #' for (i in 1:10) p$pause(0.5)$tick()$print() #' } progress_estimated <- function(n, min_time = 0) { # Moved to `always = TRUE` in 1.2.0 lifecycle::deprecate_warn( "1.0.0", "dplyr::progress_estimated()", always = TRUE, id = "dplyr-progress-estimated" ) Progress$new(n, min_time = min_time) } Progress <- R6::R6Class( "Progress", public = list( n = NULL, i = 0, init_time = NULL, stopped = FALSE, stop_time = NULL, min_time = NULL, last_update = NULL, initialize = function(n, min_time = 0, ...) { self$n <- n self$min_time <- min_time self$begin() }, begin = function() { "Initialise timer. Call this before beginning timing." self$i <- 0 self$last_update <- self$init_time <- now() self$stopped <- FALSE self }, pause = function(x) { "Sleep for x seconds. Useful for testing." Sys.sleep(x) self }, width = function() { getOption("width") - nchar("|100% ~ 99.9 h remaining") - 2 }, tick = function() { "Process one element" if (self$stopped) { return(self) } if (self$i == self$n) { abort("No more ticks.") } self$i <- self$i + 1 self }, stop = function() { if (self$stopped) { return(self) } self$stopped <- TRUE self$stop_time <- now() self }, print = function(...) { if ( isFALSE(getOption("dplyr.show_progress", default = TRUE)) || # user specifies no progress !interactive() || # not an interactive session !is.null(getOption("knitr.in.progress")) # dplyr used within knitr document ) { return(invisible(self)) } now_ <- now() if ( now_ - self$init_time < self$min_time || now_ - self$last_update < 0.05 ) { return(invisible(self)) } self$last_update <- now_ if (self$stopped) { overall <- show_time(self$stop_time - self$init_time) if (self$i == self$n) { cat_line("Completed after ", overall) cat("\n") } else { cat_line("Killed after ", overall) cat("\n") } return(invisible(self)) } avg <- (now() - self$init_time) / self$i time_left <- (self$n - self$i) * avg nbars <- trunc(self$i / self$n * self$width()) cat_line( "|", str_rep("=", nbars), str_rep(" ", self$width() - nbars), "|", format(round(self$i / self$n * 100), width = 3), "% ", "~", show_time(time_left), " remaining" ) invisible(self) } ) ) cat_line <- function(...) { msg <- paste(..., sep = "", collapse = "") gap <- max(c(0, getOption("width") - nchar(msg, "width"))) cat("\r", msg, rep.int(" ", gap), sep = "") utils::flush.console() } str_rep <- function(x, i) { paste(rep.int(x, i), collapse = "") } show_time <- function(x) { if (x < 60) { paste(round(x), "s") } else if (x < 60 * 60) { paste(round(x / 60), "m") } else { paste(round(x / (60 * 60)), "h") } } now <- function() proc.time()[[3]] dplyr/R/utils-format.R0000644000176200001440000000136214366556340014374 0ustar liggesusers#' Describing dimensions #' #' Prints the dimensions of an array-like object in a user-friendly manner, #' substituting `NA` with ?? (for SQL queries). #' #' @param x Object to show dimensions for. #' @export #' @keywords internal #' @examples #' dim_desc(mtcars) dim_desc <- function(x) { d <- dim(x) d2 <- big_mark(d) d2[is.na(d)] <- "??" paste0("[", paste0(d2, collapse = " x "), "]") } # function for the thousand separator, # returns "," unless it's used for the decimal point, in which case returns "." big_mark <- function(x, ...) { mark <- if (identical(getOption("OutDec"), ",")) "." else "," formatC(x, big.mark = mark, ...) } rule <- function(pad = "-", gap = 2L) { paste0(rep(pad, getOption("width") - gap), collapse = "") } dplyr/R/grouped-df.R0000644000176200001440000002045715106134104013767 0ustar liggesusers#' A grouped data frame. #' #' @description #' The easiest way to create a grouped data frame is to call the `group_by()` #' method on a data frame or tbl: this will take care of capturing #' the unevaluated expressions for you. #' #' These functions are designed for programmatic use. For data analysis #' purposes see [group_data()] for the accessor functions that retrieve #' various metadata from a grouped data frames. #' #' @keywords internal #' @param data a tbl or data frame. #' @param vars A character vector. #' @param drop When `.drop = TRUE`, empty groups are dropped. #' #' @export grouped_df <- function(data, vars, drop = group_by_drop_default(data)) { if (!is.data.frame(data)) { abort("`data` must be a data frame.") } if (!is.character(vars)) { abort("`vars` must be a character vector.") } if (length(vars) == 0) { as_tibble(data) } else { groups <- compute_groups(data, vars, drop = drop) new_grouped_df(data, groups) } } compute_groups <- function(data, vars, drop = FALSE) { unknown <- setdiff(vars, names(data)) if (length(unknown) > 0) { vars <- paste0(encodeString(vars, quote = "`"), collapse = ", ") abort(glue("`vars` missing from `data`: {vars}.")) } # Only train the dictionary based on selected columns group_vars <- as_tibble(data)[vars] split_key_loc <- dplyr_locate_sorted_groups(group_vars) old_keys <- split_key_loc$key old_rows <- split_key_loc$loc signal("", class = "dplyr_regroup") groups <- tibble(!!!old_keys, ".rows" := old_rows) if (!isTRUE(drop) && any(map_lgl(old_keys, is.factor))) { # Extra work is needed to auto expand empty groups uniques <- map(old_keys, function(.) { if (is.factor(.)) . else vec_unique(.) }) # Internally we only work with integers # # so for any grouping column that is not a factor # we need to match the values to the unique values positions <- map2(old_keys, uniques, function(.x, .y) { if (is.factor(.x)) .x else vec_match(.x, .y) }) # Expand groups internally adds empty groups recursively # we get back: # - indices: a list of how to vec_slice the current keys # to get the new keys # # - rows: the new list of rows (i.e. the same as old rows, # but with some extra empty integer(0) added for empty groups) expanded <- expand_groups(groups, positions, vec_size(old_keys)) new_indices <- expanded$indices new_rows <- expanded$rows # Make the new keys from the old keys and the new_indices new_keys <- pmap( list(old_keys, new_indices, uniques), function(key, index, unique) { if (is.factor(key)) { if (is.ordered(key)) { new_ordered(index, levels = levels(key)) } else { new_factor(index, levels = levels(key)) } } else { vec_slice(unique, index) } } ) names(new_keys) <- vars groups <- tibble(!!!new_keys, ".rows" := new_rows) } attr(groups, ".drop") <- drop groups } count_regroups <- function(code) { i <- 0 withCallingHandlers(code, dplyr_regroup = function(cnd) { i <<- i + 1 }) i } show_regroups <- function(code) { withCallingHandlers(code, dplyr_regroup = function(cnd) { cat("Regrouping...\n") }) } #' Low-level construction and validation for the grouped_df and rowwise_df classes #' #' `new_grouped_df()` and `new_rowwise_df()` are constructors designed to be high-performance so only #' check types, not values. This means it is the caller's responsibility #' to create valid values, and hence this is for expert use only. #' #' @param x A data frame #' @param groups The grouped structure, `groups` should be a data frame. #' Its last column should be called `.rows` and be #' a list of 1 based integer vectors that all are between 1 and the number of rows of `.data`. #' @param class additional class, will be prepended to canonical classes. #' @param ... additional attributes #' #' @examples #' # 5 bootstrap samples #' tbl <- new_grouped_df( #' tibble(x = rnorm(10)), #' groups = tibble(".rows" := replicate(5, sample(1:10, replace = TRUE), simplify = FALSE)) #' ) #' # mean of each bootstrap sample #' summarise(tbl, x = mean(x)) #' #' @keywords internal #' @export new_grouped_df <- function(x, groups, ..., class = character()) { if (!is.data.frame(x)) { abort("`x` must be a data frame.") } if (!is.data.frame(groups)) { abort("`groups` must be a data frame") } if (tail(names(groups), 1L) != ".rows") { abort('The last column of `groups` must be called ".rows".') } new_tibble( x, groups = groups, ..., nrow = NROW(x), class = c(class, "grouped_df") ) } #' @description #' `validate_grouped_df()` and `validate_rowwise_df()` validate the attributes #' of a `grouped_df` or a `rowwise_df`. #' #' @param check_bounds whether to check all indices for out of bounds problems in `grouped_df` objects #' @rdname new_grouped_df #' @export validate_grouped_df <- function(x, check_bounds = FALSE) { if (is.null(attr(x, "groups")) && !is.null(attr(x, "vars"))) { bullets <- c( "Corrupt `grouped_df` using old (< 0.8.0) format.", i = "Strip off old grouping with `ungroup()`." ) abort(bullets) } result <- .Call(`dplyr_validate_grouped_df`, x, check_bounds) if (!is.null(result)) { abort(result) } x } setOldClass(c("grouped_df", "tbl_df", "tbl", "data.frame")) #' @rdname grouped_df #' @export is.grouped_df <- function(x) inherits(x, "grouped_df") #' @rdname grouped_df #' @export is_grouped_df <- is.grouped_df group_sum <- function(x) { grps <- n_groups(x) paste0(commas(group_vars(x)), " [", big_mark(grps), "]") } #' @importFrom pillar tbl_sum #' @export tbl_sum.grouped_df <- function(x, ...) { c( NextMethod(), c("Groups" = group_sum(x)) ) } #' @export as.data.frame.grouped_df <- function( x, row.names = NULL, optional = FALSE, ... ) { new_data_frame(vec_data(x), n = nrow(x)) } #' @export as_tibble.grouped_df <- function(x, ...) { new_tibble(vec_data(x), nrow = nrow(x)) } #' @export `[.grouped_df` <- function(x, i, j, drop = FALSE) { out <- NextMethod() if (!is.data.frame(out)) { return(out) } if (drop) { as_tibble(out) } else { groups <- group_intersect(x, out) if ((missing(i) || nargs() == 2) && identical(groups, group_vars(x))) { new_grouped_df(out, group_data(x)) } else { grouped_df(out, groups, group_by_drop_default(x)) } } } #' @export `$<-.grouped_df` <- function(x, name, ..., value) { out <- NextMethod() if (name %in% group_vars(x)) { grouped_df(out, group_intersect(x, out), group_by_drop_default(x)) } else { out } } #' @export `[<-.grouped_df` <- function(x, i, j, ..., value) { out <- NextMethod() grouped_df(out, group_intersect(x, out), group_by_drop_default(x)) } #' @export `[[<-.grouped_df` <- function(x, ..., value) { out <- NextMethod() grouped_df(out, group_intersect(x, out), group_by_drop_default(x)) } #' @export `names<-.grouped_df` <- function(x, value) { data <- as.data.frame(x) names(data) <- value groups <- group_data(x) group_loc <- match(intersect(names(groups), names(x)), names(x)) group_names <- c(value[group_loc], ".rows") if (!identical(group_names, names(groups))) { names(groups) <- c(value[group_loc], ".rows") } new_grouped_df(data, groups) } #' @method rbind grouped_df #' @export rbind.grouped_df <- function(...) { bind_rows(...) } #' @method cbind grouped_df #' @export cbind.grouped_df <- function(...) { bind_cols(...) } group_data_trim <- function(group_data, preserve = FALSE) { if (preserve) { return(group_data) } non_empty <- lengths(group_data$".rows") > 0 group_data[non_empty, , drop = FALSE] } # Helpers ----------------------------------------------------------------- expand_groups <- function(old_groups, positions, nr) { .Call(`dplyr_expand_groups`, old_groups, positions, nr) } dplyr_locate_sorted_groups <- function(x) { out <- vec_locate_sorted_groups(x, nan_distinct = TRUE) out$loc <- new_list_of(out$loc, ptype = integer()) if (dplyr_legacy_locale()) { # Temporary legacy support for respecting the system locale. # Matches legacy `arrange()` ordering. out <- vec_slice(out, dplyr_order_legacy(out$key)) } out } group_intersect <- function(x, new) { intersect(group_vars(x), names(new)) } dplyr/R/context.R0000644000176200001440000001061415106134104013411 0ustar liggesusers#' Information about the "current" group or variable #' #' @description #' These functions return information about the "current" group or "current" #' variable, so only work inside specific contexts like [summarise()] and #' [mutate()]. #' #' * `n()` gives the current group size. #' * `cur_group()` gives the group keys, a tibble with one row and one column #' for each grouping variable. #' * `cur_group_id()` gives a unique numeric identifier for the current group. #' * `cur_group_rows()` gives the row indices for the current group. #' * `cur_column()` gives the name of the current column (in [across()] only). #' #' See [group_data()] for equivalent functions that return values for all #' groups. #' #' See [pick()] for a way to select a subset of columns using tidyselect syntax #' while inside `summarise()` or `mutate()`. #' #' @section data.table: #' If you're familiar with data.table: #' #' * `cur_group_id()` <-> `.GRP` #' * `cur_group()` <-> `.BY` #' * `cur_group_rows()` <-> `.I` #' #' See [pick()] for an equivalent to `.SD`. #' #' @name context #' @examples #' df <- tibble( #' g = sample(rep(letters[1:3], 1:3)), #' x = runif(6), #' y = runif(6) #' ) #' gf <- df |> group_by(g) #' #' gf |> summarise(n = n()) #' #' gf |> mutate(id = cur_group_id()) #' gf |> reframe(row = cur_group_rows()) #' gf |> summarise(data = list(cur_group())) #' #' gf |> mutate(across(everything(), ~ paste(cur_column(), round(.x, 2)))) NULL #' @rdname context #' @export n <- function() { peek_mask()$get_current_group_size() } #' @rdname context #' @export cur_group <- function() { peek_mask()$current_key() } #' @rdname context #' @export cur_group_id <- function() { peek_mask()$get_current_group_id() } #' @rdname context #' @export cur_group_rows <- function() { peek_mask()$current_rows() } group_labels_details <- function(keys) { keys <- map_chr(keys, pillar::format_glimpse) labels <- vec_paste0(names(keys), " = ", keys) labels <- cli_collapse(labels, last = ", ", sep2 = ", ") cli::format_inline("{.code {labels}}") } cur_group_label <- function( type = mask_type(), id = cur_group_id(), group = cur_group() ) { switch( type, ungrouped = "", grouped = glue("group {id}: {label}", label = group_labels_details(group)), rowwise = glue("row {id}"), stop_mask_type(type) ) } cur_group_data <- function(mask_type) { switch( mask_type, ungrouped = list(), grouped = list(id = cur_group_id(), group = cur_group()), rowwise = list(id = cur_group_id()), stop_mask_type(mask_type) ) } stop_mask_type <- function(type) { cli::cli_abort( "Unexpected mask type {.val {type}}.", .internal = TRUE ) } cnd_data <- function(cnd, ctxt, mask, call) { mask_type <- mask_type(mask) has_group_data <- has_active_group_context(mask) if (has_group_data) { group_data <- cur_group_data(mask_type) } else { group_data <- NULL } list( cnd = cnd, name = ctxt$error_name, expr = ctxt$error_expr, type = mask_type, has_group_data = has_group_data, group_data = group_data, call = call ) } #' @rdname context #' @export cur_column <- function() { peek_column() } # context accessors ------------------------------------------------------- context_env <- new_environment() context_poke <- function(name, value) { old <- context_env[[name]] context_env[[name]] <- value old } context_peek_bare <- function(name) { context_env[[name]] } context_peek <- function(name, location, call = caller_env()) { context_peek_bare(name) %||% abort(glue("Must only be used inside {location}."), call = call) } context_local <- function(name, value, frame = caller_env()) { old <- context_poke(name, value) # FIXME: Pass `after = TRUE` once we depend on R 3.5. Currently this # doesn't restore in the correct order (FIFO) when context-local # functions are called multiple times within the same frame. expr <- expr(on.exit(context_poke(!!name, !!old), add = TRUE)) eval_bare(expr, frame) value } peek_column <- function(call = caller_env()) { context_peek("column", "`across()`", call) } local_column <- function(x, frame = caller_env()) { context_local("column", x, frame = frame) } peek_mask <- function(call = caller_env()) { context_peek( "mask", "data-masking verbs like `mutate()`, `filter()`, and `group_by()`", call ) } local_mask <- function(x, frame = caller_env()) { context_local("mask", x, frame = frame) } dplyr/R/mutate.R0000644000176200001440000003666415106134104013241 0ustar liggesusers#' Create, modify, and delete columns #' #' `mutate()` creates new columns that are functions of existing variables. #' It can also modify (if the name is the same as an existing #' column) and delete columns (by setting their value to `NULL`). #' #' @section Useful mutate functions: #' #' * [`+`], [`-`], [log()], etc., for their usual mathematical meanings #' #' * [lead()], [lag()] #' #' * [dense_rank()], [min_rank()], [percent_rank()], [row_number()], #' [cume_dist()], [ntile()] #' #' * [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()] #' #' * [na_if()], [coalesce()] #' #' * [if_else()], [recode()], [case_when()] #' #' @section Grouped tibbles: #' #' Because mutating expressions are computed within groups, they may #' yield different results on grouped tibbles. This will be the case #' as soon as an aggregating, lagging, or ranking function is #' involved. Compare this ungrouped mutate: #' #' ``` #' starwars |> #' select(name, mass, species) |> #' mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) #' ``` #' #' With the grouped equivalent: #' #' ``` #' starwars |> #' select(name, mass, species) |> #' group_by(species) |> #' mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) #' ``` #' #' The former normalises `mass` by the global average whereas the #' latter normalises by the averages within species levels. #' #' @export #' @inheritParams arrange #' @param ... <[`data-masking`][rlang::args_data_masking]> Name-value pairs. #' The name gives the name of the column in the output. #' #' The value can be: #' #' * A vector of length 1, which will be recycled to the correct length. #' * A vector the same length as the current group (or the whole data frame #' if ungrouped). #' * `NULL`, to remove the column. #' * A data frame or tibble, to create multiple columns in the output. #' @family single table verbs #' @return #' An object of the same type as `.data`. The output has the following #' properties: #' #' * Columns from `.data` will be preserved according to the `.keep` argument. #' * Existing columns that are modified by `...` will always be returned in #' their original location. #' * New columns created through `...` will be placed according to the #' `.before` and `.after` arguments. #' * The number of rows is not affected. #' * Columns given the value `NULL` will be removed. #' * Groups will be recomputed if a grouping variable is mutated. #' * Data frame attributes are preserved. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. #' @examples #' # Newly created variables are available immediately #' starwars |> #' select(name, mass) |> #' mutate( #' mass2 = mass * 2, #' mass2_squared = mass2 * mass2 #' ) #' #' # As well as adding new variables, you can use mutate() to #' # remove variables and modify existing variables. #' starwars |> #' select(name, height, mass, homeworld) |> #' mutate( #' mass = NULL, #' height = height * 0.0328084 # convert to feet #' ) #' #' # Use across() with mutate() to apply a transformation #' # to multiple columns in a tibble. #' starwars |> #' select(name, homeworld, species) |> #' mutate(across(!name, as.factor)) #' # see more in ?across #' #' # Window functions are useful for grouped mutates: #' starwars |> #' select(name, mass, homeworld) |> #' group_by(homeworld) |> #' mutate(rank = min_rank(desc(mass))) #' # see `vignette("window-functions")` for more details #' #' # By default, new columns are placed on the far right. #' df <- tibble(x = 1, y = 2) #' df |> mutate(z = x + y) #' df |> mutate(z = x + y, .before = 1) #' df |> mutate(z = x + y, .after = x) #' #' # By default, mutate() keeps all columns from the input data. #' df <- tibble(x = 1, y = 2, a = "a", b = "b") #' df |> mutate(z = x + y, .keep = "all") # the default #' df |> mutate(z = x + y, .keep = "used") #' df |> mutate(z = x + y, .keep = "unused") #' df |> mutate(z = x + y, .keep = "none") #' #' # Grouping ---------------------------------------- #' # The mutate operation may yield different results on grouped #' # tibbles because the expressions are computed within groups. #' # The following normalises `mass` by the global average: #' starwars |> #' select(name, mass, species) |> #' mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) #' #' # Whereas this normalises `mass` by the averages within species #' # levels: #' starwars |> #' select(name, mass, species) |> #' group_by(species) |> #' mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) #' #' # Indirection ---------------------------------------- #' # Refer to column names stored as strings with the `.data` pronoun: #' vars <- c("mass", "height") #' mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) #' # Learn more in ?rlang::args_data_masking mutate <- function(.data, ...) { UseMethod("mutate") } #' @rdname mutate #' #' @inheritParams args_by #' #' @param .keep #' Control which columns from `.data` are retained in the output. Grouping #' columns and columns created by `...` are always kept. #' #' * `"all"` retains all columns from `.data`. This is the default. #' * `"used"` retains only the columns used in `...` to create new #' columns. This is useful for checking your work, as it displays inputs #' and outputs side-by-side. #' * `"unused"` retains only the columns _not_ used in `...` to create new #' columns. This is useful if you generate new columns, but no longer need #' the columns used to generate them. #' * `"none"` doesn't retain any extra columns from `.data`. Only the grouping #' variables and columns created by `...` are kept. #' @param .before,.after #' <[`tidy-select`][dplyr_tidy_select]> Optionally, control where new columns #' should appear (the default is to add to the right hand side). See #' [relocate()] for more details. #' @export mutate.data.frame <- function( .data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL ) { keep <- arg_match0(.keep, values = c("all", "used", "unused", "none")) by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") cols <- mutate_cols(.data, dplyr_quosures(...), by) used <- attr(cols, "used") out <- dplyr_col_modify(.data, cols) names_original <- names(.data) out <- mutate_relocate( out = out, before = {{ .before }}, after = {{ .after }}, names_original = names_original ) names_new <- names(cols) names_groups <- by$names out <- mutate_keep( out = out, keep = keep, used = used, names_new = names_new, names_groups = names_groups ) out } # Helpers ----------------------------------------------------------------- mutate_relocate <- function(out, before, after, names_original) { before <- enquo(before) after <- enquo(after) if (quo_is_null(before) && quo_is_null(after)) { return(out) } # Only change the order of completely new columns that # didn't exist in the original data names <- names(out) names <- setdiff(names, names_original) relocate( out, all_of(names), .before = !!before, .after = !!after ) } mutate_keep <- function(out, keep, used, names_new, names_groups) { names <- names(out) if (keep == "all") { names_out <- names } else { names_keep <- switch( keep, used = names(used)[used], unused = names(used)[!used], none = character(), abort("Unknown `keep`.", .internal = TRUE) ) names_out <- intersect(names, c(names_new, names_groups, names_keep)) } dplyr_col_select(out, names_out) } mutate_cols <- function(data, dots, by, error_call = caller_env()) { # Collect dots before setting up error handlers (#6178) force(dots) error_call <- dplyr_error_call(error_call) mask <- DataMask$new(data, by, "mutate", error_call = error_call) old_current_column <- context_peek_bare("column") on.exit(context_poke("column", old_current_column), add = TRUE) on.exit(mask$forget(), add = TRUE) new_columns <- set_names(list(), character()) warnings_state <- env(warnings = list()) local_error_context(dots, 0L, mask = mask) withCallingHandlers( for (i in seq_along(dots)) { poke_error_context(dots, i, mask = mask) context_poke("column", old_current_column) new_columns <- mutate_col(dots[[i]], data, mask, new_columns) }, error = dplyr_error_handler( dots = dots, mask = mask, bullets = mutate_bullets, error_call = error_call, error_class = "dplyr:::mutate_error" ), warning = dplyr_warning_handler( state = warnings_state, mask = mask, error_call = error_call ) ) signal_warnings(warnings_state, error_call) is_zap <- map_lgl(new_columns, inherits, "rlang_zap") new_columns[is_zap] <- rep(list(NULL), sum(is_zap)) used <- mask$get_used() names(used) <- mask$current_vars() attr(new_columns, "used") <- used new_columns } mutate_col <- function(dot, data, mask, new_columns) { rows <- mask$get_rows() # get results from all the quosures that are expanded from ..i # then ingest them after dot <- expand_pick(dot, mask) quosures <- expand_across(dot) quosures_results <- vector(mode = "list", length = length(quosures)) # First pass for (k in seq_along(quosures)) { quo <- quosures[[k]] quo_data <- attr(quo, "dplyr:::data") if (!is.null(quo_data$column)) { context_poke("column", quo_data$column) } # a list in which each element is the result of # evaluating the quosure in the "sliced data mask" # recycling it appropriately to match the group size # # TODO: reinject hybrid evaluation at the R level chunks <- NULL # result after unchopping the chunks result <- NULL if (quo_is_symbol(quo)) { name <- as_string(quo_get_expr(quo)) if (name %in% names(new_columns)) { # already have result and chunks result <- new_columns[[name]] chunks <- mask$resolve(name) } else if (name %in% names(data)) { # column from the original data result <- data[[name]] chunks <- mask$resolve(name) } if (mask$is_rowwise() && obj_is_list(result)) { sizes <- list_sizes(result) wrong <- which(sizes != 1) if (length(wrong)) { # same error as would have been generated by mask$eval_all_mutate() group <- wrong[1L] mask$set_current_group(group) abort( class = c( "dplyr:::mutate_incompatible_size", "dplyr:::internal_error" ), dplyr_error_data = list( result_size = sizes[group], expected_size = 1 ) ) } result_ptype <- attr(result, "ptype", exact = TRUE) if (length(result) == 0 && is.null(result_ptype)) { # i.e. `vec_ptype_finalise(unspecified())` (#6369) result <- logical() } else { result <- list_unchop(result, ptype = result_ptype) } } } else if (!quo_is_symbolic(quo) && !is.null(quo_get_expr(quo))) { # constant, we still need both `result` and `chunks` result <- quo_get_expr(quo) result <- withCallingHandlers( vec_recycle(result, vec_size(data)), error = function(cnd) { abort( class = c( "dplyr:::mutate_constant_recycle_error", "dplyr:::internal_error" ), constant_size = vec_size(result), data_size = vec_size(data) ) } ) chunks <- vec_chop(result, rows) } if (is.null(chunks)) { if (is.null(quo_data$column)) { chunks <- mask$eval_all_mutate(quo) } else { chunks <- withCallingHandlers( mask$eval_all_mutate(quo), error = function(cnd) { name <- dplyr_quosure_name(quo_data) msg <- glue("Can't compute column `{name}`.") abort(msg, call = call("across"), parent = cnd) } ) } } if (is.null(chunks)) { next } # only unchop if needed if (is.null(result)) { if (length(rows) == 1) { result <- chunks[[1]] } else { # `name` specified lazily chunks <- dplyr_vec_cast_common( chunks, name = dplyr_quosure_name(quo_data) ) result <- list_unchop(chunks, indices = rows) } } quosures_results[[k]] <- list(result = result, chunks = chunks) } # Second pass for (k in seq_along(quosures)) { quo <- quosures[[k]] quo_data <- attr(quo, "dplyr:::data") quo_result <- quosures_results[[k]] if (is.null(quo_result)) { if (quo_data$is_named) { name <- dplyr_quosure_name(quo_data) new_columns[[name]] <- zap() mask$remove(name) } next } result <- quo_result$result chunks <- quo_result$chunks if (!quo_data$is_named && is.data.frame(result)) { types <- vec_ptype(result) types_names <- names(types) chunks_extracted <- .Call(dplyr_extract_chunks, chunks, types) for (j in seq_along(types)) { mask$add_one( types_names[j], chunks_extracted[[j]], result = result[[j]] ) } new_columns[types_names] <- result } else { # treat as a single output otherwise name <- dplyr_quosure_name(quo_data) mask$add_one(name = name, chunks = chunks, result = result) new_columns[[name]] <- result } } new_columns } mutate_bullets <- function(cnd, ...) { UseMethod("mutate_bullets") } #' @export `mutate_bullets.dplyr:::mutate_incompatible_size` <- function(cnd, ...) { label <- ctxt_error_label() result_size <- cnd$dplyr_error_data$result_size expected_size <- cnd$dplyr_error_data$expected_size c( glue("`{label}` must be size {or_1(expected_size)}, not {result_size}."), i = cnd_bullet_rowwise_unlist() ) } #' @export `mutate_bullets.dplyr:::mutate_mixed_null` <- function(cnd, ...) { label <- ctxt_error_label() c( glue("`{label}` must return compatible vectors across groups."), x = "Can't combine NULL and non NULL results.", i = cnd_bullet_rowwise_unlist() ) } #' @export `mutate_bullets.dplyr:::mutate_not_vector` <- function(cnd, ...) { label <- ctxt_error_label() result <- cnd$dplyr_error_data$result c( glue("`{label}` must be a vector, not {obj_type_friendly(result)}."), i = cnd_bullet_rowwise_unlist() ) } #' @export `mutate_bullets.dplyr:::error_incompatible_combine` <- function(cnd, ...) { # the details are included in the parent error c() } #' @export `mutate_bullets.dplyr:::mutate_constant_recycle_error` <- function(cnd, ...) { label <- ctxt_error_label() constant_size <- cnd$constant_size data_size <- cnd$data_size c( glue( "Inlined constant `{label}` must be size {or_1(data_size)}, not {constant_size}." ) ) } check_muffled_warning <- function(cnd) { early_exit <- TRUE # Cancel early exits, e.g. from an exiting handler. This way we can # still instrument caught warnings to avoid confusing # inconsistencies. on.exit( if (early_exit) { return(FALSE) } ) muffled <- withRestarts( muffleWarning = function(...) TRUE, { signalCondition(cnd) FALSE } ) early_exit <- FALSE muffled } dplyr/R/import-standalone-types-check.R0000644000176200001440000002556514406402754017630 0ustar liggesusers# Standalone file: do not edit by hand # Source: # ---------------------------------------------------------------------- # # --- # repo: r-lib/rlang # file: standalone-types-check.R # last-updated: 2023-02-15 # license: https://unlicense.org # dependencies: standalone-obj-type.R # --- # # ## Changelog # # 2023-02-15: # - Added `check_logical()`. # # - `check_bool()`, `check_number_whole()`, and # `check_number_decimal()` are now implemented in C. # # - For efficiency, `check_number_whole()` and # `check_number_decimal()` now take a `NULL` default for `min` and # `max`. This makes it possible to bypass unnecessary type-checking # and comparisons in the default case of no bounds checks. # # 2022-10-07: # - `check_number_whole()` and `_decimal()` no longer treat # non-numeric types such as factors or dates as numbers. Numeric # types are detected with `is.numeric()`. # # 2022-10-04: # - Added `check_name()` that forbids the empty string. # `check_string()` allows the empty string by default. # # 2022-09-28: # - Removed `what` arguments. # - Added `allow_na` and `allow_null` arguments. # - Added `allow_decimal` and `allow_infinite` arguments. # - Improved errors with absent arguments. # # # 2022-09-16: # - Unprefixed usage of rlang functions with `rlang::` to # avoid onLoad issues when called from rlang (#1482). # # 2022-08-11: # - Added changelog. # # nocov start # Scalars ----------------------------------------------------------------- .standalone_types_check_dot_call <- .Call check_bool <- function(x, ..., allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x) && .standalone_types_check_dot_call(ffi_standalone_is_bool_1.0.7, x, allow_na, allow_null)) { return(invisible(NULL)) } stop_input_type( x, c("`TRUE`", "`FALSE`"), ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_string <- function(x, ..., allow_empty = TRUE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { is_string <- .rlang_check_is_string( x, allow_empty = allow_empty, allow_na = allow_na, allow_null = allow_null ) if (is_string) { return(invisible(NULL)) } } stop_input_type( x, "a single string", ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } .rlang_check_is_string <- function(x, allow_empty, allow_na, allow_null) { if (is_string(x)) { if (allow_empty || !is_string(x, "")) { return(TRUE) } } if (allow_null && is_null(x)) { return(TRUE) } if (allow_na && (identical(x, NA) || identical(x, na_chr))) { return(TRUE) } FALSE } check_name <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { is_string <- .rlang_check_is_string( x, allow_empty = FALSE, allow_na = FALSE, allow_null = allow_null ) if (is_string) { return(invisible(NULL)) } } stop_input_type( x, "a valid name", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } IS_NUMBER_true <- 0 IS_NUMBER_false <- 1 IS_NUMBER_oob <- 2 check_number_decimal <- function(x, ..., min = NULL, max = NULL, allow_infinite = TRUE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (missing(x)) { exit_code <- IS_NUMBER_false } else if (0 == (exit_code <- .standalone_types_check_dot_call( ffi_standalone_check_number_1.0.7, x, allow_decimal = TRUE, min, max, allow_infinite, allow_na, allow_null ))) { return(invisible(NULL)) } .stop_not_number( x, ..., exit_code = exit_code, allow_decimal = TRUE, min = min, max = max, allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_number_whole <- function(x, ..., min = NULL, max = NULL, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (missing(x)) { exit_code <- IS_NUMBER_false } else if (0 == (exit_code <- .standalone_types_check_dot_call( ffi_standalone_check_number_1.0.7, x, allow_decimal = FALSE, min, max, allow_infinite = FALSE, allow_na, allow_null ))) { return(invisible(NULL)) } .stop_not_number( x, ..., exit_code = exit_code, allow_decimal = FALSE, min = min, max = max, allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } .stop_not_number <- function(x, ..., exit_code, allow_decimal, min, max, allow_na, allow_null, arg, call) { if (exit_code == IS_NUMBER_oob) { min <- min %||% -Inf max <- max %||% Inf if (min > -Inf && max < Inf) { what <- sprintf("a number between %s and %s", min, max) } else if (x < min) { what <- sprintf("a number larger than %s", min) } else if (x > max) { what <- sprintf("a number smaller than %s", max) } else { abort("Unexpected state in OOB check", .internal = TRUE) } } else if (allow_decimal) { what <- "a number" } else { what <- "a whole number" } stop_input_type( x, what, ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_symbol <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_symbol(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a symbol", ..., allow_null = allow_null, arg = arg, call = call ) } check_arg <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_symbol(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an argument name", ..., allow_null = allow_null, arg = arg, call = call ) } check_call <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_call(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a defused call", ..., allow_null = allow_null, arg = arg, call = call ) } check_environment <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_environment(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an environment", ..., allow_null = allow_null, arg = arg, call = call ) } check_function <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_function(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a function", ..., allow_null = allow_null, arg = arg, call = call ) } check_closure <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_closure(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an R function", ..., allow_null = allow_null, arg = arg, call = call ) } check_formula <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_formula(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a formula", ..., allow_null = allow_null, arg = arg, call = call ) } # Vectors ----------------------------------------------------------------- check_character <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_character(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a character vector", ..., allow_null = allow_null, arg = arg, call = call ) } check_logical <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_logical(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a logical vector", ..., allow_null = allow_null, arg = arg, call = call ) } # nocov end dplyr/R/summarise.R0000644000176200001440000003720415137161765013757 0ustar liggesusers#' Summarise each group down to one row #' #' @description #' `summarise()` creates a new data frame. It returns one row for each #' combination of grouping variables; if there are no grouping variables, the #' output will have a single row summarising all observations in the input. It #' will contain one column for each grouping variable and one column for each of #' the summary statistics that you have specified. #' #' `summarise()` and `summarize()` are synonyms. #' #' @section Useful functions: #' #' * Center: [mean()], [median()] #' * Spread: [sd()], [IQR()], [mad()] #' * Range: [min()], [max()], #' * Position: [first()], [last()], [nth()], #' * Count: [n()], [n_distinct()] #' * Logical: [any()], [all()] #' #' @section Backend variations: #' #' The data frame backend supports creating a variable and using it in the #' same summary. This means that previously created summary variables can be #' further transformed or combined within the summary, as in [mutate()]. #' However, it also means that summary variables with the same names as previous #' variables overwrite them, making those variables unavailable to later summary #' variables. #' #' This behaviour may not be supported in other backends. To avoid unexpected #' results, consider using new names for your summary variables, especially when #' creating multiple summaries. #' #' @inheritParams arrange #' @inheritParams args_by #' #' @param ... <[`data-masking`][rlang::args_data_masking]> Name-value pairs of #' summary functions. The name will be the name of the variable in the result. #' #' The value can be: #' * A vector of length 1, e.g. `min(x)`, `n()`, or `sum(is.na(y))`. #' * A data frame with 1 row, to add multiple columns from a single expression. #' #' @param .groups `r lifecycle::badge("experimental")` Grouping structure of the #' result. #' #' * `"drop_last"`: drops the last level of grouping. This was the #' only supported option before version 1.0.0. #' * `"drop"`: All levels of grouping are dropped. #' * `"keep"`: Same grouping structure as `.data`. #' * `"rowwise"`: Each row is its own group. #' #' When `.groups` is not specified, it is set to `"drop_last"` for a grouped #' data frame, and `"keep"` for a rowwise data frame. In addition, a message #' informs you of how the result will be grouped unless the result is #' ungrouped, the option `"dplyr.summarise.inform"` is set to `FALSE`, or when #' `summarise()` is called from a function in a package. #' #' @returns #' An object _usually_ of the same type as `.data`. #' #' * The rows come from the underlying [group_keys()]. #' * The columns are a combination of the grouping keys and the summary #' expressions that you provide. #' * The grouping structure is controlled by the `.groups=` argument, the #' output may be another [grouped_df], a [tibble] or a [rowwise] data frame. #' * Data frame attributes are **not** preserved, because `summarise()` #' fundamentally creates a new data frame. #' #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("summarise")}. #' #' @family single table verbs #' @export #' @examples #' # A summary applied to ungrouped tbl returns a single row #' mtcars |> #' summarise(mean = mean(disp), n = n()) #' #' # Usually, you'll want to group first #' mtcars |> #' group_by(cyl) |> #' summarise(mean = mean(disp), n = n()) #' #' # Each summary call removes one grouping level (since that group #' # is now just a single row) #' mtcars |> #' group_by(cyl, vs) |> #' summarise(cyl_n = n()) |> #' group_vars() #' #' # BEWARE: reusing variables may lead to unexpected results #' mtcars |> #' group_by(cyl) |> #' summarise(disp = mean(disp), sd = sd(disp)) #' #' # Refer to column names stored as strings with the `.data` pronoun: #' var <- "mass" #' summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) #' # Learn more in ?rlang::args_data_masking summarise <- function(.data, ..., .by = NULL, .groups = NULL) { by <- enquo(.by) if (!quo_is_null(by) && !is.null(.groups)) { abort("Can't supply both `.by` and `.groups`.") } UseMethod("summarise") } #' @rdname summarise #' @export summarize <- summarise #' @export summarise.data.frame <- function(.data, ..., .by = NULL, .groups = NULL) { by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") cols <- summarise_cols(.data, dplyr_quosures(...), by, "summarise") out <- summarise_build(by, cols, "summarise") if (!is_tibble(.data)) { # The `by` group data we build from is always a tibble, # so we have to manually downcast as needed out <- as.data.frame(out) } if (identical(.groups, "rowwise")) { out <- rowwise_df(out, character()) } out } #' @export summarise.grouped_df <- function(.data, ..., .by = NULL, .groups = NULL) { # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") cols <- summarise_cols(.data, dplyr_quosures(...), by, "summarise") out <- summarise_build(by, cols, "summarise") verbose <- summarise_verbose(.groups, caller_env()) if (is.null(.groups)) { .groups <- "drop_last" } old_groups <- by$names if (identical(.groups, "drop_last")) { n <- length(old_groups) if (n > 1) { new_groups <- old_groups[-n] if (verbose) { inform_implicit_drop_last_for_grouped_df(old_groups, new_groups) } out <- grouped_df(out, new_groups, group_by_drop_default(.data)) } } else if (identical(.groups, "keep")) { out <- grouped_df(out, old_groups, group_by_drop_default(.data)) } else if (identical(.groups, "rowwise")) { out <- rowwise_df(out, old_groups) } else if (!identical(.groups, "drop")) { bullets <- c( paste0("`.groups` can't be ", as_label(.groups)), i = 'Possible values are NULL (default), "drop_last", "drop", "keep", and "rowwise"' ) abort(bullets) } out } #' @export summarise.rowwise_df <- function(.data, ..., .by = NULL, .groups = NULL) { # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") cols <- summarise_cols(.data, dplyr_quosures(...), by, "summarise") out <- summarise_build(by, cols, "summarise") verbose <- summarise_verbose(.groups, caller_env()) if (is.null(.groups)) { .groups <- "keep" } old_groups <- by$names if (identical(.groups, "keep")) { if (verbose && length(old_groups) > 0L) { inform_implicit_keep_for_rowwise_df(old_groups) } out <- grouped_df(out, old_groups) } else if (identical(.groups, "rowwise")) { out <- rowwise_df(out, old_groups) } else if (!identical(.groups, "drop")) { bullets <- c( paste0("`.groups` can't be ", as_label(.groups)), i = 'Possible values are NULL (default), "drop", "keep", and "rowwise"' ) abort(bullets) } out } summarise_cols <- function(data, dots, by, verb, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) mask <- DataMask$new(data, by, verb, error_call = error_call) on.exit(mask$forget(), add = TRUE) n_groups <- mask$get_n_groups() old_current_column <- context_peek_bare("column") on.exit(context_poke("column", old_current_column), add = TRUE) warnings_state <- env(warnings = list()) chunks <- list() types <- list() results <- list() out_names <- character() local_error_context(dots, 0L, mask = mask) withCallingHandlers( { for (i in seq_along(dots)) { poke_error_context(dots, i, mask = mask) context_poke("column", old_current_column) dot <- dots[[i]] # - expand dot <- expand_pick(dot, mask) quosures <- expand_across(dot) # - compute quosures_results <- map(quosures, summarise_eval_one, mask = mask) # - structure for (k in seq_along(quosures)) { quo <- quosures[[k]] quo_data <- attr(quo, "dplyr:::data") quo_result <- quosures_results[[k]] if (is.null(quo_result)) { next } types_k <- quo_result$types chunks_k <- quo_result$chunks results_k <- quo_result$results if (!quo_data$is_named && is.data.frame(types_k)) { chunks_extracted <- .Call(dplyr_extract_chunks, chunks_k, types_k) types_k_names <- names(types_k) for (j in seq_along(chunks_extracted)) { mask$add_one( name = types_k_names[j], chunks = chunks_extracted[[j]], result = results_k[[j]] ) } chunks <- append(chunks, chunks_extracted) types <- append(types, as.list(types_k)) results <- append(results, results_k) out_names <- c(out_names, types_k_names) } else { name <- dplyr_quosure_name(quo_data) mask$add_one(name = name, chunks = chunks_k, result = results_k) chunks <- append(chunks, list(chunks_k)) types <- append(types, list(types_k)) results <- append(results, list(results_k)) out_names <- c(out_names, name) } } } if (verb == "summarise") { # For `summarise()`, check that all results are size 1. .Call(`dplyr_summarise_check_all_size_one`, chunks, n_groups) group_sizes <- NULL } else { # For `reframe()`, recycle horizontally across expressions within a # single group. Modifies `chunks` and `results` in place for efficiency! group_sizes <- .Call( `dplyr_reframe_recycle_horizontally_in_place`, chunks, results, n_groups ) # Regenerate any `results` that were `NULL`ed in place during the # recycling process due to recycling of `chunks` changing the size for (i in seq_along(chunks)) { if (is.null(results[[i]])) { results[[i]] <- vec_c(!!!chunks[[i]], .ptype = types[[i]]) } } } }, error = function(cnd) { if (inherits(cnd, "dplyr:::reframe_incompatible_size")) { action <- "recycle" i <- cnd$dplyr_error_data$index_expression } else if (inherits(cnd, "dplyr:::summarise_incompatible_size")) { action <- "compute" i <- cnd$dplyr_error_data$index_expression } else { action <- "compute" i <- i } handler <- dplyr_error_handler( dots = dots, mask = mask, bullets = summarise_bullets, error_call = error_call, action = action ) handler(cnd) }, warning = dplyr_warning_handler( state = warnings_state, mask = mask, error_call = error_call ) ) # Build output `cols`, assigning by name so `summarise(df, a = expr, a = expr)` # only retains the 2nd assignment cols <- list() for (i in seq_along(results)) { cols[[out_names[i]]] <- results[[i]] } signal_warnings(warnings_state, error_call) list(new = cols, group_sizes = group_sizes) } summarise_eval_one <- function(quo, mask) { quo_data <- attr(quo, "dplyr:::data") if (!is.null(quo_data$column)) { context_poke("column", quo_data$column) # wrap the error when this has been expanded chunks_k <- withCallingHandlers( mask$eval_all_summarise(quo), error = function(cnd) { name <- dplyr_quosure_name(quo_data) msg <- glue("Can't compute column `{name}`.") abort(msg, call = call("across"), parent = cnd) } ) } else { # no wrapping otherwise chunks_k <- mask$eval_all_summarise(quo) } if (is.null(chunks_k)) { return(NULL) } # `name` specified lazily types_k <- dplyr_vec_ptype_common( chunks = chunks_k, name = dplyr_quosure_name(quo_data) ) chunks_k <- vec_cast_common(!!!chunks_k, .to = types_k) result_k <- vec_c(!!!chunks_k, .ptype = types_k) list(chunks = chunks_k, types = types_k, results = result_k) } summarise_build <- function(by, cols, verb) { out <- group_keys0(by$data) if (verb == "reframe") { # Repeat keys for `reframe()` out <- vec_rep_each(out, cols$group_sizes) } dplyr_col_modify(out, cols$new) } summarise_bullets <- function(cnd, ...) { UseMethod("summarise_bullets") } #' @export `summarise_bullets.dplyr:::summarise_unsupported_type` <- function(cnd, ...) { result <- cnd$dplyr_error_data$result error_name <- ctxt_error_label() c( glue("`{error_name}` must be a vector, not {obj_type_friendly(result)}."), i = cnd_bullet_rowwise_unlist() ) } #' @export `summarise_bullets.dplyr:::summarise_incompatible_size` <- function(cnd, ...) { index_group <- cnd$dplyr_error_data$index_group actual_size <- cnd$dplyr_error_data$actual_size error_context <- peek_error_context() error_name <- ctxt_error_label(error_context) # FIXME: So that cnd_bullet_cur_group_label() correctly reports the # faulty group peek_mask()$set_current_group(index_group) c( cli::format_inline( "{.code {error_name}} must be size 1, not {actual_size}." ), i = cli::format_inline( "To return more or less than 1 row per group, use {.fn reframe}." ) ) } #' @export `summarise_bullets.dplyr:::reframe_incompatible_size` <- function(cnd, ...) { index_group <- cnd$dplyr_error_data$index_group actual_size <- cnd$dplyr_error_data$actual_size expected_size <- cnd$dplyr_error_data$expected_size error_context <- peek_error_context() error_name <- ctxt_error_label(error_context) # FIXME: So that cnd_bullet_cur_group_label() correctly reports the # faulty group peek_mask()$set_current_group(index_group) c( cli::format_inline( "{.code {error_name}} must be size {or_1(expected_size)}, not {actual_size}." ), i = cli::format_inline( "An earlier column had size {expected_size}." ) ) } #' @export `summarise_bullets.dplyr:::summarise_mixed_null` <- function(cnd, ...) { error_name <- ctxt_error_label() c( glue("`{error_name}` must return compatible vectors across groups."), x = "Can't combine NULL and non NULL results." ) } # messaging --------------------------------------------------------------- summarise_verbose <- function(.groups, .env) { if (!is.null(.groups)) { # User supplied `.groups` return(FALSE) } inform <- getOption("dplyr.summarise.inform") if (is_true(inform) || is_false(inform)) { # User supplied global option return(inform) } is_reference(topenv(.env), global_env()) } inform_implicit_drop_last_for_grouped_df <- function(old, new) { # Only going to show this message if `length(old) > 1`, so don't need to # worry about the length 0 or length 1 cases. by <- paste0("c(", paste0(old, collapse = ", "), ")") inform(cli_format_each_inline( "{.fn summarise} has regrouped the output.", i = "Summaries were computed grouped by {cli::col_blue(old)}.", i = "Output is grouped by {cli::col_blue(new)}.", i = "Use {.code summarise(.groups = \"drop_last\")} to silence this message.", i = "Use {.code summarise(.by = {by})} for {.topic [per-operation grouping](dplyr::dplyr_by)} instead." )) } inform_implicit_keep_for_rowwise_df <- function(groups) { inform(cli_format_each_inline( "{.fn summarise} has converted the output from a rowwise data frame to a grouped data frame.", i = "Summaries were computed rowwise.", i = "Output is grouped by {cli::col_blue(groups)}.", i = "Use {.code summarise(.groups = \"keep\")} to silence this message." )) } dplyr/R/pull.R0000644000176200001440000000423515106134104012703 0ustar liggesusers#' Extract a single column #' #' `pull()` is similar to `$`. It's mostly useful because it looks a little #' nicer in pipes, it also works with remote data frames, and it can optionally #' name the output. #' #' @inheritParams arrange #' @inheritParams tidyselect::vars_pull #' @param name An optional parameter that specifies the column to be used #' as names for a named vector. Specified in a similar manner as \code{var}. #' @param ... For use by methods. #' @return A vector the same size as `.data`. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("pull")}. #' @export #' @examples #' mtcars |> pull(-1) #' mtcars |> pull(1) #' mtcars |> pull(cyl) #' @examplesIf requireNamespace("dbplyr", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE) #' #' # Also works for remote sources #' df <- dbplyr::memdb_frame(x = 1:10, y = 10:1, .name = "pull-ex") #' df |> #' mutate(z = x * y) |> #' pull() #' @examples #' #' # Pull a named vector #' starwars |> pull(height, name) pull <- function(.data, var = -1, name = NULL, ...) { check_dots_used() UseMethod("pull") } #' @export pull.data.frame <- function(.data, var = -1, name = NULL, ...) { var <- tidyselect::vars_pull(names(.data), !!enquo(var)) name <- enquo(name) if (quo_is_null(name)) { return(.data[[var]]) } name <- tidyselect::vars_pull(names(.data), !!name) set_names(.data[[var]], nm = .data[[name]]) } find_var <- function(expr, vars) { var_env <- set_names(as.list(seq_along(vars)), vars) var <- eval_tidy(expr, var_env) if (!is.numeric(var) || length(var) != 1) { abort("`var` must evaluate to a single number.") } var <- as.integer(var) n <- length(vars) if (is.na(var) || abs(var) > n || var == 0L) { abort( "`var` must be a value between {-n} and {n} (excluding zero), not {var}." ) } if (var < 0) { var <- var + n + 1 } vars[[var]] } dplyr/R/data-mask.R0000644000176200001440000002427615137161765013621 0ustar liggesusersDataMask <- R6Class( "DataMask", public = list( initialize = function(data, by, verb, error_call) { rows <- by$data$.rows if (length(rows) == 0) { # Specially handle case of zero groups rows <- new_list_of(list(integer()), ptype = integer()) } private$rows <- rows frame <- caller_env(n = 2) local_mask(self, frame) names <- names(data) if (is.null(names)) { cli::cli_abort( "Can't transform a data frame with `NULL` names.", call = error_call ) } if (vec_any_missing(names)) { cli::cli_abort( "Can't transform a data frame with missing names.", call = error_call ) } names_bindings <- chr_unserialise_unicode(names) if (any(names_bindings == "")) { # `names2()` converted potential `NA` names to `""` already abort( "Can't transform a data frame with `NA` or `\"\"` names.", call = error_call ) } if (anyDuplicated(names_bindings)) { abort( "Can't transform a data frame with duplicate names.", call = error_call ) } names(data) <- names_bindings private$size <- nrow(data) private$current_data <- dplyr_new_list(data) private$grouped <- by$type == "grouped" private$rowwise <- by$type == "rowwise" # `duplicate(0L)` is necessary to ensure that the value we modify by # reference is "fresh" and completely owned by this instance of the # `DataMask`. Otherwise nested `mutate()` calls can end up modifying # the same value (#6762). private$env_current_group_info <- new_environment(list( `dplyr:::current_group_id` = duplicate(0L), `dplyr:::current_group_size` = duplicate(0L) )) private$chops <- .Call( dplyr_lazy_vec_chop_impl, data, rows, private$env_current_group_info, private$grouped, private$rowwise ) private$env_mask_bindings <- .Call( dplyr_make_mask_bindings, private$chops, data ) private$keys <- group_keys0(by$data) private$n_groups <- nrow(private$keys) private$by_names <- by$names private$verb <- verb }, add_one = function(name, chunks, result) { if (self$is_rowwise()) { is_scalar_list <- function(.x) { obj_is_list(.x) && length(.x) == 1L } if (all(map_lgl(chunks, is_scalar_list))) { chunks <- map(chunks, `[[`, 1L) } } .Call(`dplyr_mask_binding_add`, private, name, result, chunks) }, remove = function(name) { .Call(`dplyr_mask_binding_remove`, private, name) }, resolve = function(name) { private$chops[[name]] }, eval_all = function(quo) { .Call(`dplyr_mask_eval_all`, quo, private) }, eval_all_summarise = function(quo) { # Wrap in a function called `eval()` so that rlang ignores the # call in error messages. This only concerns errors that occur # directly in `quo`. eval <- function() .Call(`dplyr_mask_eval_all_summarise`, quo, private) eval() }, eval_all_mutate = function(quo) { eval <- function() .Call(`dplyr_mask_eval_all_mutate`, quo, private) eval() }, eval_all_filter = function(quos, invert, env_filter) { eval <- function() { .Call( `dplyr_mask_eval_all_filter`, quos, invert, private, private$size, env_filter ) } eval() }, pick_current = function(vars) { # Only used for deprecated `cur_data()`, `cur_data_all()`, and # `across(.fns = NULL)`. We should remove this when we defunct those. cols <- self$current_cols(vars) if (self$is_rowwise()) { cols <- map2(cols, names(cols), function(col, name) { if (obj_is_list(private$current_data[[name]])) { col <- list(col) } col }) } dplyr_new_tibble(cols, size = self$get_current_group_size_mutable()) }, current_cols = function(vars) { env_get_list(private$env_mask_bindings, vars) }, current_rows = function() { private$rows[[self$get_current_group_id_mutable()]] }, current_key = function() { keys <- private$keys if (vec_size(keys) == 0L) { # Specially handle case of zero groups, like in `$initialize()`. # We always evaluate at least 1 group, so the slice call would attempt # to do `vec_slice(<0-row-df>, 1L)`, which is an error. keys } else { vec_slice(keys, self$get_current_group_id_mutable()) } }, current_vars = function() { names(private$current_data) }, current_non_group_vars = function() { setdiff(self$current_vars(), private$by_names) }, # This pair of functions provides access to `dplyr:::current_group_id`. # - `dplyr:::current_group_id` is modified by reference at the C level. # - If you access it ephemerally, the mutable version can be used. # - If you access it persistently, like in `cur_group_id()`, it must be # duplicated on the way out. # - For maximal performance, we inline the mutable function definition into # the non-mutable version. get_current_group_id = function() { duplicate( private[["env_current_group_info"]][["dplyr:::current_group_id"]] ) }, get_current_group_id_mutable = function() { private[["env_current_group_info"]][["dplyr:::current_group_id"]] }, # This pair of functions provides access to `dplyr:::current_group_size`. # - `dplyr:::current_group_size` is modified by reference at the C level. # - If you access it ephemerally, the mutable version can be used. # - If you access it persistently, like in `n()`, it must be duplicated on # the way out. # - For maximal performance, we inline the mutable function definition into # the non-mutable version. get_current_group_size = function() { duplicate( private[["env_current_group_info"]][["dplyr:::current_group_size"]] ) }, get_current_group_size_mutable = function() { private[["env_current_group_info"]][["dplyr:::current_group_size"]] }, set_current_group = function(group) { # Only to be used right before throwing an error. # We `duplicate()` both values to be extremely conservative, because there # is an extremely small chance we could modify this by reference and cause # issues with the `group` variable in the caller, but this has never been # seen. We generally assume `length()` always returns a fresh variable, so # we probably don't need to duplicate there, but it seems better to be # extremely safe here. env_current_group_info <- private[["env_current_group_info"]] env_current_group_info[["dplyr:::current_group_id"]] <- duplicate(group) env_current_group_info[["dplyr:::current_group_size"]] <- duplicate(length(private$rows[[group]])) }, get_used = function() { .Call(env_resolved, private$chops, names(private$current_data)) }, unused_vars = function() { used <- self$get_used() current_vars <- self$current_vars() current_vars[!used] }, get_rows = function() { private$rows }, get_current_data = function(groups = TRUE) { out <- private$current_data if (!groups) { out <- out[self$current_non_group_vars()] } out }, forget = function() { names_bindings <- self$current_vars() verb <- private$verb osbolete_promise_fn <- function(name) { abort( c( "Obsolete data mask.", x = glue( "Too late to resolve `{name}` after the end of `dplyr::{verb}()`." ), i = glue( "Did you save an object that uses `{name}` lazily in a column in the `dplyr::{verb}()` expression ?" ) ), call = NULL ) } promises <- map(names_bindings, function(.x) { expr(osbolete_promise_fn(!!.x)) }) env_mask_bindings <- private$env_mask_bindings suppressWarnings({ rm(list = names_bindings, envir = env_mask_bindings) env_bind_lazy(env_mask_bindings, !!!set_names(promises, names_bindings)) }) }, is_grouped = function() { private$grouped }, is_rowwise = function() { private$rowwise }, get_keys = function() { private$keys }, get_n_groups = function() { private$n_groups }, get_size = function() { private$size }, get_rlang_mask = function() { # Mimicking the data mask that is created during typical # expression evaluations, like in `DataMask$eval_all_mutate()`. # Important to insert a `.data` pronoun! mask <- new_data_mask(private$env_mask_bindings) mask[[".data"]] <- as_data_pronoun(private$env_mask_bindings) mask } ), private = list( # environment that contains lazy vec_chop()s for each input column # and list of result chunks as they get added. chops = NULL, # Environment which contains the: # - Current group id # - Current group size # Both of which are updated by reference at the C level. # This environment is the parent environment of `chops`. env_current_group_info = NULL, # Environment with active bindings for each column. # Expressions are evaluated in a fresh data mask created from this # environment. Each group gets its own newly created data mask to avoid # cross group contamination of the data mask by lexical side effects, like # usage of `<-` (#6666). env_mask_bindings = NULL, # ptypes of all the variables current_data = list(), # names of the `by` variables by_names = character(), # list of indices, one integer vector per group rows = NULL, # data frame of keys, one row per group keys = NULL, # number of groups, computed as number of rows in `keys` n_groups = NULL, # number of rows in `data` size = NULL, # Type of data frame grouped = NULL, rowwise = NULL, verb = character() ) ) dplyr/R/na-if.R0000644000176200001440000000531115106134104012715 0ustar liggesusers#' Convert values to `NA` #' #' This is a translation of the SQL command `NULLIF`. It is useful if you want #' to convert an annoying value to `NA`. #' #' @param x Vector to modify #' @param y Value or vector to compare against. When `x` and `y` are equal, the #' value in `x` will be replaced with `NA`. #' #' `y` is [cast][vctrs::theory-faq-coercion] to the type of `x` before #' comparison. #' #' `y` is [recycled][vctrs::theory-faq-recycling] to the size of `x` before #' comparison. This means that `y` can be a vector with the same size as `x`, #' but most of the time this will be a single value. #' @return A modified version of `x` that replaces any values that #' are equal to `y` with `NA`. #' #' @seealso #' #' - [coalesce()] to replace `NA`s with the first non-missing value. #' #' - [replace_values()] for making arbitrary replacements by value. #' #' - [replace_when()] for making arbitrary replacements using logical #' conditions. #' #' @export #' @examples #' # `na_if()` is useful for replacing a single problematic value with `NA` #' na_if(c(-99, 1, 4, 3, -99, 5), -99) #' na_if(c("abc", "def", "", "ghi"), "") #' #' # You can use it to standardize `NaN`s to `NA` #' na_if(c(1, NaN, NA, 2, NaN), NaN) #' #' # Because `na_if()` is an R translation of SQL's `NULLIF` command, #' # it compares `x` and `y` element by element. Where `x` and `y` are #' # equal, the value in `x` is replaced with an `NA`. #' na_if( #' x = c(1, 2, 5, 5, 6), #' y = c(0, 2, 3, 5, 4) #' ) #' #' # If you have multiple problematic values that you'd like to replace with #' # `NA`, then `replace_values()` is a better choice than `na_if()` #' x <- c(-99, 1, 4, 0, -99, 5, -1, 0, 5) #' replace_values(x, c(0, -1, -99) ~ NA) #' #' # You'd have to nest `na_if()`s to achieve this #' try(na_if(x, c(0, -1, -99))) #' na_if(na_if(na_if(x, 0), -1), -99) #' #' # If you'd like to replace values that match a logical condition with `NA`, #' # use `replace_when()` #' replace_when(x, x < 0 ~ NA) #' #' # If you'd like to replace `NA` with some other value, use `replace_values()` #' x <- c(NA, 5, 2, NA, 0, 3) #' replace_values(x, NA ~ 0) #' #' # `na_if()` is particularly useful inside `mutate()` #' starwars |> #' select(name, eye_color) |> #' mutate(eye_color = na_if(eye_color, "unknown")) #' #' # `na_if()` can also be used with `mutate()` and `across()` #' # to alter multiple columns #' starwars |> #' mutate(across(where(is.character), ~na_if(., "unknown"))) na_if <- function(x, y) { # Type and size stable on `x` y <- vec_cast(x = y, to = x, x_arg = "y", to_arg = "x") y <- vec_recycle(y, size = vec_size(x), x_arg = "y") na <- vec_init(x) where <- vec_equal(x, y, na_equal = TRUE) x <- vec_assign(x, where, na) x } dplyr/R/group-nest.R0000644000176200001440000000533615106134104014035 0ustar liggesusersgroup_nest_impl <- function(.tbl, .key, keep = FALSE) { mutate(group_keys(.tbl), !!.key := group_split(.tbl, .keep = keep)) } #' Nest a tibble using a grouping specification #' #' @description #' `r lifecycle::badge("experimental")` #' #' Nest a tibble using a grouping specification #' #' @section Lifecycle: #' `group_nest()` is not stable because [`tidyr::nest(.by =)`][tidyr::nest()] #' provides very similar behavior. It may be deprecated in the future. #' #' @section Grouped data frames: #' #' The primary use case for [group_nest()] is with already grouped data frames, #' typically a result of [group_by()]. In this case [group_nest()] only uses #' the first argument, the grouped tibble, and warns when `...` is used. #' #' @section Ungrouped data frames: #' #' When used on ungrouped data frames, [group_nest()] forwards the `...` to #' [group_by()] before nesting, therefore the `...` are subject to the data mask. #' #' @param .tbl A tbl #' @param ... Grouping specification, forwarded to [group_by()] #' @param .key the name of the list column #' @param keep Should the grouping columns be kept in the list column. #' @return A tbl with one row per unique combination of the grouping variables. #' The first columns are the grouping variables, followed by a list column of tibbles #' with matching rows of the remaining columns. #' @keywords internal #' @family grouping functions #' @export #' @examples #' #' #----- use case 1: a grouped data frame #' iris |> #' group_by(Species) |> #' group_nest() #' #' # this can be useful if the grouped data has been altered before nesting #' iris |> #' group_by(Species) |> #' filter(Sepal.Length > mean(Sepal.Length)) |> #' group_nest() #' #' #----- use case 2: using group_nest() on a ungrouped data frame with #' # a grouping specification that uses the data mask #' starwars |> #' group_nest(species, homeworld) group_nest <- function(.tbl, ..., .key = "data", keep = FALSE) { lifecycle::signal_stage("experimental", "group_nest()") UseMethod("group_nest") } #' @export group_nest.data.frame <- function(.tbl, ..., .key = "data", keep = FALSE) { if (dots_n(...)) { group_nest_impl(group_by(.tbl, ...), .key = .key, keep = keep) } else { tibble(!!.key := list(.tbl)) } } #' @export group_nest.grouped_df <- function(.tbl, ..., .key = "data", keep = FALSE) { if (dots_n(...)) { warn_ignores_dots( "group_nest", "grouped_df", "group_by(..., .add = TRUE) |> group_nest()" ) } group_nest_impl(.tbl, .key = .key, keep = keep) } # This is not a deprecation warning, just giving advice warn_ignores_dots <- function(fn, class, with) { cli::cli_warn( "Calling {.fn {fn}} on a {.cls {class}} ignores `...`. Please use {.code {with}}." ) } dplyr/R/join-cross.R0000644000176200001440000000510315106134104014010 0ustar liggesusers#' Cross join #' #' @description #' Cross joins match each row in `x` to every row in `y`, resulting in a data #' frame with `nrow(x) * nrow(y)` rows. #' #' Since cross joins result in all possible matches between `x` and `y`, they #' technically serve as the basis for all [mutating joins][mutate-joins], which #' can generally be thought of as cross joins followed by a filter. In practice, #' a more specialized procedure is used for better performance. #' #' @inheritParams left_join #' #' @returns #' An object of the same type as `x` (including the same groups). The output has #' the following properties: #' #' - There are `nrow(x) * nrow(y)` rows returned. #' #' - Output columns include all columns from both `x` and `y`. Column name #' collisions are resolved using `suffix`. #' #' - The order of the rows and columns of `x` is preserved as much as possible. #' #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("cross_join")}. #' #' @family joins #' @export #' @examples #' # Cross joins match each row in `x` to every row in `y`. #' # Data within the columns is not used in the matching process. #' cross_join(band_instruments, band_members) #' #' # Control the suffix added to variables duplicated in #' # `x` and `y` with `suffix`. #' cross_join(band_instruments, band_members, suffix = c("", "_y")) cross_join <- function(x, y, ..., copy = FALSE, suffix = c(".x", ".y")) { UseMethod("cross_join") } #' @export cross_join.data.frame <- function( x, y, ..., copy = FALSE, suffix = c(".x", ".y") ) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) x_names <- tbl_vars(x) y_names <- tbl_vars(y) # Empty join by with no keys by <- new_join_by() # Particular value isn't too important, as there are no keys to keep/drop keep <- FALSE vars <- join_cols( x_names = x_names, y_names = y_names, by = by, suffix = suffix, keep = keep ) x_in <- as_tibble(x, .name_repair = "minimal") y_in <- as_tibble(y, .name_repair = "minimal") x_size <- vec_size(x_in) y_size <- vec_size(y_in) x_out <- set_names(x_in, names(vars$x$out)) y_out <- set_names(y_in, names(vars$y$out)) x_out <- vec_rep_each(x_out, times = y_size) y_out <- vec_rep(y_out, times = x_size) x_out[names(y_out)] <- y_out dplyr_reconstruct(x_out, x) } dplyr/R/explain.R0000644000176200001440000000264015106134104013365 0ustar liggesusers#' Explain details of a tbl #' #' This is a generic function which gives more details about an object than #' [print()], and is more focused on human readable output than #' [str()]. #' #' @section Databases: #' Explaining a `tbl_sql` will run the SQL `EXPLAIN` command which #' will describe the query plan. This requires a little bit of knowledge about #' how `EXPLAIN` works for your database, but is very useful for #' diagnosing performance problems. #' #' @export #' @param x An object to explain #' @param ... Other parameters possibly used by generic #' @return The first argument, invisibly. #' @examplesIf requireNamespace("dbplyr", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE) #' \donttest{ #' lahman_s <- dbplyr::lahman_sqlite() #' batting <- tbl(lahman_s, "Batting") #' batting |> show_query() #' batting |> explain() #' #' # The batting database has indices on all ID variables: #' # SQLite automatically picks the most restrictive index #' batting |> filter(lgID == "NL" & yearID == 2000L) |> explain() #' #' # OR's will use multiple indexes #' batting |> filter(lgID == "NL" | yearID == 2000) |> explain() #' #' # Joins will use indexes in both tables #' teams <- tbl(lahman_s, "Teams") #' batting |> left_join(teams, c("yearID", "teamID")) |> explain() #' } explain <- function(x, ...) { UseMethod("explain") } #' @export #' @rdname explain show_query <- function(x, ...) { UseMethod("show_query") } dplyr/R/dbplyr.R0000644000176200001440000001211015106134104013212 0ustar liggesusers#' Database and SQL generics. #' #' The `sql_` generics are used to build the different types of SQL queries. #' The default implementations in dbplyr generates ANSI 92 compliant SQL. #' The `db_` generics execute actions on the database. The default #' implementations in dbplyr typically just call the standard DBI S4 #' method. #' #' A few backend methods do not call the standard DBI S4 methods including #' #' * `db_data_type()`: Calls [DBI::dbDataType()] for every field #' (e.g. data frame column) and returns a vector of corresponding SQL data #' types #' #' * `db_save_query()`: Builds and executes a #' `CREATE [TEMPORARY] TABLE ...` SQL command. #' #' * `db_create_index()`: Builds and executes a #' `CREATE INDEX ON
` SQL command. #' #' * `db_drop_table()`: Builds and executes a #' `DROP TABLE [IF EXISTS]
` SQL command. #' #' * `db_analyze()`: Builds and executes an #' `ANALYZE
` SQL command. #' #' Currently, [copy_to()] is the only user of `db_begin()`, `db_commit()`, #' `db_rollback()`, `db_write_table()`, `db_create_indexes()`, `db_drop_table()` and #' `db_analyze()`. If you find yourself overriding many of these #' functions it may suggest that you should just override `copy_to()` #' instead. #' #' `db_create_table()` and `db_insert_into()` have been deprecated #' in favour of `db_write_table()`. #' #' @return Usually a logical value indicating success. Most failures should generate #' an error. However, `db_has_table()` should return `NA` if #' temporary tables cannot be listed with [DBI::dbListTables()] (due to backend #' API limitations for example). As a result, you methods will rely on the #' backend to throw an error if a table exists when it shouldn't. #' @name backend_dbplyr #' @param con A database connection. #' @keywords internal NULL #' @name backend_dbplyr #' @export db_desc <- function(x) UseMethod("db_desc") #' @name backend_dbplyr #' @export sql_translate_env <- function(con) UseMethod("sql_translate_env") #' @name backend_dbplyr #' @export db_list_tables <- function(con) UseMethod("db_list_tables") #' @name backend_dbplyr #' @export #' @param table A string, the table name. db_has_table <- function(con, table) UseMethod("db_has_table") #' @name backend_dbplyr #' @export #' @param fields A list of fields, as in a data frame. db_data_type <- function(con, fields) UseMethod("db_data_type") #' @export #' @name backend_dbplyr #' @export db_save_query <- function(con, sql, name, temporary = TRUE, ...) { UseMethod("db_save_query") } #' @name backend_dbplyr #' @export db_begin <- function(con, ...) UseMethod("db_begin") #' @name backend_dbplyr #' @export db_commit <- function(con, ...) UseMethod("db_commit") #' @name backend_dbplyr #' @export db_rollback <- function(con, ...) UseMethod("db_rollback") #' @name backend_dbplyr #' @export db_write_table <- function(con, table, types, values, temporary = FALSE, ...) { UseMethod("db_write_table") } #' @name backend_dbplyr #' @export db_create_table <- function(con, table, types, temporary = FALSE, ...) { UseMethod("db_create_table") } #' @name backend_dbplyr #' @export db_insert_into <- function(con, table, values, ...) { UseMethod("db_insert_into") } #' @name backend_dbplyr #' @export db_create_indexes <- function(con, table, indexes = NULL, unique = FALSE, ...) { UseMethod("db_create_indexes") } #' @name backend_dbplyr #' @export db_create_index <- function( con, table, columns, name = NULL, unique = FALSE, ... ) { UseMethod("db_create_index") } #' @name backend_dbplyr #' @export db_drop_table <- function(con, table, force = FALSE, ...) { UseMethod("db_drop_table") } #' @name backend_dbplyr #' @export db_analyze <- function(con, table, ...) UseMethod("db_analyze") #' @export #' @rdname backend_dbplyr db_explain <- function(con, sql, ...) { UseMethod("db_explain") } #' @rdname backend_dbplyr #' @export db_query_fields <- function(con, sql, ...) { UseMethod("db_query_fields") } #' @rdname backend_dbplyr #' @export db_query_rows <- function(con, sql, ...) { UseMethod("db_query_rows") } #' @rdname backend_dbplyr #' @export sql_select <- function( con, select, from, where = NULL, group_by = NULL, having = NULL, order_by = NULL, limit = NULL, distinct = FALSE, ... ) { UseMethod("sql_select") } #' @export #' @rdname backend_dbplyr sql_subquery <- function(con, from, name = random_table_name(), ...) { UseMethod("sql_subquery") } random_table_name <- function(n = 10) { paste0(sample(letters, n, replace = TRUE), collapse = "") } #' @rdname backend_dbplyr #' @export sql_join <- function(con, x, y, vars, type = "inner", by = NULL, ...) { UseMethod("sql_join") } #' @rdname backend_dbplyr #' @export sql_semi_join <- function(con, x, y, anti = FALSE, by = NULL, ...) { UseMethod("sql_semi_join") } #' @rdname backend_dbplyr #' @export sql_set_op <- function(con, x, y, method) { UseMethod("sql_set_op") } #' @rdname backend_dbplyr #' @export sql_escape_string <- function(con, x) UseMethod("sql_escape_string") #' @rdname backend_dbplyr #' @export sql_escape_ident <- function(con, x) UseMethod("sql_escape_ident") dplyr/R/count-tally.R0000644000176200001440000001555515137161765014232 0ustar liggesusers#' Count the observations in each group #' #' @description #' `count()` lets you quickly count the unique values of one or more variables: #' `df |> count(a, b)` is roughly equivalent to #' `df |> group_by(a, b) |> summarise(n = n())`. #' `count()` is paired with `tally()`, a lower-level helper that is equivalent #' to `df |> summarise(n = n())`. Supply `wt` to perform weighted counts, #' switching the summary from `n = n()` to `n = sum(wt)`. #' #' `add_count()` and `add_tally()` are equivalents to `count()` and `tally()` #' but use `mutate()` instead of `summarise()` so that they add a new column #' with group-wise counts. #' #' @param x A data frame, data frame extension (e.g. a tibble), or a #' lazy data frame (e.g. from dbplyr or dtplyr). #' @param ... <[`data-masking`][rlang::args_data_masking]> Variables to group #' by. #' @param wt <[`data-masking`][rlang::args_data_masking]> Frequency weights. #' Can be `NULL` or a variable: #' #' * If `NULL` (the default), counts the number of rows in each group. #' * If a variable, computes `sum(wt)` for each group. #' @param sort If `TRUE`, will show the largest groups at the top. #' @param name The name of the new column in the output. #' #' If omitted, it will default to `n`. If there's already a column called `n`, #' it will use `nn`. If there's a column called `n` and `nn`, it'll use #' `nnn`, and so on, adding `n`s until it gets a new name. #' @param .drop Handling of factor levels that don't appear in the data, passed #' on to [group_by()]. #' #' For `count()`: if `FALSE` will include counts for empty groups (i.e. for #' levels of factors that don't exist in the data). #' #' `r lifecycle::badge("defunct")` For `add_count()`: defunct since it #' can't actually affect the output. #' @return #' An object of the same type as `.data`. `count()` and `add_count()` #' group transiently, so the output has the same groups as the input. #' @export #' @examples #' # count() is a convenient way to get a sense of the distribution of #' # values in a dataset #' starwars |> count(species) #' starwars |> count(species, sort = TRUE) #' starwars |> count(sex, gender, sort = TRUE) #' starwars |> count(birth_decade = round(birth_year, -1)) #' #' # use the `wt` argument to perform a weighted count. This is useful #' # when the data has already been aggregated once #' df <- tribble( #' ~name, ~gender, ~runs, #' "Max", "male", 10, #' "Sandra", "female", 1, #' "Susan", "female", 4 #' ) #' # counts rows: #' df |> count(gender) #' # counts runs: #' df |> count(gender, wt = runs) #' #' # When factors are involved, `.drop = FALSE` can be used to retain factor #' # levels that don't appear in the data #' df2 <- tibble( #' id = 1:5, #' type = factor(c("a", "c", "a", NA, "a"), levels = c("a", "b", "c")) #' ) #' df2 |> count(type) #' df2 |> count(type, .drop = FALSE) #' #' # Or, using `group_by()`: #' df2 |> group_by(type, .drop = FALSE) |> count() #' #' # tally() is a lower-level function that assumes you've done the grouping #' starwars |> tally() #' starwars |> group_by(species) |> tally() #' #' # both count() and tally() have add_ variants that work like #' # mutate() instead of summarise #' df |> add_count(gender, wt = runs) #' df |> add_tally(wt = runs) count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL) { UseMethod("count") } #' @export #' @rdname count count.data.frame <- function( x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = group_by_drop_default(x) ) { dplyr_local_error_call() if (!missing(...)) { out <- group_by(x, ..., .add = TRUE, .drop = .drop) } else { out <- x } wt <- compat_wt(enquo(wt)) out <- tally(out, wt = !!wt, sort = sort, name = name) # Ensure grouping is transient out <- dplyr_reconstruct(out, x) out } #' @export #' @rdname count tally <- function(x, wt = NULL, sort = FALSE, name = NULL) { UseMethod("tally") } #' @export tally.data.frame <- function(x, wt = NULL, sort = FALSE, name = NULL) { name <- check_n_name(name, group_vars(x)) dplyr_local_error_call() wt <- compat_wt(enquo(wt)) n <- tally_n(x, wt) local_options(dplyr.summarise.inform = FALSE) out <- summarise(x, !!name := !!n) if (sort) { arrange(out, desc(!!sym(name))) } else { out } } #' @export #' @rdname count add_count <- function( x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = deprecated() ) { UseMethod("add_count") } #' @export add_count.default <- function( x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = deprecated() ) { add_count_impl( x, ..., wt = {{ wt }}, sort = sort, name = name, .drop = .drop ) } #' @export add_count.data.frame <- function( x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = deprecated() ) { out <- add_count_impl( x, ..., wt = {{ wt }}, sort = sort, name = name, .drop = .drop ) dplyr_reconstruct(out, x) } add_count_impl <- function( x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = deprecated(), error_call = caller_env(), user_env = caller_env(2) ) { if (!is_missing(.drop)) { lifecycle::deprecate_stop("1.0.0", "add_count(.drop = )", env = error_call) } dplyr_local_error_call(error_call) if (!missing(...)) { out <- group_by(x, ..., .add = TRUE) } else { out <- x } wt <- compat_wt(enquo(wt), env = error_call, user_env = user_env) add_tally(out, wt = !!wt, sort = sort, name = name) } #' @rdname count #' @export add_tally <- function(x, wt = NULL, sort = FALSE, name = NULL) { name <- check_n_name(name, tbl_vars(x)) dplyr_local_error_call() wt <- compat_wt(enquo(wt)) n <- tally_n(x, wt) out <- mutate(x, !!name := !!n) if (sort) { arrange(out, desc(!!sym(name))) } else { out } } # Helpers ----------------------------------------------------------------- tally_n <- function(x, wt) { if (quo_is_null(wt)) { expr(dplyr::n()) } else { expr(base::sum(!!wt, na.rm = TRUE)) } } compat_wt <- function(wt, env = caller_env(), user_env = caller_env(2)) { if (!is_call(quo_get_expr(wt), "n", n = 0)) { return(wt) } # Provided only by dplyr 1.0.0. See #5349 for discussion. lifecycle::deprecate_warn( when = "1.0.1", what = I("`wt = n()`"), details = "You can now omit the `wt` argument.", env = env, user_env = user_env, always = TRUE, id = "dplyr-count-wt" ) quo(NULL) } check_n_name <- function( name, vars, arg = caller_arg(name), call = caller_env() ) { if (is.null(name)) { name <- n_name(vars) if (name != "n") { inform(c( glue("Storing counts in `{name}`, as `n` already present in input"), i = "Use `name = \"new_name\"` to pick a new name." )) } } else { check_string(name, arg = arg, call = call) } name } n_name <- function(x) { name <- "n" while (name %in% x) { name <- paste0("n", name) } name } dplyr/R/generics.R0000644000176200001440000002334015137161765013545 0ustar liggesusers#' Extending dplyr with new data frame subclasses #' #' @description #' `r lifecycle::badge("experimental")` #' #' These three functions, along with `names<-` and 1d numeric `[` #' (i.e. `x[loc]`) methods, provide a minimal interface for extending dplyr #' to work with new data frame subclasses. This means that for simple cases #' you should only need to provide a couple of methods, rather than a method #' for every dplyr verb. #' #' These functions are a stop-gap measure until we figure out how to solve #' the problem more generally, but it's likely that any code you write to #' implement them will find a home in what comes next. #' #' # Basic advice #' #' This section gives you basic advice if you want to extend dplyr to work with #' your custom data frame subclass, and you want the dplyr methods to behave #' in basically the same way. #' #' * If you have data frame attributes that don't depend on the rows or columns #' (and should unconditionally be preserved), you don't need to do anything. #' The one exception to this is if your subclass extends a data.frame #' directly rather than extending a tibble. The `[.data.frame` method does not #' preserve attributes, so you'll need to write a `[` method for your subclass #' that preserves attributes important for your class. #' #' * If you have __scalar__ attributes that depend on __rows__, implement a #' `dplyr_reconstruct()` method. Your method should recompute the attribute #' depending on rows now present. #' #' * If you have __scalar__ attributes that depend on __columns__, implement a #' `dplyr_reconstruct()` method and a 1d `[` method. For example, if your #' class requires that certain columns be present, your method should return #' a data.frame or tibble when those columns are removed. #' #' * If your attributes are __vectorised__ over __rows__, implement a #' `dplyr_row_slice()` method. This gives you access to `i` so you can #' modify the row attribute accordingly. You'll also need to think carefully #' about how to recompute the attribute in `dplyr_reconstruct()`, and #' you will need to carefully verify the behaviour of each verb, and provide #' additional methods as needed. #' #' * If your attributes that are __vectorised__ over __columns__, implement #' `dplyr_col_modify()`, 1d `[`, and `names<-` methods. All of these methods #' know which columns are being modified, so you can update the column #' attribute according. You'll also need to think carefully about how to #' recompute the attribute in `dplyr_reconstruct()`, and you will need to #' carefully verify the behaviour of each verb, and provide additional #' methods as needed. #' #' # Current usage #' #' * `arrange()`, `filter()` (and `filter_out()`), `slice()` (and the rest of #' the `slice_*()` family), `semi_join()`, and `anti_join()` work by #' generating a vector of row indices, and then subsetting with #' `dplyr_row_slice()`. #' #' * `mutate()` generates a list of new column value (using `NULL` to indicate #' when columns should be deleted), then passes that to `dplyr_col_modify()`. #' It also uses 1d `[` to implement `.keep`, and will call `relocate()` if #' either `.before` or `.after` are supplied. #' #' * `summarise()` and `reframe()` work similarly to `mutate()` but the data #' modified by `dplyr_col_modify()` comes from `group_data()` or is built #' from `.by`. Note that this means that the data frames returned by #' `summarise()` and `reframe()` are fundamentally new data frames, and #' will not retain any custom subclasses or attributes. #' #' * `select()` uses 1d `[` to select columns, then `names<-` to rename them. #' `rename()` just uses `names<-`. `relocate()` just uses 1d `[`. #' #' * `inner_join()`, `left_join()`, `right_join()`, and `full_join()` #' coerce `x` to a tibble, modify the rows, then use `dplyr_reconstruct()` #' to convert back to the same type as `x`. #' #' * `nest_join()` converts both `x` and `y` to tibbles, modifies the rows, #' and uses `dplyr_col_modify()` to handle modified key variables and the #' list-column that `y` becomes. It also uses `dplyr_reconstruct()` to convert #' the outer result back to the type of `x`, and to convert the nested tibbles #' back to the type of `y`. #' #' * `distinct()` does a `mutate()` if any expressions are present, then #' uses 1d `[` to select variables to keep, then `dplyr_row_slice()` to #' select distinct rows. #' #' Note that `group_by()` and `ungroup()` don't use any of these generics and #' you'll need to provide methods for them directly, or rely on `.by` for #' per-operation grouping. #' #' @keywords internal #' @param data A tibble. We use tibbles because they avoid some inconsistent #' subset-assignment use cases. #' @name dplyr_extending NULL #' @export #' @rdname dplyr_extending #' @param i A numeric or logical vector that indexes the rows of `data`. dplyr_row_slice <- function(data, i, ...) { if (!is.numeric(i) && !is.logical(i)) { abort("`i` must be a numeric or logical vector.") } UseMethod("dplyr_row_slice") } #' @export dplyr_row_slice.data.frame <- function(data, i, ...) { dplyr_reconstruct(vec_slice(data, i), data) } #' @export dplyr_row_slice.grouped_df <- function(data, i, ..., preserve = FALSE) { out <- vec_slice(as.data.frame(data), i) # Index into group_indices, then use that to restore the grouping structure groups <- group_data(data) new_id <- vec_slice(group_indices(data), i) new_grps <- vec_group_loc(new_id) rows <- rep(list_of(integer()), length.out = nrow(groups)) rows[new_grps$key] <- new_grps$loc groups$.rows <- rows if (!preserve && isTRUE(attr(groups, ".drop"))) { groups <- group_data_trim(groups) } new_grouped_df(out, groups) } #' @export dplyr_row_slice.rowwise_df <- function(data, i, ..., preserve = FALSE) { out <- vec_slice(data, i) group_data <- vec_slice(group_keys(data), i) new_rowwise_df(out, group_data) } #' @export #' @rdname dplyr_extending #' @param cols A named list used to modify columns. A `NULL` value should remove #' an existing column. dplyr_col_modify <- function(data, cols) { UseMethod("dplyr_col_modify") } #' @export dplyr_col_modify.data.frame <- function(data, cols) { # Must be implemented from first principles to avoiding edge cases in # [.data.frame and [.tibble (2.1.3 and earlier). # Apply tidyverse recycling rules cols <- vec_recycle_common(!!!cols, .size = nrow(data)) # Transform to list to avoid stripping inner names with `[[<-` out <- as.list(vec_data(data)) nms <- as_utf8_character(names2(cols)) names(out) <- as_utf8_character(names2(out)) for (i in seq_along(cols)) { nm <- nms[[i]] out[[nm]] <- cols[[i]] } # Transform back to data frame before reconstruction row_names <- .row_names_info(data, type = 0L) out <- new_data_frame(out, n = nrow(data), row.names = row_names) dplyr_reconstruct(out, data) } #' @export dplyr_col_modify.grouped_df <- function(data, cols) { out <- dplyr_col_modify(as_tibble(data), cols) if (any(names(cols) %in% group_vars(data))) { # regroup grouped_df(out, group_vars(data), drop = group_by_drop_default(data)) } else { new_grouped_df(out, group_data(data)) } } #' @export dplyr_col_modify.rowwise_df <- function(data, cols) { out <- dplyr_col_modify(as_tibble(data), cols) rowwise_df(out, group_vars(data)) } #' @param template Template data frame to use for restoring attributes. #' @export #' @rdname dplyr_extending dplyr_reconstruct <- function(data, template) { # Strip attributes before dispatch to make it easier to implement # methods and prevent unexpected leaking of irrelevant attributes. # This also enforces that `data` is a well-formed data frame. data <- dplyr_new_data_frame(data) return(dplyr_reconstruct_dispatch(data, template)) UseMethod("dplyr_reconstruct", template) } dplyr_reconstruct_dispatch <- function(data, template) { UseMethod("dplyr_reconstruct", template) } #' @export dplyr_reconstruct.data.frame <- function(data, template) { .Call(ffi_dplyr_reconstruct, data, template) } #' @export dplyr_reconstruct.grouped_df <- function(data, template) { group_vars <- group_intersect(template, data) grouped_df(data, group_vars, drop = group_by_drop_default(template)) } #' @export dplyr_reconstruct.rowwise_df <- function(data, template) { group_vars <- group_intersect(template, data) rowwise_df(data, group_vars) } dplyr_col_select <- function(.data, loc, error_call = caller_env()) { loc <- vec_as_location(loc, n = df_n_col(.data), names = names(.data)) out <- .data[loc] if (!inherits(out, "data.frame")) { classes_data <- glue_collapse(class(.data), sep = "/") classes_out <- glue_collapse(class(out), sep = "/") bullets <- c( "Can't reconstruct data frame.", x = glue( "The `[` method for class <{classes_data}> must return a data frame." ), i = glue("It returned a <{classes_out}>.") ) abort(bullets, call = error_call) } if (length(out) != length(loc)) { classes_data <- glue_collapse(class(.data), sep = "/") classes_out <- glue_collapse(class(out), sep = "/") s <- function(x) if (length(x) == 1) "" else "s" bullets <- c( "Can't reconstruct data frame.", x = glue( "The `[` method for class <{classes_data}> must return a data frame with {length(loc)} column{s(loc)}." ), i = glue("It returned a <{classes_out}> of {length(out)} column{s(out)}.") ) abort(bullets, call = error_call) } # Patch base data frames and data.table (#6171) to restore extra attributes that `[.data.frame` drops. # We require `[` methods to keep extra attributes for all data frame subclasses. if ( identical(class(.data), "data.frame") || identical(class(.data), c("data.table", "data.frame")) ) { out <- dplyr_reconstruct(out, .data) } out } dplyr/R/colwise.R0000644000176200001440000002472415106134104013401 0ustar liggesusers#' Operate on a selection of variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' Scoped verbs (`_if`, `_at`, `_all`) have been superseded by the use of #' [pick()] or [across()] in an existing verb. See `vignette("colwise")` for #' details. #' #' The variants suffixed with `_if`, `_at` or `_all` apply an #' expression (sometimes several) to all variables within a specified #' subset. This subset can contain all variables (`_all` variants), a #' [vars()] selection (`_at` variants), or variables selected with a #' predicate (`_if` variants). #' #' The verbs with scoped variants are: #' #' * [mutate()], [transmute()] and [summarise()]. See [summarise_all()]. #' * [filter()]. See [filter_all()]. #' * [group_by()]. See [group_by_all()]. #' * [rename()] and [select()]. See [select_all()]. #' * [arrange()]. See [arrange_all()] #' #' There are three kinds of scoped variants. They differ in the scope #' of the variable selection on which operations are applied: #' #' * Verbs suffixed with `_all()` apply an operation on all variables. #' #' * Verbs suffixed with `_at()` apply an operation on a subset of #' variables specified with the quoting function [vars()]. This #' quoting function accepts [tidyselect::vars_select()] helpers like #' [starts_with()]. Instead of a [vars()] selection, you can also #' supply an [integerish][rlang::is_integerish] vector of column #' positions or a character vector of column names. #' #' * Verbs suffixed with `_if()` apply an operation on the subset of #' variables for which a predicate function returns `TRUE`. Instead #' of a predicate function, you can also supply a logical vector. #' #' @param .tbl A `tbl` object. #' @param .funs A function `fun`, a quosure style lambda `~ fun(.)` or a list of either form. #' #' @param .vars A list of columns generated by [vars()], #' a character vector of column names, a numeric vector of column #' positions, or `NULL`. #' @param .predicate A predicate function to be applied to the columns #' or a logical vector. The variables for which `.predicate` is or #' returns `TRUE` are selected. This argument is passed to #' [rlang::as_function()] and thus supports quosure-style lambda #' functions and strings representing function names. #' @param ... Additional arguments for the function calls in #' `.funs`. These are evaluated only once, with [tidy #' dots][rlang::tidy-dots] support. #' #' @section Grouping variables: #' #' Most of these operations also apply on the grouping variables when #' they are part of the selection. This includes: #' #' * [arrange_all()], [arrange_at()], and [arrange_if()] #' * [distinct_all()], [distinct_at()], and [distinct_if()] #' * [filter_all()], [filter_at()], and [filter_if()] #' * [group_by_all()], [group_by_at()], and [group_by_if()] #' * [select_all()], [select_at()], and [select_if()] #' #' This is not the case for summarising and mutating variants where #' operations are *not* applied on grouping variables. The behaviour #' depends on whether the selection is **implicit** (`all` and `if` #' selections) or **explicit** (`at` selections). Grouping variables #' covered by explicit selections (with [summarise_at()], #' [mutate_at()], and [transmute_at()]) are always an error. For #' implicit selections, the grouping variables are always ignored. In #' this case, the level of verbosity depends on the kind of operation: #' #' * Summarising operations ([summarise_all()] and [summarise_if()]) #' ignore grouping variables silently because it is obvious that #' operations are not applied on grouping variables. #' #' * On the other hand it isn't as obvious in the case of mutating #' operations ([mutate_all()], [mutate_if()], [transmute_all()], and #' [transmute_if()]). For this reason, they issue a message #' indicating which grouping variables are ignored. #' #' @name scoped NULL #' Select variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' `vars()` is superseded because it is only needed for the scoped verbs (i.e. #' [mutate_at()], [summarise_at()], and friends), which have been been #' superseded in favour of [across()]. See `vignette("colwise")` for details. #' #' This helper is intended to provide tidy-select semantics for scoped verbs #' like `mutate_at()` and `summarise_at()`. Note that anywhere you can supply #' `vars()` specification, you can also supply a numeric vector of column #' positions or a character vector of column names. #' #' @param ... <[`tidy-select`][dplyr_tidy_select]> Variables to operate on. #' @seealso [all_vars()] and [any_vars()] for other quoting #' functions that you can use with scoped verbs. #' @export vars <- function(...) { quos(...) } #' Apply predicate to all variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' `all_vars()` and `any_vars()` were only needed for the scoped verbs, which #' have been superseded by the use of [across()] in an existing verb. See #' `vignette("colwise")` for details. #' #' These quoting functions signal to scoped filtering verbs #' (e.g. [filter_if()] or [filter_all()]) that a predicate expression #' should be applied to all relevant variables. The `all_vars()` #' variant takes the intersection of the predicate expressions with #' `&` while the `any_vars()` variant takes the union with `|`. #' #' @param expr <[`data-masking`][rlang::args_data_masking]> An expression that #' returns a logical vector, using `.` to refer to the "current" variable. #' @seealso [vars()] for other quoting functions that you #' can use with scoped verbs. #' @export all_vars <- function(expr) { lifecycle::signal_stage("superseded", "all_vars()") structure(enquo(expr), class = c("all_vars", "quosure", "formula")) } #' @rdname all_vars #' @export any_vars <- function(expr) { lifecycle::signal_stage("superseded", "any_vars()") structure(enquo(expr), class = c("any_vars", "quosure", "formula")) } #' @export print.all_vars <- function(x, ...) { cat("\n") NextMethod() } #' @export print.any_vars <- function(x, ...) { cat("\n") NextMethod() } # Requires tbl_vars() method tbl_at_vars <- function( tbl, vars, .include_group_vars = FALSE, error_call = caller_env() ) { if (.include_group_vars) { tibble_vars <- tbl_vars(tbl) } else { tibble_vars <- tbl_nongroup_vars(tbl) } if (is_null(vars)) { character() } else if (is_integerish(vars)) { tibble_vars[vars] } else if (is_quosures(vars) || is_character(vars)) { out <- tidyselect::vars_select(tibble_vars, !!!vars) if (!any(have_name(vars))) { names(out) <- NULL } out } else { msg <- glue( "`.vars` must be a character/numeric vector or a `vars()` object, not {obj_type_friendly(vars)}." ) abort(msg, call = error_call) } } tbl_at_syms <- function( tbl, vars, .include_group_vars = FALSE, error_call = caller_env() ) { vars <- tbl_at_vars( tbl, vars, .include_group_vars = .include_group_vars, error_call = error_call ) set_names(syms(vars), names(vars)) } # Requires tbl_vars(), `[[`() and length() methods tbl_if_vars <- function( .tbl, .p, .env, ..., .include_group_vars = FALSE, error_call = caller_env() ) { if (.include_group_vars) { tibble_vars <- tbl_vars(.tbl) } else { tibble_vars <- tbl_nongroup_vars(.tbl) } if (is_logical(.p)) { if (length(.p) != length(tibble_vars)) { bullets <- c( "`.p` is invalid.", x = "`.p` should have the same size as the number of variables in the tibble.", i = glue("`.p` is size {length(.p)}."), i = glue( "The tibble has {length(tibble_vars)} columns, {including} the grouping variables.", including = if (.include_group_vars) "including" else "non including" ) ) abort(bullets, call = error_call) } return(syms(tibble_vars[.p])) } .tbl <- tbl_ptype(.tbl) if (is_fun_list(.p) || is_list(.p)) { if (length(.p) != 1) { msg <- glue("`.predicate` must have length 1, not {length(.p)}.") abort(msg, call = error_call) } .p <- .p[[1]] } if (is_quosure(.p)) { .p <- quo_as_function(.p) } else { .p <- as_function(.p, .env) } n <- length(tibble_vars) selected <- new_logical(n) for (i in seq_len(n)) { column <- pull(.tbl, tibble_vars[[.env$i]]) cond <- eval_tidy(.p(column, ...)) if (!is.logical(cond) || length(cond) != 1) { bullets <- c( "`.p` is invalid.", x = "`.p` should return a single logical.", i = if (is.logical(cond)) { glue( "`.p` returns a size {length(cond)} for column `{tibble_vars[[i]]}`." ) } else { glue( "`.p` returns a <{vec_ptype_full(cond)}> for column `{tibble_vars[[i]]}`." ) } ) abort(bullets, call = error_call) } selected[[i]] <- isTRUE(cond) } tibble_vars[selected] } tbl_if_syms <- function( .tbl, .p, .env, ..., .include_group_vars = FALSE, error_call = caller_env() ) { syms(tbl_if_vars( .tbl, .p, .env, ..., .include_group_vars = .include_group_vars, error_call = error_call )) } #' Return a prototype of a tbl #' #' Used in `_if` functions to enable type-based selection even when the data #' is lazily generated. Should either return the complete tibble, or if that #' can not be computed quickly, a 0-row tibble where the columns are of #' the correct type. #' #' @export #' @keywords internal tbl_ptype <- function(.data) { UseMethod("tbl_ptype") } #' @export tbl_ptype.default <- function(.data) { if (inherits(.data, "tbl_lazy")) { # TODO: remove once moved to dplyr inform("Applying predicate on the first 100 rows") collect(.data, n = 100) } else { .data } } # The lambda must inherit from: # - Execution environment (bound arguments with purrr lambda syntax) # - Lexical environment (local variables) # - Data mask (other columns) # # So we need: # - Inheritance from closure -> lexical # - A maskable quosure as_inlined_function <- function(f, env, ...) { # Process unquote operator at inlining time f <- expr_interp(f) # Transform to a purrr-like lambda fn <- as_function(f, env = env) body(fn) <- expr({ # Force all arguments base::pairlist(...) # Transform the lambda body into a maskable quosure inheriting # from the execution environment `_quo` <- rlang::quo(!!body(fn)) # Evaluate the quosure in the mask rlang::eval_bare(`_quo`, base::parent.frame()) }) structure(fn, class = "inline_colwise_function", formula = f) } dplyr/R/dplyr.R0000644000176200001440000000130114406402754013063 0ustar liggesusers#' @description #' To learn more about dplyr, start with the vignettes: #' `browseVignettes(package = "dplyr")` #' @useDynLib dplyr, .registration = TRUE #' @keywords internal #' @import rlang #' @rawNamespace import(vctrs, except = data_frame) #' @importFrom glue glue glue_collapse glue_data #' @importFrom tibble new_tibble is_tibble #' @importFrom stats setNames update #' @importFrom utils head tail #' @importFrom methods setOldClass #' @importFrom lifecycle deprecated #' @importFrom R6 R6Class "_PACKAGE" # We're importing vctrs without `data_frame()` because we currently # reexport the deprecated `tibble::data_frame()` function on_load(local_use_cli()) # Singletons the <- new_environment() dplyr/R/doc-params.R0000644000176200001440000000225714472225345013774 0ustar liggesusers#' Argument type: tidy-select #' #' @description #' This page describes the `` argument modifier which indicates #' the argument supports **tidy selections**. Tidy selection provides a concise #' dialect of R for selecting variables based on their names or properties. #' #' Tidy selection is a variant of tidy evaluation. This means that inside #' functions, tidy-select arguments require special attention, as described in #' the *Indirection* section below. If you've never heard of tidy evaluation #' before, start with `vignette("programming")`. #' #' #' # Overview of selection features #' #' ```{r, child = "man/rmd/overview.Rmd"} #' ``` #' #' #' # Indirection #' #' There are two main cases: #' #' * If you have a character vector of column names, use `all_of()` #' or `any_of()`, depending on whether or not you want unknown variable #' names to cause an error, e.g. `select(df, all_of(vars))`, #' `select(df, !any_of(vars))`. #' #' * If you want the user to be able to supply a tidyselect specification in #' a function argument, embrace the function argument, e.g. #' `select(df, {{ vars }})`. #' #' @keywords internal #' @name dplyr_tidy_select NULL dplyr/R/pick.R0000644000176200001440000001550215106134104012654 0ustar liggesusers#' Select a subset of columns #' #' @description #' `pick()` provides a way to easily select a subset of columns from your data #' using [select()] semantics while inside a #' ["data-masking"][rlang::args_data_masking] function like [mutate()] or #' [summarise()]. `pick()` returns a data frame containing the selected columns #' for the current group. #' #' `pick()` is complementary to [across()]: #' - With `pick()`, you typically apply a function to the full data frame. #' - With `across()`, you typically apply a function to each column. #' #' @details #' Theoretically, `pick()` is intended to be replaceable with an equivalent call #' to `tibble()`. For example, `pick(a, c)` could be replaced with #' `tibble(a = a, c = c)`, and `pick(everything())` on a data frame with cols #' `a`, `b`, and `c` could be replaced with `tibble(a = a, b = b, c = c)`. #' `pick()` specially handles the case of an empty selection by returning a 1 #' row, 0 column tibble, so an exact replacement is more like: #' #' ``` #' size <- vctrs::vec_size_common(..., .absent = 1L) #' out <- vctrs::vec_recycle_common(..., .size = size) #' tibble::new_tibble(out, nrow = size) #' ``` #' #' @param ... <[`tidy-select`][dplyr_tidy_select]> #' #' Columns to pick. #' #' You can't pick grouping columns because they are already automatically #' handled by the verb (i.e. [summarise()] or [mutate()]). #' #' @returns #' A tibble containing the selected columns for the current group. #' #' @seealso [across()] #' @export #' @examples #' df <- tibble( #' x = c(3, 2, 2, 2, 1), #' y = c(0, 2, 1, 1, 4), #' z1 = c("a", "a", "a", "b", "a"), #' z2 = c("c", "d", "d", "a", "c") #' ) #' df #' #' # `pick()` provides a way to select a subset of your columns using #' # tidyselect. It returns a data frame. #' df |> mutate(cols = pick(x, y)) #' #' # This is useful for functions that take data frames as inputs. #' # For example, you can compute a joint rank between `x` and `y`. #' df |> mutate(rank = dense_rank(pick(x, y))) #' #' # `pick()` is also useful as a bridge between data-masking functions (like #' # `mutate()` or `group_by()`) and functions with tidy-select behavior (like #' # `select()`). For example, you can use `pick()` to create a wrapper around #' # `group_by()` that takes a tidy-selection of columns to group on. For more #' # bridge patterns, see #' # https://rlang.r-lib.org/reference/topic-data-mask-programming.html#bridge-patterns. #' my_group_by <- function(data, cols) { #' group_by(data, pick({{ cols }})) #' } #' #' df |> my_group_by(c(x, starts_with("z"))) #' #' # Or you can use it to dynamically select columns to `count()` by #' df |> count(pick(starts_with("z"))) pick <- function(...) { # This is the evaluation fallback for `pick()`, which runs: # - When users call `pick()` outside of a mutate-like context. # - When users wrap `pick()` into their own helper functions, preventing # `pick()` expansion from occurring. mask <- peek_mask() if (dots_n(...) == 0L) { stop_pick_empty() } # Evaluates `pick()` on current columns. # Mimicking expansion as much as possible, which should match the idea of # replacing the `pick()` call directly with `tibble()`, like: # pick(a, b, starts_with("foo")) -> tibble(a = a, b = b, foo1 = foo1) non_group_vars <- mask$current_non_group_vars() data <- mask$current_cols(non_group_vars) # `pick()` is evaluated in a data mask so we need to remove the # mask layer from the quosure environments (same as `across()`) (#5460) quos <- enquos(..., .named = NULL) quos <- map(quos, quo_set_env_to_data_mask_top) expr <- expr(c(!!!quos)) sel <- tidyselect::eval_select( expr = expr, data = data, allow_rename = FALSE ) data <- data[sel] data <- dplyr_pick_tibble(!!!data) data } # ------------------------------------------------------------------------------ expand_pick <- function(quo, mask) { error_call <- call("pick") out <- expand_pick_quo(quo, mask, error_call = error_call) out <- new_dplyr_quosure(out, !!!attr(quo, "dplyr:::data")) out } expand_pick_quo <- function(quo, mask, error_call = caller_env()) { env <- quo_get_env(quo) expr <- quo_get_expr(quo) if (is_missing(expr)) { return(quo) } if (is_quosure(expr)) { expr <- expand_pick_quo(expr, mask, error_call = error_call) } else if (is_call(expr)) { expr <- expand_pick_call(expr, env, mask, error_call = error_call) } new_quosure(expr, env = env) } expand_pick_call <- function(expr, env, mask, error_call = caller_env()) { if (is_call(expr, name = "pick", ns = c("", "dplyr"))) { expr <- as_pick_selection(expr, error_call) out <- eval_pick(expr, env, mask, error_call) out <- as_pick_expansion(out) return(out) } if (is_call(expr, name = c("~", "function"))) { # Never expand across anonymous function boundaries return(expr) } index <- seq2(2L, length(expr)) for (i in index) { elt <- expr[[i]] if (is_missing(elt)) { next } if (is_quosure(elt)) { expr[[i]] <- expand_pick_quo(elt, mask, error_call = error_call) } else if (is_call(elt)) { expr[[i]] <- expand_pick_call(elt, env, mask, error_call = error_call) } } expr } eval_pick <- function(expr, env, mask, error_call = caller_env()) { # Evaluates `pick()` on the full version of the "current" columns. # Remove grouping variables, which are never allowed to be selected as # variables to `pick()`. This includes variables specified in # `rowwise(.data, ...)`. data <- mask$get_current_data(groups = FALSE) out <- tidyselect::eval_select( expr = expr, env = env, data = data, error_call = error_call, allow_rename = FALSE ) names(out) } as_pick_selection <- function(expr, error_call) { # Drop `pick()`, get the arguments expr <- expr[-1] if (is.null(expr)) { stop_pick_empty(call = error_call) } # Turn arguments into list of expressions expr <- as.list(expr) # Inline into `c()` call for tidy-selection expr <- expr(c(!!!expr)) expr } as_pick_expansion <- function(names) { out <- set_names(syms(names), names) expr(asNamespace("dplyr")$dplyr_pick_tibble(!!!out)) } dplyr_pick_tibble <- function(...) { error_call <- call("pick") out <- list2(...) # Allow recycling between selected columns, in case it is called from # a `reframe()` call that modified columns in an earlier expression like # `reframe(df, x = 1, y = pick(x, z))`. This also closely mimics expansion # into `y = tibble(x, z)`, with an empty selection being an exception that # is like `y = tibble(.rows = 1L)` for recycling purposes (#6685). size <- vec_size_common(!!!out, .absent = 1L, .call = error_call) out <- vec_recycle_common(!!!out, .size = size, .call = error_call) dplyr_new_tibble(out, size = size) } stop_pick_empty <- function(call = caller_env()) { abort("Must supply at least one input to `pick()`.", call = call) } dplyr/R/import-standalone-obj-type.R0000644000176200001440000002026714406402754017134 0ustar liggesusers# Standalone file: do not edit by hand # Source: # ---------------------------------------------------------------------- # # --- # repo: r-lib/rlang # file: standalone-obj-type.R # last-updated: 2022-10-04 # license: https://unlicense.org # --- # # ## Changelog # # 2022-10-04: # - `obj_type_friendly(value = TRUE)` now shows numeric scalars # literally. # - `stop_friendly_type()` now takes `show_value`, passed to # `obj_type_friendly()` as the `value` argument. # # 2022-10-03: # - Added `allow_na` and `allow_null` arguments. # - `NULL` is now backticked. # - Better friendly type for infinities and `NaN`. # # 2022-09-16: # - Unprefixed usage of rlang functions with `rlang::` to # avoid onLoad issues when called from rlang (#1482). # # 2022-08-11: # - Prefixed usage of rlang functions with `rlang::`. # # 2022-06-22: # - `friendly_type_of()` is now `obj_type_friendly()`. # - Added `obj_type_oo()`. # # 2021-12-20: # - Added support for scalar values and empty vectors. # - Added `stop_input_type()` # # 2021-06-30: # - Added support for missing arguments. # # 2021-04-19: # - Added support for matrices and arrays (#141). # - Added documentation. # - Added changelog. # # nocov start #' Return English-friendly type #' @param x Any R object. #' @param value Whether to describe the value of `x`. Special values #' like `NA` or `""` are always described. #' @param length Whether to mention the length of vectors and lists. #' @return A string describing the type. Starts with an indefinite #' article, e.g. "an integer vector". #' @noRd obj_type_friendly <- function(x, value = TRUE) { if (is_missing(x)) { return("absent") } if (is.object(x)) { if (inherits(x, "quosure")) { type <- "quosure" } else { type <- paste(class(x), collapse = "/") } return(sprintf("a <%s> object", type)) } if (!is_vector(x)) { return(.rlang_as_friendly_type(typeof(x))) } n_dim <- length(dim(x)) if (!n_dim) { if (!is_list(x) && length(x) == 1) { if (is_na(x)) { return(switch( typeof(x), logical = "`NA`", integer = "an integer `NA`", double = if (is.nan(x)) { "`NaN`" } else { "a numeric `NA`" }, complex = "a complex `NA`", character = "a character `NA`", .rlang_stop_unexpected_typeof(x) )) } show_infinites <- function(x) { if (x > 0) { "`Inf`" } else { "`-Inf`" } } str_encode <- function(x, width = 30, ...) { if (nchar(x) > width) { x <- substr(x, 1, width - 3) x <- paste0(x, "...") } encodeString(x, ...) } if (value) { if (is.numeric(x) && is.infinite(x)) { return(show_infinites(x)) } if (is.numeric(x) || is.complex(x)) { number <- as.character(round(x, 2)) what <- if (is.complex(x)) "the complex number" else "the number" return(paste(what, number)) } return(switch( typeof(x), logical = if (x) "`TRUE`" else "`FALSE`", character = { what <- if (nzchar(x)) "the string" else "the empty string" paste(what, str_encode(x, quote = "\"")) }, raw = paste("the raw value", as.character(x)), .rlang_stop_unexpected_typeof(x) )) } return(switch( typeof(x), logical = "a logical value", integer = "an integer", double = if (is.infinite(x)) show_infinites(x) else "a number", complex = "a complex number", character = if (nzchar(x)) "a string" else "\"\"", raw = "a raw value", .rlang_stop_unexpected_typeof(x) )) } if (length(x) == 0) { return(switch( typeof(x), logical = "an empty logical vector", integer = "an empty integer vector", double = "an empty numeric vector", complex = "an empty complex vector", character = "an empty character vector", raw = "an empty raw vector", list = "an empty list", .rlang_stop_unexpected_typeof(x) )) } } vec_type_friendly(x) } vec_type_friendly <- function(x, length = FALSE) { if (!is_vector(x)) { abort("`x` must be a vector.") } type <- typeof(x) n_dim <- length(dim(x)) add_length <- function(type) { if (length && !n_dim) { paste0(type, sprintf(" of length %s", length(x))) } else { type } } if (type == "list") { if (n_dim < 2) { return(add_length("a list")) } else if (is.data.frame(x)) { return("a data frame") } else if (n_dim == 2) { return("a list matrix") } else { return("a list array") } } type <- switch( type, logical = "a logical %s", integer = "an integer %s", numeric = , double = "a double %s", complex = "a complex %s", character = "a character %s", raw = "a raw %s", type = paste0("a ", type, " %s") ) if (n_dim < 2) { kind <- "vector" } else if (n_dim == 2) { kind <- "matrix" } else { kind <- "array" } out <- sprintf(type, kind) if (n_dim >= 2) { out } else { add_length(out) } } .rlang_as_friendly_type <- function(type) { switch( type, list = "a list", NULL = "`NULL`", environment = "an environment", externalptr = "a pointer", weakref = "a weak reference", S4 = "an S4 object", name = , symbol = "a symbol", language = "a call", pairlist = "a pairlist node", expression = "an expression vector", char = "an internal string", promise = "an internal promise", ... = "an internal dots object", any = "an internal `any` object", bytecode = "an internal bytecode object", primitive = , builtin = , special = "a primitive function", closure = "a function", type ) } .rlang_stop_unexpected_typeof <- function(x, call = caller_env()) { abort( sprintf("Unexpected type <%s>.", typeof(x)), call = call ) } #' Return OO type #' @param x Any R object. #' @return One of `"bare"` (for non-OO objects), `"S3"`, `"S4"`, #' `"R6"`, or `"R7"`. #' @noRd obj_type_oo <- function(x) { if (!is.object(x)) { return("bare") } class <- inherits(x, c("R6", "R7_object"), which = TRUE) if (class[[1]]) { "R6" } else if (class[[2]]) { "R7" } else if (isS4(x)) { "S4" } else { "S3" } } #' @param x The object type which does not conform to `what`. Its #' `obj_type_friendly()` is taken and mentioned in the error message. #' @param what The friendly expected type as a string. Can be a #' character vector of expected types, in which case the error #' message mentions all of them in an "or" enumeration. #' @param show_value Passed to `value` argument of `obj_type_friendly()`. #' @param ... Arguments passed to [abort()]. #' @inheritParams args_error_context #' @noRd stop_input_type <- function(x, what, ..., allow_na = FALSE, allow_null = FALSE, show_value = TRUE, arg = caller_arg(x), call = caller_env()) { # From standalone-cli.R cli <- env_get_list( nms = c("format_arg", "format_code"), last = topenv(), default = function(x) sprintf("`%s`", x), inherit = TRUE ) if (allow_na) { what <- c(what, cli$format_code("NA")) } if (allow_null) { what <- c(what, cli$format_code("NULL")) } if (length(what)) { what <- oxford_comma(what) } message <- sprintf( "%s must be %s, not %s.", cli$format_arg(arg), what, obj_type_friendly(x, value = show_value) ) abort(message, ..., call = call, arg = arg) } oxford_comma <- function(chr, sep = ", ", final = "or") { n <- length(chr) if (n < 2) { return(chr) } head <- chr[seq_len(n - 1)] last <- chr[n] head <- paste(head, collapse = sep) # Write a or b. But a, b, or c. if (n > 2) { paste0(head, sep, final, " ", last) } else { paste0(head, " ", final, " ", last) } } # nocov end dplyr/R/n-col.R0000644000176200001440000000135715106134104012741 0ustar liggesusers# Masks `ncol()` to avoid accidentally materializing ALTREP duckplyr # data frames. ncol <- function(x) { abort("Use `df_n_col()` or `mat_n_col()` instead.") } # Alternative to `ncol()` which avoids `dim()`. # # `dim()` also requires knowing the number of rows, # which forces ALTREP duckplyr data frames to materialize. # # This function makes the same assertion as vctrs about data frame structure, # i.e. if `x` inherits from `"data.frame"`, then it is a VECSXP with length # equal to the number of columns. df_n_col <- function(x) { x <- unclass(x) obj_check_list(x) length(x) } # In a few places we call `ncol()` on matrices, and in those # cases we want to continue using the base version. mat_n_col <- function(x) { base::ncol(x) } dplyr/R/deprec-context.R0000644000176200001440000000176115137161765014675 0ustar liggesusers#' Information about the "current" group or variable #' #' @description #' `r lifecycle::badge("deprecated")` #' #' These functions were deprecated in dplyr 1.1.0. #' #' * `cur_data()` is deprecated in favor of [pick()]. #' * `cur_data_all()` is deprecated but does not have a direct replacement as #' selecting the grouping variables is not well-defined and is unlikely to #' ever be useful. #' #' @keywords internal #' @name deprec-context NULL #' @rdname deprec-context #' @export cur_data <- function() { lifecycle::deprecate_warn( when = "1.1.0", what = "cur_data()", with = "pick()", id = "dplyr-cur-data" ) mask <- peek_mask() vars <- mask$current_non_group_vars() mask$pick_current(vars) } #' @rdname deprec-context #' @export cur_data_all <- function() { lifecycle::deprecate_warn( when = "1.1.0", what = "cur_data_all()", with = "pick()", id = "dplyr-cur-data-all" ) mask <- peek_mask() vars <- mask$current_vars() mask$pick_current(vars) } dplyr/R/by.R0000644000176200001440000000740015106134104012336 0ustar liggesusers#' Helper for consistent documentation of `.by` #' #' Use `@inheritParams args_by` to consistently document `.by`. #' #' @param .by #' #' <[`tidy-select`][dplyr_tidy_select]> Optionally, a selection of columns to #' group by for just this operation, functioning as an alternative to [group_by()]. For #' details and examples, see [?dplyr_by][dplyr_by]. #' #' @name args_by #' @keywords internal NULL #' Per-operation grouping with `.by`/`by` #' #' ```{r, echo = FALSE, results = "asis"} #' result <- rlang::with_options( #' knitr::knit_child("man/rmd/by.Rmd"), #' dplyr.summarise.inform = TRUE #' ) #' cat(result, sep = "\n") #' ``` #' #' @name dplyr_by NULL compute_by <- function( by, data, ..., by_arg = "by", data_arg = "data", error_call = caller_env() ) { check_dots_empty0(...) error_call <- dplyr_error_call(error_call) by <- enquo(by) check_by( by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call ) if (is_grouped_df(data)) { type <- "grouped" names <- group_vars(data) data <- group_data(data) } else if (is_rowwise_df(data)) { type <- "rowwise" names <- group_vars(data) data <- group_data(data) } else { if (quo_is_null(by)) { # Much faster than `eval_select_by()` for this common case by <- character() } else { by <- eval_select_by(by, data, error_call = error_call) } if (length(by) == 0L) { # `by = NULL` or empty selection type <- "ungrouped" names <- by data <- group_data(data) data <- dplyr_new_tibble(data, size = vec_size(data)) } else { type <- "grouped" names <- by data <- compute_by_groups(data, by, error_call = error_call) } } new_by(type = type, names = names, data = data) } compute_by_groups <- function(data, names, error_call = caller_env()) { data <- dplyr_col_select(data, names, error_call = error_call) info <- vec_group_loc(data) size <- vec_size(info) out <- dplyr_new_list(info$key) out[[".rows"]] <- new_list_of(info$loc, ptype = integer()) out <- dplyr_new_tibble(out, size = size) out } check_by <- function( by, data, ..., by_arg = "by", data_arg = "data", error_call = caller_env() ) { check_dots_empty0(...) if (quo_is_null(by)) { return(invisible(NULL)) } if (is_grouped_df(data)) { message <- paste0( "Can't supply {.arg {by_arg}} when ", "{.arg {data_arg}} is a grouped data frame." ) cli::cli_abort(message, call = error_call) } if (is_rowwise_df(data)) { message <- paste0( "Can't supply {.arg {by_arg}} when ", "{.arg {data_arg}} is a rowwise data frame." ) cli::cli_abort(message, call = error_call) } invisible(NULL) } eval_select_by <- function(by, data, error_call = caller_env()) { out <- tidyselect::eval_select( expr = by, data = data, allow_rename = FALSE, error_call = error_call ) names(out) } new_by <- function(type, names, data) { structure(list(type = type, names = names, data = data), class = "dplyr_by") } check_by_typo <- function(..., by = NULL, error_call = caller_env()) { check_by_typo_impl( wrong = "by", right = ".by", by = {{ by }}, error_call = error_call ) } check_dot_by_typo <- function(..., .by = NULL, error_call = caller_env()) { check_by_typo_impl( wrong = ".by", right = "by", by = {{ .by }}, error_call = error_call ) } check_by_typo_impl <- function( wrong, right, by = NULL, error_call = caller_env() ) { by <- enquo(by) if (quo_is_null(by)) { return(invisible()) } message <- c( "Can't specify an argument named {.code {wrong}} in this verb.", i = "Did you mean to use {.code {right}} instead?" ) cli::cli_abort(message, call = error_call) } dplyr/R/relocate.R0000644000176200001440000001155715106134104013532 0ustar liggesusers#' Change column order #' #' Use `relocate()` to change column positions, using the same syntax as #' `select()` to make it easy to move blocks of columns at once. #' #' @inheritParams arrange #' @param ... <[`tidy-select`][dplyr_tidy_select]> Columns to move. #' @param .before,.after <[`tidy-select`][dplyr_tidy_select]> Destination of #' columns selected by `...`. Supplying neither will move columns to the #' left-hand side; specifying both is an error. #' @return #' An object of the same type as `.data`. The output has the following #' properties: #' #' * Rows are not affected. #' * The same columns appear in the output, but (usually) in a different place #' and possibly renamed. #' * Data frame attributes are preserved. #' * Groups are not affected. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("relocate")}. #' @export #' @examples #' df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a") #' df |> relocate(f) #' df |> relocate(a, .after = c) #' df |> relocate(f, .before = b) #' df |> relocate(a, .after = last_col()) #' #' # relocated columns can change name #' df |> relocate(ff = f) #' #' # Can also select variables based on their type #' df |> relocate(where(is.character)) #' df |> relocate(where(is.numeric), .after = last_col()) #' # Or with any other select helper #' df |> relocate(any_of(c("a", "e", "i", "o", "u"))) #' #' # When .before or .after refers to multiple variables they will be #' # moved to be immediately before/after the selected variables. #' df2 <- tibble(a = 1, b = "a", c = 1, d = "a") #' df2 |> relocate(where(is.numeric), .after = where(is.character)) #' df2 |> relocate(where(is.numeric), .before = where(is.character)) relocate <- function(.data, ..., .before = NULL, .after = NULL) { UseMethod("relocate") } #' @export relocate.data.frame <- function(.data, ..., .before = NULL, .after = NULL) { loc <- eval_relocate( expr = expr(c(...)), data = .data, before = enquo(.before), after = enquo(.after), before_arg = ".before", after_arg = ".after" ) out <- dplyr_col_select(.data, loc) out <- set_names(out, names(loc)) out } eval_relocate <- function( expr, data, ..., before = NULL, after = NULL, before_arg = "before", after_arg = "after", env = caller_env(), error_call = caller_env() ) { # `eval_relocate()` returns a named integer vector of size `ncol(data)` # describing how to rearrange `data`. Each location in the range # `seq2(1L, ncol(data))` is represented once. The names are the new names to # assign to those columns. They are typically the same as the original names, # but `expr` does allow for renaming. check_dots_empty0(...) sel <- tidyselect::eval_select( expr = expr, data = data, env = env, error_call = error_call ) # Enforce the invariant that relocating can't change the number of columns by # retaining only the last instance of a column that is renamed multiple times # TODO: https://github.com/r-lib/vctrs/issues/1442 # `sel <- vec_unique(sel, which = "last")` loc_last <- which(!duplicated(sel, fromLast = TRUE)) sel <- vec_slice(sel, loc_last) n <- length(data) before <- as_quosure(before, env = env) after <- as_quosure(after, env = env) has_before <- !quo_is_null(before) has_after <- !quo_is_null(after) if (has_before && has_after) { message <- glue("Can't supply both `{before_arg}` and `{after_arg}`.") abort(message, call = error_call) } if (has_before) { # TODO: Use `allow_rename = FALSE`. https://github.com/r-lib/tidyselect/issues/221 where <- tidyselect::eval_select( before, data, env = env, error_call = error_call ) where <- unname(where) if (length(where) == 0L) { # Empty `before` selection pushes `sel` to the front where <- 1L } else { where <- min(where) } } else if (has_after) { # TODO: Use `allow_rename = FALSE`. https://github.com/r-lib/tidyselect/issues/221 where <- tidyselect::eval_select( after, data, env = env, error_call = error_call ) where <- unname(where) if (length(where) == 0L) { # Empty `after` selection pushes `sel` to the back where <- n } else { where <- max(where) } where <- where + 1L } else { # Defaults to `before = everything()` if neither `before` nor `after` are supplied where <- 1L } lhs <- seq2(1L, where - 1L) rhs <- seq2(where, n) lhs <- setdiff(lhs, sel) rhs <- setdiff(rhs, sel) names <- names(data) names(lhs) <- names[lhs] names(rhs) <- names[rhs] sel <- vec_c(lhs, sel, rhs) sel } dplyr/R/transmute.R0000644000176200001440000000522315106134104013747 0ustar liggesusers#' Create, modify, and delete columns #' #' @description #' `r lifecycle::badge("superseded")` #' #' `transmute()` creates a new data frame containing only the specified #' computations. It's superseded because you can perform the same job #' with `mutate(.keep = "none")`. #' #' @inheritParams mutate #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("transmute")}. #' @returns An object of the same type as `.data`. The output has the following #' properties: #' #' * Columns created or modified through `...` will be returned in the order #' specified by `...`. #' * Unmodified grouping columns will be placed at the front. #' * The number of rows is not affected. #' * Columns given the value `NULL` will be removed. #' * Groups will be recomputed if a grouping variable is mutated. #' * Data frame attributes are preserved. #' @keywords internal #' @export transmute <- function(.data, ...) { # dplyr 1.1.0 lifecycle::signal_stage( "superseded", "transmute()", I("mutate(.keep = 'none')") ) UseMethod("transmute") } #' @export transmute.data.frame <- function(.data, ...) { dots <- check_transmute_args(...) dots <- dplyr_quosures(!!!dots) # We don't expose `.by` because `transmute()` is superseded by <- compute_by(by = NULL, data = .data) cols <- mutate_cols(.data, dots, by) out <- dplyr_col_modify(.data, cols) # Compact out `NULL` columns that got removed. # These won't exist in `out`, but we don't want them to look "new". # Note that `dplyr_col_modify()` makes it impossible to `NULL` a group column, # which we rely on below. cols <- compact_null(cols) # Retain expression columns in order of their appearance cols_expr <- names(cols) # Retain untouched group variables up front cols_group <- by$names cols_group <- setdiff(cols_group, cols_expr) cols_retain <- c(cols_group, cols_expr) dplyr_col_select(out, cols_retain) } # helpers ----------------------------------------------------------------- check_transmute_args <- function( ..., .keep, .before, .after, error_call = caller_env() ) { if (!missing(.keep)) { abort("The `.keep` argument is not supported.", call = error_call) } if (!missing(.before)) { abort("The `.before` argument is not supported.", call = error_call) } if (!missing(.after)) { abort("The `.after` argument is not supported.", call = error_call) } enquos(...) } dplyr/R/copy-to.R0000644000176200001440000000441015106134104013314 0ustar liggesusers#' Copy a local data frame to a remote src #' #' This function uploads a local data frame into a remote data source, creating #' the table definition as needed. Wherever possible, the new object will be #' temporary, limited to the current connection to the source. #' #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("copy_to")}. #' @param dest remote data source #' @param df local data frame #' @param name name for new remote table. #' @param overwrite If `TRUE`, will overwrite an existing table with #' name `name`. If `FALSE`, will throw an error if `name` already #' exists. #' @param ... other parameters passed to methods. #' @seealso [collect()] for the opposite action; downloading remote data into #' a local dbl. #' @return a `tbl` object in the remote source #' @export #' @examples #' \dontrun{ #' iris2 <- dbplyr::src_memdb() |> copy_to(iris, overwrite = TRUE) #' iris2 #' } copy_to <- function( dest, df, name = deparse(substitute(df)), overwrite = FALSE, ... ) { UseMethod("copy_to") } #' Copy tables to same source, if necessary #' #' @param x,y `y` will be copied to `x`, if necessary. #' @param copy If `x` and `y` are not from the same data source, #' and `copy` is `TRUE`, then `y` will be copied into the #' same src as `x`. This allows you to join tables across srcs, but #' it is a potentially expensive operation so you must opt into it. #' @param ... Other arguments passed on to methods. #' @export auto_copy <- function(x, y, copy = FALSE, ...) { if (same_src(x, y)) { return(y) } if (!copy) { bullets <- c( "`x` and `y` must share the same src.", i = cli::format_inline("`x` is {obj_type_friendly(x)}."), i = cli::format_inline("`y` is {obj_type_friendly(y)}."), i = "Set `copy = TRUE` if `y` can be copied to the same source as `x` (may be slow)." ) abort(bullets) } UseMethod("auto_copy") } #' @export auto_copy.data.frame <- function(x, y, copy = FALSE, ...) { as.data.frame(y) } dplyr/R/nest-by.R0000644000176200001440000000703615106134104013312 0ustar liggesusers#' Nest by one or more variables #' #' @description #' `r lifecycle::badge("experimental")` #' #' `nest_by()` is closely related to [group_by()]. However, instead of storing #' the group structure in the metadata, it is made explicit in the data, #' giving each group key a single row along with a list-column of data frames #' that contain all the other data. #' #' `nest_by()` returns a [rowwise] data frame, which makes operations on the #' grouped data particularly elegant. See `vignette("rowwise")` for more #' details. #' #' @details #' Note that `df |> nest_by(x, y)` is roughly equivalent to #' #' ``` #' df |> #' group_by(x, y) |> #' summarise(data = list(pick(everything()))) |> #' rowwise() #' ``` #' #' If you want to unnest a nested data frame, you can either use #' `tidyr::unnest()` or take advantage of `reframe()`s multi-row behaviour: #' #' ``` #' nested |> #' reframe(data) #' ``` #' #' @section Lifecycle: #' `nest_by()` is not stable because [`tidyr::nest(.by =)`][tidyr::nest()] #' provides very similar behavior. It may be deprecated in the future. #' #' @return #' A [rowwise] data frame. The output has the following properties: #' #' * The rows come from the underlying [group_keys()]. #' * The columns are the grouping keys plus one list-column of data frames. #' * Data frame attributes are **not** preserved, because `nest_by()` #' fundamentally creates a new data frame. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("nest_by")}. #' #' @inheritParams group_by #' @param .key Name of the list column #' @param .keep Should the grouping columns be kept in the list column. #' @return A tbl with one row per unique combination of the grouping variables. #' The first columns are the grouping variables, followed by a list column of tibbles #' with matching rows of the remaining columns. #' @keywords internal #' @export #' @examples #' # After nesting, you get one row per group #' iris |> nest_by(Species) #' starwars |> nest_by(species) #' #' # The output is grouped by row, which makes modelling particularly easy #' models <- mtcars |> #' nest_by(cyl) |> #' mutate(model = list(lm(mpg ~ wt, data = data))) #' models #' #' models |> summarise(rsq = summary(model)$r.squared) #' @examplesIf requireNamespace("broom", quietly = TRUE) #' #' # This is particularly elegant with the broom functions #' models |> summarise(broom::glance(model)) #' models |> reframe(broom::tidy(model)) #' @examples #' #' # Note that you can also `reframe()` to unnest the data #' models |> reframe(data) nest_by <- function(.data, ..., .key = "data", .keep = FALSE) { lifecycle::signal_stage("experimental", "nest_by()") UseMethod("nest_by") } #' @export nest_by.data.frame <- function(.data, ..., .key = "data", .keep = FALSE) { .data <- group_by(.data, ...) nest_by.grouped_df(.data, .key = .key, .keep = .keep) } #' @export nest_by.grouped_df <- function(.data, ..., .key = "data", .keep = FALSE) { if (!missing(...)) { bullets <- c( "Can't re-group while nesting", i = "Either `ungroup()` first or don't supply arguments to `nest_by()" ) abort(bullets) } vars <- group_vars(.data) keys <- group_keys(.data) keys <- mutate(keys, !!.key := group_split(.env$.data, .keep = .keep)) rowwise(keys, tidyselect::all_of(vars)) } dplyr/R/src-dbi.R0000644000176200001440000000065215106134104013251 0ustar liggesusers#' @export tbl.DBIConnection <- function(src, from, ...) { check_dbplyr() tbl(dbplyr::src_dbi(src, auto_disconnect = FALSE), from = from, ...) } #' @export copy_to.DBIConnection <- function( dest, df, name = deparse(substitute(df)), overwrite = FALSE, ... ) { check_dbplyr() copy_to( dbplyr::src_dbi(dest, auto_disconnect = FALSE), df = df, name = name, overwrite = overwrite, ... ) } dplyr/R/compat-dbplyr.R0000644000176200001440000000356514272553254014527 0ustar liggesusers#' dbplyr compatibility functions #' #' @description #' In dplyr 0.7.0, a number of database and SQL functions moved from dplyr to #' dbplyr. The generic functions stayed in dplyr (since there is no easy way #' to conditionally import a generic from different packages), but many other #' SQL and database helper functions moved. If you have written a backend, #' these functions generate the code you need to work with both dplyr 0.5.0 #' dplyr 0.7.0. #' #' @keywords internal #' @export #' @examplesIf requireNamespace("dbplyr", quietly = TRUE) #' wrap_dbplyr_obj("build_sql") #' wrap_dbplyr_obj("base_agg") check_dbplyr <- function() { check_installed("dbplyr", "to communicate with database backends.") } #' @export #' @rdname check_dbplyr wrap_dbplyr_obj <- function(obj_name) { # Silence R CMD check NOTE `UQ<-` <- NULL obj <- getExportedValue("dbplyr", obj_name) obj_sym <- sym(obj_name) dbplyr_sym <- call("::", quote(dbplyr), obj_sym) dplyr_sym <- call("::", quote(dplyr), obj_sym) if (is.function(obj)) { args <- formals() pass_on <- map(set_names(names(args)), sym) dbplyr_call <- expr((!!dbplyr_sym)(!!!pass_on)) dplyr_call <- expr((!!dplyr_sym)(!!!pass_on)) } else { args <- list() dbplyr_call <- dbplyr_sym dplyr_call <- dplyr_sym } body <- expr({ if (utils::packageVersion("dplyr") > "0.5.0") { dplyr::check_dbplyr() !!dbplyr_call } else { !!dplyr_call } }) wrapper <- new_function(args, body, caller_env()) expr(!!obj_sym <- !!get_expr(wrapper)) } utils::globalVariables("!<-") #' @inherit dbplyr::sql #' @export sql <- function(...) { check_dbplyr() dbplyr::sql(...) } #' @inherit dbplyr::ident #' @export #' @examples #' # Identifiers are escaped with " #' @examplesIf requireNamespace("dbplyr", quietly = TRUE) #' ident("x") ident <- function(...) { check_dbplyr() dbplyr::ident(...) } dplyr/R/bind-rows.R0000644000176200001440000000566615106134104013644 0ustar liggesusers#' Bind multiple data frames by row #' #' Bind any number of data frames by row, making a longer result. This is #' similar to `do.call(rbind, dfs)`, but the output will contain all columns #' that appear in any of the inputs. #' #' @param ... Data frames to combine. Each argument can either be a data frame, #' a list that could be a data frame, or a list of data frames. Columns are #' matched by name, and any missing columns will be filled with `NA`. #' @param .id The name of an optional identifier column. Provide a string to #' create an output column that identifies each input. The column will use #' names if available, otherwise it will use positions. #' @returns A data frame the same type as the first element of `...`. #' @aliases bind #' @export #' @examples #' df1 <- tibble(x = 1:2, y = letters[1:2]) #' df2 <- tibble(x = 4:5, z = 1:2) #' #' # You can supply individual data frames as arguments: #' bind_rows(df1, df2) #' #' # Or a list of data frames: #' bind_rows(list(df1, df2)) #' #' # When you supply a column name with the `.id` argument, a new #' # column is created to link each row to its original data frame #' bind_rows(list(df1, df2), .id = "id") #' bind_rows(list(a = df1, b = df2), .id = "id") bind_rows <- function(..., .id = NULL) { dots <- list2(...) # bind_rows() has weird legacy squashing behaviour is_flattenable <- function(x) !is_named(x) if (length(dots) == 1 && is_bare_list(dots[[1]])) { dots <- dots[[1]] } dots <- list_flatten(dots, fn = is_flattenable) dots <- discard(dots, is.null) # Used to restore type if (length(dots) == 0) { first <- NULL } else { first <- dots[[1L]] } if (is_named(dots) && !all(map_lgl(dots, dataframe_ish))) { # This is hit by map_dfr() so we can't easily deprecate return(as_tibble(dots)) } for (i in seq_along(dots)) { .x <- dots[[i]] if (!dataframe_ish(.x)) { abort(glue("Argument {i} must be a data frame or a named atomic vector.")) } if (obj_is_list(.x)) { dots[[i]] <- vctrs::data_frame(!!!.x, .name_repair = "minimal") } } if (!is_null(.id)) { check_string(.id) if (!is_named(dots)) { # Replace `NA` or `""` names with their index, # but leave existing names in place (#7100) dots_with_names <- have_name(dots) dots_without_names <- which(!dots_with_names) names(dots)[dots_without_names] <- as.character(dots_without_names) } } else { # Don't let `vec_rbind(.id = NULL)` promote input names to row names names(dots) <- NULL } out <- vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()) # Override vctrs coercion rules and instead derive class from first input if (is.data.frame(first)) { out <- dplyr_reconstruct(out, first) } else { out <- as_tibble(out) } out } # helpers ----------------------------------------------------------------- dataframe_ish <- function(.x) { is.data.frame(.x) || (vec_is(.x) && is_named(.x)) } dplyr/R/data-storms.R0000644000176200001440000000374215106134104014167 0ustar liggesusers#' Storm tracks data #' #' This dataset is the NOAA Atlantic hurricane database best track data, #' . The data includes the positions and #' attributes of storms from `r min(storms$year)`-`r max(storms$year)`. Storms #' from 1979 onward are measured every six hours during the lifetime of the #' storm. Storms in earlier years have some missing data. #' #' @seealso The script to create the storms data set: #' #' #' @format A tibble with `r format(nrow(storms), big.mark = ",")` observations #' and `r df_n_col(storms)` variables: #' \describe{ #' \item{name}{Storm Name} #' \item{year,month,day}{Date of report} #' \item{hour}{Hour of report (in UTC)} #' \item{lat,long}{Location of storm center} #' \item{status}{Storm classification (Tropical Depression, Tropical Storm, #' or Hurricane)} #' \item{category}{Saffir-Simpson hurricane category calculated from wind speed. #' \itemize{ #' \item `NA`: Not a hurricane #' \item 1: 64+ knots #' \item 2: 83+ knots #' \item 3: 96+ knots #' \item 4: 113+ knots #' \item 5: 137+ knots #' } #' } #' \item{wind}{storm's maximum sustained wind speed (in knots)} #' \item{pressure}{Air pressure at the storm's center (in millibars)} #' \item{tropicalstorm_force_diameter}{Diameter (in nautical miles) of the #' area experiencing tropical storm strength winds (34 knots or above). #' Only available starting in 2004.} #' \item{hurricane_force_diameter}{Diameter (in nautical miles) of the area #' experiencing hurricane strength winds (64 knots or above). Only available #' starting in 2004.} #' } #' @examples #' storms #' #' # Show a few recent storm paths #' if (requireNamespace("ggplot2", quietly = TRUE)) { #' library(ggplot2) #' storms |> #' filter(year >= 2000) |> #' ggplot(aes(long, lat, color = paste(year, name))) + #' geom_path(show.legend = FALSE) + #' facet_wrap(~year) #' } #' #' storms "storms" dplyr/R/conditions.R0000644000176200001440000002414115106134104014076 0ustar liggesusers#' Local error call for dplyr verbs #' @noRd dplyr_local_error_call <- function(call = frame, frame = caller_env()) { # This doesn't implement the semantics of a `local_` function # perfectly in order to be as fast as possible frame$.__dplyr_error_call__. <- call invisible(NULL) } # Takes the local call by default. If the caller of the verb has # called `dplyr_local_error_call()`, we used that call instead. This # logic is slightly different than in checking functions or error # helpers, where the error call is always taken from the parent by # default. dplyr_error_call <- function(call) { if (is_missing(call)) { call <- caller_env() } while (is_environment(call)) { caller <- eval_bare(quote(base::parent.frame()), call) caller_call <- caller[[".__dplyr_error_call__."]] if (is_null(caller_call)) { break } call <- caller_call } call } cnd_bullet_cur_group_label <- function(what = "error") { label <- cur_group_label() if (label != "") { glue("In {label}.") } } cnd_bullet_rowwise_unlist <- function() { if (peek_mask()$is_rowwise()) { glue_data( peek_error_context(), "Did you mean: `{error_name} = list({expr_as_label(error_expr)})` ?" ) } } or_1 <- function(x) { if (x == 1L) { "1" } else { glue("{x} or 1") } } has_active_group_context <- function(mask) { mask$get_current_group_id_mutable() != 0L } # Common ------------------------------------------------------------------ is_data_pronoun <- function(x) { is_call(x, c("[[", "$")) && identical(x[[2]], sym(".data")) } # Because as_label() strips off .data$<> and .data[[<>]] expr_as_label <- function(expr) { if (is_data_pronoun(expr)) { deparse(expr)[[1]] } else { with_no_rlang_infix_labeling(as_label(expr)) } } local_error_context <- function(dots, i, mask, frame = caller_env()) { ctxt <- new_error_context(dots, i, mask = mask) context_local("dplyr_error_context", ctxt, frame = frame) } peek_error_context <- function() { context_peek("dplyr_error_context", "dplyr error handling") } new_error_context <- function(dots, i, mask) { if (!length(dots) || i == 0L) { env( error_name = "", error_expr = NULL, mask = mask ) } else { # Saving the expression rather than the result of `expr_as_label()` to avoid # slow label creation unless required. Not saving the quosure itself because # carrying around its environment past the scope of a dplyr verb's lifetime # can be very expensive (#7649)! env( error_name = names(dots)[[i]], error_expr = quo_get_expr(dots[[i]]), mask = mask ) } } # Doesn't restore values. To be called within a # `local_error_context()` in charge of restoring. poke_error_context <- function(dots, i, mask) { ctxt <- new_error_context(dots, i, mask = mask) context_poke("dplyr_error_context", ctxt) } mask_type <- function(mask = peek_mask()) { if (mask$get_size() > 0) { if (mask$is_grouped()) { return("grouped") } else if (mask$is_rowwise()) { return("rowwise") } } "ungrouped" } ctxt_error_label <- function(ctxt = peek_error_context()) { error_label(ctxt$error_name, ctxt$error_expr) } error_label <- function(name, expr) { if (is_null(name) || !nzchar(name)) { expr_as_label(expr) } else { name } } ctxt_error_label_named <- function(ctxt = peek_error_context()) { error_label_named(ctxt$error_name, ctxt$error_expr) } error_label_named <- function(name, expr) { if (is_null(name) || !nzchar(name)) { expr_as_label(expr) } else { paste0(name, " = ", expr_as_label(expr)) } } cnd_bullet_header <- function(what) { ctxt <- peek_error_context() label <- ctxt_error_label_named(ctxt) if (is_string(what, "recycle")) { glue("Can't {what} `{label}`.") } else { c("i" = glue("In argument: `{label}`.")) } } cnd_bullet_combine_details <- function(x, arg) { id <- as.integer(sub("^..", "", arg)) group <- peek_mask()$get_keys()[id, ] details <- cur_group_label(id = group, group = group) glue("Result of type <{vec_ptype_full(x)}> for {details}.") } err_vars <- function(x) { if (is.logical(x)) { x <- which(x) } if (is.character(x)) { x <- encodeString(x, quote = "`") } glue_collapse(x, sep = ", ", last = if (length(x) <= 2) " and " else ", and ") } err_locs <- function(x) { if (!is.integer(x)) { abort("`x` must be an integer vector of locations.", .internal = TRUE) } size <- length(x) if (size == 0L) { abort("`x` must have at least 1 location.", .internal = TRUE) } if (size > 5L) { x <- x[1:5] extra <- glue(" and {size - 5L} more") } else { extra <- "" } x <- glue_collapse(x, sep = ", ") glue("`c({x})`{extra}") } dplyr_internal_error <- function(class = NULL, data = list()) { abort(class = c(class, "dplyr:::internal_error"), dplyr_error_data = data) } dplyr_internal_signal <- function(class) { signal( message = "Internal dplyr signal", class = c(class, "dplyr:::internal_signal") ) } skip_internal_condition <- function(cnd) { if (inherits(cnd, "dplyr:::internal_error")) { cnd$parent } else { cnd } } dplyr_error_handler <- function( dots, mask, bullets, error_call, action = "compute", error_class = NULL, i_sym = "i", frame = caller_env() ) { force(frame) function(cnd) { local_error_context(dots, i = frame[[i_sym]], mask = mask) if (inherits(cnd, "dplyr:::internal_error")) { parent <- error_cnd(message = bullets(cnd)) } else { parent <- cnd } # FIXME: Must be after calling `bullets()` because the # `dplyr:::summarise_incompatible_size` and # `dplyr:::reframe_incompatible_size` methods set the correct group by side # effect message <- c( cnd_bullet_header(action), "i" = if (has_active_group_context(mask)) cnd_bullet_cur_group_label() ) abort( message, class = error_class, parent = parent, call = error_call ) } } # Warnings ------------------------------------------------------------- #' Show warnings from the last command #' #' Warnings that occur inside a dplyr verb like `mutate()` are caught #' and stashed away instead of being emitted to the console. This #' prevents rowwise and grouped data frames from flooding the console #' with warnings. To see the original warnings, use #' `last_dplyr_warnings()`. #' #' @param n Passed to [head()] so that only the first `n` warnings are #' displayed. #' @keywords internal #' @export last_dplyr_warnings <- function(n = 5) { if (!identical(n, Inf)) { check_number_whole(n) stopifnot(n >= 0) } warnings <- the$last_warnings n_remaining <- max(length(warnings) - n, 0L) warnings <- head(warnings, n = n) warnings <- map(warnings, new_dplyr_warning) structure( warnings, class = c("last_dplyr_warnings", "list"), n_shown = n, n_remaining = n_remaining ) } on_load({ the$last_warnings <- list() the$last_cmd_frame <- "" }) dplyr_warning_handler <- function(state, mask, error_call) { # `error_call()` does some non-trivial work, e.g. climbing frame # environments to find generic calls. We avoid evaluating it # repeatedly in the loop by assigning it here (lazily as we only # need it for the error path). delayedAssign("error_call_forced", error_call(error_call)) function(cnd) { # Don't entrace more than 5 warnings because this is very costly if (is_null(cnd$trace) && length(state$warnings) < 5) { cnd$trace <- trace_back(bottom = error_call) } new <- cnd_data( cnd = cnd, ctxt = peek_error_context(), mask = mask, call = error_call_forced ) state$warnings <- c(state$warnings, list(new)) maybe_restart("muffleWarning") } } # Flushes warnings if a new top-level command is detected push_dplyr_warnings <- function(warnings) { last <- the$last_cmd_frame current <- obj_address(sys.frame(1)) if (!identical(last, current)) { reset_dplyr_warnings() the$last_cmd_frame <- current } the$last_warnings <- c(the$last_warnings, warnings) } # Also used in tests reset_dplyr_warnings <- function() { the$last_warnings <- list() } signal_warnings <- function(state, error_call) { warnings <- state$warnings n <- length(warnings) if (!n) { return() } push_dplyr_warnings(warnings) first <- new_dplyr_warning(warnings[[1]]) call <- format_error_call(error_call) if (nzchar(names2(cnd_header(first))[[1]])) { prefix <- NULL } else { prefix <- paste0(cli::col_yellow("!"), " ") } msg <- paste_line( cli::format_warning(c( "There {cli::qty(n)} {?was/were} {n} warning{?s} in {call}.", if (n > 1) { "The first warning was:" } )), paste0(prefix, cnd_message(first)), if (n > 1) { cli::format_warning(c( i = "Run {.run dplyr::last_dplyr_warnings()} to see the {n - 1} remaining warning{?s}." )) } ) warn(msg, use_cli_format = FALSE) } new_dplyr_warning <- function(data) { if (data$has_group_data) { group_label <- cur_group_label( data$type, data$group_data$id, data$group_data$group ) } else { group_label <- "" } label <- error_label_named(data$name, data$expr) msg <- c( "i" = glue::glue("In argument: `{label}`."), "i" = if (nzchar(group_label)) glue("In {group_label}.") ) warning_cnd( message = msg, parent = data$cnd, call = data$call, trace = data$cnd$trace ) } #' @export print.last_dplyr_warnings <- function(x, ...) { # Opt into experimental grayed out tree local_options( "rlang:::trace_display_tree" = TRUE ) print(unstructure(x), ..., simplify = "none") n_remaining <- attr(x, "n_remaining") if (n_remaining) { n_more <- attr(x, "n_shown") * 2 cli::cli_bullets(c( "... with {n_remaining} more warning{?s}.", "i" = "Run {.run dplyr::last_dplyr_warnings(n = {n_more})} to show more." )) } } # rlang should export this routine error_call <- function(call) { tryCatch( abort("", call = call), error = conditionCall ) } cnd_message_lines <- function(cnd, ...) { c( "!" = cnd_header(cnd, ...), cnd_body(cnd, ...), cnd_footer(cnd, ...) ) } dplyr/R/order-by.R0000644000176200001440000000405514406402754013465 0ustar liggesusers#' A helper function for ordering window function output #' #' This function makes it possible to control the ordering of window functions #' in R that don't have a specific ordering parameter. When translated to SQL #' it will modify the order clause of the OVER function. #' #' This function works by changing the `call` to instead call #' [with_order()] with the appropriate arguments. #' #' @param order_by a vector to order_by #' @param call a function call to a window function, where the first argument #' is the vector being operated on #' @export #' @examples #' order_by(10:1, cumsum(1:10)) #' x <- 10:1 #' y <- 1:10 #' order_by(x, cumsum(y)) #' #' df <- data.frame(year = 2000:2005, value = (0:5) ^ 2) #' scrambled <- df[sample(nrow(df)), ] #' #' wrong <- mutate(scrambled, running = cumsum(value)) #' arrange(wrong, year) #' #' right <- mutate(scrambled, running = order_by(year, cumsum(value))) #' arrange(right, year) order_by <- function(order_by, call) { quo <- enquo(call) expr <- quo_get_expr(quo) if (!is_call(expr)) { if (is_symbol(expr)) { bullets <- c( glue("`call` must be a function call, not a symbol."), i = glue("Did you mean `arrange({as_label(enquo(order_by))}, {expr})`?") ) abort(bullets) } else { type <- obj_type_friendly(expr) msg <- glue("`call` must be a function call, not { type }.") abort(msg) } } fn <- set_expr(quo, expr[[1]]) args <- map(expr[-1], new_quosure, quo_get_env(quo)) expr <- expr(with_order(!!order_by, !!fn, !!!args)) eval_tidy(expr) } #' Run a function with one order, translating result back to original order #' #' This is used to power the ordering parameters of dplyr's window functions #' #' @param order_by vector to order by #' @param fun window function #' @param x,... arguments to `f` #' @keywords internal #' @export with_order <- function(order_by, fun, x, ...) { vec_check_size(order_by, size = vec_size(x)) o <- vec_order_radix(order_by) x <- vec_slice(x, o) out <- fun(x, ...) o <- vec_order_radix(o) vec_slice(out, o) } dplyr/R/rank.R0000644000176200001440000001270615106134104012664 0ustar liggesusers#' Integer ranking functions #' #' @description #' Three ranking functions inspired by SQL2003. They differ primarily in how #' they handle ties: #' #' * `row_number()` gives every input a unique rank, so that `c(10, 20, 20, 30)` #' would get ranks `c(1, 2, 3, 4)`. It's equivalent to #' `rank(ties.method = "first")`. #' #' * `min_rank()` gives every tie the same (smallest) value so that #' `c(10, 20, 20, 30)` gets ranks `c(1, 2, 2, 4)`. It's the way that ranks #' are usually computed in sports and is equivalent to #' `rank(ties.method = "min")`. #' #' * `dense_rank()` works like `min_rank()`, but doesn't leave any gaps, #' so that `c(10, 20, 20, 30)` gets ranks `c(1, 2, 2, 3)`. #' #' @param x A vector to rank #' #' By default, the smallest values will get the smallest ranks. Use [desc()] #' to reverse the direction so the largest values get the smallest ranks. #' #' Missing values will be given rank `NA`. Use `coalesce(x, Inf)` or #' `coalesce(x, -Inf)` if you want to treat them as the largest or smallest #' values respectively. #' #' To rank by multiple columns at once, supply a data frame. #' @return An integer vector. #' @family ranking functions #' @examples #' x <- c(5, 1, 3, 2, 2, NA) #' row_number(x) #' min_rank(x) #' dense_rank(x) #' #' # Ranking functions can be used in `filter()` to select top/bottom rows #' df <- data.frame( #' grp = c(1, 1, 1, 2, 2, 2, 3, 3, 3), #' x = c(3, 2, 1, 1, 2, 2, 1, 1, 1), #' y = c(1, 3, 2, 3, 2, 2, 4, 1, 2), #' id = 1:9 #' ) #' # Always gives exactly 1 row per group #' df |> group_by(grp) |> filter(row_number(x) == 1) #' # May give more than 1 row if ties #' df |> group_by(grp) |> filter(min_rank(x) == 1) #' # Rank by multiple columns (to break ties) by selecting them with `pick()` #' df |> group_by(grp) |> filter(min_rank(pick(x, y)) == 1) #' # See slice_min() and slice_max() for another way to tackle the same problem #' #' # You can use row_number() without an argument to refer to the "current" #' # row number. #' df |> group_by(grp) |> filter(row_number() == 1) #' #' # It's easiest to see what this does with mutate(): #' df |> group_by(grp) |> mutate(grp_id = row_number()) #' @export row_number <- function(x) { if (missing(x)) { seq_len(n()) } else { vec_rank(x, ties = "sequential", incomplete = "na") } } #' @export #' @rdname row_number min_rank <- function(x) { vec_rank(x, ties = "min", incomplete = "na") } #' @export #' @rdname row_number dense_rank <- function(x) { vec_rank(x, ties = "dense", incomplete = "na") } #' Bucket a numeric vector into `n` groups #' #' @description #' `ntile()` is a sort of very rough rank, which breaks the input vector into #' `n` buckets. If `length(x)` is not an integer multiple of `n`, the size of #' the buckets will differ by up to one, with larger buckets coming first. #' #' Unlike other ranking functions, `ntile()` ignores ties: it will create #' evenly sized buckets even if the same value of `x` ends up in different #' buckets. #' #' @inheritParams row_number #' @param n Number of groups to bucket into #' @export #' @family ranking functions #' @examples #' x <- c(5, 1, 3, 2, 2, NA) #' ntile(x, 2) #' ntile(x, 4) #' #' # If the bucket sizes are uneven, the larger buckets come first #' ntile(1:8, 3) #' #' # Ties are ignored #' ntile(rep(1, 8), 3) ntile <- function(x = row_number(), n) { # Avoid recomputation in default case: # row_number(row_number(x)) == row_number(x) if (!missing(x)) { x <- row_number(x) } len <- vec_size(x) - sum(vec_detect_missing(x)) check_number_whole(n) n <- vec_cast(n, integer()) if (n <= 0L) { abort("`n` must be positive.") } # Definition from # https://techcommunity.microsoft.com/t5/sql-server/ranking-functions-rank-dense-rank-and-ntile/ba-p/383384 if (len == 0L) { rep(NA_integer_, vec_size(x)) } else { n_larger <- as.integer(len %% n) n_smaller <- as.integer(n - n_larger) size <- len / n larger_size <- as.integer(ceiling(size)) smaller_size <- as.integer(floor(size)) larger_threshold <- larger_size * n_larger bins <- if_else( x <= larger_threshold, (x + (larger_size - 1L)) / larger_size, (x + (-larger_threshold + smaller_size - 1L)) / smaller_size + n_larger ) as.integer(floor(bins)) } } #' Proportional ranking functions #' #' @description #' These two ranking functions implement two slightly different ways to #' compute a percentile. For each `x_i` in `x`: #' #' * `cume_dist(x)` counts the total number of values less than #' or equal to `x_i`, and divides it by the number of observations. #' #' * `percent_rank(x)` counts the total number of values less than #' `x_i`, and divides it by the number of observations minus 1. #' #' In both cases, missing values are ignored when counting the number #' of observations. #' #' @inheritParams row_number #' @returns A numeric vector containing a proportion. #' @family ranking functions #' @export #' @examples #' x <- c(5, 1, 3, 2, 2) #' #' cume_dist(x) #' percent_rank(x) #' #' # You can understand what's going on by computing it by hand #' sapply(x, function(xi) sum(x <= xi) / length(x)) #' sapply(x, function(xi) sum(x < xi) / (length(x) - 1)) #' # The real computations are a little more complex in order to #' # correctly deal with missing values percent_rank <- function(x) { (min_rank(x) - 1) / (sum(vec_detect_complete(x)) - 1) } #' @export #' @rdname percent_rank cume_dist <- function(x) { vec_rank(x, ties = "max", incomplete = "na") / sum(vec_detect_complete(x)) } dplyr/R/colwise-arrange.R0000644000176200001440000000447015106134104015012 0ustar liggesusers#' Arrange rows by a selection of variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' Scoped verbs (`_if`, `_at`, `_all`) have been superseded by the use of #' [pick()] or [across()] in an existing verb. See `vignette("colwise")` for #' details. #' #' These [scoped] variants of [arrange()] sort a data frame by a #' selection of variables. Like [arrange()], you can modify the #' variables before ordering with the `.funs` argument. #' #' @inheritParams scoped #' @inheritParams arrange #' #' @section Grouping variables: #' #' The grouping variables that are part of the selection participate #' in the sorting of the data frame. #' #' @export #' @keywords internal #' @examples #' df <- as_tibble(mtcars) #' arrange_all(df) #' # -> #' arrange(df, pick(everything())) #' #' arrange_all(df, desc) #' # -> #' arrange(df, across(everything(), desc)) arrange_all <- function( .tbl, .funs = list(), ..., .by_group = FALSE, .locale = NULL ) { lifecycle::signal_stage("superseded", "arrange_all()") funs <- manip_all( .tbl, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_all" ) if (!length(funs)) { funs <- syms(tbl_vars(.tbl)) } arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale) } #' @rdname arrange_all #' @export arrange_at <- function( .tbl, .vars, .funs = list(), ..., .by_group = FALSE, .locale = NULL ) { lifecycle::signal_stage("superseded", "arrange_at()") funs <- manip_at( .tbl, .vars, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_at" ) if (!length(funs)) { funs <- tbl_at_syms(.tbl, .vars, .include_group_vars = TRUE) } arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale) } #' @rdname arrange_all #' @export arrange_if <- function( .tbl, .predicate, .funs = list(), ..., .by_group = FALSE, .locale = NULL ) { lifecycle::signal_stage("superseded", "arrange_if()") funs <- manip_if( .tbl, .predicate, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_if" ) if (!length(funs)) { funs <- tbl_if_syms(.tbl, .predicate, .include_group_vars = TRUE) } arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale) } dplyr/R/reframe.R0000644000176200001440000001001315137161765013360 0ustar liggesusers#' Transform each group to an arbitrary number of rows #' #' @description #' #' While [summarise()] requires that each argument returns a single value, and #' [mutate()] requires that each argument returns the same number of rows as the #' input, `reframe()` is a more general workhorse with no requirements on the #' number of rows returned per group. #' #' `reframe()` creates a new data frame by applying functions to columns of an #' existing data frame. It is most similar to `summarise()`, with two big #' differences: #' #' - `reframe()` can return an arbitrary number of rows per group, while #' `summarise()` reduces each group down to a single row. #' #' - `reframe()` always returns an ungrouped data frame, while `summarise()` #' might return a grouped or rowwise data frame, depending on the scenario. #' #' We expect that you'll use `summarise()` much more often than `reframe()`, but #' `reframe()` can be particularly helpful when you need to apply a complex #' function that doesn't return a single summary value. #' #' @inheritParams args_by #' @inheritParams arrange #' #' @param ... <[`data-masking`][rlang::args_data_masking]> #' #' Name-value pairs of functions. The name will be the name of the variable in #' the result. The value can be a vector of any length. #' #' Unnamed data frame values add multiple columns from a single expression. #' #' @return #' If `.data` is a tibble, a tibble. Otherwise, a data.frame. #' #' * The rows originate from the underlying grouping keys. #' * The columns are a combination of the grouping keys and the #' expressions that you provide. #' * The output is always ungrouped. #' * Data frame attributes are **not** preserved, because `reframe()` #' fundamentally creates a new data frame. #' #' @section Connection to tibble: #' `reframe()` is theoretically connected to two functions in tibble, #' [tibble::enframe()] and [tibble::deframe()]: #' #' * `enframe()`: vector -> data frame #' * `deframe()`: data frame -> vector #' * `reframe()`: data frame -> data frame #' #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("reframe")}. #' #' @family single table verbs #' @export #' @examples #' table <- c("a", "b", "d", "f") #' #' df <- tibble( #' g = c(1, 1, 1, 2, 2, 2, 2), #' x = c("e", "a", "b", "c", "f", "d", "a") #' ) #' #' # `reframe()` allows you to apply functions that return #' # an arbitrary number of rows #' df |> #' reframe(x = intersect(x, table)) #' #' # Functions are applied per group, and each group can return a #' # different number of rows. #' df |> #' reframe(x = intersect(x, table), .by = g) #' #' # The output is always ungrouped, even when using `group_by()` #' df |> #' group_by(g) |> #' reframe(x = intersect(x, table)) #' #' # You can add multiple columns at once using a single expression by returning #' # a data frame. #' quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) { #' tibble( #' val = quantile(x, probs, na.rm = TRUE), #' quant = probs #' ) #' } #' #' x <- c(10, 15, 18, 12) #' quantile_df(x) #' #' starwars |> #' reframe(quantile_df(height)) #' #' starwars |> #' reframe(quantile_df(height), .by = homeworld) #' #' starwars |> #' reframe( #' across(c(height, mass), quantile_df, .unpack = TRUE), #' .by = homeworld #' ) reframe <- function(.data, ..., .by = NULL) { UseMethod("reframe") } #' @export reframe.data.frame <- function(.data, ..., .by = NULL) { by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") cols <- summarise_cols(.data, dplyr_quosures(...), by, "reframe") out <- summarise_build(by, cols, "reframe") if (!is_tibble(.data)) { # The `by` group data we build from is always a tibble, # so we have to manually downcast as needed out <- as.data.frame(out) } out } dplyr/R/join-cols.R0000644000176200001440000001465115106134104013627 0ustar liggesusersjoin_cols <- function( x_names, y_names, by, ..., suffix = c(".x", ".y"), keep = NULL, error_call = caller_env() ) { check_dots_empty0(...) if (is_false(keep) && any(by$condition != "==")) { abort( "Can't set `keep = FALSE` when using an inequality, rolling, or overlap join.", call = error_call ) } check_duplicate_vars(x_names, "x", error_call = error_call) check_duplicate_vars(y_names, "y", error_call = error_call) check_join_vars(by$x, x_names, by$condition, "x", error_call = error_call) check_join_vars(by$y, y_names, by$condition, "y", error_call = error_call) suffix <- standardise_join_suffix(suffix, error_call = error_call) x_by <- set_names(match(by$x, x_names), by$x) y_by <- set_names(match(by$y, y_names), by$y) x_loc <- seq_along(x_names) names(x_loc) <- x_names if (is_null(keep)) { # In x_out, equi key variables need to keep the same name, and non-equi # key variables and aux variables need suffixes for duplicates that appear # in y_out. This is equivalent to `keep = TRUE` for the non-equi keys and # `keep = FALSE` for the equi keys. equi <- by$condition == "==" y_aux <- setdiff(y_names, c(by$x[equi], by$y[equi])) x_ignore <- by$x[equi] x_check <- !x_names %in% x_ignore names(x_loc)[x_check] <- add_suffixes( x_names[x_check], c(x_ignore, y_aux), suffix$x ) } else if (is_false(keep)) { # In x_out, key variables need to keep the same name, and aux # variables need suffixes for duplicates that appear in y_out y_aux <- setdiff(y_names, c(by$x, by$y)) x_ignore <- by$x x_check <- !x_names %in% x_ignore names(x_loc)[x_check] <- add_suffixes( x_names[x_check], c(x_ignore, y_aux), suffix$x ) } else { # In x_out, key variables and aux variables need suffixes # for duplicates that appear in y_out names(x_loc) <- add_suffixes(x_names, y_names, suffix$x) } y_loc <- seq_along(y_names) names(y_loc) <- add_suffixes(y_names, x_names, suffix$y) if (is_null(keep)) { equi <- by$condition == "==" y_ignore <- by$y[equi] y_loc <- y_loc[!y_names %in% y_ignore] } else if (is_false(keep)) { y_ignore <- by$y y_loc <- y_loc[!y_names %in% y_ignore] } # key = named locations to use for matching # out = named locations to use in output list( x = list(key = x_by, out = x_loc), y = list(key = y_by, out = y_loc) ) } check_join_vars <- function( vars, names, condition, input, ..., error_call = caller_env() ) { check_dots_empty0(...) if (!is.character(vars)) { message <- glue("Join columns in `{input}` must be character vectors.") abort(message, call = error_call) } na <- is.na(vars) if (any(na)) { bullets <- c( glue("Join columns in `{input}` can't be `NA`."), x = glue("Problem at position {err_vars(na)}.") ) abort(bullets, call = error_call) } # Columns are allowed to appear in more than one non-equi condition # (but not in a mix of non-equi and equi conditions). # When non-equi conditions are present, `keep` can't be `FALSE` so we don't # have to worry about merging into the same key column multiple times (#6499). non_equi <- condition != "==" vars <- c(vars[!non_equi], unique(vars[non_equi])) dup <- duplicated(vars) if (any(dup)) { vars <- unique(vars[dup]) bullets <- c( glue("Join columns in `{input}` must be unique."), x = glue("Problem with {err_vars(vars)}.") ) abort(bullets, call = error_call) } missing <- setdiff(vars, names) if (length(missing) > 0) { bullets <- c( glue("Join columns in `{input}` must be present in the data."), x = glue("Problem with {err_vars(missing)}.") ) abort(bullets, call = error_call) } } check_duplicate_vars <- function(vars, input, ..., error_call = caller_env()) { check_dots_empty0(...) dup <- duplicated(vars) if (any(dup)) { bullets <- c( glue("Input columns in `{input}` must be unique."), x = glue("Problem with {err_vars(vars[dup])}.") ) abort(bullets, call = error_call) } } standardise_join_suffix <- function(x, ..., error_call = caller_env()) { check_dots_empty0(...) if (!is.character(x) || length(x) != 2) { bullets <- glue( "`suffix` must be a character vector of length 2, not {obj_type_friendly(x)} of length {length(x)}." ) abort(bullets, call = error_call) } if (any(is.na(x))) { msg <- glue("`suffix` can't be `NA`.") abort(msg, call = error_call) } list(x = x[[1]], y = x[[2]]) } # `join_cols()` checks that `x` and `y` are individually unique, # which plays into assumptions made here add_suffixes <- function(x, y, suffix) { if (identical(suffix, "")) { return(x) } x <- c(y, x) # Never marks the "first" duplicate (i.e. never anything in `y`) dup <- duplicated(x) while (any(dup)) { x[dup] <- paste0(x[dup], suffix) dup <- duplicated(x) } loc <- seq2(length(y) + 1L, length(x)) x <- x[loc] x } join_cast_common <- function(x, y, vars, error_call = caller_env()) { ptype <- join_ptype_common(x, y, vars, error_call = error_call) vec_cast_common(x = x, y = y, .to = ptype, .call = error_call) } join_ptype_common <- function(x, y, vars, error_call = caller_env()) { # Explicit `x/y_arg = ""` to avoid auto naming in `cnd$x_arg` ptype <- try_fetch( vec_ptype2(x, y, x_arg = "", y_arg = "", call = error_call), vctrs_error_incompatible_type = function(cnd) { rethrow_error_join_incompatible_type(cnd, vars, error_call) } ) # Finalize unspecified columns (#6804) ptype <- vec_ptype_finalise(ptype) ptype } rethrow_error_join_incompatible_type <- function(cnd, vars, call) { x_name <- cnd$x_arg y_name <- cnd$y_arg # Remap `y_name` to actual name from `y`. Useful for `join_by(a == b)` # where the name from `x` is used when determining the common type and will # be in the error `cnd`, but we need to tell the user about the name in `y`. loc <- match(y_name, names(vars$x$key)) y_name <- names(vars$y$key)[[loc]] x_name <- paste0("x$", x_name) y_name <- paste0("y$", y_name) x_type <- vec_ptype_full(cnd$x) y_type <- vec_ptype_full(cnd$y) stop_join( message = c( glue("Can't join `{x_name}` with `{y_name}` due to incompatible types."), i = glue("`{x_name}` is a <{x_type}>."), i = glue("`{y_name}` is a <{y_type}>.") ), class = "dplyr_error_join_incompatible_type", call = call ) } dplyr/R/vctrs.R0000644000176200001440000000316015137161765013105 0ustar liggesuserscommon_handler <- function(name) { function(cnd) { bullets <- c( glue("`{name}` must return compatible vectors across groups."), i = cnd_bullet_combine_details(cnd$x, cnd$x_arg), i = cnd_bullet_combine_details(cnd$y, cnd$y_arg) ) abort(bullets, class = "dplyr:::error_incompatible_combine") } } dplyr_vec_cast_common <- function(chunks, name) { withCallingHandlers( vec_cast_common(!!!chunks), error = common_handler(name) ) } dplyr_vec_ptype_common <- function(chunks, name) { withCallingHandlers( vec_ptype_common(!!!chunks), error = common_handler(name) ) } # Version of `vec_size_common()` that takes a list. # Useful for delaying `!!!` when used within an `expr()` call. dplyr_list_size_common <- function( x, ..., size = NULL, absent = 0L, call = caller_env() ) { check_dots_empty0(...) vec_size_common(!!!x, .size = size, .absent = absent, .call = call) } # Version of `vec_recycle_common()` that takes a list. # Useful for delaying `!!!` when used within an `expr()` call. dplyr_list_recycle_common <- function( x, ..., size = NULL, call = caller_env() ) { check_dots_empty0(...) vec_recycle_common(!!!x, .size = size, .call = call) } dplyr_list_pall <- function( x, ..., missing = NA, size = NULL, error_call = caller_env() ) { check_dots_empty0(...) vec_pall(!!!x, .missing = missing, .size = size, .error_call = error_call) } dplyr_list_pany <- function( x, ..., missing = NA, size = NULL, error_call = caller_env() ) { check_dots_empty0(...) vec_pany(!!!x, .missing = missing, .size = size, .error_call = error_call) } dplyr/R/coalesce.R0000644000176200001440000001327015106134104013504 0ustar liggesusers#' Find the first non-missing element #' #' Given a set of vectors, `coalesce()` finds the first non-missing value at #' each position. It's inspired by the SQL `COALESCE` function which does the #' same thing for SQL `NULL`s. #' #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> #' #' One or more vectors. These will be #' [recycled][vctrs::theory-faq-recycling] against each other, and will be #' cast to their common type. #' #' @param .ptype An optional prototype declaring the desired output type. If #' supplied, this overrides the common type of the vectors in `...`. #' #' @param .size An optional size declaring the desired output size. If supplied, #' this overrides the common size of the vectors in `...`. #' #' @return A vector with the same type and size as the common type and common #' size of the vectors in `...`. #' #' @seealso #' #' - [na_if()] to replace a specified value with `NA`. #' #' - [replace_values()] for making arbitrary replacements by value. #' #' - [replace_when()] for making arbitrary replacements using logical #' conditions. #' #' @export #' @examples #' # Replace missing values with a single value #' x <- sample(c(1:5, NA, NA, NA)) #' coalesce(x, 0L) #' #' # Or replace missing values with the corresponding non-missing value in #' # another vector #' x <- c(1, 2, NA, NA, 5, NA) #' y <- c(NA, NA, 3, 4, 5, NA) #' coalesce(x, y) #' #' # For cases like these where your replacement is a single value or a single #' # vector, `replace_values()` works just as well #' replace_values(x, NA ~ 0) #' coalesce(x, 0) #' #' replace_values(x, NA ~ y) #' coalesce(x, y) #' #' # `coalesce()` really shines when you have >2 vectors to coalesce with #' z <- c(NA, 2, 3, 4, 5, 6) #' coalesce(x, y, z) #' #' # If you're looking to replace values with `NA`, rather than replacing `NA` #' # with a value, then use `replace_values()` #' x <- c(0, -1, 5, -99, 8) #' replace_values(x, c(-1, -99) ~ NA) #' #' # The equivalent to a missing value in a list is `NULL` #' coalesce(list(1, 2, NULL, NA), list(0)) #' #' # Supply lists of vectors by splicing them into dots #' vecs <- list( #' c(1, 2, NA, NA, 5), #' c(NA, NA, 3, 4, 5) #' ) #' coalesce(!!!vecs) coalesce <- function(..., .ptype = NULL, .size = NULL) { args <- list2(...) if (length(args) == 0L) { abort("`...` can't be empty.") } if (vec_all_missing(args)) { abort("`...` must contain at least 1 non-`NULL` value.") } # We do vector, type, and size checks up front before dropping any `NULL` # values or extracting out a `default` to ensure that any errors report # the correct index list_check_all_vectors(args, allow_null = TRUE, arg = "") .ptype <- vec_ptype_common(!!!args, .ptype = .ptype) args <- vec_cast_common(!!!args, .to = .ptype) if (is_null(.size)) { .size <- vec_size_common(!!!args) } else { # Check recyclability, but delay actual recycling list_check_all_recyclable(args, .size, allow_null = TRUE, arg = "") } # From this point on we don't expect any errors args <- convert_from_coalesce_to_case_when(args, .size) values <- args$values default <- args$default conditions <- map(values, function(value) { !vec_detect_missing(value) }) vec_case_when( conditions = conditions, values = values, default = default, ptype = .ptype, size = .size ) } # Goal is to convert from `...` of `coalesce()` to `values` and `default` # of `vec_case_when()` # # Recognize that these are equivalent: # # ``` # coalesce(x, y) # case_when(!vec_detect_missing(x) ~ x, !vec_detect_missing(y) ~ y) # # coalesce(x, y_with_no_missings) # case_when(!vec_detect_missing(x) ~ x, .default = y_with_no_missings) # # coalesce(x, NULL, y, 0) # case_when(!vec_detect_missing(x) ~ x, !vec_detect_missing(y) ~ y, .default = 0) # ``` # # Note how the last element can be used as `default` if it doesn't contain any # missing values. This is a very nice optimization since `vec_case_when()` # doesn't need to recycle that value, and efficiently computes its output # locations! # # Note how `NULL`s are dropped during the conversion. convert_from_coalesce_to_case_when <- function(args, size) { if (vec_any_missing(args)) { # Drop `NULL` args <- vec_slice(args, vec_detect_complete(args)) } args_size <- length(args) if (args_size == 0L) { abort("Checked for at least 1 non-`NULL` value earlier", .internal = TRUE) } # Try to promote the `last` element of `args` to `default` # # For the 99% case of `coalesce(x, 0)`, this: # - Avoids recycling `0` to size `size`. # - Avoids computing `!vec_detect_missing()` on that recycled `0`. # # Can only do this if the `last` element doesn't contain missing values # due to how names are handled. We don't want to take the name from any `NA` # element, which is what would happen if we promoted the whole `y` vector here # to `default`. # # ``` # x <- c(a = NA, b = 2) # y <- c(c = NA, d = 4) # # coalesce(x, y) # # Want c(NA, b = 2) # # Not c(c = NA, b = 2) # # # Compare to # case_when(!vec_detect_missing(x) ~ x, !vec_detect_missing(y) ~ y) # case_when(!vec_detect_missing(x) ~ x, .default = y) # ``` last <- args[[args_size]] if (vec_any_missing(last)) { default <- NULL } else { default <- last args <- args[-args_size] } # Most of the time this recycle is a no-op. Two cases where it isn't: # - `coalesce(x, 0, 1)`, where `1` becomes `default` but we still have a # scalar `0`. # - `coalesce(x, NA)`, where `NA` can't be promoted, so we have a scalar `NA`. args <- vec_recycle_common(!!!args, .size = size) list(values = args, default = default) } vec_all_missing <- function(x) { if (!vec_any_missing(x)) { return(FALSE) } sum(vec_detect_missing(x)) == vec_size(x) } dplyr/R/reexport-pillar.R0000644000176200001440000000174715106134104015065 0ustar liggesusers#' Get a glimpse of your data #' #' @description #' `glimpse()` is like a transposed version of `print()`: #' columns run down the page, and data runs across. #' This makes it possible to see every column in a data frame. #' It's a little like [str()] applied to a data frame #' but it tries to show you as much data as possible. #' (And it always shows the underlying data, even when applied #' to a remote data source.) #' #' `glimpse()` is provided by the pillar package, and re-exported #' by dplyr. See [pillar::glimpse()] for more details. #' #' @return x original x is (invisibly) returned, allowing `glimpse()` to be #' used within a data pipeline. #' @examples #' glimpse(mtcars) #' #' # Note that original x is (invisibly) returned, allowing `glimpse()` to be #' # used within a pipeline. #' mtcars |> #' glimpse() |> #' select(1:3) #' #' glimpse(starwars) #' @importFrom pillar glimpse #' @export #' @name glimpse glimpse #' @importFrom pillar type_sum #' @export pillar::type_sum dplyr/R/zzz.R0000644000176200001440000000270515106134104012564 0ustar liggesusers.onLoad <- function(libname, pkgname) { ns_dplyr <- ns_env(pkgname) .Call(dplyr_init_library, ns_dplyr, ns_env("vctrs"), ns_env("rlang")) # TODO: For `arrange()`, `group_by()`, `with_order()`, and `nth()` until vctrs # changes `vec_order()` to the new ordering algorithm, at which point we # should switch from `vec_order_radix()` to `vec_order()` so vctrs can remove # it. env_bind( .env = ns_dplyr, vec_order_radix = import_vctrs("vec_order_radix") ) run_on_load() invisible() } .onAttach <- function(libname, pkgname) { setHook(packageEvent("plyr", "attach"), function(...) { packageStartupMessage(rule()) packageStartupMessage( "You have loaded plyr after dplyr - this is likely ", "to cause problems.\nIf you need functions from both plyr and dplyr, ", "please load plyr first, then dplyr:\nlibrary(plyr); library(dplyr)" ) packageStartupMessage(rule()) }) } .onDetach <- function(libpath) { setHook(packageEvent("plyr", "attach"), NULL, "replace") } import_vctrs <- function(name, optional = FALSE) { import_from(name, "vctrs", optional = optional) } import_from <- function(name, package, optional = FALSE) { ns <- getNamespace(package) if (!exists(name, mode = "function", envir = ns, inherits = FALSE)) { if (optional) { return(NULL) } abort(sprintf("No such '%s' function: `%s()`.", package, name)) } get(name, mode = "function", envir = ns, inherits = FALSE) } dplyr/R/filter.R0000644000176200001440000003253015137161765013234 0ustar liggesusers#' Keep or drop rows that match a condition #' #' @description #' These functions are used to subset a data frame, applying the expressions in #' `...` to determine which rows should be kept (for `filter()`) or dropped ( #' for `filter_out()`). #' #' Multiple conditions can be supplied separated by a comma. These will be #' combined with the `&` operator. To combine comma separated conditions using #' `|` instead, wrap them in [when_any()]. #' #' Both `filter()` and `filter_out()` treat `NA` like `FALSE`. This subtle #' behavior can impact how you write your conditions when missing values are #' involved. See the section on `Missing values` for important details and #' examples. #' #' @inheritParams arrange #' @inheritParams args_by #' #' @param ... <[`data-masking`][rlang::args_data_masking]> Expressions that #' return a logical vector, defined in terms of the variables in `.data`. If #' multiple expressions are included, they are combined with the `&` operator. #' To combine expressions using `|` instead, wrap them in [when_any()]. Only #' rows for which all expressions evaluate to `TRUE` are kept (for `filter()`) #' or dropped (for `filter_out()`). #' #' @param .preserve Relevant when the `.data` input is grouped. If `.preserve = #' FALSE` (the default), the grouping structure is recalculated based on the #' resulting data, otherwise the grouping is kept as is. #' #' @returns #' An object of the same type as `.data`. The output has the following #' properties: #' #' * Rows are a subset of the input, but appear in the same order. #' * Columns are not modified. #' * The number of groups may be reduced (if `.preserve` is not `TRUE`). #' * Data frame attributes are preserved. #' #' @section Missing values: #' #' Both `filter()` and `filter_out()` treat `NA` like `FALSE`. This results in #' the following behavior: #' #' - `filter()` _drops_ both `NA` and `FALSE`. #' #' - `filter_out()` _keeps_ both `NA` and `FALSE`. #' #' This means that `filter(data, ) + filter_out(data, )` #' captures every row within `data` exactly once. #' #' The `NA` handling of these functions has been designed to match your #' _intent_. When your intent is to keep rows, use `filter()`. When your intent #' is to drop rows, use `filter_out()`. #' #' For example, if your goal with this `cars` data is to "drop rows where the #' `class` is suv", then you might write this in one of two ways: #' #' ```{r} #' cars <- tibble(class = c("suv", NA, "coupe")) #' cars #' ``` #' #' ```{r} #' cars |> filter(class != "suv") #' ``` #' #' ```{r} #' cars |> filter_out(class == "suv") #' ``` #' #' Note how `filter()` drops the `NA` rows even though our goal was only to drop #' `"suv"` rows, but `filter_out()` matches our intuition. #' #' To generate the correct result with `filter()`, you'd need to use: #' #' ```{r} #' cars |> filter(class != "suv" | is.na(class)) #' ``` #' #' This quickly gets unwieldy when multiple conditions are involved. #' #' In general, if you find yourself: #' #' - Using "negative" operators like `!=` or `!` #' - Adding in `NA` handling like `| is.na(col)` or `& !is.na(col)` #' #' then you should consider if swapping to the other filtering variant would #' make your conditions simpler. #' #' ## Comparison to base subsetting #' #' Base subsetting with `[` doesn't treat `NA` like `TRUE` or `FALSE`. Instead, #' it generates a fully missing row, which is different from how both `filter()` #' and `filter_out()` work. #' #' ```{r} #' cars <- tibble(class = c("suv", NA, "coupe"), mpg = c(10, 12, 14)) #' cars #' ``` #' #' ```{r} #' cars[cars$class == "suv",] #' #' cars |> filter(class == "suv") #' ``` #' #' @section Useful filter functions: #' #' There are many functions and operators that are useful when constructing the #' expressions used to filter the data: #' #' * [`==`], [`>`], [`>=`] etc #' * [`&`], [`|`], [`!`], [xor()] #' * [is.na()] #' * [between()], [near()] #' * [when_any()], [when_all()] #' #' @section Grouped tibbles: #' #' Because filtering expressions are computed within groups, they may yield #' different results on grouped tibbles. This will be the case as soon as an #' aggregating, lagging, or ranking function is involved. Compare this ungrouped #' filtering: #' #' ``` #' starwars |> filter(mass > mean(mass, na.rm = TRUE)) #' ``` #' #' With the grouped equivalent: #' #' ``` #' starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) #' ``` #' #' In the ungrouped version, `filter()` compares the value of `mass` in each row #' to the global average (taken over the whole data set), keeping only the rows #' with `mass` greater than this global average. In contrast, the grouped #' version calculates the average mass separately for each `gender` group, and #' keeps rows with `mass` greater than the relevant within-gender average. #' #' @section Methods: #' #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. #' #' @family single table verbs #' @name filter #' #' @examples #' # Filtering for one criterion #' filter(starwars, species == "Human") #' #' # Filtering for multiple criteria within a single logical expression #' filter(starwars, hair_color == "none" & eye_color == "black") #' filter(starwars, hair_color == "none" | eye_color == "black") #' #' # Multiple comma separated expressions are combined using `&` #' starwars |> filter(hair_color == "none", eye_color == "black") #' #' # To combine comma separated expressions using `|` instead, use `when_any()` #' starwars |> filter(when_any(hair_color == "none", eye_color == "black")) #' #' # Filtering out to drop rows #' filter_out(starwars, hair_color == "none") #' #' # When filtering out, it can be useful to first interactively filter for the #' # rows you want to drop, just to double check that you've written the #' # conditions correctly. Then, just change `filter()` to `filter_out()`. #' filter(starwars, mass > 1000, eye_color == "orange") #' filter_out(starwars, mass > 1000, eye_color == "orange") #' #' # The filtering operation may yield different results on grouped #' # tibbles because the expressions are computed within groups. #' # #' # The following keeps rows where `mass` is greater than the #' # global average: #' starwars |> filter(mass > mean(mass, na.rm = TRUE)) #' #' # Whereas this keeps rows with `mass` greater than the per `gender` #' # average: #' starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) #' #' # If you find yourself trying to use a `filter()` to drop rows, then #' # you should consider if switching to `filter_out()` can simplify your #' # conditions. For example, to drop blond individuals, you might try: #' starwars |> filter(hair_color != "blond") #' #' # But this also drops rows with an `NA` hair color! To retain those: #' starwars |> filter(hair_color != "blond" | is.na(hair_color)) #' #' # But explicit `NA` handling like this can quickly get unwieldy, especially #' # with multiple conditions. Since your intent was to specify rows to drop #' # rather than rows to keep, use `filter_out()`. This also removes the need #' # for any explicit `NA` handling. #' starwars |> filter_out(hair_color == "blond") #' #' # To refer to column names that are stored as strings, use the `.data` #' # pronoun: #' vars <- c("mass", "height") #' cond <- c(80, 150) #' starwars |> #' filter( #' .data[[vars[[1]]]] > cond[[1]], #' .data[[vars[[2]]]] > cond[[2]] #' ) #' # Learn more in ?rlang::args_data_masking NULL #' @rdname filter #' @export filter <- function(.data, ..., .by = NULL, .preserve = FALSE) { check_by_typo(...) check_not_both_by_and_preserve({{ .by }}, .preserve) UseMethod("filter") } #' @rdname filter #' @export filter_out <- function(.data, ..., .by = NULL, .preserve = FALSE) { check_by_typo(...) check_not_both_by_and_preserve({{ .by }}, .preserve) UseMethod("filter_out") } #' @export filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { filter_impl( .data = .data, ..., .by = {{ .by }}, .preserve = .preserve, .verb = "filter" ) } #' @export filter_out.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { filter_impl( .data = .data, ..., .by = {{ .by }}, .preserve = .preserve, .verb = "filter_out" ) } filter_impl <- function( .data, ..., .by, .preserve, .invert, .verb, .error_call = caller_env(), .user_env = caller_env(2) ) { dots <- dplyr_quosures(...) check_filter(dots, error_call = .error_call) by <- compute_by( by = {{ .by }}, data = .data, by_arg = ".by", data_arg = ".data", error_call = .error_call ) loc <- filter_rows( data = .data, dots = dots, by = by, verb = .verb, error_call = .error_call, user_env = .user_env ) dplyr_row_slice(.data, loc, preserve = .preserve) } filter_rows <- function( data, dots, by, verb, error_call = caller_env(), user_env = caller_env(2) ) { error_call <- dplyr_error_call(error_call) mask <- DataMask$new(data, by, verb, error_call = error_call) on.exit(mask$forget(), add = TRUE) # 1:1 mapping between `dots` and `dots_expanded` dots_expanded <- filter_expand(dots, mask = mask, error_call = error_call) invert <- verb == "filter_out" filter_eval( dots = dots, dots_expanded = dots_expanded, invert = invert, mask = mask, error_call = error_call, user_env = user_env ) } check_filter <- function(dots, error_call = caller_env()) { named <- have_name(dots) for (i in which(named)) { quo <- dots[[i]] # only allow named logical vectors, anything else # is suspicious expr <- quo_get_expr(quo) if (!is.logical(expr)) { name <- names(dots)[i] bullets <- c( "We detected a named input.", i = glue("This usually means that you've used `=` instead of `==`."), i = glue("Did you mean `{name} == {as_label(expr)}`?") ) abort(bullets, call = error_call) } } } filter_expand <- function(dots, mask, error_call = caller_env()) { env_filter <- env() filter_expand_one <- function(dot, index) { env_filter$current_expression <- index dot <- expand_pick(dot, mask) expand_if_across(dot) } local_error_context(dots, i = 0L, mask = mask) dots <- withCallingHandlers( imap(unname(dots), filter_expand_one), error = function(cnd) { poke_error_context(dots, env_filter$current_expression, mask = mask) abort(cnd_bullet_header("expand"), call = error_call, parent = cnd) } ) new_quosures(dots) } # We evaluate `dots_expanded` but report errors relative to `dots` so that # we show "In argument: `if_any(c(x, y), is.na)`" rather than its expanded form. # This works because `dots` and `dots_expanded` have a 1:1 mapping. filter_eval <- function( dots, dots_expanded, invert, mask, error_call = caller_env(), user_env = caller_env(2) ) { env_filter <- env() warnings_state <- env(warnings = list()) # For condition handlers i <- NULL env_bind_active( current_env(), "i" = function() env_filter$current_expression ) warning_handler <- dplyr_warning_handler( state = warnings_state, mask = mask, error_call = error_call ) out <- withCallingHandlers( mask$eval_all_filter(dots_expanded, invert, env_filter), error = dplyr_error_handler( dots = dots, mask = mask, bullets = filter_bullets, error_call = error_call ), warning = function(cnd) { local_error_context(dots, i, mask) warning_handler(cnd) }, `dplyr:::signal_filter_one_column_matrix` = function(e) { warn_filter_one_column_matrix(env = error_call, user_env = user_env) } ) signal_warnings(warnings_state, error_call) out } filter_bullets <- function(cnd, ...) { UseMethod("filter_bullets") } #' @export `filter_bullets.dplyr:::filter_incompatible_type` <- function(cnd, ...) { index <- cnd$dplyr_error_data$index result <- cnd$dplyr_error_data$result bullets <- cli::format_inline( "`..{index}` must be a logical vector, not {obj_type_friendly(result)}." ) if (is.data.frame(result)) { # Provide some extra advice for people who try and use `across()` inside # of `filter()` bullets <- c( bullets, i = cli::format_inline( "If you used {.fn across} to generate this data frame, please use {.fn if_any} or {.fn if_all} instead." ) ) } bullets } #' @export `filter_bullets.dplyr:::filter_incompatible_size` <- function(cnd, ...) { index <- cnd$dplyr_error_data$index expected_size <- cnd$dplyr_error_data$expected_size size <- cnd$dplyr_error_data$size glue("`..{index}` must be of size {or_1(expected_size)}, not size {size}.") } warn_filter_one_column_matrix <- function(env, user_env) { lifecycle::deprecate_warn( when = "1.1.0", what = I("Using one column matrices in `filter()` or `filter_out()`"), with = I("one dimensional logical vectors"), env = env, user_env = user_env, always = TRUE, id = "dplyr-filter-one-column-matrix" ) } check_not_both_by_and_preserve <- function( .by, .preserve, error_call = caller_env() ) { if (!quo_is_null(enquo(.by)) && !is_false(.preserve)) { abort("Can't supply both `.by` and `.preserve`.", call = error_call) } invisible(NULL) } dplyr/R/reexport-magrittr.R0000644000176200001440000000006714366556340015446 0ustar liggesusers#' @importFrom magrittr %>% #' @export magrittr::`%>%` dplyr/R/src.R0000644000176200001440000000265214366556340012540 0ustar liggesusers#' Create a "src" object #' #' `src()` is the standard constructor for srcs and `is.src()` tests. #' #' @keywords internal #' @export #' @param subclass name of subclass. "src" is an abstract base class, so you #' must supply this value. `src_` is automatically prepended to the #' class name #' @param ... fields used by object. #' #' These dots are evaluated with [explicit splicing][rlang::dots_list]. #' @param x object to test for "src"-ness. src <- function(subclass, ...) { subclass <- paste0("src_", subclass) structure(dots_list(...), class = c(subclass, "src")) } #' @rdname src #' @export is.src <- function(x) inherits(x, "src") #' @export print.src <- function(x, ...) { cat(format(x, ...), "\n", sep = "") } #' List all tbls provided by a source. #' #' This is a generic method which individual src's will provide methods for. #' Most methods will not be documented because it's usually pretty obvious what #' possible results will be. #' #' @param x a data src. #' @param ... other arguments passed on to the individual methods. #' @export #' @keywords internal src_tbls <- function(x, ...) { UseMethod("src_tbls") } #' Figure out if two sources are the same (or two tbl have the same source) #' #' @param x,y src or tbls to test #' @return a logical flag #' @export #' @keywords internal same_src <- function(x, y) { UseMethod("same_src") } #' @export same_src.data.frame <- function(x, y) { is.data.frame(y) } dplyr/R/across.R0000644000176200001440000007561115137161765013250 0ustar liggesusers#' Apply a function (or functions) across multiple columns #' #' @description #' `across()` makes it easy to apply the same transformation to multiple #' columns, allowing you to use [select()] semantics inside in "data-masking" #' functions like [summarise()] and [mutate()]. See `vignette("colwise")` for #' more details. #' #' `if_any()` and `if_all()` apply the same #' predicate function to a selection of columns and combine the #' results into a single logical vector: `if_any()` is `TRUE` when #' the predicate is `TRUE` for *any* of the selected columns, `if_all()` #' is `TRUE` when the predicate is `TRUE` for *all* selected columns. #' #' If you just need to select columns without applying a transformation to each #' of them, then you probably want to use [pick()] instead. #' #' `across()` supersedes the family of "scoped variants" like #' `summarise_at()`, `summarise_if()`, and `summarise_all()`. #' #' @details #' When there are no selected columns: #' #' - `if_any()` will return `FALSE`, consistent with the behavior of #' `any()` when called without inputs. #' - `if_all()` will return `TRUE`, consistent with the behavior of #' `all()` when called without inputs. #' #' @param .cols <[`tidy-select`][dplyr_tidy_select]> Columns to transform. #' You can't select grouping columns because they are already automatically #' handled by the verb (i.e. [summarise()] or [mutate()]). #' @param .fns Functions to apply to each of the selected columns. #' Possible values are: #' #' - A function, e.g. `mean`. #' - A purrr-style lambda, e.g. `~ mean(.x, na.rm = TRUE)` #' - A named list of functions or lambdas, e.g. #' `list(mean = mean, n_miss = ~ sum(is.na(.x))`. Each function is applied #' to each column, and the output is named by combining the function name #' and the column name using the glue specification in `.names`. #' #' Within these functions you can use [cur_column()] and [cur_group()] #' to access the current column and grouping keys respectively. #' @param ... `r lifecycle::badge("deprecated")` #' #' Additional arguments for the function calls in `.fns` are no longer #' accepted in `...` because it's not clear when they should be evaluated: #' once per `across()` or once per group? Instead supply additional arguments #' directly in `.fns` by using a lambda. For example, instead of #' `across(a:b, mean, na.rm = TRUE)` write #' `across(a:b, ~ mean(.x, na.rm = TRUE))`. #' @param .names A glue specification that describes how to name the output #' columns. This can use `{.col}` to stand for the selected column name, and #' `{.fn}` to stand for the name of the function being applied. The default #' (`NULL`) is equivalent to `"{.col}"` for the single function case and #' `"{.col}_{.fn}"` for the case where a list is used for `.fns`. #' @param .unpack `r lifecycle::badge("experimental")` #' #' Optionally [unpack][tidyr::unpack()] data frames returned by functions in #' `.fns`, which expands the df-columns out into individual columns, retaining #' the number of rows in the data frame. #' #' - If `FALSE`, the default, no unpacking is done. #' - If `TRUE`, unpacking is done with a default glue specification of #' `"{outer}_{inner}"`. #' - Otherwise, a single glue specification can be supplied to describe how to #' name the unpacked columns. This can use `{outer}` to refer to the name #' originally generated by `.names`, and `{inner}` to refer to the names of #' the data frame you are unpacking. #' #' @returns #' `across()` typically returns a tibble with one column for each column in #' `.cols` and each function in `.fns`. If `.unpack` is used, more columns may #' be returned depending on how the results of `.fns` are unpacked. #' #' `if_any()` and `if_all()` return a logical vector. #' #' @section Timing of evaluation: #' R code in dplyr verbs is generally evaluated once per group. #' Inside `across()` however, code is evaluated once for each #' combination of columns and groups. If the evaluation timing is #' important, for example if you're generating random variables, think #' about when it should happen and place your code in consequence. #' #' ```{r} #' gdf <- #' tibble(g = c(1, 1, 2, 3), v1 = 10:13, v2 = 20:23) |> #' group_by(g) #' #' set.seed(1) #' #' # Outside: 1 normal variate #' n <- rnorm(1) #' gdf |> mutate(across(v1:v2, ~ .x + n)) #' #' # Inside a verb: 3 normal variates (ngroup) #' gdf |> mutate(n = rnorm(1), across(v1:v2, ~ .x + n)) #' #' # Inside `across()`: 6 normal variates (ncol * ngroup) #' gdf |> mutate(across(v1:v2, ~ .x + rnorm(1))) #' ``` #' #' @examples #' # For better printing #' iris <- as_tibble(iris) #' #' # across() ----------------------------------------------------------------- #' # Using everything() to apply the same function to all columns #' iris |> #' mutate(across(everything(), as.character)) #' #' # Different ways to select the same set of columns #' # See for details #' iris |> #' mutate(across(c(Sepal.Length, Sepal.Width), round)) #' iris |> #' mutate(across(c(1, 2), round)) #' iris |> #' mutate(across(1:Sepal.Width, round)) #' iris |> #' mutate(across(where(is.double) & !c(Petal.Length, Petal.Width), round)) #' #' # Using an external vector of names #' cols <- c("Sepal.Length", "Petal.Width") #' iris |> #' mutate(across(all_of(cols), round)) #' #' # If the external vector is named, the output columns will be named according #' # to those names #' names(cols) <- tolower(cols) #' iris |> #' mutate(across(all_of(cols), round)) #' #' # A purrr-style formula #' iris |> #' group_by(Species) |> #' summarise(across(starts_with("Sepal"), ~ mean(.x, na.rm = TRUE))) #' #' # A named list of functions #' iris |> #' group_by(Species) |> #' summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd))) #' #' # Use the .names argument to control the output names #' iris |> #' group_by(Species) |> #' summarise(across(starts_with("Sepal"), mean, .names = "mean_{.col}")) #' #' iris |> #' group_by(Species) |> #' summarise( #' across( #' starts_with("Sepal"), #' list(mean = mean, sd = sd), #' .names = "{.col}.{.fn}" #' ) #' ) #' #' # If a named external vector is used for column selection, .names will use #' # those names when constructing the output names #' iris |> #' group_by(Species) |> #' summarise(across(all_of(cols), mean, .names = "mean_{.col}")) #' #' # When the list is not named, .fn is replaced by the function's position #' iris |> #' group_by(Species) |> #' summarise( #' across(starts_with("Sepal"), list(mean, sd), .names = "{.col}.fn{.fn}") #' ) #' #' # When the functions in .fns return a data frame, you typically get a #' # "packed" data frame back #' quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) { #' tibble(quantile = probs, value = quantile(x, probs)) #' } #' #' iris |> #' reframe(across(starts_with("Sepal"), quantile_df)) #' #' # Use .unpack to automatically expand these packed data frames into their #' # individual columns #' iris |> #' reframe(across(starts_with("Sepal"), quantile_df, .unpack = TRUE)) #' #' # .unpack can utilize a glue specification if you don't like the defaults #' iris |> #' reframe( #' across(starts_with("Sepal"), quantile_df, .unpack = "{outer}.{inner}") #' ) #' #' # This is also useful inside mutate(), for example, with a multi-lag helper #' multilag <- function(x, lags = 1:3) { #' names(lags) <- as.character(lags) #' purrr::map_dfr(lags, lag, x = x) #' } #' #' iris |> #' group_by(Species) |> #' mutate(across(starts_with("Sepal"), multilag, .unpack = TRUE)) |> #' select(Species, starts_with("Sepal")) #' #' # if_any() and if_all() ---------------------------------------------------- #' iris |> #' filter(if_any(ends_with("Width"), ~ . > 4)) #' iris |> #' filter_out(if_any(ends_with("Width"), ~ . > 4)) #' #' iris |> #' filter(if_all(ends_with("Width"), ~ . > 2)) #' iris |> #' filter_out(if_all(ends_with("Width"), ~ . > 2)) #' #' @export #' @seealso [c_across()] for a function that returns a vector across <- function(.cols, .fns, ..., .names = NULL, .unpack = FALSE) { mask <- peek_mask() caller_env <- caller_env() across_if_fn <- context_peek_bare("across_if_fn") %||% "across" error_call <- context_peek_bare("across_frame") %||% current_env() .cols <- enquo(.cols) fns_quo <- enquo(.fns) fns_quo_env <- quo_get_env(fns_quo) if (quo_is_missing(.cols)) { across_missing_cols_deprecate_warn() .cols <- quo_set_expr(.cols, expr(everything())) } if (is_missing(.fns)) { # Silent restoration to old defaults of `.fns` for now. # TODO: Escalate this to formal deprecation. .fns <- NULL # Catch if dots are non-empty with no `.fns` supplied. # Mainly catches typos, e.g. `.funs` (#6638). check_dots_empty0(...) } else { .fns <- quo_eval_fns(fns_quo, mask = fns_quo_env, error_call = error_call) } if (!is_bool(.unpack) && !is_string(.unpack)) { stop_input_type(.unpack, "`TRUE`, `FALSE`, or a single string") } if (is_string(.unpack)) { unpack_spec <- .unpack .unpack <- TRUE } else { unpack_spec <- "{outer}_{inner}" } setup <- across_setup( cols = !!.cols, fns = .fns, names = .names, .caller_env = caller_env, mask = mask, error_call = error_call, across_if_fn = across_if_fn ) if (!missing(...)) { details <- c( "Supply arguments directly to `.fns` through an anonymous function instead.", "", " " = "# Previously", " " = "across(a:b, mean, na.rm = TRUE)", "", " " = "# Now", " " = "across(a:b, \\(x) mean(x, na.rm = TRUE))" ) lifecycle::deprecate_warn( when = "1.1.0", what = "across(...)", details = details, id = "dplyr-across-fns-dots" ) } vars <- setup$vars if (length(vars) == 0L) { return(dplyr_new_tibble(list(), size = 1L)) } fns <- setup$fns names <- setup$names fns <- map(fns, function(fn) uninline(fn, fns_quo_env)) if (!length(fns)) { # TODO: Deprecate and remove the `.fns = NULL` path in favor of `pick()` data <- mask$pick_current(vars) if (is.null(names)) { return(data) } else { return(set_names(data, names)) } } data <- mask$current_cols(vars) n_cols <- length(data) n_fns <- length(fns) seq_n_cols <- seq_len(n_cols) seq_fns <- seq_len(n_fns) k <- 1L out <- vector("list", n_cols * n_fns) # Reset `cur_column()` info on exit old_var <- context_peek_bare("column") on.exit(context_poke("column", old_var), add = TRUE) # Loop in such an order that all functions are applied # to a single column before moving on to the next column withCallingHandlers( for (i in seq_n_cols) { var <- vars[[i]] col <- data[[i]] context_poke("column", var) for (j in seq_fns) { fn <- fns[[j]] out[[k]] <- fn(col, ...) k <- k + 1L } }, error = function(cnd) { bullets <- c( glue("Can't compute column `{names[k]}`.") ) abort(bullets, call = error_call, parent = cnd) } ) size <- vec_size_common(!!!out) out <- vec_recycle_common(!!!out, .size = size) names(out) <- names out <- dplyr_new_tibble(out, size = size) if (.unpack) { out <- df_unpack(out, unpack_spec, caller_env) } out } #' @rdname across #' @export if_any <- function(.cols, .fns, ..., .names = NULL) { context_local("across_if_fn", "if_any") context_local("across_frame", current_env()) df <- across({{ .cols }}, .fns, ..., .names = .names) x <- dplyr_new_list(df) size <- vec_size(df) dplyr_list_pany(x, size = size) } #' @rdname across #' @export if_all <- function(.cols, .fns, ..., .names = NULL) { context_local("across_if_fn", "if_all") context_local("across_frame", current_env()) df <- across({{ .cols }}, .fns, ..., .names = .names) x <- dplyr_new_list(df) size <- vec_size(df) dplyr_list_pall(x, size = size) } #' Combine values from multiple columns #' #' @description #' `c_across()` is designed to work with [rowwise()] to make it easy to #' perform row-wise aggregations. It has two differences from `c()`: #' #' * It uses tidy select semantics so you can easily select multiple variables. #' See `vignette("rowwise")` for more details. #' #' * It uses [vctrs::vec_c()] in order to give safer outputs. #' #' @inheritParams across #' @seealso [across()] for a function that returns a tibble. #' @export #' @examples #' df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4)) #' df |> #' rowwise() |> #' mutate( #' sum = sum(c_across(w:z)), #' sd = sd(c_across(w:z)) #' ) c_across <- function(cols) { mask <- peek_mask() cols <- enquo(cols) if (quo_is_missing(cols)) { c_across_missing_cols_deprecate_warn() cols <- quo_set_expr(cols, expr(everything())) } vars <- c_across_setup(!!cols, mask = mask) cols <- mask$current_cols(vars) vec_c(!!!cols, .name_spec = zap()) } across_glue_mask <- function(.col, .fn, .caller_env) { glue_mask <- env(.caller_env, .col = .col, .fn = .fn) # TODO: we can make these bindings louder later env_bind_active( glue_mask, col = function() glue_mask$.col, fn = function() glue_mask$.fn ) glue_mask } across_setup <- function( cols, fns, names, .caller_env, mask, error_call = caller_env(), across_if_fn = "across" ) { cols <- enquo(cols) # `across()` is evaluated in a data mask so we need to remove the # mask layer from the quosure environment (#5460) cols <- quo_set_env_to_data_mask_top(cols) # TODO: call eval_select with a calling handler to intercept # classed error, after https://github.com/r-lib/tidyselect/issues/233 if (is.null(fns) && quo_is_call(cols, "~")) { bullets <- c( "Must supply a column selection.", i = glue( "You most likely meant: `{across_if_fn}(everything(), {as_label(cols)})`." ), i = "The first argument `.cols` selects a set of columns.", i = "The second argument `.fns` operates on each selected columns." ) abort(bullets, call = error_call) } data <- mask$get_current_data(groups = FALSE) vars <- tidyselect::eval_select( cols, data = data, error_call = error_call ) names_vars <- names(vars) vars <- names(data)[vars] if (is.null(fns)) { # TODO: Eventually deprecate and remove the `.fns = NULL` path in favor of `pick()` if (!is.null(names)) { glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1") names <- vec_as_names( glue(names, .envir = glue_mask), repair = "check_unique", call = error_call ) } else { names <- names_vars } value <- list(vars = vars, fns = fns, names = names) return(value) } # apply `.names` smart default if (is.function(fns)) { names <- names %||% "{.col}" fns <- list("1" = fns) } else { names <- names %||% "{.col}_{.fn}" } if (!is.list(fns)) { abort("Expected a list.", .internal = TRUE) } # make sure fns has names, use number to replace unnamed if (is.null(names(fns))) { names_fns <- seq_along(fns) } else { names_fns <- names(fns) empties <- which(names_fns == "") if (length(empties)) { names_fns[empties] <- empties } } glue_mask <- across_glue_mask( .caller_env, .col = rep(names_vars, each = length(fns)), .fn = rep(names_fns, length(vars)) ) names <- vec_as_names( glue(names, .envir = glue_mask), repair = "check_unique", call = error_call ) list( vars = vars, fns = fns, names = names ) } uninline <- function(fn, env) { # Reset environment of inlinable lambdas which are set to the empty # env sentinel if (identical(get_env(fn), empty_env())) { set_env(fn, env) } else { fn } } # FIXME: This pattern should be encapsulated by rlang data_mask_top <- function(env, recursive = FALSE, inherit = FALSE) { while (env_has(env, ".__tidyeval_data_mask__.", inherit = inherit)) { env <- env_parent(env_get(env, ".top_env", inherit = inherit)) if (!recursive) { return(env) } } env } quo_set_env_to_data_mask_top <- function(quo) { env <- quo_get_env(quo) env <- data_mask_top(env, recursive = FALSE, inherit = FALSE) quo_set_env(quo, env) } c_across_setup <- function(cols, mask, error_call = caller_env()) { cols <- enquo(cols) # `c_across()` is evaluated in a data mask so we need to remove the # mask layer from the quosure environments (same as `across()`) (#5460, #6522) cols <- quo_set_env_to_data_mask_top(cols) data <- mask$get_current_data(groups = FALSE) vars <- tidyselect::eval_select( expr = cols, data = data, allow_rename = FALSE, error_call = error_call ) value <- names(vars) value } new_dplyr_quosure <- function(quo, ...) { attr(quo, "dplyr:::data") <- list2(...) quo } dplyr_quosure_name <- function(quo_data) { if (quo_data$is_named) { # `name` is a user-supplied or known character string quo_data$name } else { # `name` is a quosure that must be auto-named with_no_rlang_infix_labeling(as_label(quo_data$name)) } } dplyr_quosures <- function(...) { # We're using quos() instead of enquos() here for speed, because we're not defusing named arguments -- # only the ellipsis is converted to quosures, there are no further arguments. quosures <- quos(..., .ignore_empty = "all") names <- names2(quosures) for (i in seq_along(quosures)) { quosure <- quosures[[i]] name <- names[[i]] is_named <- (name != "") if (!is_named) { # Will be auto-named by `dplyr_quosure_name()` only as needed name <- quosure } quosures[[i]] <- new_dplyr_quosure( quo = quosure, name = name, is_named = is_named, index = i ) } quosures } # Expand an `if_any()` or `if_all()` call # # Always guaranteed to be 1 quosure in, 1 quosure out, unlike `expand_across()`. # # For the dplyr backend, the main reason we expand at all is to evaluate # tidyselection exactly once (rather than once per group), because tidyselection # is rather slow. # # At one point we believed `if_any()` and `if_all()` could be implemented as # "pure expansion" that would run before dispatching to other backends, like # dbplyr. In theory this could expand to a chain of `&` and `|` operations that # dbplyr would already know how to translate (so dbplyr itself would not have to # know how to implement `if_any()` and `if_all()`), but in practice we need more # error checking than what `x & y & z` gets us, so we actually expand to a # vctrs-backed implementation since the "pure expansion" ideas have never played # out. expand_if_across <- function(quo) { if (quo_is_call(quo, "if_any", ns = c("", "dplyr"))) { variant <- "any" } else if (quo_is_call(quo, "if_all", ns = c("", "dplyr"))) { variant <- "all" } else { # Refuse to expand return(quo) } # `definition` is the same between the two for the purposes of `match.call()` definition <- if_any call <- match.call( definition = definition, call = quo_get_expr(quo), expand.dots = FALSE, envir = quo_get_env(quo) ) if (!is_null(call$...)) { # Refuse to expand return(quo) } if (variant == "any") { if_fn <- "if_any" dplyr_fn <- "dplyr_list_pany" } else { if_fn <- "if_all" dplyr_fn <- "dplyr_list_pall" } # `expand_across()` will always expand at this point given that we bailed on # `...` usage early on, which is the only case that would stop expansion. # # Set frame here for backtrace truncation. But override error call via # `local_error_call()` so it refers to the function we're expanding, e.g. # `if_any()` and not `expand_if_across()`. context_local("across_if_fn", if_fn) context_local("across_frame", current_env()) local_error_call(call(if_fn)) call[[1]] <- quote(across) quos <- expand_across(quo_set_expr(quo, call)) expr <- expr({ x <- list(!!!quos) ns <- asNamespace("dplyr") # In the evaluation path, `across()` automatically recycles to common size, # so we must here as well for compatibility. `across()` also returns a 0 # col, 1 row data frame in the case of no inputs so that it will recycle to # the group size, which we also do here. size <- ns[["dplyr_list_size_common"]](x, absent = 1L, call = call(!!if_fn)) x <- ns[["dplyr_list_recycle_common"]](x, size = size, call = call(!!if_fn)) ns[[!!dplyr_fn]](x, size = size, error_call = call(!!if_fn)) }) new_quosure(expr, env = baseenv()) } expand_across <- function(quo) { quo_data <- attr(quo, "dplyr:::data") if (!quo_is_call(quo, "across", ns = c("", "dplyr")) || quo_data$is_named) { return(list(quo)) } across_if_fn <- context_peek_bare("across_if_fn") %||% "across" # Set error call to frame for backtrace truncation, but override # call with the relevant function we're doing the expansion for error_call <- context_peek_bare("across_frame") %||% current_env() local_error_call(call(across_if_fn)) # Expand dots in lexical env env <- quo_get_env(quo) expr <- match.call( definition = across, call = quo_get_expr(quo), expand.dots = FALSE, envir = env ) # Abort expansion if there are any expression supplied because dots # must be evaluated once per group in the data mask. Expanding the # `across()` call would lead to either `n_group * n_col` evaluations # if dots are delayed or only 1 evaluation if they are eagerly # evaluated. if (!is_null(expr$...)) { return(list(quo)) } dplyr_mask <- peek_mask() mask <- dplyr_mask$get_rlang_mask() if (".unpack" %in% names(expr)) { # We're expanding expressions but we do need some actual values ahead of # time. We evaluate those in the mask to simulate masked evaluation of an # `across()` call within a verb like `mutate()`. `.names` and `.fns` are # also evaluated this way below. unpack <- eval_tidy(expr$.unpack, mask, env = env) } else { unpack <- FALSE } # Abort expansion if unpacking as expansion makes named expressions and we # need the expressions to remain unnamed if (!is_false(unpack)) { return(list(quo)) } # Differentiate between missing and null (`match.call()` doesn't # expand default argument) if (".cols" %in% names(expr)) { cols <- expr$.cols } else { across_missing_cols_deprecate_warn() cols <- expr(everything()) } cols <- as_quosure(cols, env) if (".fns" %in% names(expr)) { fns <- as_quosure(expr$.fns, env) fns <- quo_eval_fns(fns, mask = mask, error_call = error_call) } else { # In the missing case, silently restore the old default of `NULL`. # TODO: Escalate this to formal deprecation. fns <- NULL } setup <- across_setup( !!cols, fns = fns, names = eval_tidy(expr$.names, mask, env = env), .caller_env = env, mask = dplyr_mask, error_call = error_call, across_if_fn = across_if_fn ) vars <- setup$vars # Empty expansion if (length(vars) == 0L) { return(list()) } fns <- setup$fns names <- setup$names %||% vars # No functions, so just return a list of symbols if (is.null(fns)) { # TODO: Deprecate and remove the `.fns = NULL` path in favor of `pick()` exprs <- pmap(list(vars, names, seq_along(vars)), function(var, name, k) { quo <- new_quosure(sym(var), empty_env()) quo <- new_dplyr_quosure( quo, name = name, is_named = TRUE, index = c(quo_data$index, k), column = var ) }) names(exprs) <- names return(exprs) } n_vars <- length(vars) n_fns <- length(fns) seq_vars <- seq_len(n_vars) seq_fns <- seq_len(n_fns) exprs <- new_list(n_vars * n_fns, names = names) k <- 1L for (i in seq_vars) { var <- vars[[i]] for (j in seq_fns) { fn_call <- as_across_fn_call(fns[[j]], var, env, mask) name <- names[[k]] exprs[[k]] <- new_dplyr_quosure( fn_call, name = name, is_named = TRUE, index = c(quo_data$index, k), column = var ) k <- k + 1L } } exprs } as_across_fn_call <- function(fn, var, env, mask) { if (is_inlinable_lambda(fn)) { # Transform inlinable lambdas to simple quosured calls arg <- names(formals(fn))[[1]] expr <- body(fn) expr <- expr_substitute(expr, sym(arg), sym(var)) new_quosure(expr, env) } else { # Non-inlinable elements are wrapped in a quosured call. It's # important that these are set to their original quosure # environment (passed as `env`) because we change non-inlinable # lambdas to inherit from the data mask in order to make them # maskable. By wrapping them in a quosured call that inherits from # the original quosure environment that wrapped the expanded # `across()` call, we cause `eval_tidy()` to chains this # environment to the top of the data mask, thereby preserving the # lexical environment of the lambda when it is evaluated. new_quosure(call2(fn, sym(var)), env) } } # The environment of functions that are safe to inline has been set to # the empty env sentinel is_inlinable_lambda <- function(x) { is_function(x) && identical(fn_env(x), empty_env()) } across_missing_cols_deprecate_warn <- function() { across_if_fn <- context_peek_bare("across_if_fn") %||% "across" # Passing the correct `user_env` through `expand_across()` to here is # complicated, so instead we force the global environment. This means users # won't ever see the "deprecated feature was likely used in the {pkg}" # message, but the warning will still fire and that is more important. user_env <- global_env() lifecycle::deprecate_warn( when = "1.1.0", what = I(glue("Using `{across_if_fn}()` without supplying `.cols`")), details = "Please supply `.cols` instead.", user_env = user_env, always = TRUE, id = "dplyr-across-missing-cols" ) } c_across_missing_cols_deprecate_warn <- function(user_env = caller_env(2)) { lifecycle::deprecate_warn( when = "1.1.0", what = I("Using `c_across()` without supplying `cols`"), details = "Please supply `cols` instead.", user_env = user_env, always = TRUE, id = "dplyr-c-across-missing-cols" ) } df_unpack <- function(x, spec, caller_env, error_call = caller_env()) { size <- vec_size(x) out <- dplyr_new_list(x) names <- names(out) loc <- which(map_lgl(out, is.data.frame)) cols <- out[loc] col_names <- names[loc] out[loc] <- map2( .x = cols, .y = col_names, .f = apply_unpack_spec, spec = spec, caller_env = caller_env ) # Signal to `df_list()` that these columns should be unpacked names[loc] <- "" names(out) <- names out <- df_list(!!!out, .size = size, .name_repair = "minimal") out <- dplyr_new_tibble(out, size = size) vec_as_names(names(out), repair = "check_unique", call = error_call) out } apply_unpack_spec <- function(col, outer, spec, caller_env) { inner <- names(col) outer <- vec_rep(outer, times = length(inner)) mask <- env(caller_env, outer = outer, inner = inner) inner <- glue(spec, .envir = mask) inner <- as.character(inner) names(col) <- inner col } # Evaluate the quosure of the `.fns` argument # # We detect and mark inlinable lambdas here. By lambda we mean either # a `~` or `function` call that is directly supplied to # `across()`. Lambdas haven't been evaluated yet and don't carry an # environment. # # Inlinable lambdas are eventually expanded in the surrounding call. # To distinguish inlinable lambdas from non-inlinable ones, we set # their environments to the empty env. # # There are cases where we can't inline, for instance lambdas that are # passed additional arguments through `...`. We still want these # non-inlinable lambdas to be maskable so that they can refer to # data-mask columns. So we set them (a) in the evaluation case, to # their original quosure environment which is the data mask, or (b) in # the expansion case, to the uninitialised data mask. # # @value | >. Inlinable lambdas are set to the # empty env. quo_eval_fns <- function(quo, mask, error_call = caller_env()) { # In the evaluation path (as opposed to expansion), the quosure # inherits from the data mask. We set the environment to the data # mask top (the original quosure environment) so that we don't # evaluate the function expressions in the mask. This prevents # masking a function symbol (e.g. `mean`) by a column of the same # name. quo <- quo_set_env_to_data_mask_top(quo) # The following strange scheme is a work around to reconciliate two # contradictory goals. We want to evaluate outside the mask so that # data mask columns are not confused with functions (#6545). # However at the same time we want non-inlinable lambdas (inlinable # ones are dealt with above) to be maskable so they can refer to # data mask columns. So we evaluate outside the mask, in a data-less # quosure mask that handles quosures. Then, in `validate()`, we # detect lambdas that inherit from this quosure mask and set their # environment to the data mask. sentinel_env <- empty_env() out <- eval_tidy(quo({ sentinel_env <<- current_env() !!quo })) validate <- function(x) { if (is_formula(x) || is_function(x)) { # If the function or formula inherits from the data-less quosure # mask, we have a lambda that was directly supplied and # evaluated here. We inline it if possible. if (identical(get_env(x), sentinel_env)) { if (is_inlinable_function(x)) { return(set_env(x, empty_env())) } if (is_inlinable_formula(x)) { x <- expr_substitute(x, quote(.), quote(.x)) fn <- new_function(pairlist2(.x = ), f_rhs(x), empty_env()) return(fn) } # Can't inline the lambda. We set its environment to the data # mask so it can still refer to columns. x <- set_env(x, mask) } as_function(x, arg = ".fns", call = error_call) } else { abort( "`.fns` must be a function, a formula, or a list of functions/formulas.", call = error_call ) } } if (obj_is_list(out)) { map(out, function(elt) validate(elt)) } else { validate(out) } } is_inlinable_function <- function(x) { if (!is_function(x)) { return(FALSE) } fmls <- formals(x) # Don't inline if there are additional arguments even if they have # defaults or are passed through `...` if (length(fmls) != 1) { return(FALSE) } # Don't inline lambdas that call `return()` at the moment a few # packages do things like `across(1, function(x) # return(x))`. Whereas `eval()` sets a return point, `eval_tidy()` # doesn't which causes `return()` to throw an error. if ("return" %in% all.names(body(x))) { return(FALSE) } TRUE } is_inlinable_formula <- function(x) { if (!is_formula(x, lhs = FALSE)) { return(FALSE) } # Don't inline if there are additional arguments passed through `...` nms <- all.names(x) unsupported_arg_rx <- "\\.\\.[0-9]|\\.y" if (any(grepl(unsupported_arg_rx, nms))) { return(FALSE) } # Don't inline lambdas that call `return()` at the moment, see above if ("return" %in% nms) { return(FALSE) } TRUE } dplyr/R/group-trim.R0000644000176200001440000000237215106134104014034 0ustar liggesusers#' Trim grouping structure #' #' @description #' `r lifecycle::badge("experimental")` #' Drop unused levels of all factors that are used as grouping variables, #' then recalculates the grouping structure. #' #' `group_trim()` is particularly useful after a [filter()] that is intended #' to select a subset of groups. #' #' @param .tbl A [grouped data frame][grouped_df()] #' @param .drop See [group_by()] #' @return A [grouped data frame][grouped_df()] #' @export #' @family grouping functions #' @examples #' iris |> #' group_by(Species) |> #' filter(Species == "setosa", .preserve = TRUE) |> #' group_trim() group_trim <- function(.tbl, .drop = group_by_drop_default(.tbl)) { lifecycle::signal_stage("experimental", "group_trim()") UseMethod("group_trim") } #' @export group_trim.data.frame <- function(.tbl, .drop = group_by_drop_default(.tbl)) { .tbl } #' @export group_trim.grouped_df <- function(.tbl, .drop = group_by_drop_default(.tbl)) { vars <- group_vars(.tbl) ungrouped <- ungroup(.tbl) # names of the factors that should be droplevels()'d fgroups <- names(select_if(select_at(ungrouped, vars), is.factor)) # drop levels dropped <- mutate_at(ungrouped, fgroups, droplevels) # regroup group_by_at(dropped, vars, .drop = .drop) } dplyr/R/reexport-tibble.R0000644000176200001440000000071214366556340015053 0ustar liggesusers#' @importFrom tibble data_frame #' @export tibble::data_frame #' @importFrom tibble as_data_frame #' @export tibble::as_data_frame #' @importFrom tibble lst #' @export tibble::lst #' @importFrom tibble add_row #' @export tibble::add_row #' @importFrom tibble tribble #' @export tibble::tribble #' @importFrom tibble tibble #' @export tibble::tibble #' @importFrom tibble as_tibble #' @export tibble::as_tibble #' @importFrom tibble view tibble::view dplyr/R/colwise-filter.R0000644000176200001440000001265015106134104014657 0ustar liggesusers#' Filter within a selection of variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' Scoped verbs (`_if`, `_at`, `_all`) have been superseded by the use of #' [if_all()] or [if_any()] in an existing verb. See `vignette("colwise")` for #' details. #' #' These [scoped] filtering verbs apply a predicate expression to a #' selection of variables. The predicate expression should be quoted #' with [all_vars()] or [any_vars()] and should mention the pronoun #' `.` to refer to variables. #' #' @inheritParams scoped #' @param .vars_predicate A quoted predicate expression as returned by #' [all_vars()] or [any_vars()]. #' #' Can also be a function or purrr-like formula. In this case, the #' intersection of the results is taken by default and there's #' currently no way to request the union. #' @param .preserve when `FALSE` (the default), the grouping structure #' is recalculated based on the resulting data, otherwise it is kept as is. #' @export #' #' @section Grouping variables: #' #' The grouping variables that are part of the selection are taken #' into account to determine filtered rows. #' #' @keywords internal #' @examples #' # While filter() accepts expressions with specific variables, the #' # scoped filter verbs take an expression with the pronoun `.` and #' # replicate it over all variables. This expression should be quoted #' # with all_vars() or any_vars(): #' all_vars(is.na(.)) #' any_vars(is.na(.)) #' #' #' # You can take the intersection of the replicated expressions: #' filter_all(mtcars, all_vars(. > 150)) #' # -> #' filter(mtcars, if_all(everything(), ~ .x > 150)) #' #' # Or the union: #' filter_all(mtcars, any_vars(. > 150)) #' # -> #' filter(mtcars, if_any(everything(), ~ . > 150)) #' #' #' # You can vary the selection of columns on which to apply the #' # predicate. filter_at() takes a vars() specification: #' filter_at(mtcars, vars(starts_with("d")), any_vars((. %% 2) == 0)) #' # -> #' filter(mtcars, if_any(starts_with("d"), ~ (.x %% 2) == 0)) #' #' # And filter_if() selects variables with a predicate function: #' filter_if(mtcars, ~ all(floor(.) == .), all_vars(. != 0)) #' # -> #' is_int <- function(x) all(floor(x) == x) #' filter(mtcars, if_all(where(is_int), ~ .x != 0)) filter_all <- function(.tbl, .vars_predicate, .preserve = FALSE) { lifecycle::signal_stage("superseded", "filter_all()") syms <- syms(tbl_vars(.tbl)) pred <- apply_filter_syms(.vars_predicate, syms, .tbl) filter(.tbl, !!pred, .preserve = .preserve) } #' @rdname filter_all #' @export filter_if <- function(.tbl, .predicate, .vars_predicate, .preserve = FALSE) { lifecycle::signal_stage("superseded", "filter_if()") syms <- tbl_if_syms(.tbl, .predicate, .include_group_vars = TRUE) pred <- apply_filter_syms(.vars_predicate, syms, .tbl) filter(.tbl, !!pred, .preserve = .preserve) } #' @rdname filter_all #' @export filter_at <- function(.tbl, .vars, .vars_predicate, .preserve = FALSE) { lifecycle::signal_stage("superseded", "filter_at()") syms <- tbl_at_syms(.tbl, .vars, .include_group_vars = TRUE) pred <- apply_filter_syms(.vars_predicate, syms, .tbl) filter(.tbl, !!pred, .preserve = .preserve) } apply_filter_syms <- function(pred, syms, tbl, error_call = caller_env()) { if (is_empty(syms)) { msg <- glue("`.predicate` must match at least one column.") abort(msg, call = error_call) } joiner <- all_exprs if (inherits_any(pred, c("all_vars", "any_vars"))) { if (inherits(pred, "any_vars")) { joiner <- any_exprs } pred <- map(syms, function(sym) expr_substitute(pred, quote(.), sym)) } else if (is_bare_formula(pred) || is_function(pred)) { pred <- as_function(pred) pred <- map(syms, function(sym) call2(pred, sym)) } else { msg <- glue( "`.vars_predicate` must be a function or a call to `all_vars()` or `any_vars()`, not {obj_type_friendly(pred)}." ) abort(msg, call = error_call) } joiner(!!!pred) } ## Return the union or intersection of predicate expressions. ## ## `all_exprs()` and `any_exprs()` take predicate expressions and join them ## into a single predicate. They assume vectorised expressions by ## default and join them with `&` or `|`. Note that this will also ## work with scalar predicates, but if you want to be explicit you can ## set `.vectorised` to `FALSE` to join by `&&` or `||`. ## ## @param ... Predicate expressions. ## @param .vectorised If `TRUE`, predicates are joined with `&` or ## `|`. Otherwise, they are joined with `&&` or `||`. ## @return A [quosure][rlang::quo]. ## @export ## @examples ## all_exprs(cyl > 3, am == 1) ## any_exprs(cyl > 3, am == 1) ## any_exprs(cyl > 3, am == 1, .vectorised = FALSE) all_exprs <- function(..., .vectorised = TRUE) { op <- if (.vectorised) quote(`&`) else quote(`&&`) quo_reduce(..., .op = op) } ## @rdname all_exprs ## @export any_exprs <- function(..., .vectorised = TRUE) { op <- if (.vectorised) quote(`|`) else quote(`||`) quo_reduce(..., .op = op) } ## @param .op Can be a function or a quoted name of a function. If a ## quoted name, the default environment is the [base ## environment][rlang::base_env] unless you supply a ## [quosure][rlang::quo]. quo_reduce <- function(..., .op) { stopifnot(is_symbol(.op) || is_function(.op)) dots <- enquos(...) if (length(dots) == 1) { return(dots[[1]]) } op_quo <- as_quosure(.op, base_env()) op <- quo_get_expr(op_quo) expr <- reduce(dots, function(x, y) expr((!!op)((!!x), (!!y)))) new_quosure(expr, quo_get_env(op_quo)) } dplyr/R/desc.R0000644000176200001440000000065115106134104012643 0ustar liggesusers#' Descending order #' #' Transform a vector into a format that will be sorted in descending order. #' This is useful within [arrange()]. #' #' @param x vector to transform #' @export #' @examples #' desc(1:10) #' desc(factor(letters)) #' #' first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years") #' desc(first_day) #' #' starwars |> arrange(desc(mass)) desc <- function(x) { obj_check_vector(x) -xtfrm(x) } dplyr/R/colwise-mutate.R0000644000176200001440000003552015106134104014672 0ustar liggesusers#' Summarise multiple columns #' #' @description #' `r lifecycle::badge("superseded")` #' #' Scoped verbs (`_if`, `_at`, `_all`) have been superseded by the use of #' [pick()] or [across()] in an existing verb. See `vignette("colwise")` for #' details. #' #' The [scoped] variants of [summarise()] make it easy to apply the same #' transformation to multiple variables. #' There are three variants. #' * `summarise_all()` affects every variable #' * `summarise_at()` affects variables selected with a character vector or #' vars() #' * `summarise_if()` affects variables selected with a predicate function #' #' @inheritParams scoped #' @param .cols This argument has been renamed to `.vars` to fit #' dplyr's terminology and is deprecated. #' @return A data frame. By default, the newly created columns have the shortest #' names needed to uniquely identify the output. To force inclusion of a name, #' even when not needed, name the input (see examples for details). #' @seealso [The other scoped verbs][scoped], [vars()] #' #' @section Grouping variables: #' #' If applied on a grouped tibble, these operations are *not* applied #' to the grouping variables. The behaviour depends on whether the #' selection is **implicit** (`all` and `if` selections) or #' **explicit** (`at` selections). #' #' * Grouping variables covered by explicit selections in #' `summarise_at()` are always an error. Add `-group_cols()` to the #' [vars()] selection to avoid this: #' #' ``` #' data |> #' summarise_at(vars(-group_cols(), ...), myoperation) #' ``` #' #' Or remove `group_vars()` from the character vector of column names: #' #' ``` #' nms <- setdiff(nms, group_vars(data)) #' data |> summarise_at(nms, myoperation) #' ``` #' #' * Grouping variables covered by implicit selections are silently #' ignored by `summarise_all()` and `summarise_if()`. #' #' @section Naming: #' #' The names of the new columns are derived from the names of the #' input variables and the names of the functions. #' #' - if there is only one unnamed function (i.e. if `.funs` is an unnamed list #' of length one), #' the names of the input variables are used to name the new columns; #' #' - for `_at` functions, if there is only one unnamed variable (i.e., #' if `.vars` is of the form `vars(a_single_column)`) and `.funs` has length #' greater than one, #' the names of the functions are used to name the new columns; #' #' - otherwise, the new names are created by #' concatenating the names of the input variables and the names of the #' functions, separated with an underscore `"_"`. #' #' The `.funs` argument can be a named or unnamed list. #' If a function is unnamed and the name cannot be derived automatically, #' a name of the form "fn#" is used. #' Similarly, [vars()] accepts named and unnamed arguments. #' If a variable in `.vars` is named, a new column by that name will be created. #' #' Name collisions in the new columns are disambiguated using a unique suffix. #' #' @examples #' # The _at() variants directly support strings: #' starwars |> #' summarise_at(c("height", "mass"), mean, na.rm = TRUE) #' # -> #' starwars |> summarise(across(c("height", "mass"), ~ mean(.x, na.rm = TRUE))) #' #' # You can also supply selection helpers to _at() functions but you have #' # to quote them with vars(): #' starwars |> #' summarise_at(vars(height:mass), mean, na.rm = TRUE) #' # -> #' starwars |> #' summarise(across(height:mass, ~ mean(.x, na.rm = TRUE))) #' #' # The _if() variants apply a predicate function (a function that #' # returns TRUE or FALSE) to determine the relevant subset of #' # columns. Here we apply mean() to the numeric columns: #' starwars |> #' summarise_if(is.numeric, mean, na.rm = TRUE) #' starwars |> #' summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE))) #' #' by_species <- iris |> #' group_by(Species) #' #' # If you want to apply multiple transformations, pass a list of #' # functions. When there are multiple functions, they create new #' # variables instead of modifying the variables in place: #' by_species |> #' summarise_all(list(min, max)) #' # -> #' by_species |> #' summarise(across(everything(), list(min = min, max = max))) #' @export #' @keywords internal summarise_all <- function(.tbl, .funs, ...) { lifecycle::signal_stage("superseded", "summarise_all()") funs <- manip_all( .tbl, .funs, enquo(.funs), caller_env(), ..., .caller = "summarise_all" ) summarise(.tbl, !!!funs) } #' @rdname summarise_all #' @export summarise_if <- function(.tbl, .predicate, .funs, ...) { lifecycle::signal_stage("superseded", "summarise_if()") funs <- manip_if( .tbl, .predicate, .funs, enquo(.funs), caller_env(), ..., .caller = "summarise_if" ) summarise(.tbl, !!!funs) } #' @rdname summarise_all #' @export summarise_at <- function(.tbl, .vars, .funs, ..., .cols = NULL) { lifecycle::signal_stage("superseded", "summarise_at()") .vars <- check_dot_cols(.vars, .cols) funs <- manip_at( .tbl, .vars, .funs, enquo(.funs), caller_env(), ..., .caller = "summarise_at" ) summarise(.tbl, !!!funs) } #' @rdname summarise_all #' @export summarize_all <- summarise_all #' @rdname summarise_all #' @export summarize_if <- summarise_if #' @rdname summarise_all #' @export summarize_at <- summarise_at #' Mutate multiple columns #' #' @description #' `r lifecycle::badge("superseded")` #' #' Scoped verbs (`_if`, `_at`, `_all`) have been superseded by the use of #' [pick()] or [across()] in an existing verb. See `vignette("colwise")` for #' details. #' #' The [scoped] variants of [mutate()] and [transmute()] make it easy to apply #' the same transformation to multiple variables. There are three variants: #' * _all affects every variable #' * _at affects variables selected with a character vector or vars() #' * _if affects variables selected with a predicate function: #' #' @inheritParams scoped #' @inheritParams summarise_all #' @return A data frame. By default, the newly created columns have the shortest #' names needed to uniquely identify the output. To force inclusion of a name, #' even when not needed, name the input (see examples for details). #' @seealso [The other scoped verbs][scoped], [vars()] #' #' @section Grouping variables: #' #' If applied on a grouped tibble, these operations are *not* applied #' to the grouping variables. The behaviour depends on whether the #' selection is **implicit** (`all` and `if` selections) or #' **explicit** (`at` selections). #' #' * Grouping variables covered by explicit selections in #' `mutate_at()` and `transmute_at()` are always an error. Add #' `-group_cols()` to the [vars()] selection to avoid this: #' #' ``` #' data |> mutate_at(vars(-group_cols(), ...), myoperation) #' ``` #' #' Or remove `group_vars()` from the character vector of column names: #' #' ``` #' nms <- setdiff(nms, group_vars(data)) #' data |> mutate_at(vars, myoperation) #' ``` #' #' * Grouping variables covered by implicit selections are ignored by #' `mutate_all()`, `transmute_all()`, `mutate_if()`, and #' `transmute_if()`. #' #' @inheritSection summarise_all Naming #' #' @examples #' iris <- as_tibble(iris) #' #' # All variants can be passed functions and additional arguments, #' # purrr-style. The _at() variants directly support strings. Here #' # we'll scale the variables `height` and `mass`: #' scale2 <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm) #' starwars |> mutate_at(c("height", "mass"), scale2) #' # -> #' starwars |> mutate(across(c("height", "mass"), scale2)) #' #' # You can pass additional arguments to the function: #' starwars |> mutate_at(c("height", "mass"), scale2, na.rm = TRUE) #' starwars |> mutate_at(c("height", "mass"), ~scale2(., na.rm = TRUE)) #' # -> #' starwars |> mutate(across(c("height", "mass"), ~ scale2(.x, na.rm = TRUE))) #' #' # You can also supply selection helpers to _at() functions but you have #' # to quote them with vars(): #' iris |> mutate_at(vars(matches("Sepal")), log) #' iris |> mutate(across(matches("Sepal"), log)) #' #' # The _if() variants apply a predicate function (a function that #' # returns TRUE or FALSE) to determine the relevant subset of #' # columns. Here we divide all the numeric columns by 100: #' starwars |> mutate_if(is.numeric, scale2, na.rm = TRUE) #' starwars |> mutate(across(where(is.numeric), ~ scale2(.x, na.rm = TRUE))) #' #' # mutate_if() is particularly useful for transforming variables from #' # one type to another #' iris |> mutate_if(is.factor, as.character) #' iris |> mutate_if(is.double, as.integer) #' # -> #' iris |> mutate(across(where(is.factor), as.character)) #' iris |> mutate(across(where(is.double), as.integer)) #' #' # Multiple transformations ---------------------------------------- #' #' # If you want to apply multiple transformations, pass a list of #' # functions. When there are multiple functions, they create new #' # variables instead of modifying the variables in place: #' iris |> mutate_if(is.numeric, list(scale2, log)) #' iris |> mutate_if(is.numeric, list(~scale2(.), ~log(.))) #' iris |> mutate_if(is.numeric, list(scale = scale2, log = log)) #' # -> #' iris |> #' as_tibble() |> #' mutate(across(where(is.numeric), list(scale = scale2, log = log))) #' #' # When there's only one function in the list, it modifies existing #' # variables in place. Give it a name to instead create new variables: #' iris |> mutate_if(is.numeric, list(scale2)) #' iris |> mutate_if(is.numeric, list(scale = scale2)) #' @export #' @keywords internal mutate_all <- function(.tbl, .funs, ...) { lifecycle::signal_stage("superseded", "mutate_all()") check_grouped(.tbl, "mutate", "all", alt = TRUE) funs <- manip_all( .tbl, .funs, enquo(.funs), caller_env(), ..., .caller = "mutate_all" ) mutate(.tbl, !!!funs) } #' @rdname mutate_all #' @export mutate_if <- function(.tbl, .predicate, .funs, ...) { lifecycle::signal_stage("superseded", "mutate_if()") check_grouped(.tbl, "mutate", "if") funs <- manip_if( .tbl, .predicate, .funs, enquo(.funs), caller_env(), ..., .caller = "mutate_if" ) mutate(.tbl, !!!funs) } #' @rdname mutate_all #' @export mutate_at <- function(.tbl, .vars, .funs, ..., .cols = NULL) { lifecycle::signal_stage("superseded", "mutate_at()") .vars <- check_dot_cols(.vars, .cols) funs <- manip_at( .tbl, .vars, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "mutate_at" ) mutate(.tbl, !!!funs) } #' @rdname mutate_all #' @export transmute_all <- function(.tbl, .funs, ...) { lifecycle::signal_stage("superseded", "transmute_all()") check_grouped(.tbl, "transmute", "all", alt = TRUE) funs <- manip_all( .tbl, .funs, enquo(.funs), caller_env(), ..., .caller = "transmute_all" ) transmute(.tbl, !!!funs) } #' @rdname mutate_all #' @export transmute_if <- function(.tbl, .predicate, .funs, ...) { lifecycle::signal_stage("superseded", "transmute_if()") check_grouped(.tbl, "transmute", "if") funs <- manip_if( .tbl, .predicate, .funs, enquo(.funs), caller_env(), ..., .caller = "transmute_if" ) transmute(.tbl, !!!funs) } #' @rdname mutate_all #' @export transmute_at <- function(.tbl, .vars, .funs, ..., .cols = NULL) { lifecycle::signal_stage("superseded", "transmute_at()") .vars <- check_dot_cols(.vars, .cols) funs <- manip_at( .tbl, .vars, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "transmute_at" ) transmute(.tbl, !!!funs) } # Helpers ----------------------------------------------------------------- manip_all <- function( .tbl, .funs, .quo, .env, ..., .include_group_vars = FALSE, .caller, error_call = caller_env() ) { if (.include_group_vars) { syms <- syms(tbl_vars(.tbl)) } else { syms <- syms(tbl_nongroup_vars(.tbl)) } funs <- as_fun_list( .funs, .env, ..., .caller = .caller, error_call = error_call, .user_env = caller_env(2) ) manip_apply_syms(funs, syms, .tbl) } manip_if <- function( .tbl, .predicate, .funs, .quo, .env, ..., .include_group_vars = FALSE, .caller, error_call = caller_env() ) { vars <- tbl_if_syms( .tbl, .predicate, .env, .include_group_vars = .include_group_vars, error_call = error_call ) funs <- as_fun_list( .funs, .env, ..., .caller = .caller, error_call = error_call, .user_env = caller_env(2) ) manip_apply_syms(funs, vars, .tbl) } manip_at <- function( .tbl, .vars, .funs, .quo, .env, ..., .include_group_vars = FALSE, .caller, error_call = caller_env() ) { syms <- tbl_at_syms( .tbl, .vars, .include_group_vars = .include_group_vars, error_call = error_call ) funs <- as_fun_list( .funs, .env, ..., .caller = .caller, error_call = error_call, .user_env = caller_env(2) ) manip_apply_syms(funs, syms, .tbl) } check_grouped <- function(tbl, verb, suffix, alt = FALSE) { if (is_grouped_df(tbl)) { if (alt) { alt_line <- sprintf( "Use `%s_at(df, vars(-group_cols()), myoperation)` to silence the message.", verb ) } else { alt_line <- chr() } inform(c( sprintf( "`%s_%s()` ignored the following grouping variables:", verb, suffix ), set_names(fmt_cols(group_vars(tbl)), "*"), "i" = alt_line )) } } check_dot_cols <- function(vars, cols) { if (is_null(cols)) { vars } else { inform("`.cols` has been renamed and is deprecated, please use `.vars`") if (missing(vars)) cols else vars } } manip_apply_syms <- function(funs, syms, tbl) { out <- vector("list", length(syms) * length(funs)) dim(out) <- c(length(syms), length(funs)) syms_position <- match(as.character(syms), tbl_vars(tbl)) for (i in seq_along(syms)) { pos <- syms_position[i] for (j in seq_along(funs)) { fun <- funs[[j]] if (is_quosure(fun)) { out[[i, j]] <- expr_substitute(funs[[j]], quote(.), syms[[i]]) } else { out[[i, j]] <- call2(funs[[j]], syms[[i]]) } attr(out[[i, j]], "position") <- pos } } dim(out) <- NULL # Use symbols as default names unnamed <- !have_name(syms) names(syms)[unnamed] <- map_chr(syms[unnamed], as_string) if (length(funs) == 1 && !attr(funs, "have_name")) { names(out) <- names(syms) } else { nms <- names(funs) %||% rep("", length(funs)) is_fun <- nms == "" | nms == "" nms[is_fun] <- paste0("fn", seq_len(sum(is_fun))) nms <- unique_names(nms, quiet = TRUE) names(funs) <- nms if (length(syms) == 1 && all(unnamed)) { names(out) <- names(funs) } else { syms_names <- ifelse(unnamed, map_chr(syms, as_string), names(syms)) grid <- expand.grid(var = syms_names, call = names(funs)) names(out) <- paste(grid$var, grid$call, sep = "_") } } out } dplyr/R/n-distinct.R0000644000176200001440000000260314366556340014021 0ustar liggesusers#' Count unique combinations #' #' `n_distinct()` counts the number of unique/distinct combinations in a set #' of one or more vectors. It's a faster and more concise equivalent to #' `nrow(unique(data.frame(...)))`. #' #' @param ... Unnamed vectors. If multiple vectors are supplied, then they should #' have the same length. #' @param na.rm If `TRUE`, exclude missing observations from the count. #' If there are multiple vectors in `...`, an observation will #' be excluded if _any_ of the values are missing. #' @returns A single number. #' @export #' @examples #' x <- c(1, 1, 2, 2, 2) #' n_distinct(x) #' #' y <- c(3, 3, NA, 3, 3) #' n_distinct(y) #' n_distinct(y, na.rm = TRUE) #' #' # Pairs (1, 3), (2, 3), and (2, NA) are distinct #' n_distinct(x, y) #' #' # (2, NA) is dropped, leaving 2 distinct combinations #' n_distinct(x, y, na.rm = TRUE) #' #' # Also works with data frames #' n_distinct(data.frame(x, y)) n_distinct <- function(..., na.rm = FALSE) { if (missing(...)) { cli::cli_abort("{.arg ...} is absent, but must be supplied.") } check_dots_unnamed() data <- df_list( ..., .unpack = FALSE, .name_repair = "minimal", .error_call = current_env() ) data <- new_data_frame(data) if (isTRUE(na.rm)) { # Drop observation if *any* missing complete <- vec_detect_complete(data) data <- vec_slice(data, complete) } vec_unique_count(data) } dplyr/R/defunct-lazyeval.R0000644000176200001440000000655115137161765015230 0ustar liggesusers#' Defunct standard evaluation functions #' #' @description #' `r lifecycle::badge("defunct")` #' #' dplyr used to offer twin versions of each verb suffixed with an #' underscore. These versions had standard evaluation (SE) semantics: #' rather than taking arguments by code, like NSE verbs, they took #' arguments by value. Their purpose was to make it possible to #' program with dplyr. However, dplyr now uses tidy evaluation #' semantics. NSE verbs still capture their arguments, but you can now #' unquote parts of these arguments. This offers full programmability #' with NSE verbs. Thus, the underscored versions are now superfluous. #' #' Unquoting triggers immediate evaluation of its operand and inlines #' the result within the captured expression. This result can be a #' value or an expression to be evaluated later with the rest of the #' argument. See `vignette("programming")` for more information. #' #' @name defunct-lazyeval #' @keywords internal NULL lazy_defunct <- function(fun, hint = TRUE) { lifecycle::deprecate_stop( when = "0.7.0", what = paste0(fun, "_()"), with = paste0(fun, "()"), details = if (hint) "See vignette('programming') for more help" ) } #' @rdname defunct-lazyeval #' @export add_count_ <- function(x, vars, wt = NULL, sort = FALSE) { lazy_defunct("add_count") } #' @rdname defunct-lazyeval #' @export add_tally_ <- function(x, wt, sort = FALSE) { lazy_defunct("add_tally") } #' @export #' @rdname defunct-lazyeval arrange_ <- function(.data, ..., .dots = list()) { lazy_defunct("arrange") } #' @export #' @rdname defunct-lazyeval count_ <- function( x, vars, wt = NULL, sort = FALSE, .drop = group_by_drop_default(x) ) { lazy_defunct("count") } #' @export #' @rdname defunct-lazyeval distinct_ <- function(.data, ..., .dots, .keep_all = FALSE) { lazy_defunct("distinct") } #' @export #' @rdname defunct-lazyeval do_ <- function(.data, ..., .dots = list()) { lazy_defunct("do") } #' @export #' @rdname defunct-lazyeval filter_ <- function(.data, ..., .dots = list()) { lazy_defunct("filter") } #' @export #' @rdname defunct-lazyeval funs_ <- function(dots, args = list(), env = base_env()) { lazy_defunct("funs") } #' @export #' @rdname defunct-lazyeval group_by_ <- function(.data, ..., .dots = list(), add = FALSE) { lazy_defunct("group_by") } #' @export #' @rdname defunct-lazyeval group_indices_ <- function(.data, ..., .dots = list()) { lazy_defunct("group_indices", hint = FALSE) } #' @export #' @rdname defunct-lazyeval mutate_ <- function(.data, ..., .dots = list()) { lazy_defunct("mutate") } #' @rdname defunct-lazyeval #' @export tally_ <- function(x, wt, sort = FALSE) { lazy_defunct("tally") } #' @rdname defunct-lazyeval #' @export transmute_ <- function(.data, ..., .dots = list()) { lazy_defunct("transmute") } #' @rdname defunct-lazyeval #' @export rename_ <- function(.data, ..., .dots = list()) { lazy_defunct("rename", hint = FALSE) } #' @export #' @rdname defunct-lazyeval select_ <- function(.data, ..., .dots = list()) { lazy_defunct("select", hint = FALSE) } #' @export #' @rdname defunct-lazyeval slice_ <- function(.data, ..., .dots = list()) { lazy_defunct("slice", hint = FALSE) } #' @export #' @rdname defunct-lazyeval summarise_ <- function(.data, ..., .dots = list()) { lazy_defunct("summarise", hint = FALSE) } #' @rdname defunct-lazyeval #' @export summarize_ <- summarise_ dplyr/R/join-rows.R0000644000176200001440000003265515106134104013665 0ustar liggesusersjoin_rows <- function( x_key, y_key, ..., type = c("inner", "left", "right", "full", "semi", "anti", "nest"), na_matches = "na", condition = "==", filter = "none", cross = FALSE, multiple = "all", unmatched = "drop", relationship = NULL, error_call = caller_env(), user_env = caller_env() ) { check_dots_empty0(...) type <- arg_match0( arg = type, values = c("inner", "left", "right", "full", "semi", "anti", "nest"), error_call = error_call ) unmatched <- check_unmatched(unmatched, type, error_call = error_call) x_unmatched <- unmatched$x y_unmatched <- unmatched$y if (cross) { # TODO: Remove this section when `by = character()` is defunct # Rather than matching on key values, match on a proxy where every x value # matches every y value. This purposefully does not propagate missings, as # missing values aren't considered in a cross-join. x_key <- vec_rep(1L, times = vec_size(x_key)) y_key <- vec_rep(1L, times = vec_size(y_key)) condition <- "==" filter <- "none" } if (is_null(relationship)) { relationship <- compute_join_relationship( type, condition, cross, user_env = user_env ) } else { relationship <- check_join_relationship( relationship, error_call = error_call ) } incomplete <- standardise_join_incomplete(type, na_matches, x_unmatched) no_match <- standardise_join_no_match(type, x_unmatched) remaining <- standardise_join_remaining(type, y_unmatched) matches <- dplyr_locate_matches( needles = x_key, haystack = y_key, condition = condition, filter = filter, incomplete = incomplete, no_match = no_match, remaining = remaining, multiple = multiple, relationship = relationship, needles_arg = "x", haystack_arg = "y", error_call = error_call ) list(x = matches$needles, y = matches$haystack) } dplyr_locate_matches <- function( needles, haystack, ..., condition = "==", filter = "none", incomplete = "compare", no_match = NA_integer_, remaining = "drop", multiple = "all", relationship = "none", needles_arg = "", haystack_arg = "", error_call = caller_env() ) { check_dots_empty0(...) withCallingHandlers( vctrs::vec_locate_matches( needles = needles, haystack = haystack, condition = condition, filter = filter, incomplete = incomplete, no_match = no_match, remaining = remaining, multiple = multiple, relationship = relationship, needles_arg = needles_arg, haystack_arg = haystack_arg, nan_distinct = TRUE ), vctrs_error_incompatible_type = function(cnd) { abort("`join_cast_common()` should have handled this.", .internal = TRUE) }, vctrs_error_matches_overflow = function(cnd) { rethrow_error_join_matches_overflow(cnd, error_call) }, vctrs_error_matches_nothing = function(cnd) { rethrow_error_join_matches_nothing(cnd, error_call) }, vctrs_error_matches_incomplete = function(cnd) { rethrow_error_join_matches_incomplete(cnd, error_call) }, vctrs_error_matches_remaining = function(cnd) { rethrow_error_join_matches_remaining(cnd, error_call) }, vctrs_error_matches_relationship_one_to_one = function(cnd) { rethrow_error_join_relationship_one_to_one(cnd, error_call) }, vctrs_error_matches_relationship_one_to_many = function(cnd) { rethrow_error_join_relationship_one_to_many(cnd, error_call) }, vctrs_error_matches_relationship_many_to_one = function(cnd) { rethrow_error_join_relationship_many_to_one(cnd, error_call) }, vctrs_warning_matches_relationship_many_to_many = function(cnd) { rethrow_warning_join_relationship_many_to_many(cnd, error_call) }, vctrs_error_matches_multiple = function(cnd) { rethrow_error_join_matches_multiple(cnd, error_call) }, vctrs_warning_matches_multiple = function(cnd) { rethrow_warning_join_matches_multiple(cnd, error_call) } ) } rethrow_error_join_matches_overflow <- function(cnd, call) { size <- cnd$size stop_join( message = c( "This join would result in more rows than dplyr can handle.", i = glue( "{size} rows would be returned. ", "2147483647 rows is the maximum number allowed." ), i = paste0( "Double check your join keys. This error commonly occurs due to a ", "missing join key, or an improperly specified join condition." ) ), class = "dplyr_error_join_matches_overflow", call = call ) } rethrow_error_join_matches_nothing <- function(cnd, call) { i <- cnd$i stop_join( message = c( "Each row of `x` must have a match in `y`.", i = glue("Row {i} of `x` does not have a match.") ), class = "dplyr_error_join_matches_nothing", call = call ) } rethrow_error_join_matches_incomplete <- function(cnd, call) { # Only occurs with `na_matches = "never", unmatched = "error"` for # right and inner joins, and is a signal that `x` has unmatched incompletes # that would result in dropped rows. So really this is a matched-nothing case. rethrow_error_join_matches_nothing(cnd, call) } rethrow_error_join_matches_remaining <- function(cnd, call) { i <- cnd$i stop_join( message = c( "Each row of `y` must be matched by `x`.", i = glue("Row {i} of `y` was not matched.") ), class = "dplyr_error_join_matches_remaining", call = call ) } rethrow_error_join_relationship_one_to_one <- function(cnd, call) { i <- cnd$i which <- cnd$which if (which == "needles") { x_name <- "x" y_name <- "y" } else { x_name <- "y" y_name <- "x" } stop_join_matches_multiple( i = i, x_name = x_name, y_name = y_name, class = "dplyr_error_join_relationship_one_to_one", call = call ) } rethrow_error_join_relationship_one_to_many <- function(cnd, call) { stop_join_matches_multiple( i = cnd$i, x_name = "y", y_name = "x", class = "dplyr_error_join_relationship_one_to_many", call = call ) } rethrow_error_join_relationship_many_to_one <- function(cnd, call) { stop_join_matches_multiple( i = cnd$i, x_name = "x", y_name = "y", class = "dplyr_error_join_relationship_many_to_one", call = call ) } rethrow_warning_join_relationship_many_to_many <- function(cnd, call) { i <- cnd$i j <- cnd$j warn_join( message = c( "Detected an unexpected many-to-many relationship between `x` and `y`.", i = glue("Row {i} of `x` matches multiple rows in `y`."), i = glue("Row {j} of `y` matches multiple rows in `x`."), i = paste0( "If a many-to-many relationship is expected, ", "set `relationship = \"many-to-many\"` to silence this warning." ) ), class = "dplyr_warning_join_relationship_many_to_many", call = call ) # Cancel `cnd` maybe_restart("muffleWarning") } rethrow_error_join_matches_multiple <- function(cnd, call) { stop_join_matches_multiple( i = cnd$i, x_name = "x", y_name = "y", class = "dplyr_error_join_matches_multiple", call = call ) } rethrow_warning_join_matches_multiple <- function(cnd, call) { i <- cnd$i warn_join( message = c( glue("Each row in `x` is expected to match at most 1 row in `y`."), i = glue("Row {i} of `x` matches multiple rows.") ), class = "dplyr_warning_join_matches_multiple", call = call ) # Cancel `cnd` maybe_restart("muffleWarning") } stop_join_matches_multiple <- function(i, x_name, y_name, class, call) { stop_join( message = c( glue("Each row in `{x_name}` must match at most 1 row in `{y_name}`."), i = glue("Row {i} of `{x_name}` matches multiple rows in `{y_name}`.") ), class = class, call = call ) } stop_join <- function(message = NULL, class = NULL, ..., call = caller_env()) { stop_dplyr( message = message, class = c(class, "dplyr_error_join"), ..., call = call ) } warn_join <- function(message = NULL, class = NULL, ...) { warn_dplyr(message = message, class = c(class, "dplyr_warning_join"), ...) } stop_dplyr <- function(message = NULL, class = NULL, ..., call = caller_env()) { abort(message = message, class = c(class, "dplyr_error"), ..., call = call) } warn_dplyr <- function(message = NULL, class = NULL, ...) { warn(message = message, class = c(class, "dplyr_warning"), ...) } check_unmatched <- function(unmatched, type, error_call = caller_env()) { # Inner joins check both `x` and `y` for unmatched keys, so `unmatched` is # allowed to be a character vector of size 2 in that case to check `x` and `y` # independently inner <- type == "inner" n_unmatched <- length(unmatched) if (n_unmatched == 1L || (n_unmatched == 2L && inner)) { arg_match( arg = unmatched, values = c("drop", "error"), multiple = TRUE, error_arg = "unmatched", error_call = error_call ) } else if (inner) { cli::cli_abort( "{.arg unmatched} must be length 1 or 2, not {n_unmatched}.", call = error_call ) } else { cli::cli_abort( "{.arg unmatched} must be length 1, not {n_unmatched}.", call = error_call ) } if (n_unmatched == 1L) { list(x = unmatched, y = unmatched) } else { list(x = unmatched[[1L]], y = unmatched[[2L]]) } } standardise_join_incomplete <- function(type, na_matches, x_unmatched) { if (na_matches == "na") { # Comparing missings in incomplete observations overrides the other arguments "compare" } else if (x_unmatched == "error" && (type == "right" || type == "inner")) { # Ensure that `x` can't drop rows when `na_matches = "never"` "error" } else if (type == "inner" || type == "right" || type == "semi") { # With these joins and `na_matches = "never"`, drop missings from `x` "drop" } else if (type == "nest") { # Nest join is special and returns `0` which will be sliced out later 0L } else { # Otherwise we are keeping all keys from `x` NA_integer_ } } standardise_join_no_match <- function(type, x_unmatched) { if (x_unmatched == "error" && (type == "right" || type == "inner")) { # Ensure that `x` can't drop rows "error" } else if (type == "inner" || type == "right" || type == "semi") { # With these joins, unmatched keys in `x` get dropped "drop" } else if (type == "nest") { # Nest join is special and returns `0` which will be sliced out later 0L } else { # Otherwise we are keeping all keys from `x` NA_integer_ } } standardise_join_remaining <- function(type, y_unmatched) { if ( y_unmatched == "error" && (type == "left" || type == "inner" || type == "nest") ) { # Ensure that `y` can't drop rows "error" } else if (type == "right" || type == "full") { # With these joins, unmatched keys in `y` are kept NA_integer_ } else { # Otherwise we drop unmatched keys in `y` "drop" } } compute_join_relationship <- function( type, condition, cross, user_env = caller_env(2) ) { if (type == "nest") { # Not unreasonable to see a many-to-many relationship here, but it can't # result in a Cartesian explosion in the result so we don't check for it return("none") } if (type %in% c("semi", "anti")) { # Impossible to generate a many-to-many relationship here because we set # `multiple = "any"` return("none") } if (cross) { # TODO: Remove when `by = character()` is defunct # Cross-joins always result in many-to-many relationships return("none") } any_inequality <- any(condition != "==") if (any_inequality) { # We only check for a many-to-many relationship when doing an equality join, # because that is where it is typically unexpected. # - Inequality and overlap joins often generate many-to-many relationships # by nature # - Rolling joins are a little trickier, but we've decided that not warning # is probably easier to explain. `relationship = "many-to-one"` can always # be used explicitly as needed. return("none") } if (!is_direct(user_env)) { # Indirect calls don't warn, because the caller is unlikely to have access # to `relationship` to silence it return("none") } "warn-many-to-many" } check_join_relationship <- function(relationship, error_call = caller_env()) { arg_match0( arg = relationship, values = c("one-to-one", "one-to-many", "many-to-one", "many-to-many"), error_call = error_call ) } # ------------------------------------------------------------------------------ # TODO: Use upstream function when exported from rlang # `lifecycle:::is_direct()` is_direct <- function(env) { env_inherits_global(env) || from_testthat(env) } env_inherits_global <- function(env) { # `topenv(emptyenv())` returns the global env. Return `FALSE` in # that case to allow passing the empty env when the # soft-deprecation should not be promoted to deprecation based on # the caller environment. if (is_reference(env, empty_env())) { return(FALSE) } is_reference(topenv(env), global_env()) } # TRUE if we are in unit tests and the package being tested is the # same as the package that called from_testthat <- function(env) { tested_package <- Sys.getenv("TESTTHAT_PKG") if (!nzchar(tested_package)) { return(FALSE) } top <- topenv(env) if (!is_namespace(top)) { return(FALSE) } # Test for environment names rather than reference/contents because # testthat clones the namespace identical(ns_env_name(top), tested_package) } dplyr/R/data-bands.R0000644000176200001440000000141313663216626013740 0ustar liggesusers#' Band membership #' #' These data sets describe band members of the Beatles and Rolling Stones. They #' are toy data sets that can be displayed in their entirety on a slide (e.g. to #' demonstrate a join). #' #' `band_instruments` and `band_instruments2` contain the same data but use #' different column names for the first column of the data set. #' `band_instruments` uses `name`, which matches the name of the key column of #' `band_members`; `band_instruments2` uses `artist`, which does not. #' #' @format Each is a tibble with two variables and three observations #' @examples #' band_members #' band_instruments #' band_instruments2 "band_members" #' @rdname band_members #' @format NULL "band_instruments" #' @rdname band_members #' @format NULL "band_instruments2" dplyr/R/case-when.R0000644000176200001440000004550115137161765013623 0ustar liggesusers#' A general vectorised if-else #' #' @description #' `case_when()` and `replace_when()` are two forms of vectorized [if_else()]. #' They work by evaluating each case sequentially and using the first match for #' each element to determine the corresponding value in the output vector. #' #' - Use `case_when()` when creating an entirely new vector. #' #' - Use `replace_when()` when partially updating an existing vector. #' #' If you are just replacing a few values within an existing vector, then #' `replace_when()` is always a better choice because it is type stable, size #' stable, pipes better, and better expresses intent. #' #' A major difference between the two functions is what happens when no cases #' match: #' #' - `case_when()` falls through to a `.default` as a final "else" statement. #' #' - `replace_when()` retains the original values from `x`. #' #' See `vignette("recoding-replacing")` for more examples. #' #' @param x A vector. #' #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> A sequence of two-sided #' formulas. The left hand side (LHS) determines which values match this case. #' The right hand side (RHS) provides the replacement value. #' #' For `case_when()`: #' #' - The LHS inputs must be logical vectors. For backwards compatibility, #' scalars are [recycled][vctrs::theory-faq-recycling], but we no longer #' recommend supplying scalars. #' #' - The RHS inputs will be [cast][vctrs::theory-faq-coercion] to their common #' type, and will be [recycled][vctrs::theory-faq-recycling] to the common #' size of the LHS inputs. #' #' For `replace_when()`: #' #' - The LHS inputs must be logical vectors the same size as `x`. #' #' - The RHS inputs will be [cast][vctrs::theory-faq-coercion] to the type of #' `x` and [recycled][vctrs::theory-faq-recycling] to the size of `x`. #' #' `NULL` inputs are ignored. #' #' @param .default The value used when all of the LHS inputs return either #' `FALSE` or `NA`. #' #' - If `NULL`, the default, a missing value will be used. #' #' - If provided, `.default` will follow the same type and size rules as the #' RHS inputs. #' #' `NA` values in the LHS conditions are treated like `FALSE`, meaning that #' the result at those locations will be assigned the `.default` value. To #' handle missing values in the conditions differently, you must explicitly #' catch them with another condition before they fall through to the #' `.default`. This typically involves some variation of `is.na(x) ~ value` #' tailored to your usage of `case_when()`. #' #' @param .unmatched Handling of unmatched locations. #' #' One of: #' #' - `"default"` to use `.default` in unmatched locations. #' #' - `"error"` to error when there are unmatched locations. #' #' @param .ptype An optional prototype declaring the desired output type. If #' supplied, this overrides the common type of the RHS inputs. #' #' @param .size An optional size declaring the desired output size. If supplied, #' this overrides the common size computed from the LHS inputs. #' #' @returns #' For `case_when()`, a new vector where the size is the common size of the LHS #' inputs, the type is the common type of the RHS inputs, and the names #' correspond to the names of the RHS elements used in the result. #' #' For `replace_when()`, an updated version of `x`, with the same size, type, #' and names as `x`. #' #' @seealso [recode_values()], [vctrs::vec_case_when()] #' #' @name case-and-replace-when #' #' @examples #' x <- 1:70 #' case_when( #' x %% 35 == 0 ~ "fizz buzz", #' x %% 5 == 0 ~ "fizz", #' x %% 7 == 0 ~ "buzz", #' .default = as.character(x) #' ) #' #' # Like an if statement, the arguments are evaluated in order, so you must #' # proceed from the most specific to the most general. This won't work: #' case_when( #' x %% 5 == 0 ~ "fizz", #' x %% 7 == 0 ~ "buzz", #' x %% 35 == 0 ~ "fizz buzz", #' .default = as.character(x) #' ) #' #' # If none of the cases match and no `.default` is supplied, NA is used: #' case_when( #' x %% 35 == 0 ~ "fizz buzz", #' x %% 5 == 0 ~ "fizz", #' x %% 7 == 0 ~ "buzz" #' ) #' #' # Note that `NA` values on the LHS are treated like `FALSE` and will be #' # assigned the `.default` value. You must handle them explicitly if you #' # want to use a different value. The exact way to handle missing values is #' # dependent on the set of LHS conditions you use. #' x[2:4] <- NA_real_ #' case_when( #' x %% 35 == 0 ~ "fizz buzz", #' x %% 5 == 0 ~ "fizz", #' x %% 7 == 0 ~ "buzz", #' is.na(x) ~ "nope", #' .default = as.character(x) #' ) #' #' # `case_when()` is not a replacement for basic if/else control flow. When #' # you have a single scalar condition, using if/else is faster, simpler to #' # reason about, and is lazy on the branch that isn't run. For example, this #' # seems to work: #' x <- "value" #' case_when(is.character(x) ~ x, .default = "not-a-character") #' #' # Until `x` is a non-character type #' x <- 1 #' try(case_when(is.character(x) ~ x, .default = "not-a-character")) #' #' # Instead, you should use if/else #' if (is.character(x)) { #' y <- x #' } else { #' y <- "not-a-character" #' } #' y #' #' # If you believe that you've covered every possible case, then set #' # `.unmatched = "error"` rather than supplying a `.default`. This adds an #' # extra layer of safety to `case_when()` and is particularly useful when you #' # have a series of complex expressions! #' set.seed(123) #' x <- sample(50) #' #' # Oops, we forgot to handle `50` #' try(case_when( #' x < 10 ~ "ten", #' x < 20 ~ "twenty", #' x < 30 ~ "thirty", #' x < 40 ~ "forty", #' x < 50 ~ "fifty", #' .unmatched = "error" #' )) #' #' case_when( #' x < 10 ~ "ten", #' x < 20 ~ "twenty", #' x < 30 ~ "thirty", #' x < 40 ~ "forty", #' x <= 50 ~ "fifty", #' .unmatched = "error" #' ) #' #' # Note that `NA` is considered unmatched and must be handled with its own #' # explicit case, even if that case just propagates the missing value! #' x[c(2, 5)] <- NA #' #' case_when( #' x < 10 ~ "ten", #' x < 20 ~ "twenty", #' x < 30 ~ "thirty", #' x < 40 ~ "forty", #' x <= 50 ~ "fifty", #' is.na(x) ~ NA, #' .unmatched = "error" #' ) #' #' # `replace_when()` is useful when you're updating an existing vector, #' # rather than creating an entirely new one. Note the so-far unused "puppy" #' # factor level: #' pets <- tibble( #' name = c("Max", "Bella", "Chuck", "Luna", "Cooper"), #' type = factor( #' c("dog", "dog", "cat", "dog", "cat"), #' levels = c("dog", "cat", "puppy") #' ), #' age = c(1, 3, 5, 2, 4) #' ) #' #' # We can replace some values with `"puppy"` based on arbitrary conditions. #' # Even though we are using a character `"puppy"` value, `replace_when()` will #' # automatically cast it to the factor type of `type` for us. #' pets |> #' mutate( #' type = replace_when(type, type == "dog" & age <= 2 ~ "puppy") #' ) #' #' # Compare that with this `case_when()` call, which loses the factor class. #' # It's always better to use `replace_when()` when updating a few values in #' # an existing vector! #' pets |> #' mutate( #' type = case_when(type == "dog" & age <= 2 ~ "puppy", .default = type) #' ) #' #' # `case_when()` and `replace_when()` evaluate all RHS expressions, and then #' # construct their result by extracting the selected (via the LHS expressions) #' # parts. For example, `NaN`s are produced here because `sqrt(y)` is evaluated #' # on all of `y`, not just where `y >= 0`. #' y <- seq(-2, 2, by = .5) #' replace_when(y, y >= 0 ~ sqrt(y)) #' #' # These functions are particularly useful inside `mutate()` when you want to #' # create a new variable that relies on a complex combination of existing #' # variables #' starwars |> #' select(name:mass, gender, species) |> #' mutate( #' type = case_when( #' height > 200 | mass > 200 ~ "large", #' species == "Droid" ~ "robot", #' .default = "other" #' ) #' ) #' #' # `case_when()` is not a tidy eval function. If you'd like to reuse #' # the same patterns, extract the `case_when()` call into a normal #' # function: #' case_character_type <- function(height, mass, species) { #' case_when( #' height > 200 | mass > 200 ~ "large", #' species == "Droid" ~ "robot", #' .default = "other" #' ) #' } #' #' case_character_type(150, 250, "Droid") #' case_character_type(150, 150, "Droid") #' #' # Such functions can be used inside `mutate()` as well: #' starwars |> #' mutate(type = case_character_type(height, mass, species)) |> #' pull(type) #' #' # `case_when()` ignores `NULL` inputs. This is useful when you'd #' # like to use a pattern only under certain conditions. Here we'll #' # take advantage of the fact that `if` returns `NULL` when there is #' # no `else` clause: #' case_character_type <- function(height, mass, species, robots = TRUE) { #' case_when( #' height > 200 | mass > 200 ~ "large", #' if (robots) species == "Droid" ~ "robot", #' .default = "other" #' ) #' } #' #' starwars |> #' mutate(type = case_character_type(height, mass, species, robots = FALSE)) |> #' pull(type) #' #' # `replace_when()` can also be used in combination with `pick()` to #' # conditionally mutate rows within multiple columns using a single condition. #' # Here `replace_when()` returns a data frame with new `species` and `name` #' # columns, which `mutate()` then automatically unpacks. #' starwars |> #' select(homeworld, species, name) |> #' mutate(replace_when( #' pick(species, name), #' homeworld == "Tatooine" ~ tibble( #' species = "Tatooinese", #' name = paste(name, "(Tatooine)") #' ) #' )) NULL #' @rdname case-and-replace-when #' @export case_when <- function( ..., .default = NULL, .unmatched = "default", .ptype = NULL, .size = NULL ) { args <- eval_formulas(..., allow_empty_dots = FALSE) conditions <- args$lhs values <- args$rhs .size <- case_when_size_common( conditions = conditions, values = values, size = .size ) # Only recycle `conditions`. Expect that `vec_case_when()` requires all # `conditions` to be the same size, but can efficiently recycle `values` at # the C level without extra allocations. conditions <- vec_recycle_common(!!!conditions, .size = .size) vec_case_when( conditions = conditions, values = values, default = .default, unmatched = .unmatched, ptype = .ptype, size = .size, conditions_arg = "", values_arg = "", default_arg = ".default", error_call = current_env() ) } #' @rdname case-and-replace-when #' @export replace_when <- function(x, ...) { check_dots_unnamed() args <- eval_formulas(..., allow_empty_dots = TRUE) conditions <- args$lhs values <- args$rhs vec_replace_when( x = x, conditions = conditions, values = values, x_arg = "x", conditions_arg = "", values_arg = "", error_call = current_env() ) } # Size common computation for `case_when()` # # `case_when()`'s formula interface historically finds the common size of ALL # inputs. This is not good, ideally it would force all LHS inputs to have the # same size (with no recycling), and then recycle all RHS inputs to that size # inferred from the LHS. That is how `vec_case_when()` works. # # We can't change this easily for two reasons: # # - `TRUE ~` must continue to work for legacy reasons, so at the very least all # LHS inputs must be recycled against each other. We are okay with this. # # - Many packages (60+) use `case_when()` with scalar LHSs but vector RHSs, # requiring that all inputs by recycled against each other. This usage should # be replaced with a series of if statements. This is a highly inefficient use # of `case_when()` because each scalar LHS has to be recycled to the size # determined from the RHS, which is a big waste of memory and time. This # behavior can also allow real bugs to slip through silently (#7082), which is # bad. To combat this case, we specially detect this and throw a deprecation # warning. # # There are four cases to consider: # # 1. `size_conditions == 1, size_values == 1` # # Fine, use size 1 # # 2. `size_conditions == 1, size_values != 1` # # Use `size_values` for historical reasons, but warn against this. This is # people doing off-label usage of `case_when()` when they should be using a # series of if statements. # # 3. `size_conditions != 1, size_values == 1` # # Fine, use `size_conditions` # # 4. `size_conditions != 1, size_values != 1` # # If `size_conditions == size_values`, good to go, else throw an error by # recalling `vec_size_common()` with all inputs. case_when_size_common <- function( conditions, values, size, ..., user_env = caller_env(2), error_call = caller_env() ) { # These error if there are any size incompatibilites within LHS and RHS inputs, # but not across LHS and RHS inputs size_conditions <- vec_size_common( !!!conditions, .size = size, .absent = -1L, .call = error_call ) size_values <- vec_size_common( !!!values, .size = size, .absent = -1L, .call = error_call ) # Annoying handling of the edge case of all `NULL` elements in the formulas. # `vec_case_when()` will error on these and require vector inputs, but we have # to get to it first (#7794). if (size_conditions == -1L || size_values == -1L) { if (size_conditions != -1L) { # All `values` are `NULL`, but at least 1 `conditions` is non-`NULL` return(size_conditions) } else if (size_values != -1L) { # All `conditions` are `NULL`, but at least 1 `values` is non-`NULL` return(size_values) } else { # All `conditions` and `values` are `NULL` return(0L) } } if (size_conditions == 1L && size_values == 1L) { return(1L) } if (size_conditions == 1L && size_values != 1L) { warn_case_when_scalar_lhs_vector_rhs( env = error_call, user_env = user_env ) return(size_values) } if (size_conditions != 1L && size_values == 1L) { return(size_conditions) } if (size_conditions != 1L && size_values != 1L) { if (size_conditions == size_values) { return(size_conditions) } # Errors vec_size_common( !!!conditions, !!!values, .size = size, .call = error_call ) abort("`vec_size_common()` should have errored.", .internal = TRUE) } abort("All cases should have been covered.", .internal = TRUE) } warn_case_when_scalar_lhs_vector_rhs <- function( env, user_env ) { what <- I( "Calling `case_when()` with size 1 LHS inputs and size >1 RHS inputs" ) details <- no_cli_wrapping(paste( sep = "\n", "This `case_when()` statement can result in subtle silent bugs and is very inefficient.", "", " Please use a series of if statements instead:", "", " ```", " # Previously", " case_when(scalar_lhs1 ~ rhs1, scalar_lhs2 ~ rhs2, .default = default)", "", " # Now", " if (scalar_lhs1) {", " rhs1", " } else if (scalar_lhs2) {", " rhs2", " } else {", " default", " }", " ```" )) lifecycle::deprecate_soft( when = "1.2.0", what = what, details = details, env = env, user_env = user_env, id = "dplyr-case-when-scalar-lhs-vector-rhs" ) } # Suppress cli wrapping https://cli.r-lib.org/reference/inline-markup.html#wrapping no_cli_wrapping <- function(x) { x <- gsub(" ", "\u00a0", x, fixed = TRUE) x <- gsub("\n", "\f", x, fixed = TRUE) x } eval_formulas <- function( ..., allow_empty_dots, dots_env = current_env(), user_env = caller_env(2), error_call = caller_env() ) { dots <- list2(...) # Store index computed before dropping `NULL` so error indices are correct indices <- seq_along(dots) # Drop `NULL`s if (vec_any_missing(dots)) { not_missing <- !vec_detect_missing(dots) dots <- vec_slice(dots, not_missing) indices <- vec_slice(indices, not_missing) } dots_size <- length(dots) dots_seq <- seq_len(dots_size) if (!allow_empty_dots && dots_size == 0L) { abort("`...` can't be empty.", call = error_call) } pairs <- map2( .x = dots, .y = indices, .f = function(dot, index) { validate_and_split_formula( dot = dot, index = index, dots_env = dots_env, user_env = user_env, error_call = error_call ) } ) lhs <- vector("list", dots_size) rhs <- vector("list", dots_size) env_error_info <- new_environment() # Using 1 call to `withCallingHandlers()` that wraps all `eval_tidy()` # evaluations to avoid repeated handler setup (#6674) withCallingHandlers( for (i in dots_seq) { env_error_info[["index"]] <- indices[[i]] pair <- pairs[[i]] env_error_info[["side"]] <- "left" elt_lhs <- eval_tidy(pair$lhs, env = user_env) env_error_info[["side"]] <- "right" elt_rhs <- eval_tidy(pair$rhs, env = user_env) if (!is.null(elt_lhs)) { lhs[[i]] <- elt_lhs } if (!is.null(elt_rhs)) { rhs[[i]] <- elt_rhs } }, error = function(cnd) { message <- glue::glue_data( env_error_info, "Failed to evaluate the {side}-hand side of formula {index}." ) abort(message, parent = cnd, call = error_call) } ) # TODO: Ideally we'd name the lhs/rhs values with their `as_label()`-ed # expressions. But `as_label()` is much too slow for that to be useful in # a grouped `mutate()`. We need a way to add ALTREP lazy names that only get # materialized on demand (i.e. on error). Until then, we fall back to the # positional names (like `..1` or `..3`) with info about left/right (#6674). # # # Add the expressions as names for `lhs` and `rhs` for nice errors. # # These names also get passed on to the underlying vctrs backend. # lhs_names <- map(quos_pairs, function(pair) pair$lhs) # lhs_names <- map_chr(lhs_names, as_label) # names(lhs) <- lhs_names # # rhs_names <- map(quos_pairs, function(pair) pair$rhs) # rhs_names <- map_chr(rhs_names, as_label) # names(rhs) <- rhs_names if (dots_size > 0L) { names(lhs) <- paste0("..", indices, " (left)") names(rhs) <- paste0("..", indices, " (right)") } list( lhs = lhs, rhs = rhs ) } validate_and_split_formula <- function( dot, index, dots_env, user_env, error_call ) { if (is_quosure(dot)) { # We specially handle quosures, assuming they hold formulas user_env <- quo_get_env(dot) dot <- quo_get_expr(dot) } if (!is_formula(dot, lhs = TRUE)) { arg <- substitute(...(), dots_env)[[index]] arg <- glue::backtick(as_label(arg)) if (is_formula(dot)) { type <- "a two-sided formula, not a one-sided formula" } else { type <- glue("a two-sided formula, not {obj_type_friendly(dot)}") } message <- glue("Case {index} ({arg}) must be {type}.") abort(message, call = error_call) } # Formula might be unevaluated, e.g. if it's been quosured env <- f_env(dot) %||% user_env list( lhs = new_quosure(f_lhs(dot), env), rhs = new_quosure(f_rhs(dot), env) ) } dplyr/R/funs.R0000644000176200001440000000650615106134104012705 0ustar liggesusers#' Detect where values fall in a specified range #' #' This is a shortcut for `x >= left & x <= right`, implemented for local #' vectors and translated to the appropriate SQL for remote tables. #' #' @details #' `x`, `left`, and `right` are all cast to their common type before the #' comparison is made. Use the `ptype` argument to specify the type manually. #' #' @inheritParams rlang::args_dots_empty #' #' @param x A vector #' @param left,right Boundary values. Both `left` and `right` are recycled to #' the size of `x`. #' @param ptype An optional prototype giving the desired output type. The #' default is to compute the common type of `x`, `left`, and `right` using #' [vctrs::vec_cast_common()]. #' #' @returns #' A logical vector the same size as `x` with a type determined by `ptype`. #' #' @seealso #' [join_by()] if you are looking for documentation for the `between()` overlap #' join helper. #' #' @export #' @examples #' between(1:12, 7, 9) #' #' x <- rnorm(1e2) #' x[between(x, -1, 1)] #' #' # On a tibble using `filter()` #' filter(starwars, between(height, 100, 150)) #' #' # Using the `ptype` argument with ordered factors, where otherwise everything #' # is cast to the common type of character before the comparison #' x <- ordered( #' c("low", "medium", "high", "medium"), #' levels = c("low", "medium", "high") #' ) #' between(x, "medium", "high") #' between(x, "medium", "high", ptype = x) between <- function(x, left, right, ..., ptype = NULL) { check_dots_empty0(...) args <- list(x = x, left = left, right = right) # Common type of all inputs args <- vec_cast_common(!!!args, .to = ptype) x <- args$x args$x <- NULL # Recycle to size of `x` args <- vec_recycle_common(!!!args, .size = vec_size(x)) left <- args$left right <- args$right left <- vec_compare(x, left) left <- left >= 0L right <- vec_compare(x, right) right <- right <= 0L left & right } #' Cumulative versions of any, all, and mean #' #' dplyr provides `cumall()`, `cumany()`, and `cummean()` to complete R's set #' of cumulative functions. #' #' @section Cumulative logical functions: #' #' These are particularly useful in conjunction with `filter()`: #' #' * `cumall(x)`: all cases until the first `FALSE`. #' * `cumall(!x)`: all cases until the first `TRUE`. #' * `cumany(x)`: all cases after the first `TRUE`. #' * `cumany(!x)`: all cases after the first `FALSE`. #' #' @param x For `cumall()` and `cumany()`, a logical vector; for #' `cummean()` an integer or numeric vector. #' @return A vector the same length as `x`. #' @examples #' # `cummean()` returns a numeric/integer vector of the same length #' # as the input vector. #' x <- c(1, 3, 5, 2, 2) #' cummean(x) #' cumsum(x) / seq_along(x) #' #' # `cumall()` and `cumany()` return logicals #' cumall(x < 5) #' cumany(x == 3) #' #' # `cumall()` vs. `cumany()` #' df <- data.frame( #' date = as.Date("2020-01-01") + 0:6, #' balance = c(100, 50, 25, -25, -50, 30, 120) #' ) #' # all rows after first overdraft #' df |> filter(cumany(balance < 0)) #' # all rows until first overdraft #' df |> filter(cumall(!(balance < 0))) #' #' @export cumall <- function(x) { .Call(`dplyr_cumall`, as.logical(x)) } #' @rdname cumall #' @export cumany <- function(x) { .Call(`dplyr_cumany`, as.logical(x)) } #' @rdname cumall #' @export cummean <- function(x) { .Call(`dplyr_cummean`, as.numeric(x)) } dplyr/R/defunct.R0000644000176200001440000000373515106134104013363 0ustar liggesusers#' Defunct functions #' #' @description #' `r lifecycle::badge("defunct")` #' #' These functions were deprecated for at least two years before being #' made defunct. If there's a known replacement, calling the function #' will tell you about it. #' #' @keywords internal #' @name defunct NULL #' @usage # Deprecated in 1.0.0 ------------------------------------- #' @name defunct NULL #' @export #' @rdname defunct combine <- function(...) { lifecycle::deprecate_stop("1.0.0", "combine()", "vctrs::vec_c()") } #' @export #' @rdname defunct src_mysql <- function( dbname, host = NULL, port = 0L, username = "root", password = "", ... ) { lifecycle::deprecate_stop( "1.0.0", "src_mysql()", details = "Please use `tbl()` directly with a database connection" ) } #' @export #' @rdname defunct src_postgres <- function( dbname = NULL, host = NULL, port = NULL, user = NULL, password = NULL, ... ) { lifecycle::deprecate_stop( "1.0.0", "src_postgres()", details = "Please use `tbl()` directly with a database connection" ) } #' @export #' @rdname defunct src_sqlite <- function(path, create = FALSE) { lifecycle::deprecate_stop( "1.0.0", "src_sqlite()", details = "Please use `tbl()` directly with a database connection" ) } #' @export #' @rdname defunct src_local <- function(tbl, pkg = NULL, env = NULL) { lifecycle::deprecate_stop("1.0.0", "src_local()") } #' @export #' @rdname defunct src_df <- function(pkg = NULL, env = NULL) { lifecycle::deprecate_stop("1.0.0", "src_df()") } #' @export #' @rdname defunct tbl_df <- function(data) { lifecycle::deprecate_stop("1.0.0", "tbl_df()", "tibble::as_tibble()") } #' @export #' @rdname defunct as.tbl <- function(x, ...) { lifecycle::deprecate_stop("1.0.0", "as.tbl()", "tibble::as_tibble()") } #' @export #' @rdname defunct add_rownames <- function(df, var = "rowname") { lifecycle::deprecate_stop( "1.0.0", "add_rownames()", "tibble::rownames_to_column()" ) } dplyr/R/distinct.R0000644000176200001440000001043615106134104013550 0ustar liggesusers#' Keep distinct/unique rows #' #' Keep only unique/distinct rows from a data frame. This is similar #' to [unique.data.frame()] but considerably faster. #' #' @inheritParams arrange #' @param ... <[`data-masking`][rlang::args_data_masking]> Optional variables to #' use when determining uniqueness. If there are multiple rows for a given #' combination of inputs, only the first row will be preserved. If omitted, #' will use all variables in the data frame. #' @param .keep_all If `TRUE`, keep all variables in `.data`. #' If a combination of `...` is not distinct, this keeps the #' first row of values. #' @return #' An object of the same type as `.data`. The output has the following #' properties: #' #' * Rows are a subset of the input but appear in the same order. #' * Columns are not modified if `...` is empty or `.keep_all` is `TRUE`. #' Otherwise, `distinct()` first calls `mutate()` to create new columns. #' * Groups are not modified. #' * Data frame attributes are preserved. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("distinct")}. #' @export #' @examples #' df <- tibble( #' x = sample(10, 100, rep = TRUE), #' y = sample(10, 100, rep = TRUE) #' ) #' nrow(df) #' nrow(distinct(df)) #' nrow(distinct(df, x, y)) #' #' distinct(df, x) #' distinct(df, y) #' #' # You can choose to keep all other variables as well #' distinct(df, x, .keep_all = TRUE) #' distinct(df, y, .keep_all = TRUE) #' #' # You can also use distinct on computed variables #' distinct(df, diff = abs(x - y)) #' #' # Use `pick()` to select columns with tidy-select #' distinct(starwars, pick(contains("color"))) #' #' # Grouping ------------------------------------------------- #' #' df <- tibble( #' g = c(1, 1, 2, 2, 2), #' x = c(1, 1, 2, 1, 2), #' y = c(3, 2, 1, 3, 1) #' ) #' df <- df |> group_by(g) #' #' # With grouped data frames, distinctness is computed within each group #' df |> distinct(x) #' #' # When `...` are omitted, `distinct()` still computes distinctness using #' # all variables in the data frame #' df |> distinct() distinct <- function(.data, ..., .keep_all = FALSE) { UseMethod("distinct") } # Same basic philosophy as group_by_prepare(): lazy_dots comes in, list of data and # vars (character vector) comes out. #' @rdname group_by_prepare #' @export distinct_prepare <- function( .data, vars, group_vars = character(), .keep_all = FALSE, caller_env = caller_env(2), error_call = caller_env() ) { stopifnot(is_quosures(vars), is.character(group_vars)) # If no input, keep all variables if (length(vars) == 0) { return(list( data = .data, vars = seq_along(.data), keep = seq_along(.data) )) } # If any calls, use mutate to add new columns, then distinct on those computed_columns <- add_computed_columns(.data, vars, error_call = error_call) .data <- computed_columns$data distinct_vars <- computed_columns$added_names # Once we've done the mutate, we no longer need lazy objects, and # can instead just use their names missing_vars <- setdiff(distinct_vars, names(.data)) if (length(missing_vars) > 0) { bullets <- c( "Must use existing variables.", set_names( glue("`{missing_vars}` not found in `.data`."), rep("x", length(missing_vars)) ) ) abort(bullets, call = error_call) } # Only keep unique vars distinct_vars <- unique(distinct_vars) # Missing grouping variables are added to the front new_vars <- c(setdiff(group_vars, distinct_vars), distinct_vars) if (.keep_all) { keep <- seq_along(.data) } else { keep <- new_vars } list(data = .data, vars = new_vars, keep = keep) } #' @export distinct.data.frame <- function(.data, ..., .keep_all = FALSE) { prep <- distinct_prepare( .data, vars = enquos(...), group_vars = group_vars(.data), .keep_all = .keep_all, caller_env = caller_env() ) out <- prep$data cols <- dplyr_col_select(out, prep$vars) loc <- vec_unique_loc(cols) out <- dplyr_col_select(out, prep$keep) dplyr_row_slice(out, loc) } dplyr/R/group-split.R0000644000176200001440000000672315106134104014220 0ustar liggesusers#' Split data frame by groups #' #' @description #' `r lifecycle::badge("experimental")` #' #' [group_split()] works like [base::split()] but: #' #' - It uses the grouping structure from [group_by()] and therefore is subject #' to the data mask #' #' - It does not name the elements of the list based on the grouping as this #' only works well for a single character grouping variable. Instead, #' use [group_keys()] to access a data frame that defines the groups. #' #' `group_split()` is primarily designed to work with grouped data frames. #' You can pass `...` to group and split an ungrouped data frame, but this #' is generally not very useful as you want have easy access to the group #' metadata. #' #' @section Lifecycle: #' `group_split()` is not stable because you can achieve very similar results by #' manipulating the nested column returned from #' [`tidyr::nest(.by =)`][tidyr::nest()]. That also retains the group keys all #' within a single data structure. `group_split()` may be deprecated in the #' future. #' #' @param .tbl A tbl. #' @param ... If `.tbl` is an ungrouped data frame, a grouping specification, #' forwarded to [group_by()]. #' @param .keep Should the grouping columns be kept? #' @returns A list of tibbles. Each tibble contains the rows of `.tbl` for the #' associated group and all the columns, including the grouping variables. #' Note that this returns a [list_of][vctrs::list_of()] which is slightly #' stricter than a simple list but is useful for representing lists where #' every element has the same type. #' @keywords internal #' @family grouping functions #' @export #' @examples #' ir <- iris |> group_by(Species) #' #' group_split(ir) #' group_keys(ir) group_split <- function(.tbl, ..., .keep = TRUE) { lifecycle::signal_stage("experimental", "group_split()") UseMethod("group_split") } #' @export group_split.data.frame <- function( .tbl, ..., .keep = TRUE, keep = deprecated() ) { if (!missing(keep)) { lifecycle::deprecate_stop( "1.0.0", "group_split(keep = )", "group_split(.keep = )" ) } data <- group_by(.tbl, ...) group_split_impl(data, .keep = .keep) } #' @export group_split.rowwise_df <- function( .tbl, ..., .keep = TRUE, keep = deprecated() ) { if (!missing(keep)) { lifecycle::deprecate_stop( "1.0.0", "group_split(keep = )", "group_split(.keep = )" ) } if (dots_n(...)) { warn_ignores_dots( "group_split", "rowwise_df", "as_tibble() |> group_split(...)" ) } if (!missing(.keep)) { warn(".keep is ignored in group_split()") } group_split_impl(.tbl, .keep = TRUE) } #' @export group_split.grouped_df <- function( .tbl, ..., .keep = TRUE, keep = deprecated() ) { if (!missing(keep)) { lifecycle::deprecate_stop( "1.0.0", "group_split(keep = )", "group_split(.keep = )" ) } if (dots_n(...)) { warn_ignores_dots( "group_split", "grouped_df", "group_by(..., .add = TRUE) |> group_split()" ) } group_split_impl(.tbl, .keep = .keep) } group_split_impl <- function(data, .keep) { out <- ungroup(data) indices <- group_rows(data) if (!isTRUE(.keep)) { remove <- group_vars(data) .keep <- names(out) .keep <- setdiff(.keep, remove) out <- out[.keep] } dplyr_chop(out, indices) } dplyr_chop <- function(data, indices) { out <- map(indices, dplyr_row_slice, data = data) out <- new_list_of(out, ptype = vec_ptype(data)) out } dplyr/R/top-n.R0000644000176200001440000000411215106134104012756 0ustar liggesusers#' Select top (or bottom) n rows (by value) #' #' @description #' `r lifecycle::badge("superseded")` #' `top_n()` has been superseded in favour of [slice_min()]/[slice_max()]. #' While it will not be deprecated in the near future, retirement means #' that we will only perform critical bug fixes, so we recommend moving to the #' newer alternatives. #' #' `top_n()` was superseded because the name was fundamentally confusing as #' it returned what you might reasonably consider to be the _bottom_ #' rows. Additionally, the `wt` variable had a confusing name, and strange #' default (the last column in the data frame). Unfortunately we could not #' see an easy way to fix the existing `top_n()` function without breaking #' existing code, so we created a new alternative. #' #' @param x A data frame. #' @param n Number of rows to return for `top_n()`, fraction of rows to #' return for `top_frac()`. If `n` is positive, selects the top rows. #' If negative, selects the bottom rows. #' If `x` is grouped, this is the number (or fraction) of rows per group. #' Will include more rows if there are ties. #' @param wt (Optional). The variable to use for ordering. If not #' specified, defaults to the last variable in the tbl. #' @keywords internal #' @export #' @examples #' df <- data.frame(x = c(6, 4, 1, 10, 3, 1, 1)) #' #' df |> top_n(2) # highest values #' df |> top_n(-2) # lowest values #' # now use #' df |> slice_max(x, n = 2) #' df |> slice_min(x, n = 2) #' #' # top_frac() -> prop argument of slice_min()/slice_max() #' df |> top_frac(.5) #' # -> #' df |> slice_max(x, prop = 0.5) top_n <- function(x, n, wt) { lifecycle::signal_stage("superseded", "top_n()") wt <- enquo(wt) if (quo_is_missing(wt)) { vars <- tbl_vars(x) wt_name <- vars[length(vars)] inform(glue("Selecting by ", wt_name)) wt <- sym(wt_name) } filter(x, top_n_rank({{ n }}, !!wt)) } top_n_rank <- function(n, wt) { if (n > 0) { min_rank(desc(wt)) <= n } else { min_rank(wt) <= abs(n) } } #' @export #' @rdname top_n top_frac <- function(x, n, wt) { top_n(x, {{ n }} * n(), {{ wt }}) } dplyr/R/tbl.R0000644000176200001440000000410215106134104012501 0ustar liggesusers#' Create a table from a data source #' #' This is a generic method that dispatches based on the first argument. #' #' @param src A data source #' @param ... Other arguments passed on to the individual methods #' @export tbl <- function(src, ...) { UseMethod("tbl") } #' Create a "tbl" object #' #' `tbl()` is the standard constructor for tbls. `is.tbl()` tests. #' #' @keywords internal #' @export #' @param subclass name of subclass. "tbl" is an abstract base class, so you #' must supply this value. `tbl_` is automatically prepended to the #' class name #' @param ... For `tbl()`, other fields used by class. make_tbl <- function(subclass, ...) { subclass <- paste0("tbl_", subclass) structure(list(...), class = c(subclass, "tbl")) } #' @rdname tbl #' @param x Any object #' @export is.tbl <- function(x) inherits(x, "tbl") tbl_vars_dispatch <- function(x) { UseMethod("tbl_vars") } new_sel_vars <- function(vars, group_vars) { structure( vars, groups = group_vars, class = c("dplyr_sel_vars", "character") ) } #' List variables provided by a tbl. #' #' `tbl_vars()` returns all variables while `tbl_nongroup_vars()` #' returns only non-grouping variables. The `groups` attribute #' of the object returned by `tbl_vars()` is a character vector of the #' grouping columns. #' #' @export #' @param x A tbl object #' @seealso [group_vars()] for a function that returns grouping #' variables. #' @keywords internal tbl_vars <- function(x) { return(new_sel_vars(tbl_vars_dispatch(x), group_vars(x))) # For roxygen and static analysis UseMethod("tbl_vars") } #' @export tbl_vars.data.frame <- function(x) { names(x) } #' @rdname tbl_vars #' @export tbl_nongroup_vars <- function(x) { setdiff(tbl_vars(x), group_vars(x)) } is_sel_vars <- function(x) { inherits(x, "dplyr_sel_vars") } #' @export print.dplyr_sel_vars <- function(x, ...) { cat("\n") print(unstructure(x)) groups <- attr(x, "groups") if (length(groups)) { cat("Groups:\n") print(groups) } invisible(x) } unstructure <- function(x) { attributes(x) <- NULL x } dplyr/R/rename.R0000644000176200001440000000660215106134104013176 0ustar liggesusers#' Rename columns #' #' `rename()` changes the names of individual variables using #' `new_name = old_name` syntax; `rename_with()` renames columns using a #' function. #' #' @inheritParams arrange #' @param ... #' For `rename()`: <[`tidy-select`][dplyr_tidy_select]> Use #' `new_name = old_name` to rename selected variables. #' #' For `rename_with()`: additional arguments passed onto `.fn`. #' @return #' An object of the same type as `.data`. The output has the following #' properties: #' #' * Rows are not affected. #' * Column names are changed; column order is preserved. #' * Data frame attributes are preserved. #' * Groups are updated to reflect new names. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rename")}. #' @family single table verbs #' @export #' @examples #' iris <- as_tibble(iris) # so it prints a little nicer #' rename(iris, petal_length = Petal.Length) #' #' # Rename using a named vector and `all_of()` #' lookup <- c(pl = "Petal.Length", sl = "Sepal.Length") #' rename(iris, all_of(lookup)) #' #' # If your named vector might contain names that don't exist in the data, #' # use `any_of()` instead #' lookup <- c(lookup, new = "unknown") #' try(rename(iris, all_of(lookup))) #' rename(iris, any_of(lookup)) #' #' rename_with(iris, toupper) #' rename_with(iris, toupper, starts_with("Petal")) #' rename_with(iris, ~ tolower(gsub(".", "_", .x, fixed = TRUE))) #' #' @examplesIf getRversion() > "4.0.1" #' # If your renaming function uses `paste0()`, make sure to set #' # `recycle0 = TRUE` to ensure that empty selections are recycled correctly #' try(rename_with( #' iris, #' ~ paste0("prefix_", .x), #' starts_with("nonexistent") #' )) #' #' rename_with( #' iris, #' ~ paste0("prefix_", .x, recycle0 = TRUE), #' starts_with("nonexistent") #' ) #' @export rename <- function(.data, ...) { UseMethod("rename") } #' @export rename.data.frame <- function(.data, ...) { loc <- tidyselect::eval_rename(expr(c(...)), .data) # eval_rename() only returns changes names <- names(.data) names[loc] <- names(loc) set_names(.data, names) } #' @export #' @rdname rename #' @param .fn A function used to transform the selected `.cols`. Should #' return a character vector the same length as the input. #' @param .cols <[`tidy-select`][dplyr_tidy_select]> Columns to rename; #' defaults to all columns. rename_with <- function(.data, .fn, .cols = everything(), ...) { UseMethod("rename_with") } #' @export rename_with.data.frame <- function(.data, .fn, .cols = everything(), ...) { .fn <- as_function(.fn) cols <- tidyselect::eval_select(enquo(.cols), .data, allow_rename = FALSE) names <- names(.data) sel <- vec_slice(names, cols) new <- .fn(sel, ...) if (!is_character(new)) { cli::cli_abort( "{.arg .fn} must return a character vector, not {.obj_type_friendly {new}}." ) } if (length(new) != length(sel)) { cli::cli_abort( "{.arg .fn} must return a vector of length {length(sel)}, not {length(new)}." ) } names <- vec_assign(names, cols, new) names <- vec_as_names(names, repair = "check_unique") set_names(.data, names) } dplyr/R/when.R0000644000176200001440000001123315137161765012705 0ustar liggesusers#' Elementwise `any()` and `all()` #' #' @description #' These functions are variants of [any()] and [all()] that work elementwise #' across multiple inputs. You can also think of these functions as generalizing #' [`|`] and [`&`] to any number of inputs, rather than just two, for example: #' #' - `when_any(x, y, z)` is equivalent to `x | y | z`. #' #' - `when_all(x, y, z)` is equivalent to `x & y & z`. #' #' `when_any()` is particularly useful within [filter()] and [filter_out()] to #' specify comma separated conditions combined with `|` rather than `&`. #' #' @details #' `when_any()` and `when_all()` are "parallel" versions of [any()] and [all()] #' in the same way that [pmin()] and [pmax()] are "parallel" versions of [min()] #' and [max()]. #' #' @param ... Logical vectors of equal size. #' #' @param na_rm Missing value handling: #' #' - If `FALSE`, missing values are propagated according to the same rules as #' `|` and `&`. #' #' - If `TRUE`, missing values are removed from the elementwise computation. #' #' @param size An optional output size. Only useful to specify if it is possible #' for `...` to be empty, with no inputs provided. #' #' @name when-any-all #' #' @seealso [base::any()], [base::all()], [cumany()], [cumall()], #' [base::pmin()], [base::pmax()] #' #' @examples #' x <- c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, NA, NA, NA) #' y <- c(TRUE, FALSE, NA, TRUE, FALSE, NA, TRUE, FALSE, NA) #' #' # `any()` and `all()` summarise down to 1 value #' any(x, y) #' all(x, y) #' #' # `when_any()` and `when_all()` work element by element across all inputs #' # at the same time. Their defaults are equivalent to calling `|` or `&`. #' when_any(x, y) #' x | y #' #' when_all(x, y) #' x & y #' #' # `na_rm = TRUE` is useful when you'd like to force these functions to #' # return only `TRUE` or `FALSE`. This argument does so by removing any `NA` #' # from the elementwise computation entirely. #' tibble( #' x = x, #' y = y, #' any_propagate = when_any(x, y), #' any_remove = when_any(x, y, na_rm = TRUE), #' all_propagate = when_all(x, y), #' all_remove = when_all(x, y, na_rm = TRUE) #' ) #' #' # --------------------------------------------------------------------------- #' # With `filter()` and `filter_out()` #' #' # `when_any()` is particularly useful inside of `filter()` and #' # `filter_out()` as a way to combine comma separated conditions with `|` #' # instead of with `&`. #' #' countries <- tibble( #' name = c("US", "CA", "PR", "RU", "US", NA, "CA", "PR", "RU"), #' score = c(200, 100, 150, NA, 50, 100, 300, 250, 120) #' ) #' countries #' #' # Find rows where any of the following are true: #' # - "US" and "CA" have a score between 200-300 #' # - "PR" and "RU" have a score between 100-200 #' countries |> #' filter( #' (name %in% c("US", "CA") & between(score, 200, 300)) | #' (name %in% c("PR", "RU") & between(score, 100, 200)) #' ) #' #' # With `when_any()`, you drop the explicit `|`, the extra `()`, and your #' # conditions are all indented to the same level #' countries |> #' filter(when_any( #' name %in% c("US", "CA") & between(score, 200, 300), #' name %in% c("PR", "RU") & between(score, 100, 200) #' )) #' #' # To drop these rows instead, use `filter_out()` #' countries |> #' filter_out(when_any( #' name %in% c("US", "CA") & between(score, 200, 300), #' name %in% c("PR", "RU") & between(score, 100, 200) #' )) #' #' # --------------------------------------------------------------------------- #' # Programming with `when_any()` and `when_all()` #' #' # The `size` argument is useful for making these functions size stable when #' # you aren't sure how many inputs you're going to receive #' size <- length(x) #' #' # Two inputs #' inputs <- list(x, y) #' when_all(!!!inputs, size = size) #' #' # One input #' inputs <- list(x) #' when_all(!!!inputs, size = size) #' #' # Zero inputs (without `size`, this would return `logical()`) #' inputs <- list() #' when_all(!!!inputs, size = size) #' #' # When no inputs are provided, these functions are consistent with `any()` #' # and `all()` #' any() #' when_any(size = 1) #' #' all() #' when_all(size = 1) NULL #' @rdname when-any-all #' @export when_any <- function(..., na_rm = FALSE, size = NULL) { check_dots_unnamed() check_bool(na_rm) missing <- if (na_rm) FALSE else NA vec_pany( ..., .missing = missing, .size = size, .arg = "", .error_call = current_env() ) } #' @rdname when-any-all #' @export when_all <- function(..., na_rm = FALSE, size = NULL) { check_dots_unnamed() check_bool(na_rm) missing <- if (na_rm) TRUE else NA vec_pall( ..., .missing = missing, .size = size, .arg = "", .error_call = current_env() ) } dplyr/R/doc-methods.R0000644000176200001440000000524415106134104014136 0ustar liggesusers# Adapted from sloop methods_generic <- function(x) { # Return early if generic not defined in global environment. This happens # when the documentation is read before the package is attached, or when # previewing development documentation from RStudio, since it renders the # files in a separate session. if (!"package:dplyr" %in% search()) { return(data.frame()) } info <- eval(expr(attr(utils::methods(!!x), "info")), envir = globalenv()) info <- tibble::as_tibble(info, rownames = "method") generic_esc <- gsub("([.\\[])", "\\\\\\1", x) info$class <- gsub(paste0("^", generic_esc, "[.,]"), "", info$method) info$class <- gsub("-method$", "", info$class) info$source <- gsub(paste0(" for ", generic_esc), "", info$from) # Find package methods <- map2( info$generic, info$class, utils::getS3method, optional = TRUE, envir = globalenv() ) envs <- map(methods, environment) info$package <- map_chr(envs, environmentName) # Find help topic, if it exists info$topic <- help_topic(info$method, info$package) # Don't link to self info$topic[info$topic == x] <- NA # Remove spurious matches in base packages like select.list or slice.index base_packages <- c( "base", "compiler", "datasets", "graphics", "grDevices", "grid", "methods", "parallel", "splines", "stats", "stats4", "tcltk", "tools", "utils" ) info <- info[!info$package %in% base_packages, ] info[c("generic", "class", "package", "topic", "visible", "source", "isS4")] } methods_rd <- function(x) { methods <- tryCatch(methods_generic(x), error = function(e) data.frame()) if (nrow(methods) == 0) { return("no methods found") } methods <- methods[order(methods$package, methods$class), , drop = FALSE] topics <- unname(split(methods, methods$package)) by_package <- vapply( topics, function(x) { links <- topic_links(x$class, x$package, x$topic) paste0(x$package[[1]], " (", paste0(links, collapse = ", "), ")") }, character(1) ) paste0(by_package, collapse = ", ") } topic_links <- function(class, package, topic) { ifelse( is.na(topic), paste0("\\code{", class, "}"), paste0("\\code{\\link[", package, ":", topic, "]{", class, "}}") ) } help_topic <- function(x, pkg) { find_one <- function(topic, pkg) { if (identical(pkg, "")) { return(NA_character_) } path <- system.file("help", "aliases.rds", package = pkg) if (!file.exists(path)) { return(NA_character_) } aliases <- readRDS(path) if (!topic %in% names(aliases)) { return(NA_character_) } aliases[[topic]] } map2_chr(x, pkg, find_one) } dplyr/R/utils.R0000644000176200001440000000764215137161765013115 0ustar liggesuserscommas <- function(...) paste0(..., collapse = ", ") compact_null <- function(x) { Filter(function(elt) !is.null(elt), x) } paste_line <- function(...) { paste(chr(...), collapse = "\n") } vec_paste0 <- function(...) { args <- vec_recycle_common(...) exec(paste0, !!!args) } # Until vctrs::new_data_frame() forwards row names automatically dplyr_new_data_frame <- function( x = data.frame(), n = NULL, ..., row.names = NULL, class = NULL ) { row.names <- row.names %||% .row_names_info(x, type = 0L) new_data_frame( x, n = n, ..., row.names = row.names, class = class ) } # Strips a list-like vector down to just names dplyr_new_list <- function(x) { if (!is_list(x)) { abort("`x` must be a VECSXP.", .internal = TRUE) } names <- names(x) if (is.null(names)) { attributes(x) <- NULL } else { attributes(x) <- list(names = names) } x } dplyr_new_tibble <- function(x, size) { # ~9x faster than `tibble::new_tibble()` for internal usage new_data_frame(x = x, n = size, class = c("tbl_df", "tbl")) } #' @param x A list #' @param fn An optional function of 1 argument to be applied to each list #' element of `x`. This allows you to further refine what elements should be #' flattened. `fn` should return a single `TRUE` or `FALSE`. #' @param recursive Should `list_flatten()` be applied recursively? If `TRUE`, #' it will continue to apply `list_flatten()` as long as at least one element #' of `x` was flattened in the previous iteration. #' @noRd list_flatten <- function(x, ..., fn = NULL, recursive = FALSE) { check_dots_empty0(...) obj_check_list(x) x <- unclass(x) loc <- map_lgl(x, obj_is_list) if (!is_null(fn)) { loc[loc] <- map_lgl(x[loc], fn) } not_loc <- !loc names <- names(x) if (!is_null(names)) { # Always prefer inner names, even if inner elements are actually unnamed. # This is what `rlang::flatten_if()` did, with a warning. We could also # use `name_spec` and `name_repair` for a more complete solution. names[loc] <- "" names(x) <- names } x[loc] <- map(x[loc], unclass) x[not_loc] <- map(x[not_loc], list) out <- list_unchop(x, ptype = list()) if (recursive && any(loc)) { out <- list_flatten(out, fn = fn, recursive = TRUE) } out } maybe_restart <- function(restart) { if (!is_null(findRestart(restart))) { invokeRestart(restart) } } expr_substitute <- function(expr, old, new) { expr <- duplicate(expr) switch( typeof(expr), language = node_walk_replace(node_cdr(expr), old, new), symbol = if (identical(expr, old)) return(new) ) expr } node_walk_replace <- function(node, old, new) { while (!is_null(node)) { switch( typeof(node_car(node)), language = if ( !is_call(node_car(node), c("~", "function")) || is_call(node_car(node), "~", n = 2) ) { node_walk_replace(node_cdar(node), old, new) }, symbol = if (identical(node_car(node), old)) { node_poke_car(node, new) } ) node <- node_cdr(node) } } cli_collapse <- function(x, last = " and ", sep2 = " and ") { cli::cli_vec(x, style = list("vec-last" = last, "vec-sep2" = sep2)) } # A version of `cli::format_inline()` that formats each string individually. # Useful for constructing error bullets ahead of time. cli_format_each_inline <- function( ..., .envir = parent.frame(), collapse = TRUE, keep_whitespace = TRUE ) { map_chr(c(...), function(x) { cli::format_inline( x, .envir = .envir, collapse = collapse, keep_whitespace = keep_whitespace ) }) } with_no_rlang_infix_labeling <- function(expr) { # TODO: Temporary patch for a slowdown seen with `rlang::as_label()` and infix # operators. A real solution likely involves lazy ALTREP vectors (#6681). # https://github.com/r-lib/rlang/commit/33db700d556b0b85a1fe78e14a53f95ac9248004 with_options("rlang:::use_as_label_infix" = FALSE, expr) } dplyr/R/sets.R0000644000176200001440000001373315106134104012710 0ustar liggesusers#' Set operations #' #' @description #' Perform set operations using the rows of a data frame. #' #' * `intersect(x, y)` finds all rows in both `x` and `y`. #' * `union(x, y)` finds all rows in either `x` or `y`, excluding duplicates. #' * `union_all(x, y)` finds all rows in either `x` or `y`, including duplicates. #' * `setdiff(x, y)` finds all rows in `x` that aren't in `y`. #' * `symdiff(x, y)` computes the symmetric difference, i.e. all rows in #' `x` that aren't in `y` and all rows in `y` that aren't in `x`. #' * `setequal(x, y)` returns `TRUE` if `x` and `y` contain the same rows #' (ignoring order). #' #' Note that `intersect()`, `union()`, `setdiff()`, and `symdiff()` remove #' duplicates in `x` and `y`. #' #' # Base functions #' `intersect()`, `union()`, `setdiff()`, and `setequal()` override the base #' functions of the same name in order to make them generic. The existing #' behaviour for vectors is preserved by providing default methods that call #' the base functions. #' #' @param x,y Pair of compatible data frames. A pair of data frames is #' compatible if they have the same column names (possibly in different #' orders) and compatible types. #' @inheritParams rlang::args_dots_empty #' @name setops #' @examples #' df1 <- tibble(x = 1:3) #' df2 <- tibble(x = 3:5) #' #' intersect(df1, df2) #' union(df1, df2) #' union_all(df1, df2) #' setdiff(df1, df2) #' setdiff(df2, df1) #' symdiff(df1, df2) #' #' setequal(df1, df2) #' setequal(df1, df1[3:1, ]) #' #' # Note that the following functions remove pre-existing duplicates: #' df1 <- tibble(x = c(1:3, 3, 3)) #' df2 <- tibble(x = c(3:5, 5)) #' #' intersect(df1, df2) #' union(df1, df2) #' setdiff(df1, df2) #' symdiff(df1, df2) NULL #' @name setops #' @aliases intersect #' @usage intersect(x, y, ...) #' @importFrom generics intersect #' @export intersect NULL #' @name setops #' @aliases union #' @usage union(x, y, ...) #' @importFrom generics union #' @export union NULL #' @rdname setops #' @export union_all <- function(x, y, ...) UseMethod("union_all") #' @export union_all.default <- function(x, y, ...) { check_dots_empty() vec_c(x, y) } #' @name setops #' @aliases setdiff #' @usage setdiff(x, y, ...) #' @importFrom generics setdiff #' @export setdiff NULL #' @name setops #' @aliases setequal #' @usage setequal(x, y, ...) #' @importFrom generics setequal #' @export setequal NULL #' @rdname setops #' @export symdiff <- function(x, y, ...) { UseMethod("symdiff") } #' @export symdiff.default <- function(x, y, ...) { check_dots_empty() # Default is defined in terms of base R methods setdiff(union(x, y), intersect(x, y)) } #' @export intersect.data.frame <- function(x, y, ...) { check_dots_empty() check_compatible(x, y) out <- vec_set_intersect(x, y, error_call = current_env()) dplyr_reconstruct(out, x) } #' @export union.data.frame <- function(x, y, ...) { check_dots_empty() check_compatible(x, y) out <- vec_set_union(x, y, error_call = current_env()) dplyr_reconstruct(out, x) } #' @export union_all.data.frame <- function(x, y, ...) { check_dots_empty() check_compatible(x, y) out <- vec_rbind(x, y) dplyr_reconstruct(out, x) } #' @export setdiff.data.frame <- function(x, y, ...) { check_dots_empty() check_compatible(x, y) out <- vec_set_difference(x, y, error_call = current_env()) dplyr_reconstruct(out, x) } #' @export setequal.data.frame <- function(x, y, ...) { check_dots_empty() check_compatible(x, y) cast <- vec_cast_common(x = x, y = y) all(vec_in(cast$x, cast$y)) && all(vec_in(cast$y, cast$x)) } #' @export symdiff.data.frame <- function(x, y, ...) { check_dots_empty() check_compatible(x, y) out <- vec_set_symmetric_difference(x, y, error_call = current_env()) dplyr_reconstruct(out, x) } # Helpers ----------------------------------------------------------------- is_compatible <- function(x, y, ignore_col_order = TRUE, convert = TRUE) { if (!is.data.frame(y)) { return("`y` must be a data frame.") } nc <- df_n_col(x) if (nc != df_n_col(y)) { return( c(x = glue("Different number of columns: {nc} vs {df_n_col(y)}.")) ) } names_x <- names(x) names_y <- names(y) names_y_not_in_x <- setdiff(names_y, names_x) names_x_not_in_y <- setdiff(names_x, names_y) if (length(names_y_not_in_x) == 0L && length(names_x_not_in_y) == 0L) { # check if same order if (!isTRUE(ignore_col_order)) { if (!identical(names_x, names_y)) { return(c(x = "Same column names, but different order.")) } } } else { # names are not the same, explain why msg <- c() if (length(names_y_not_in_x)) { wrong <- glue_collapse(glue('`{names_y_not_in_x}`'), sep = ", ") msg <- c( msg, x = glue("Cols in `y` but not `x`: {wrong}.") ) } if (length(names_x_not_in_y)) { wrong <- glue_collapse(glue('`{names_x_not_in_y}`'), sep = ", ") msg <- c( msg, x = glue("Cols in `x` but not `y`: {wrong}.") ) } return(msg) } msg <- c() for (name in names_x) { x_i <- x[[name]] y_i <- y[[name]] if (convert) { tryCatch( vec_ptype2(x_i, y_i), error = function(e) { msg <<- c( msg, x = glue( "Incompatible types for column `{name}`: {vec_ptype_full(x_i)} vs {vec_ptype_full(y_i)}." ) ) } ) } else { if (!identical(vec_ptype(x_i), vec_ptype(y_i))) { msg <- c( msg, x = glue( "Different types for column `{name}`: {vec_ptype_full(x_i)} vs {vec_ptype_full(y_i)}." ) ) } } } if (length(msg)) { return(msg) } TRUE } check_compatible <- function( x, y, ignore_col_order = TRUE, convert = TRUE, error_call = caller_env() ) { compat <- is_compatible( x, y, ignore_col_order = ignore_col_order, convert = convert ) if (isTRUE(compat)) { return() } abort(c("`x` and `y` are not compatible.", compat), call = error_call) } dplyr/R/join-by.R0000644000176200001440000007601115106134104013277 0ustar liggesusers#' Join specifications #' #' `join_by()` constructs a specification that describes how to join two tables #' using a small domain specific language. The result can be supplied as the #' `by` argument to any of the join functions (such as [left_join()]). #' #' # Join types #' #' The following types of joins are supported by dplyr: #' - Equality joins #' - Inequality joins #' - Rolling joins #' - Overlap joins #' - Cross joins #' #' Equality, inequality, rolling, and overlap joins are discussed in more detail #' below. Cross joins are implemented through [cross_join()]. #' #' ## Equality joins #' #' Equality joins require keys to be equal between one or more pairs of columns, #' and are the most common type of join. To construct an equality join using #' `join_by()`, supply two column names to join with separated by `==`. #' Alternatively, supplying a single name will be interpreted as an equality #' join between two columns of the same name. For example, `join_by(x)` is #' equivalent to `join_by(x == x)`. #' #' ## Inequality joins #' #' Inequality joins match on an inequality, such as `>`, `>=`, `<`, or `<=`, and #' are common in time series analysis and genomics. To construct an inequality #' join using `join_by()`, supply two column names separated by one of the above #' mentioned inequalities. #' #' Note that inequality joins will match a single row in `x` to a potentially #' large number of rows in `y`. Be extra careful when constructing inequality #' join specifications! #' #' ## Rolling joins #' #' Rolling joins are a variant of inequality joins that limit the results #' returned from an inequality join condition. They are useful for "rolling" the #' closest match forward/backwards when there isn't an exact match. To construct #' a rolling join, wrap an inequality with `closest()`. #' #' - `closest(expr)` #' #' `expr` must be an inequality involving one of: `>`, `>=`, `<`, or `<=`. #' #' For example, `closest(x >= y)` is interpreted as: For each value in `x`, #' find the closest value in `y` that is less than or equal to that `x` value. #' #' `closest()` will always use the left-hand table (`x`) as the primary table, #' and the right-hand table (`y`) as the one to find the closest match in, #' regardless of how the inequality is specified. For example, #' `closest(y$a >= x$b)` will always be interpreted as `closest(x$b <= y$a)`. #' #' ## Overlap joins #' #' Overlap joins are a special case of inequality joins involving one or two #' columns from the left-hand table _overlapping_ a range defined by two columns #' from the right-hand table. There are three helpers that `join_by()` #' recognizes to assist with constructing overlap joins, all of which can be #' constructed from simpler inequalities. #' #' - `between(x, y_lower, y_upper, ..., bounds = "[]")` #' #' For each value in `x`, this finds everywhere that value falls between #' `[y_lower, y_upper]`. Equivalent to `x >= y_lower, x <= y_upper` by #' default. #' #' `bounds` can be one of \code{"[]"}, \code{"[)"}, \code{"(]"}, or #' \code{"()"} to alter the inclusiveness of the lower and upper bounds. This #' changes whether `>=` or `>` and `<=` or `<` are used to build the #' inequalities shown above. #' #' Dots are for future extensions and must be empty. #' #' - `within(x_lower, x_upper, y_lower, y_upper)` #' #' For each range in `[x_lower, x_upper]`, this finds everywhere that range #' falls completely within `[y_lower, y_upper]`. Equivalent to `x_lower >= #' y_lower, x_upper <= y_upper`. #' #' The inequalities used to build `within()` are the same regardless of the #' inclusiveness of the supplied ranges. #' #' - `overlaps(x_lower, x_upper, y_lower, y_upper, ..., bounds = "[]")` #' #' For each range in `[x_lower, x_upper]`, this finds everywhere that range #' overlaps `[y_lower, y_upper]` in any capacity. Equivalent to `x_lower <= #' y_upper, x_upper >= y_lower` by default. #' #' `bounds` can be one of \code{"[]"}, \code{"[)"}, \code{"(]"}, or #' \code{"()"} to alter the inclusiveness of the lower and upper bounds. #' \code{"[]"} uses `<=` and `>=`, but the 3 other options use `<` and `>` #' and generate the exact same inequalities. #' #' Dots are for future extensions and must be empty. #' #' These conditions assume that the ranges are well-formed and non-empty, i.e. #' `x_lower <= x_upper` when bounds are treated as \code{"[]"}, and #' `x_lower < x_upper` otherwise. #' #' # Column referencing #' #' When specifying join conditions, `join_by()` assumes that column names on the #' left-hand side of the condition refer to the left-hand table (`x`), and names #' on the right-hand side of the condition refer to the right-hand table (`y`). #' Occasionally, it is clearer to be able to specify a right-hand table name on #' the left-hand side of the condition, and vice versa. To support this, column #' names can be prefixed by `x$` or `y$` to explicitly specify which table they #' come from. #' #' @param ... Expressions specifying the join. #' #' Each expression should consist of one of the following: #' #' - Equality condition: `==` #' - Inequality conditions: `>=`, `>`, `<=`, or `<` #' - Rolling helper: `closest()` #' - Overlap helpers: `between()`, `within()`, or `overlaps()` #' #' Other expressions are not supported. If you need to perform a join on #' a computed variable, e.g. `join_by(sales_date - 40 >= promo_date)`, #' you'll need to precompute and store it in a separate column. #' #' Column names should be specified as quoted or unquoted names. By default, #' the name on the left-hand side of a join condition refers to the left-hand #' table, unless overridden by explicitly prefixing the column name with #' either `x$` or `y$`. #' #' If a single column name is provided without any join conditions, it is #' interpreted as if that column name was duplicated on each side of `==`, #' i.e. `x` is interpreted as `x == x`. #' #' @aliases closest overlaps within #' #' @export #' @examples #' sales <- tibble( #' id = c(1L, 1L, 1L, 2L, 2L), #' sale_date = as.Date(c("2018-12-31", "2019-01-02", "2019-01-05", "2019-01-04", "2019-01-01")) #' ) #' sales #' #' promos <- tibble( #' id = c(1L, 1L, 2L), #' promo_date = as.Date(c("2019-01-01", "2019-01-05", "2019-01-02")) #' ) #' promos #' #' # Match `id` to `id`, and `sale_date` to `promo_date` #' by <- join_by(id, sale_date == promo_date) #' left_join(sales, promos, by) #' #' # For each `sale_date` within a particular `id`, #' # find all `promo_date`s that occurred before that particular sale #' by <- join_by(id, sale_date >= promo_date) #' left_join(sales, promos, by) #' #' # For each `sale_date` within a particular `id`, #' # find only the closest `promo_date` that occurred before that sale #' by <- join_by(id, closest(sale_date >= promo_date)) #' left_join(sales, promos, by) #' #' # If you want to disallow exact matching in rolling joins, use `>` rather #' # than `>=`. Note that the promo on `2019-01-05` is no longer considered the #' # closest match for the sale on the same date. #' by <- join_by(id, closest(sale_date > promo_date)) #' left_join(sales, promos, by) #' #' # Same as before, but also require that the promo had to occur at most 1 #' # day before the sale was made. We'll use a full join to see that id 2's #' # promo on `2019-01-02` is no longer matched to the sale on `2019-01-04`. #' sales <- mutate(sales, sale_date_lower = sale_date - 1) #' by <- join_by(id, closest(sale_date >= promo_date), sale_date_lower <= promo_date) #' full_join(sales, promos, by) #' #' # --------------------------------------------------------------------------- #' #' segments <- tibble( #' segment_id = 1:4, #' chromosome = c("chr1", "chr2", "chr2", "chr1"), #' start = c(140, 210, 380, 230), #' end = c(150, 240, 415, 280) #' ) #' segments #' #' reference <- tibble( #' reference_id = 1:4, #' chromosome = c("chr1", "chr1", "chr2", "chr2"), #' start = c(100, 200, 300, 415), #' end = c(150, 250, 399, 450) #' ) #' reference #' #' # Find every time a segment `start` falls between the reference #' # `[start, end]` range. #' by <- join_by(chromosome, between(start, start, end)) #' full_join(segments, reference, by) #' #' # If you wanted the reference columns first, supply `reference` as `x` #' # and `segments` as `y`, then explicitly refer to their columns using `x$` #' # and `y$`. #' by <- join_by(chromosome, between(y$start, x$start, x$end)) #' full_join(reference, segments, by) #' #' # Find every time a segment falls completely within a reference. #' # Sometimes using `x$` and `y$` makes your intentions clearer, even if they #' # match the default behavior. #' by <- join_by(chromosome, within(x$start, x$end, y$start, y$end)) #' inner_join(segments, reference, by) #' #' # Find every time a segment overlaps a reference in any way. #' by <- join_by(chromosome, overlaps(x$start, x$end, y$start, y$end)) #' full_join(segments, reference, by) #' #' # It is common to have right-open ranges with bounds like `[)`, which would #' # mean an end value of `415` would no longer overlap a start value of `415`. #' # Setting `bounds` allows you to compute overlaps with those kinds of ranges. #' by <- join_by(chromosome, overlaps(x$start, x$end, y$start, y$end, bounds = "[)")) #' full_join(segments, reference, by) join_by <- function(...) { # `join_by()` works off pure expressions with no evaluation in the user's # environment, but we want to allow `{{ }}` to make it easier to program with. # The best way to do this is to capture quosures with `enquos()`, and then # immediately squash them recursively into expressions with `quo_squash()`. exprs <- enquos(..., .named = NULL) exprs <- map(exprs, quo_squash) n <- length(exprs) if (n == 0L) { abort(c( "Must supply at least one expression.", i = "If you want a cross join, use `cross_join()`." )) } if (!is_null(names(exprs))) { abort(c( "Can't name join expressions.", i = "Did you use `=` instead of `==`?" )) } error_call <- environment() bys <- vector("list", length = n) for (i in seq_len(n)) { bys[[i]] <- parse_join_by_expr(exprs[[i]], i, error_call = error_call) } # `between()`, `overlaps()`, and `within()` parse into >1 binary conditions x <- flat_map_chr(bys, function(by) by$x) y <- flat_map_chr(bys, function(by) by$y) filter <- flat_map_chr(bys, function(by) by$filter) condition <- flat_map_chr(bys, function(by) by$condition) new_join_by( exprs = exprs, condition = condition, filter = filter, x = x, y = y ) } #' @export print.dplyr_join_by <- function(x, ...) { out <- map_chr(x$exprs, expr_deparse) out <- glue_collapse(glue("- {out}"), sep = "\n") cat("Join By:\n") cat(out) invisible(x) } new_join_by <- function( exprs = list(), condition = character(), filter = character(), x = character(), y = character() ) { out <- list( exprs = exprs, condition = condition, filter = filter, x = x, y = y ) structure(out, class = "dplyr_join_by") } flat_map_chr <- function(x, fn) { list_unchop(map(x, fn), ptype = character()) } # ------------------------------------------------------------------------------ # Internal generic as_join_by <- function(x, error_call = caller_env()) { UseMethod("as_join_by") } #' @export as_join_by.default <- function(x, error_call = caller_env()) { message <- glue(paste0( "`by` must be a (named) character vector, list, `join_by()` result, ", "or NULL, not {obj_type_friendly(x)}." )) abort(message, call = error_call) } #' @export as_join_by.dplyr_join_by <- function(x, error_call = caller_env()) { x } #' @export as_join_by.character <- function(x, error_call = caller_env()) { x_names <- names(x) %||% x y_names <- unname(x) # If x partially named, assume unnamed are the same in both tables x_names[x_names == ""] <- y_names[x_names == ""] finalise_equi_join_by(x_names, y_names) } #' @export as_join_by.list <- function(x, error_call = caller_env()) { # TODO: check lengths x_names <- x[["x"]] y_names <- x[["y"]] if (!is_character(x_names)) { abort("`by$x` must evaluate to a character vector.") } if (!is_character(y_names)) { abort("`by$y` must evaluate to a character vector.") } finalise_equi_join_by(x_names, y_names) } finalise_equi_join_by <- function(x_names, y_names) { n <- length(x_names) if (n == 0L) { abort( "Backwards compatible support for cross joins should have been caught earlier.", .internal = TRUE ) } exprs <- map2(x_names, y_names, function(x, y) expr(!!x == !!y)) condition <- vec_rep("==", times = n) filter <- vec_rep("none", times = n) new_join_by( exprs = exprs, condition = condition, filter = filter, x = x_names, y = y_names ) } # ------------------------------------------------------------------------------ join_by_common <- function(x_names, y_names, ..., error_call = caller_env()) { check_dots_empty0(...) by <- intersect(x_names, y_names) if (length(by) == 0) { message <- c( "`by` must be supplied when `x` and `y` have no common variables.", i = "Use `cross_join()` to perform a cross-join." ) abort(message, call = error_call) } by_names <- tick_if_needed(by) by_names <- glue_collapse(by_names, sep = ", ") inform(glue("Joining with `by = join_by({by_names})`")) finalise_equi_join_by(by, by) } # ------------------------------------------------------------------------------ # In the parsing implementation below, note that all `binding_*()` functions # should maintain a function signature that exactly matches what is documented # in `?join_by`, as these get bound directly to their corresponding function # name, i.e. `binding_join_by_between()` is bound to `between()`. This is why # these functions don't have an `error_call` argument. parse_join_by_expr <- function(expr, i, error_call) { if (is_missing(expr)) { message <- c( "Expressions can't be missing.", x = glue("Expression {i} is missing.") ) abort(message, call = error_call) } if (length(expr) == 0L) { message <- c( "Expressions can't be empty.", x = glue("Expression {i} is empty.") ) abort(message, call = error_call) } if (is_symbol_or_string(expr)) { expr <- expr(!!expr == !!expr) } if (!is_call(expr)) { message <- c( "Each element of `...` must be a single column name or a join by expression.", x = glue("Element {i} is not a name and not an expression.") ) abort(message, call = error_call) } if (is_call(expr, ns = "dplyr")) { # Normalize by removing the `dplyr::` expr[[1]] <- sym(call_name(expr)) } op <- expr[[1]] if (!is_symbol(op)) { if (is_call(op, name = "::")) { stop_invalid_namespaced_expression(expr, i, error_call) } else { stop_invalid_top_expression(expr, i, error_call) } } op <- as_string(op) switch( op, "==" = , ">=" = , ">" = , "<=" = , "<" = parse_join_by_binary(expr, i, error_call), "between" = parse_join_by_between(expr, i, error_call), "within" = parse_join_by_within(expr, i, error_call), "overlaps" = parse_join_by_overlaps(expr, i, error_call), "closest" = parse_join_by_closest(expr, i, error_call), "$" = stop_invalid_dollar_sign(expr, i, error_call), stop_invalid_top_expression(expr, i, error_call) ) } stop_invalid_dollar_sign <- function(expr, i, call) { message <- c( "Can't use `$` when specifying a single column name.", i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = call) } stop_invalid_top_expression <- function(expr, i, call) { options <- c( "==", ">=", ">", "<=", "<", "closest()", "between()", "overlaps()", "within()" ) options <- glue::backtick(options) options <- glue_collapse(options, sep = ", ", last = ", or ") message <- c( glue("Expressions must use one of: {options}."), i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = call) } stop_invalid_namespaced_expression <- function(expr, i, call) { message <- c( glue("Expressions can only be namespace prefixed with `dplyr::`."), i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = call) } parse_join_by_name <- function(expr, i, default_side, error_call) { if (is_symbol_or_string(expr)) { name <- as_string(expr) return(list(name = name, side = default_side)) } if (is_call(expr, name = "$")) { return(parse_join_by_dollar(expr, i, error_call)) } message <- c( paste0( "Expressions can't contain computed columns, ", "and can only reference columns by name or by explicitly specifying ", "a side, like `x$col` or `y$col`." ), i = glue("Expression {i} contains {err_expr(expr)}.") ) abort(message, call = error_call) } parse_join_by_dollar <- function(expr, i, error_call) { args <- eval_join_by_dollar(expr, error_call) side <- args$side if (!is_symbol_or_string(side)) { message <- c( "The left-hand side of a `$` expression must be a symbol or string.", i = glue("Expression {i} contains {err_expr(expr)}.") ) abort(message, call = error_call) } side <- as_string(side) sides <- c("x", "y") if (!side %in% sides) { message <- c( "The left-hand side of a `$` expression must be either `x$` or `y$`.", i = glue("Expression {i} contains {err_expr(expr)}.") ) abort(message, call = error_call) } name <- args$name if (!is_symbol_or_string(name)) { message <- c( "The right-hand side of a `$` expression must be a symbol or string.", i = glue("Expression {i} contains {err_expr(expr)}.") ) abort(message, call = error_call) } name <- as_string(name) list(name = name, side = side) } eval_join_by_dollar <- function(expr, error_call) { env <- new_environment() local_error_call(error_call, frame = env) env_poke(env, "$", binding_join_by_dollar) eval_tidy(expr, env = env) } binding_join_by_dollar <- function(x, name) { error_call <- caller_env() x <- enexpr(x) name <- enexpr(name) check_missing_arg(x, "x", "$", error_call, binary_op = TRUE) check_missing_arg(name, "name", "$", error_call, binary_op = TRUE) list(side = x, name = name) } parse_join_by_binary <- function(expr, i, error_call) { args <- eval_join_by_binary(expr, error_call) condition <- args$condition lhs <- args$lhs rhs <- args$rhs lhs <- parse_join_by_name(lhs, i, default_side = "x", error_call = error_call) rhs <- parse_join_by_name(rhs, i, default_side = "y", error_call = error_call) if (lhs$side == rhs$side) { message <- c( "The left and right-hand sides of a binary expression must reference different tables.", i = glue("Expression {i} contains {err_expr(expr)}.") ) abort(message, call = error_call) } if (lhs$side == "x") { x <- lhs$name y <- rhs$name } else { # Must reverse the op lookup <- c("==" = "==", ">=" = "<=", ">" = "<", "<=" = ">=", "<" = ">") condition <- lookup[[condition]] x <- rhs$name y <- lhs$name } list( x = x, y = y, condition = condition, filter = "none" ) } eval_join_by_binary <- function(expr, error_call) { env <- new_environment() local_error_call(error_call, frame = env) env_bind( env, `==` = binding_join_by_equality, `>` = binding_join_by_greater_than, `>=` = binding_join_by_greater_than_or_equal, `<` = binding_join_by_less_than, `<=` = binding_join_by_less_than_or_equal ) eval_tidy(expr, env = env) } binding_join_by_binary <- function(condition, error_call, x, y) { x <- enexpr(x) y <- enexpr(y) check_missing_arg(x, "x", condition, error_call, binary_op = TRUE) check_missing_arg(y, "y", condition, error_call, binary_op = TRUE) list(condition = condition, lhs = x, rhs = y) } binding_join_by_equality <- function(x, y) { binding_join_by_binary("==", caller_env(), !!enexpr(x), !!enexpr(y)) } binding_join_by_greater_than <- function(x, y) { binding_join_by_binary(">", caller_env(), !!enexpr(x), !!enexpr(y)) } binding_join_by_greater_than_or_equal <- function(x, y) { binding_join_by_binary(">=", caller_env(), !!enexpr(x), !!enexpr(y)) } binding_join_by_less_than <- function(x, y) { binding_join_by_binary("<", caller_env(), !!enexpr(x), !!enexpr(y)) } binding_join_by_less_than_or_equal <- function(x, y) { binding_join_by_binary("<=", caller_env(), !!enexpr(x), !!enexpr(y)) } parse_join_by_closest <- function(expr, i, error_call) { args <- eval_join_by_closest(expr, error_call) expr_binary <- args$expr if (!is_call(expr_binary)) { message <- c( "The first argument of `closest()` must be an expression.", i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } op <- as_string(expr_binary[[1]]) out <- switch( op, ">=" = , ">" = , "<=" = , "<" = parse_join_by_binary(expr_binary, i, error_call), "==" = stop_join_by_closest_equal_expression(expr, i, error_call), stop_join_by_closest_invalid_expression(expr, i, error_call) ) filter <- switch( out$condition, ">=" = "max", ">" = "max", "<=" = "min", "<" = "min", abort("Unexpected `closest()` `condition`.", .internal = TRUE) ) out$filter <- filter out } eval_join_by_closest <- function(expr, error_call) { env <- new_environment() local_error_call(error_call, frame = env) env_poke(env, "closest", binding_join_by_closest) eval_tidy(expr, env = env) } binding_join_by_closest <- function(expr) { error_call <- caller_env() expr <- enexpr(expr) check_missing_arg(expr, "expr", "closest", error_call) list(expr = expr) } stop_join_by_closest_equal_expression <- function(expr, i, error_call) { # `closest(x == y)` doesn't make any sense, # even if vctrs can technically handle it. message <- c( "The expression used in `closest()` can't use `==`.", i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } stop_join_by_closest_invalid_expression <- function(expr, i, error_call) { options <- c(">=", ">", "<=", "<") options <- glue::backtick(options) options <- glue_collapse(options, sep = ", ", last = ", or ") message <- c( glue("The expression used in `closest()` must use one of: {options}."), i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } parse_join_by_between <- function(expr, i, error_call) { args <- eval_join_by_between(expr, error_call) lhs <- parse_join_by_name(args$lhs, i, "x", error_call) rhs_lower <- parse_join_by_name(args$rhs_lower, i, "y", error_call) rhs_upper <- parse_join_by_name(args$rhs_upper, i, "y", error_call) bounds <- args$bounds if (rhs_lower$side != rhs_upper$side) { message <- c( "Expressions containing `between()` must reference the same table for the lower and upper bounds.", i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } if (lhs$side == rhs_lower$side) { message <- c( "Expressions containing `between()` can't all reference the same table.", i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } if (lhs$side == "x") { x <- c(lhs$name, lhs$name) y <- c(rhs_lower$name, rhs_upper$name) condition <- switch( bounds, "[]" = c(">=", "<="), "[)" = c(">=", "<"), "(]" = c(">", "<="), "()" = c(">", "<") ) } else { x <- c(rhs_lower$name, rhs_upper$name) y <- c(lhs$name, lhs$name) condition <- switch( bounds, "[]" = c("<=", ">="), "[)" = c("<=", ">"), "(]" = c("<", ">="), "()" = c("<", ">") ) } filter <- c("none", "none") list( x = x, y = y, condition = condition, filter = filter ) } eval_join_by_between <- function(expr, error_call) { env <- new_environment() local_error_call(error_call, frame = env) env_poke(env, "between", binding_join_by_between) eval_tidy(expr, env = env) } binding_join_by_between <- function(x, y_lower, y_upper, ..., bounds = "[]") { error_call <- caller_env() check_join_by_dots_empty(..., fn = "between", call = error_call) x <- enexpr(x) y_lower <- enexpr(y_lower) y_upper <- enexpr(y_upper) check_missing_arg(x, "x", "between", error_call) check_missing_arg(y_lower, "y_lower", "between", error_call) check_missing_arg(y_upper, "y_upper", "between", error_call) bounds <- check_bounds(bounds, call = error_call) list(lhs = x, rhs_lower = y_lower, rhs_upper = y_upper, bounds = bounds) } parse_join_by_within <- function(expr, i, error_call) { args <- eval_join_by_within(expr, error_call) lhs_lower <- parse_join_by_name(args$lhs_lower, i, "x", error_call) lhs_upper <- parse_join_by_name(args$lhs_upper, i, "x", error_call) rhs_lower <- parse_join_by_name(args$rhs_lower, i, "y", error_call) rhs_upper <- parse_join_by_name(args$rhs_upper, i, "y", error_call) if (lhs_lower$side != lhs_upper$side) { message <- c( paste0( "Expressions containing `within()` must reference ", "the same table for the left-hand side lower and upper bounds." ), i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } if (rhs_lower$side != rhs_upper$side) { message <- c( paste0( "Expressions containing `within()` must reference ", "the same table for the right-hand side lower and upper bounds." ), i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } if (lhs_lower$side == rhs_lower$side) { message <- c( "Expressions containing `within()` can't all reference the same table.", i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } if (lhs_lower$side == "x") { x <- c(lhs_lower$name, lhs_upper$name) y <- c(rhs_lower$name, rhs_upper$name) condition <- c(">=", "<=") } else { x <- c(rhs_lower$name, rhs_upper$name) y <- c(lhs_lower$name, lhs_upper$name) condition <- c("<=", ">=") } filter <- c("none", "none") list( x = x, y = y, condition = condition, filter = filter ) } eval_join_by_within <- function(expr, error_call) { env <- new_environment() local_error_call(error_call, frame = env) env_poke(env, "within", binding_join_by_within) eval_tidy(expr, env = env) } binding_join_by_within <- function(x_lower, x_upper, y_lower, y_upper) { error_call <- caller_env() x_lower <- enexpr(x_lower) x_upper <- enexpr(x_upper) y_lower <- enexpr(y_lower) y_upper <- enexpr(y_upper) check_missing_arg(x_lower, "x_lower", "within", error_call) check_missing_arg(x_upper, "x_upper", "within", error_call) check_missing_arg(y_lower, "y_lower", "within", error_call) check_missing_arg(y_upper, "y_upper", "within", error_call) list( lhs_lower = x_lower, lhs_upper = x_upper, rhs_lower = y_lower, rhs_upper = y_upper ) } parse_join_by_overlaps <- function(expr, i, error_call) { args <- eval_join_by_overlaps(expr, error_call) lhs_lower <- parse_join_by_name(args$lhs_lower, i, "x", error_call) lhs_upper <- parse_join_by_name(args$lhs_upper, i, "x", error_call) rhs_lower <- parse_join_by_name(args$rhs_lower, i, "y", error_call) rhs_upper <- parse_join_by_name(args$rhs_upper, i, "y", error_call) bounds <- args$bounds if (lhs_lower$side != lhs_upper$side) { message <- c( paste0( "Expressions containing `overlaps()` must reference ", "the same table for the left-hand side lower and upper bounds." ), i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } if (rhs_lower$side != rhs_upper$side) { message <- c( paste0( "Expressions containing `overlaps()` must reference ", "the same table for the right-hand side lower and upper bounds." ), i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } if (lhs_lower$side == rhs_lower$side) { message <- c( "Expressions containing `overlaps()` can't all reference the same table.", i = glue("Expression {i} is {err_expr(expr)}.") ) abort(message, call = error_call) } # 3 of the `bounds` have the exact same behavior, but the argument name is # consistent with `between(bounds =)` and easier to remember and interpret # than exposing `closed` directly (#6504). # - `[]` uses `<=` and `>=` # - All other conditions use `<` and `>` due to the presence of a `(` or `)` closed <- switch( bounds, "[]" = TRUE, "[)" = FALSE, "(]" = FALSE, "()" = FALSE, abort("Unknown `bounds`.", .internal = TRUE) ) if (lhs_lower$side == "x") { x <- c(lhs_lower$name, lhs_upper$name) y <- c(rhs_upper$name, rhs_lower$name) if (closed) { condition <- c("<=", ">=") } else { condition <- c("<", ">") } } else { x <- c(rhs_upper$name, rhs_lower$name) y <- c(lhs_lower$name, lhs_upper$name) if (closed) { condition <- c(">=", "<=") } else { condition <- c(">", "<") } } filter <- c("none", "none") list( x = x, y = y, condition = condition, filter = filter ) } eval_join_by_overlaps <- function(expr, error_call) { env <- new_environment() local_error_call(error_call, frame = env) env_poke(env, "overlaps", binding_join_by_overlaps) eval_tidy(expr, env = env) } binding_join_by_overlaps <- function( x_lower, x_upper, y_lower, y_upper, ..., bounds = "[]" ) { error_call <- caller_env() check_join_by_dots_empty(..., fn = "overlaps", call = error_call) x_lower <- enexpr(x_lower) x_upper <- enexpr(x_upper) y_lower <- enexpr(y_lower) y_upper <- enexpr(y_upper) check_missing_arg(x_lower, "x_lower", "overlaps", error_call) check_missing_arg(x_upper, "x_upper", "overlaps", error_call) check_missing_arg(y_lower, "y_lower", "overlaps", error_call) check_missing_arg(y_upper, "y_upper", "overlaps", error_call) bounds <- check_bounds(bounds, call = error_call) list( lhs_lower = x_lower, lhs_upper = x_upper, rhs_lower = y_lower, rhs_upper = y_upper, bounds = bounds ) } check_bounds <- function(bounds, call) { arg_match0( bounds, values = c("[]", "[)", "(]", "()"), error_call = call ) } check_join_by_dots_empty <- function(..., fn, call) { if (dots_n(...) == 0L) { return() } fn <- glue::backtick(glue("{fn}()")) message <- c( "`...` must be empty.", i = glue("Non-empty dots were detected inside {fn}.") ) abort(message, call = call) } check_missing_arg <- function( arg, arg_name, fn_name, error_call, ..., binary_op = FALSE ) { check_dots_empty0(...) if (!is_missing(arg)) { return(invisible()) } if (!binary_op) { fn_name <- glue("{fn_name}()") } arg_name <- glue::backtick(arg_name) fn_name <- glue::backtick(fn_name) message <- c( glue("Expressions using {fn_name} can't contain missing arguments."), x = glue("Argument {arg_name} is missing.") ) abort(message, call = error_call) } is_symbol_or_string <- function(x) { is_symbol(x) || is_string(x) } err_expr <- function(expr) { expr <- expr_deparse(expr) expr <- glue::backtick(expr) expr } dplyr/R/colwise-funs.R0000644000176200001440000000736415137161765014374 0ustar liggesusersas_fun_list <- function( .funs, .env, ..., .caller, .caller_arg = "...", error_call = caller_env(), .user_env = caller_env(2) ) { args <- list2(...) force(.user_env) if (is_fun_list(.funs)) { if (!is_empty(args)) { .funs[] <- map(.funs, call_modify, !!!args) } return(.funs) } if (is_list(.funs) && length(.funs) > 1) { .funs <- auto_name_formulas(.funs) } if (!is_character(.funs) && !is_list(.funs)) { .funs <- list(.funs) } if (is_character(.funs) && is_null(names(.funs)) && length(.funs) != 1L) { names(.funs) <- .funs } funs <- map(.funs, function(.x) { if (is_formula(.x)) { if (is_quosure(.x)) { what <- paste0( "dplyr::", .caller, "(", .caller_arg, " = ", "'can\\'t contain quosures')" ) # Only used in superceded functions, so advancing to `deprecate_stop()` # doesn't feel worth it lifecycle::deprecate_warn( "0.8.3", what, details = "Please use a one-sided formula, a function, or a function name.", always = TRUE, env = .env, user_env = .user_env, id = "dplyr-funs-quosures" ) .x <- new_formula(NULL, quo_squash(.x), quo_get_env(.x)) } .x <- as_inlined_function(.x, env = .env) } else { if (is_character(.x)) { .x <- get(.x, .env, mode = "function") } else if (!is_function(.x)) { msg <- "`.funs` must be a one sided formula, a function, or a function name." abort(msg, call = error_call) } if (length(args)) { .x <- new_quosure( call2(.x, quote(.), !!!args), env = .env ) } } .x }) attr(funs, "have_name") <- any(names2(funs) != "") funs } auto_name_formulas <- function(funs) { where <- !have_name(funs) & map_lgl(funs, function(x) is_bare_formula(x) && is_call(f_rhs(x))) names(funs)[where] <- map_chr( funs[where], function(x) as_label(f_rhs(x)[[1]]) ) funs } as_fun <- function(.x, .env, .args, error_call = caller_env()) { quo <- as_quosure(.x, .env) # For legacy reasons, we support strings. Those are enclosed in the # empty environment and need to be switched to the caller environment. quo <- quo_set_env(quo, fun_env(quo, .env)) expr <- quo_get_expr(quo) if (is_call(expr, c("function", "~"))) { top_level <- as_string(expr[[1]]) msg <- glue( "`{quo_text(expr)}` must be a function name (quoted or unquoted) or an unquoted call, not `{top_level}`." ) abort(msg, call = error_call) } if (is_call(expr) && !is_call(expr, c("::", ":::"))) { expr <- call_modify(expr, !!!.args) } else { expr <- call2(expr, quote(.), !!!.args) } set_expr(quo, expr) } quo_as_function <- function(quo) { new_function(exprs(. = ), quo_get_expr(quo), quo_get_env(quo)) } fun_env <- function(quo, default_env) { env <- quo_get_env(quo) if (is_null(env) || identical(env, empty_env())) { default_env } else { env } } is_fun_list <- function(x) { inherits(x, "fun_list") } #' @export `[.fun_list` <- function(x, i) { structure(NextMethod(), class = "fun_list", has_names = attr(x, "has_names")) } #' @export print.fun_list <- function(x, ..., width = getOption("width")) { cat("\n") names <- format(names(x)) code <- map_chr(x, function(x) { deparse_trunc(quo_get_expr(x), width - 2 - nchar(names[1])) }) cat(paste0("$ ", names, ": ", code, collapse = "\n")) cat("\n") invisible(x) } deparse_trunc <- function(x, width = getOption("width")) { text <- deparse(x, width.cutoff = width) if (length(text) == 1 && nchar(text) < width) { return(text) } paste0(substr(text[1], 1, width - 3), "...") } dplyr/R/group-data.R0000644000176200001440000001014715137161765014012 0ustar liggesusers#' Grouping metadata #' #' @description #' This collection of functions accesses data about grouped data frames in #' various ways: #' #' * `group_data()` returns a data frame that defines the grouping structure. #' The columns give the values of the grouping variables. The last column, #' always called `.rows`, is a list of integer vectors that gives the #' location of the rows in each group. #' #' * `group_keys()` returns a data frame describing the groups. #' #' * `group_rows()` returns a list of integer vectors giving the rows that #' each group contains. #' #' * `group_indices()` returns an integer vector the same length as `.data` #' that gives the group that each row belongs to. #' #' * `group_vars()` gives names of grouping variables as character vector. #' #' * `groups()` gives the names of the grouping variables as a list of symbols. #' #' * `group_size()` gives the size of each group. #' #' * `n_groups()` gives the total number of groups. #' #' See [context] for equivalent functions that return values for the _current_ #' group. #' @param .data,.tbl,x A data frame or extension (like a tibble or grouped #' tibble). #' @param ... Unused. #' @keywords internal #' @examples #' df <- tibble(x = c(1,1,2,2)) #' group_vars(df) #' group_rows(df) #' group_data(df) #' group_indices(df) #' #' gf <- group_by(df, x) #' group_vars(gf) #' group_rows(gf) #' group_data(gf) #' group_indices(gf) #' @export group_data <- function(.data) { UseMethod("group_data") } #' @export group_data.data.frame <- function(.data) { size <- nrow(.data) out <- seq_len(size) out <- new_list_of(list(out), ptype = integer()) out <- list(.rows = out) out <- new_data_frame(out, n = 1L) out } #' @export group_data.tbl_df <- function(.data) { out <- NextMethod() out <- dplyr_new_tibble(out, size = 1L) out } #' @export group_data.rowwise_df <- function(.data) { attr(.data, "groups") } #' @export group_data.grouped_df <- function(.data) { error_call <- current_env() withCallingHandlers( validate_grouped_df(.data), error = function(cnd) { msg <- glue("`.data` must be a valid object.") abort(msg, parent = cnd, call = error_call) } ) attr(.data, "groups") } # ------------------------------------------------------------------------- #' @rdname group_data #' @export group_keys <- function(.tbl, ...) { UseMethod("group_keys") } #' @export group_keys.data.frame <- function(.tbl, ...) { if (dots_n(...) > 0) { lifecycle::deprecate_stop( "1.0.0", "group_keys(... = )", details = "Please `group_by()` first" ) } out <- group_data(.tbl) group_keys0(out) } group_keys0 <- function(x) { # Compute keys directly from `group_data()` results .Call(`dplyr_group_keys`, x) } #' @rdname group_data #' @export group_rows <- function(.data) { group_data(.data)[[".rows"]] } #' @export #' @rdname group_data group_indices <- function(.data, ...) { if (nargs() == 0) { lifecycle::deprecate_warn( "1.0.0", I("`group_indices()` with no arguments"), "cur_group_id()", always = TRUE, id = "dplyr-group-indices-no-arguments" ) return(cur_group_id()) } UseMethod("group_indices") } #' @export group_indices.data.frame <- function(.data, ...) { if (dots_n(...) > 0) { lifecycle::deprecate_stop( "1.0.0", "group_indices(... = )", details = "Please `group_by()` first" ) .data <- group_by(.data, ...) } .Call(`dplyr_group_indices`, .data, group_rows(.data)) } #' @export #' @rdname group_data group_vars <- function(x) { UseMethod("group_vars") } #' @export group_vars.data.frame <- function(x) { setdiff(names(group_data(x)), ".rows") } #' @export #' @rdname group_data groups <- function(x) { UseMethod("groups") } #' @export groups.data.frame <- function(x) { syms(group_vars(x)) } #' @export #' @rdname group_data group_size <- function(x) UseMethod("group_size") #' @export group_size.data.frame <- function(x) { lengths(group_rows(x)) } #' @export #' @rdname group_data n_groups <- function(x) UseMethod("n_groups") #' @export n_groups.data.frame <- function(x) { nrow(group_data(x)) } dplyr/R/deprec-do.R0000644000176200001440000001614015106134104013567 0ustar liggesusers#' Do anything #' #' @description #' `r lifecycle::badge("superseded")` #' #' `do()` is superseded as of dplyr 1.0.0, because its syntax never really #' felt like it belonged with the rest of dplyr. It's replaced by a combination #' of [reframe()] (which can produce multiple rows and multiple columns), #' [nest_by()] (which creates a [rowwise] tibble of nested data), #' and [pick()] (which allows you to access the data for the "current" group). #' #' @param .data a tbl #' @param ... Expressions to apply to each group. If named, results will be #' stored in a new column. If unnamed, must return a data frame. You can #' use `.` to refer to the current group. You can not mix named and #' unnamed arguments. #' @keywords internal #' @export #' @examples #' # do() with unnamed arguments becomes reframe() or summarise() #' # . becomes pick() #' by_cyl <- mtcars |> group_by(cyl) #' by_cyl |> do(head(., 2)) #' # -> #' by_cyl |> reframe(head(pick(everything()), 2)) #' by_cyl |> slice_head(n = 2) #' #' # Can refer to variables directly #' by_cyl |> do(mean = mean(.$vs)) #' # -> #' by_cyl |> summarise(mean = mean(vs)) #' #' # do() with named arguments becomes nest_by() + mutate() & list() #' models <- by_cyl |> do(mod = lm(mpg ~ disp, data = .)) #' # -> #' models <- mtcars |> #' nest_by(cyl) |> #' mutate(mod = list(lm(mpg ~ disp, data = data))) #' models |> summarise(rsq = summary(mod)$r.squared) #' #' # use broom to turn models into data #' models |> do(data.frame( #' var = names(coef(.$mod)), #' coef(summary(.$mod))) #' ) #' @examplesIf requireNamespace("broom", quietly = TRUE) #' # -> #' models |> reframe(broom::tidy(mod)) do <- function(.data, ...) { lifecycle::signal_stage("superseded", "do()") UseMethod("do") } #' @export do.NULL <- function(.data, ...) { NULL } #' @export do.grouped_df <- function(.data, ...) { index <- group_rows(.data) labels <- select(group_data(.data), -last_col()) attr(labels, ".drop") <- NULL # Create ungroup version of data frame suitable for subsetting group_data <- ungroup(.data) args <- enquos(...) named <- named_args(args) mask <- new_data_mask(new_environment()) n <- length(index) m <- length(args) # Special case for zero-group/zero-row input if (n == 0) { if (named) { out <- rep_len(list(list()), length(args)) out <- set_names(out, names(args)) out <- label_output_list(labels, out, groups(.data)) } else { env_bind_do_pronouns(mask, group_data) out <- eval_tidy(args[[1]], mask) out <- out[0, , drop = FALSE] out <- label_output_dataframe( labels, list(list(out)), group_vars(.data), group_by_drop_default(.data) ) } return(out) } # Add pronouns with active bindings that resolve to the current # subset. `_i` is found in environment of this function because of # usual scoping rules. group_slice <- function(value) { if (missing(value)) { group_data[index[[`_i`]], , drop = FALSE] } else { group_data[index[[`_i`]], ] <<- value } } env_bind_do_pronouns(mask, group_slice) out <- replicate(m, vector("list", n), simplify = FALSE) names(out) <- names(args) p <- Progress$new(n * m, min_time = 2) for (`_i` in seq_len(n)) { for (j in seq_len(m)) { out[[j]][`_i`] <- list(eval_tidy(args[[j]], mask)) p$tick()$print() } } if (!named) { label_output_dataframe( labels, out, group_vars(.data), group_by_drop_default(.data) ) } else { label_output_list(labels, out, group_vars(.data)) } } #' @export do.data.frame <- function(.data, ...) { args <- enquos(...) named <- named_args(args) # Create custom data mask with `.` pronoun mask <- new_data_mask(new_environment()) env_bind_do_pronouns(mask, .data) if (!named) { out <- eval_tidy(args[[1]], mask) if (!inherits(out, "data.frame")) { msg <- glue("Result must be a data frame, not {fmt_classes(out)}.") abort(msg) } } else { out <- map(args, function(arg) list(eval_tidy(arg, mask))) names(out) <- names(args) out <- tibble::as_tibble(out, .name_repair = "minimal") } out } # Helper functions ------------------------------------------------------------- env_bind_do_pronouns <- function(env, data) { if (is_function(data)) { bind <- env_bind_active } else { bind <- env_bind } # Use `:=` for `.` to avoid partial matching with `.env` bind(env, "." := data, .data = data) } label_output_dataframe <- function( labels, out, groups, .drop, error_call = caller_env() ) { data_frame <- vapply(out[[1]], is.data.frame, logical(1)) if (any(!data_frame)) { msg <- glue( "Results {bad} must be data frames, not {first_bad_class}.", bad = fmt_comma(which(!data_frame)), first_bad_class = fmt_classes(out[[1]][[which.min(data_frame)]]) ) abort(msg, call = error_call) } rows <- vapply(out[[1]], nrow, numeric(1)) out <- bind_rows(out[[1]]) if (!is.null(labels)) { # Remove any common columns from labels labels <- labels[setdiff(names(labels), names(out))] # Repeat each row to match data labels <- labels[rep(seq_len(nrow(labels)), rows), , drop = FALSE] rownames(labels) <- NULL grouped_df(bind_cols(labels, out), groups, .drop) } else { rowwise(out) } } label_output_list <- function(labels, out, groups) { if (!is.null(labels)) { labels[names(out)] <- out rowwise(labels) } else { class(out) <- "data.frame" attr(out, "row.names") <- .set_row_names(length(out[[1]])) rowwise(out) } } named_args <- function(args, error_call = caller_env()) { # Arguments must either be all named or all unnamed. named <- sum(names2(args) != "") if (!(named == 0 || named == length(args))) { msg <- "Arguments must either be all named or all unnamed." abort(msg, call = error_call) } if (named == 0 && length(args) > 1) { msg <- glue("Can only supply one unnamed argument, not {length(args)}.") abort(msg, call = error_call) } named != 0 } #' @export do.rowwise_df <- function(.data, ...) { # Create ungroup version of data frame suitable for subsetting group_data <- ungroup(.data) args <- enquos(...) named <- named_args(args) # Create new environment, inheriting from parent, with an active binding # for . that resolves to the current subset. `_i` is found in environment # of this function because of usual scoping rules. mask <- new_data_mask(new_environment()) current_row <- function() lapply(group_data[`_i`, , drop = FALSE], "[[", 1) env_bind_do_pronouns(mask, current_row) n <- nrow(.data) m <- length(args) out <- replicate(m, vector("list", n), simplify = FALSE) names(out) <- names(args) p <- rlang::with_options( lifecycle_verbosity = "quiet", progress_estimated(n * m, min_time = 2) ) for (`_i` in seq_len(n)) { for (j in seq_len(m)) { out[[j]][`_i`] <- list(eval_tidy(args[[j]], mask)) p$tick()$print() } } if (!named) { label_output_dataframe( NULL, out, groups(.data), group_by_drop_default(.data) ) } else { label_output_list(NULL, out, groups(.data)) } } dplyr/R/near.R0000644000176200001440000000066213663216626012674 0ustar liggesusers#' Compare two numeric vectors #' #' This is a safe way of comparing if two vectors of floating point numbers #' are (pairwise) equal. This is safer than using `==`, because it has #' a built in tolerance #' #' @param x,y Numeric vectors to compare #' @param tol Tolerance of comparison. #' @export #' @examples #' sqrt(2) ^ 2 == 2 #' near(sqrt(2) ^ 2, 2) near <- function(x, y, tol = .Machine$double.eps^0.5) { abs(x - y) < tol } dplyr/R/ts.R0000644000176200001440000000034014266276767012402 0ustar liggesusers#' @export filter.ts <- function(.data, ...) { bullets <- c( "Incompatible data source.", x = '`.data` is a object, not a data source.', i = "Did you want to use `stats::filter()`?" ) abort(bullets) } dplyr/R/group-map.R0000644000176200001440000001514415106134104013637 0ustar liggesusersas_group_map_function <- function(.f, error_call = caller_env()) { .f <- rlang::as_function(.f) if (length(form <- formals(.f)) < 2 && !"..." %in% names(form)) { bullets <- c( "`.f` must accept at least two arguments.", i = "You can use `...` to absorb unused components." ) abort(bullets, call = error_call) } .f } #' Apply a function to each group #' #' @description #' `r lifecycle::badge("experimental")` #' #' `group_map()`, `group_modify()` and `group_walk()` are purrr-style functions that can #' be used to iterate on grouped tibbles. #' #' @details #' Use `group_modify()` when `summarize()` is too limited, in terms of what you need #' to do and return for each group. `group_modify()` is good for "data frame in, data frame out". #' If that is too limited, you need to use a [nested][group_nest()] or [split][group_split()] workflow. #' `group_modify()` is an evolution of [do()], if you have used that before. #' #' Each conceptual group of the data frame is exposed to the function `.f` with two pieces of information: #' #' - The subset of the data for the group, exposed as `.x`. #' - The key, a tibble with exactly one row and columns for each grouping variable, exposed as `.y`. #' #' For completeness, `group_modify()`, `group_map` and `group_walk()` also work on #' ungrouped data frames, in that case the function is applied to the #' entire data frame (exposed as `.x`), and `.y` is a one row tibble with no #' column, consistently with [group_keys()]. #' #' @family grouping functions #' #' @param .data A grouped tibble #' @param .f A function or formula to apply to each group. #' #' If a __function__, it is used as is. It should have at least 2 formal arguments. #' #' If a __formula__, e.g. `~ head(.x)`, it is converted to a function. #' #' In the formula, you can use #' #' - `.` or `.x` to refer to the subset of rows of `.tbl` #' for the given group #' #' - `.y` to refer to the key, a one row tibble with one column per grouping variable #' that identifies the group #' #' @param ... Additional arguments passed on to `.f` #' @param .keep are the grouping variables kept in `.x` #' #' @return #' - `group_modify()` returns a grouped tibble. In that case `.f` must return a data frame. #' - `group_map()` returns a list of results from calling `.f` on each group. #' - `group_walk()` calls `.f` for side effects and returns the input `.tbl`, invisibly. #' #' @examples #' #' # return a list #' mtcars |> #' group_by(cyl) |> #' group_map(~ head(.x, 2L)) #' #' # return a tibble grouped by `cyl` with 2 rows per group #' # the grouping data is recalculated #' mtcars |> #' group_by(cyl) |> #' group_modify(~ head(.x, 2L)) #' @examplesIf requireNamespace("broom", quietly = TRUE) #' #' # a list of tibbles #' iris |> #' group_by(Species) |> #' group_map(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x))) #' #' # a restructured grouped tibble #' iris |> #' group_by(Species) |> #' group_modify(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x))) #' @examples #' #' # a list of vectors #' iris |> #' group_by(Species) |> #' group_map(~ quantile(.x$Petal.Length, probs = c(0.25, 0.5, 0.75))) #' #' # to use group_modify() the lambda must return a data frame #' iris |> #' group_by(Species) |> #' group_modify(~ { #' quantile(.x$Petal.Length, probs = c(0.25, 0.5, 0.75)) |> #' tibble::enframe(name = "prob", value = "quantile") #' }) #' #' iris |> #' group_by(Species) |> #' group_modify(~ { #' .x |> #' purrr::map_dfc(fivenum) |> #' mutate(nms = c("min", "Q1", "median", "Q3", "max")) #' }) #' #' # group_walk() is for side effects #' dir.create(temp <- tempfile()) #' iris |> #' group_by(Species) |> #' group_walk(~ write.csv(.x, file = file.path(temp, paste0(.y$Species, ".csv")))) #' list.files(temp, pattern = "csv$") #' unlink(temp, recursive = TRUE) #' #' # group_modify() and ungrouped data frames #' mtcars |> #' group_modify(~ head(.x, 2L)) #' #' @export group_map <- function(.data, .f, ..., .keep = FALSE) { lifecycle::signal_stage("experimental", "group_map()") UseMethod("group_map") } #' @export group_map.data.frame <- function( .data, .f, ..., .keep = FALSE, keep = deprecated() ) { if (!missing(keep)) { lifecycle::deprecate_stop( "1.0.0", "group_map(keep = )", "group_map(.keep = )" ) } .f <- as_group_map_function(.f) # call the function on each group chunks <- if (is_grouped_df(.data)) { group_split(.data, .keep = isTRUE(.keep)) } else { group_split(.data) } keys <- group_keys(.data) group_keys <- map(seq_len(nrow(keys)), function(i) keys[i, , drop = FALSE]) if (length(chunks)) { map2(chunks, group_keys, .f, ...) } else { # calling .f with .x and .y set to prototypes structure( list(), ptype = .f(attr(chunks, "ptype"), keys[integer(0L), ], ...) ) } } #' @rdname group_map #' @export group_modify <- function(.data, .f, ..., .keep = FALSE) { lifecycle::signal_stage("experimental", "group_map()") UseMethod("group_modify") } #' @export group_modify.data.frame <- function( .data, .f, ..., .keep = FALSE, keep = deprecated() ) { if (!missing(keep)) { lifecycle::deprecate_stop( "1.0.0", "group_modify(keep = )", "group_modify(.keep = )" ) } .f <- as_group_map_function(.f) .f(.data, group_keys(.data), ...) } #' @export group_modify.grouped_df <- function( .data, .f, ..., .keep = FALSE, keep = deprecated() ) { if (!missing(keep)) { lifecycle::deprecate_stop( "1.0.0", "group_modify(keep = )", "group_modify(.keep = )" ) } tbl_group_vars <- group_vars(.data) .f <- as_group_map_function(.f) error_call <- current_env() fun <- function(.x, .y) { res <- .f(.x, .y, ...) if (!inherits(res, "data.frame")) { abort("The result of `.f` must be a data frame.", call = error_call) } if (any(bad <- names(res) %in% tbl_group_vars)) { msg <- glue( "The returned data frame cannot contain the original grouping variables: {names}.", names = paste(names(res)[bad], collapse = ", ") ) abort(msg, call = error_call) } bind_cols(.y[rep(1L, nrow(res)), , drop = FALSE], res) } chunks <- group_map(.data, fun, .keep = .keep) res <- if (length(chunks) > 0L) { bind_rows(!!!chunks) } else { attr(chunks, "ptype") } grouped_df(res, group_vars(.data), group_by_drop_default(.data)) } #' @export #' @rdname group_map group_walk <- function(.data, .f, ..., .keep = FALSE) { lifecycle::signal_stage("experimental", "group_walk()") group_map(.data, .f, ..., .keep = .keep) invisible(.data) } dplyr/R/recode.R0000644000176200001440000003203515106134104013167 0ustar liggesusers#' Recode values #' #' @description #' `r lifecycle::badge("superseded")` #' #' `recode()` is superseded in favor of [recode_values()] and #' [replace_values()], which are more general and have a much better interface. #' `recode_factor()` is also superseded, however, its direct replacement is not #' currently available but will eventually live in #' [forcats](https://forcats.tidyverse.org/). For creating new variables based #' on logical vectors, use [if_else()]. For even more complicated criteria, use #' [case_when()]. #' #' `recode()` is a vectorised version of [switch()]: you can replace numeric #' values based on their position or their name, and character or factor values #' only by their name. This is an S3 generic: dplyr provides methods for #' numeric, character, and factors. You can use `recode()` directly with #' factors; it will preserve the existing order of levels while changing the #' values. Alternatively, you can use `recode_factor()`, which will change the #' order of levels to match the order of replacements. #' #' @param .x A vector to modify #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Replacements. For character and factor `.x`, these should be named #' and replacement is based only on their name. For numeric `.x`, these can be #' named or not. If not named, the replacement is done based on position i.e. #' `.x` represents positions to look for in replacements. See examples. #' #' When named, the argument names should be the current values to be replaced, and the #' argument values should be the new (replacement) values. #' #' All replacements must be the same type, and must have either #' length one or the same length as `.x`. #' @param .default If supplied, all values not otherwise matched will #' be given this value. If not supplied and if the replacements are #' the same type as the original values in `.x`, unmatched #' values are not changed. If not supplied and if the replacements #' are not compatible, unmatched values are replaced with `NA`. #' #' `.default` must be either length 1 or the same length as #' `.x`. #' @param .missing If supplied, any missing values in `.x` will be #' replaced by this value. Must be either length 1 or the same length as #' `.x`. #' @param .ordered If `TRUE`, `recode_factor()` creates an #' ordered factor. #' @return A vector the same length as `.x`, and the same type as #' the first of `...`, `.default`, or `.missing`. #' `recode_factor()` returns a factor whose levels are in the same order as #' in `...`. The levels in `.default` and `.missing` come last. #' @seealso [recode_values()] #' @export #' @examples #' set.seed(1234) #' #' x <- sample(c("a", "b", "c"), 10, replace = TRUE) #' #' # `recode()` is superseded by `recode_values()` and `replace_values()` #' #' # If you are fully recoding a vector use `recode_values()` #' recode(x, a = "Apple", b = "Banana", .default = NA_character_) #' recode_values(x, "a" ~ "Apple", "b" ~ "Banana") #' #' # With a default #' recode(x, a = "Apple", b = "Banana", .default = "unknown") #' recode_values(x, "a" ~ "Apple", "b" ~ "Banana", default = "unknown") #' #' # If you are partially updating a vector and want to retain the original #' # vector's values in locations you don't make a replacement, use #' # `replace_values()` #' recode(x, a = "Apple", b = "Banana") #' replace_values(x, "a" ~ "Apple", "b" ~ "Banana") #' #' # `replace_values()` is easier to use with numeric vectors, because you don't #' # need to turn the numeric values into names #' y <- c(1:4, NA) #' recode(y, `2` = 20L, `4` = 40L) #' replace_values(y, 2 ~ 20L, 4 ~ 40L) #' #' # `recode()` is particularly confusing because it tries to handle both #' # full recodings to new vector types and partial updating of an existing #' # vector. With the above example, using doubles (20) rather than integers #' # (20L) results in a warning from `recode()`, because it thinks you are #' # doing a full recode and missed a case. `replace_values()` is type stable #' # on `y` and will instead coerce the double values to integer. #' recode(y, `2` = 20, `4` = 40) #' replace_values(y, 2 ~ 20, 4 ~ 40) #' #' # This also makes `replace_values()` much safer. If you provide #' # incompatible types, it will error. #' recode(y, `2` = "20", `4` = "40") #' try(replace_values(y, 2 ~ "20", 4 ~ "40")) #' #' # If you were trying to fully recode the vector and want a different output #' # type, use `recode_values()` #' recode_values(y, 2 ~ "20", 4 ~ "40") #' #' # And if you want to ensure you don't miss a case, use `unmatched`, which #' # errors rather than warns #' try(recode_values(y, 2 ~ "20", 4 ~ "40", unmatched = "error")) #' #' # --------------------------------------------------------------------------- #' # Lookup tables #' #' # If you were splicing an external lookup vector into `recode()`, you can #' # instead use the `from` and `to` arguments of `recode_values()` #' x <- c("a", "b", "a", "c", "d", "c") #' #' lookup <- c( #' "a" = "A", #' "b" = "B", #' "c" = "C", #' "d" = "D" #' ) #' #' recode(x, !!!lookup) #' recode_values(x, from = names(lookup), to = unname(lookup)) #' #' # `recode_values()` is much more flexible here because the lookup table #' # isn't restricted to just character values. We recommend using `tribble()` #' # to build your lookup tables. #' lookup <- tribble( #' ~from, ~to, #' "a", 1, #' "b", 2, #' "c", 3, #' "d", 4 #' ) #' #' recode_values(x, from = lookup$from, to = lookup$to) #' #' # --------------------------------------------------------------------------- #' # Factors #' #' # The factor method of `recode()` can generally be replaced with #' # `forcats::fct_recode()` #' x <- factor(c("a", "b", "c")) #' recode(x, a = "Apple") #' # forcats::fct_recode(x, "Apple" = "a") #' #' # `recode_factor()` does not currently have a direct replacement, but we #' # plan to add one to forcats. In the meantime, use a lookup table that #' # recodes every case, and then convert the `to` column to a factor. If you #' # define your lookup table in your preferred level order, then the conversion #' # to factor is straightforward! #' y <- c(3, 4, 1, 2, 4, NA) #' #' recode_factor( #' y, #' `1` = "a", #' `2` = "b", #' `3` = "c", #' `4` = "d", #' .missing = "M" #' ) #' #' lookup <- tribble( #' ~from, ~to, #' 1, "a", #' 2, "b", #' 3, "c", #' 4, "d", #' NA, "M" #' ) #' # `factor()` generates levels by sorting the unique values of `to`, which we #' # don't want, so we supply `levels = to` directly. Alternatively, use #' # `forcats::fct(to)`, which generates levels in order of appearance. #' lookup <- mutate(lookup, to = factor(to, levels = to)) #' #' recode_values(y, from = lookup$from, to = lookup$to) recode <- function(.x, ..., .default = NULL, .missing = NULL) { # Superseded in dplyr 1.1.0 lifecycle::signal_stage("superseded", "recode()", "recode_values()") UseMethod("recode") } #' @export recode.numeric <- function(.x, ..., .default = NULL, .missing = NULL) { values <- list2(...) nms <- have_name(values) if (all(nms)) { vals <- as.double(names(values)) } else if (all(!nms)) { vals <- seq_along(values) } else { msg <- "Either all values must be named, or none must be named." abort(msg) } n <- length(.x) template <- find_template(values, .default, .missing) out <- template[rep(NA_integer_, n)] replaced <- rep(FALSE, n) for (i in seq_along(values)) { out <- replace_with(out, .x == vals[i], values[[i]], paste0("Vector ", i)) replaced[.x == vals[i]] <- TRUE } .default <- validate_recode_default(.default, .x, out, replaced) out <- replace_with(out, !replaced & !is.na(.x), .default, "`.default`") out <- replace_with(out, is.na(.x), .missing, "`.missing`") out } #' @export recode.character <- function(.x, ..., .default = NULL, .missing = NULL) { .x <- as.character(.x) values <- list2(...) if (!all(have_name(values))) { bad <- which(!have_name(values)) + 1 msg <- glue("{fmt_pos_args(bad)} must be named.") abort(msg) } n <- length(.x) template <- find_template(values, .default, .missing) out <- template[rep(NA_integer_, n)] replaced <- rep(FALSE, n) for (nm in names(values)) { out <- replace_with(out, .x == nm, values[[nm]], paste0("`", nm, "`")) replaced[.x == nm] <- TRUE } .default <- validate_recode_default(.default, .x, out, replaced) out <- replace_with(out, !replaced & !is.na(.x), .default, "`.default`") out <- replace_with(out, is.na(.x), .missing, "`.missing`") out } #' @export recode.factor <- function(.x, ..., .default = NULL, .missing = NULL) { values <- list2(...) if (length(values) == 0) { abort("No replacements provided.") } if (!all(have_name(values))) { bad <- which(!have_name(values)) + 1 msg <- glue("{fmt_pos_args(bad)} must be named.") abort(msg) } if (!is.null(.missing)) { msg <- glue("`.missing` is not supported for factors.") abort(msg) } n <- length(levels(.x)) template <- find_template(values, .default, .missing) out <- template[rep(NA_integer_, n)] replaced <- rep(FALSE, n) for (nm in names(values)) { out <- replace_with( out, levels(.x) == nm, values[[nm]], paste0("`", nm, "`") ) replaced[levels(.x) == nm] <- TRUE } .default <- validate_recode_default(.default, .x, out, replaced) out <- replace_with(out, !replaced, .default, "`.default`") if (is.character(out)) { levels(.x) <- out .x } else { out[as.integer(.x)] } } find_template <- function( values, .default = NULL, .missing = NULL, error_call = caller_env() ) { x <- compact(c(values, .default, .missing)) if (length(x) == 0) { abort("No replacements provided.", call = error_call) } x[[1]] } validate_recode_default <- function(default, x, out, replaced) { default <- recode_default(x, default, out) if (is.null(default) && sum(replaced & !is.na(x)) < length(out[!is.na(x)])) { bullets <- c( "Unreplaced values treated as NA as `.x` is not compatible. ", "Please specify replacements exhaustively or supply `.default`." ) warn(bullets) } default } recode_default <- function(x, default, out) { UseMethod("recode_default") } #' @export recode_default.default <- function(x, default, out) { same_type <- identical(typeof(x), typeof(out)) if (is.null(default) && same_type) { x } else { default } } #' @export recode_default.factor <- function(x, default, out) { if (is.null(default)) { if ((is.character(out) || is.factor(out)) && is.factor(x)) { levels(x) } else { out[NA_integer_] } } else { default } } #' @rdname recode #' @export recode_factor <- function( .x, ..., .default = NULL, .missing = NULL, .ordered = FALSE ) { # Superseded in dplyr 1.1.0 lifecycle::signal_stage("superseded", "recode_factor()", "recode_values()") values <- list2(...) recoded <- recode(.x, !!!values, .default = .default, .missing = .missing) all_levels <- unique(c( values, recode_default(.x, .default, recoded), .missing )) recoded_levels <- if (is.factor(recoded)) levels(recoded) else unique(recoded) levels <- intersect(all_levels, recoded_levels) factor(recoded, levels, ordered = .ordered) } # ------------------------------------------------------------------------------ # Helpers replace_with <- function( x, i, val, name, reason = NULL, error_call = caller_env() ) { if (is.null(val)) { return(x) } check_length(val, x, name, reason, error_call = error_call) check_type(val, x, name, error_call = error_call) check_class(val, x, name, error_call = error_call) i[is.na(i)] <- FALSE if (length(val) == 1L) { x[i] <- val } else { x[i] <- val[i] } x } fmt_check_length_val <- function(length_x, n, header, reason = NULL) { if (all(length_x %in% c(1L, n))) { return() } if (is.null(reason)) { reason <- "" } else { reason <- glue(" ({reason})") } if (n == 1) { glue("{header} must be length 1{reason}, not {commas(length_x)}.") } else { glue("{header} must be length {n}{reason} or one, not {commas(length_x)}.") } } check_length_val <- function( length_x, n, header, reason = NULL, error_call = caller_env() ) { msg <- fmt_check_length_val(length_x, n, header, reason) if (length(msg)) { abort(msg, call = error_call) } } check_length <- function( x, template, header, reason = NULL, error_call = caller_env() ) { check_length_val( length(x), length(template), header, reason, error_call = error_call ) } check_type <- function(x, template, header, error_call = caller_env()) { if (identical(typeof(x), typeof(template))) { return() } msg <- glue( "{header} must be {obj_type_friendly(template)}, not {obj_type_friendly(x)}." ) abort(msg, call = error_call) } check_class <- function(x, template, header, error_call = caller_env()) { if (!is.object(x)) { return() } if (identical(class(x), class(template))) { return() } exp_classes <- fmt_classes(template) out_classes <- fmt_classes(x) msg <- glue( "{header} must have class `{exp_classes}`, not class `{out_classes}`." ) abort(msg, call = error_call) } dplyr/R/select-helpers.R0000644000176200001440000000433615106134104014650 0ustar liggesusers#' Select grouping variables #' #' This selection helpers matches grouping variables. It can be used #' in [select()] or [vars()] selections. #' #' @param data For advanced use only. The default `NULL` automatically #' finds the "current" data frames. #' @param vars `r lifecycle::badge("defunct")` #' @seealso [groups()] and [group_vars()] for retrieving the grouping #' variables outside selection contexts. #' #' @examples #' gdf <- iris |> group_by(Species) #' gdf |> select(group_cols()) #' #' # Remove the grouping variables from mutate selections: #' gdf |> mutate_at(vars(-group_cols()), `/`, 100) #' # -> No longer necessary with across() #' gdf |> mutate(across(everything(), ~ . / 100)) #' @export group_cols <- function(vars = NULL, data = NULL) { # So group_cols() continues to work in _at() helpers. data <- data %||% tryCatch(tidyselect::peek_data(), error = function(e) NULL) if (!is.null(data)) { match(group_vars(data), tbl_vars(data)) } else { group_cols_legacy(vars) } } group_cols_legacy <- function(vars = NULL) { if (!is.null(vars)) { lifecycle::deprecate_stop( "1.0.0", "group_cols(vars = )", details = "Use `data` with entire dataframe instead" ) } vars <- tidyselect::peek_vars() if (is_sel_vars(vars)) { matches <- match(vars %@% groups, vars) if (anyNA(matches)) { abort("Can't find the grouping variables.") } matches } else { int() } } # Alias required for help links in downstream packages #' @aliases select_helpers #' @importFrom tidyselect contains #' @export tidyselect::contains #' @importFrom tidyselect ends_with #' @export tidyselect::ends_with #' @importFrom tidyselect everything #' @export tidyselect::everything #' @importFrom tidyselect matches #' @export tidyselect::matches #' @importFrom tidyselect num_range #' @export tidyselect::num_range #' @importFrom tidyselect one_of #' @export tidyselect::one_of #' @importFrom tidyselect starts_with #' @export tidyselect::starts_with #' @importFrom tidyselect last_col #' @export tidyselect::last_col #' @importFrom tidyselect any_of #' @export tidyselect::any_of #' @importFrom tidyselect all_of #' @export tidyselect::all_of #' @importFrom tidyselect where #' @export tidyselect::where dplyr/R/case-match.R0000644000176200001440000001531215137161765013753 0ustar liggesusers#' A general vectorised `switch()` #' #' @description #' `r lifecycle::badge("deprecated")` #' #' `case_match()` is deprecated. Please use [recode_values()] and #' [replace_values()] instead, which are more powerful, have more intuitive #' names, and have better safety. In addition to the familiar two-sided formula #' interface, these functions also have `from` and `to` arguments which allow #' you to incorporate a lookup table into the recoding process. #' #' This function allows you to vectorise multiple [switch()] statements. Each #' case is evaluated sequentially and the first match for each element #' determines the corresponding value in the output vector. If no cases match, #' the `.default` is used. #' #' @param .x A vector to match against. #' #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> A sequence of two-sided #' formulas: `old_values ~ new_value`. The right hand side (RHS) determines #' the output value for all values of `.x` that match the left hand side #' (LHS). #' #' The LHS must evaluate to the same type of vector as `.x`. It can be any #' length, allowing you to map multiple `.x` values to the same RHS value. #' If a value is repeated in the LHS, i.e. a value in `.x` matches to #' multiple cases, the first match is used. #' #' The RHS inputs will be coerced to their common type. Each RHS input will be #' [recycled][vctrs::theory-faq-recycling] to the size of `.x`. #' #' @param .default The value used when values in `.x` aren't matched by any of #' the LHS inputs. If `NULL`, the default, a missing value will be used. #' #' `.default` is [recycled][vctrs::theory-faq-recycling] to the size of #' `.x`. #' #' @param .ptype An optional prototype declaring the desired output type. If #' not supplied, the output type will be taken from the common type of #' all RHS inputs and `.default`. #' #' @return #' A vector with the same size as `.x` and the same type as the common type of #' the RHS inputs and `.default` (if not overridden by `.ptype`). #' #' @keywords internal #' #' @export #' @examples #' # `case_match()` is deprecated and has been replaced by `recode_values()` and #' # `replace_values()` #' #' x <- c("a", "b", "a", "d", "b", NA, "c", "e") #' #' # `recode_values()` is a 1:1 replacement for `case_match()` #' case_match( #' x, #' "a" ~ 1, #' "b" ~ 2, #' "c" ~ 3, #' "d" ~ 4 #' ) #' recode_values( #' x, #' "a" ~ 1, #' "b" ~ 2, #' "c" ~ 3, #' "d" ~ 4 #' ) #' #' # `recode_values()` has an additional `unmatched` argument to help you catch #' # missed mappings #' try(recode_values( #' x, #' "a" ~ 1, #' "b" ~ 2, #' "c" ~ 3, #' "d" ~ 4, #' unmatched = "error" #' )) #' #' # `recode_values()` also has additional `from` and `to` arguments, which are #' # useful when your lookup table is defined elsewhere (for example, it could #' # be read in from a CSV file). This is very difficult to do with #' # `case_match()`! #' lookup <- tribble( #' ~from, ~to, #' "a", 1, #' "b", 2, #' "c", 3, #' "d", 4 #' ) #' #' recode_values(x, from = lookup$from, to = lookup$to) #' #' # Both `case_match()` and `recode_values()` work with more than just #' # character inputs: #' y <- as.integer(c(1, 2, 1, 3, 1, NA, 2, 4)) #' #' case_match( #' y, #' c(1, 3) ~ "odd", #' c(2, 4) ~ "even", #' .default = "missing" #' ) #' recode_values( #' y, #' c(1, 3) ~ "odd", #' c(2, 4) ~ "even", #' default = "missing" #' ) #' #' # Or with a lookup table #' lookup <- tribble( #' ~from, ~to, #' c(1, 3), "odd", #' c(2, 4), "even" #' ) #' recode_values(y, from = lookup$from, to = lookup$to, default = "missing") #' #' # `replace_values()` is a convenient way to replace selected values, leaving #' # everything else as is. It's similar to `case_match(y, .default = y)`. #' replace_values(y, NA ~ 0) #' case_match(y, NA ~ 0, .default = y) #' #' # Notably, `replace_values()` is type stable, which means that `y` can't #' # change types out from under you, unlike with `case_match()`! #' typeof(y) #' typeof(replace_values(y, NA ~ 0)) #' typeof(case_match(y, NA ~ 0, .default = y)) #' #' # We believe that `replace_values()` better expresses intent when doing a #' # partial replacement. Compare these two `mutate()` calls, each with the #' # goals of: #' # - Replace missings in `hair_color` #' # - Replace some of the `species` #' starwars |> #' mutate( #' hair_color = case_match(hair_color, NA ~ "unknown", .default = hair_color), #' species = case_match( #' species, #' "Human" ~ "Humanoid", #' "Droid" ~ "Robot", #' c("Wookiee", "Ewok") ~ "Hairy", #' .default = species #' ), #' .keep = "used" #' ) #' #' updates <- tribble( #' ~from, ~to, #' "Human", "Humanoid", #' "Droid", "Robot", #' c("Wookiee", "Ewok"), "Hairy" #' ) #' #' starwars |> #' mutate( #' hair_color = replace_values(hair_color, NA ~ "unknown"), #' species = replace_values(species, from = updates$from, to = updates$to), #' .keep = "used" #' ) case_match <- function(.x, ..., .default = NULL, .ptype = NULL) { lifecycle::deprecate_soft( when = "1.2.0", what = "case_match()", with = "recode_values()", id = "dplyr-case-match" ) # Matching historical behavior of `case_match()`, which was to work like # `case_when()` and not allow empty `...`. Newer `replace_when()` and # `replace_values()` are a no-op for this case, but we deprecated # `case_match()` at that time so it never moved to the new behavior. allow_empty_dots <- FALSE args <- eval_formulas( ..., allow_empty_dots = allow_empty_dots ) haystacks <- args$lhs values <- args$rhs vec_case_match( needles = .x, haystacks = haystacks, values = values, needles_arg = ".x", haystacks_arg = "", values_arg = "", default = .default, default_arg = ".default", ptype = .ptype, call = current_env() ) } vec_case_match <- function( needles, haystacks, values, ..., needles_arg = "needles", haystacks_arg = "haystacks", values_arg = "values", default = NULL, default_arg = "default", ptype = NULL, call = current_env() ) { check_dots_empty0(...) obj_check_vector(needles, arg = needles_arg, call = call) obj_check_list(haystacks, arg = haystacks_arg, call = call) list_check_all_vectors(haystacks, arg = haystacks_arg, call = call) haystacks <- vec_cast_common( !!!haystacks, .to = needles, .arg = haystacks_arg, .call = call ) conditions <- map(haystacks, vec_in, needles = needles) size <- vec_size(needles) vec_case_when( conditions = conditions, values = values, default = default, ptype = ptype, size = size, conditions_arg = "", values_arg = values_arg, default_arg = default_arg, error_call = call ) } dplyr/R/select.R0000644000176200001440000000600315106134104013201 0ustar liggesusers#' Keep or drop columns using their names and types #' #' @description #' #' Select (and optionally rename) variables in a data frame, using a concise #' mini-language that makes it easy to refer to variables based on their name #' (e.g. `a:f` selects all columns from `a` on the left to `f` on the #' right) or type (e.g. `where(is.numeric)` selects all numeric columns). #' #' ## Overview of selection features #' #' ```{r, child = "man/rmd/overview.Rmd"} #' ``` #' #' @inheritParams arrange #' @param ... <[`tidy-select`][dplyr_tidy_select]> One or more unquoted #' expressions separated by commas. Variable names can be used as if they #' were positions in the data frame, so expressions like `x:y` can #' be used to select a range of variables. #' @return #' An object of the same type as `.data`. The output has the following #' properties: #' #' * Rows are not affected. #' * Output columns are a subset of input columns, potentially with a different #' order. Columns will be renamed if `new_name = old_name` form is used. #' * Data frame attributes are preserved. #' * Groups are maintained; you can't select off grouping variables. #' #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("select")}. #' #' @section Examples: #' #' ```{r, echo = FALSE, results = "asis"} #' result <- rlang::with_options( #' knitr::knit_child("man/rmd/select.Rmd"), #' tibble.print_min = 4, #' tibble.max_extra_cols = 8, #' pillar.min_title_chars = 20, #' digits = 2 #' ) #' cat(result, sep = "\n") #' ``` #' #' @family single table verbs #' @export select <- function(.data, ...) { UseMethod("select") } #' @export select.list <- function(.data, ...) { abort("`select()` doesn't handle lists.") } #' @export select.data.frame <- function(.data, ...) { error_call <- dplyr_error_call() loc <- tidyselect::eval_select( expr(c(...)), data = .data, error_call = error_call ) loc <- ensure_group_vars(loc, .data, notify = TRUE) out <- dplyr_col_select(.data, loc) out <- set_names(out, names(loc)) out } # Helpers ----------------------------------------------------------------- ensure_group_vars <- function(loc, data, notify = TRUE) { group_loc <- match(group_vars(data), names(data)) missing <- setdiff(group_loc, loc) if (length(missing) > 0) { vars <- names(data)[missing] added_group_loc <- set_names(missing, vars) # don't add grouping variables with same name as new variable (#5841) added_group_loc <- added_group_loc[!vars %in% names(loc)] if (length(added_group_loc) > 0 && notify) { inform(glue( "Adding missing grouping variables: ", paste0("`", names(added_group_loc), "`", collapse = ", ") )) } loc <- c(added_group_loc, loc) } loc } dplyr/R/arrange.R0000644000176200001440000002173215106134104013347 0ustar liggesusers#' Order rows using column values #' #' @description #' `arrange()` orders the rows of a data frame by the values of selected #' columns. #' #' Unlike other dplyr verbs, `arrange()` largely ignores grouping; you #' need to explicitly mention grouping variables (or use `.by_group = TRUE`) #' in order to group by them, and functions of variables are evaluated #' once per data frame, not once per group. #' #' @details #' ## Missing values #' Unlike base sorting with `sort()`, `NA` are: #' * always sorted to the end for local data, even when wrapped with `desc()`. #' * treated differently for remote data, depending on the backend. #' #' @return #' An object of the same type as `.data`. The output has the following #' properties: #' #' * All rows appear in the output, but (usually) in a different place. #' * Columns are not modified. #' * Groups are not modified. #' * Data frame attributes are preserved. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("arrange")}. #' @export #' @param .data A data frame, data frame extension (e.g. a tibble), or a #' lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for #' more details. #' @param ... <[`data-masking`][rlang::args_data_masking]> Variables, or #' functions of variables. Use [desc()] to sort a variable in descending #' order. #' @param .by_group If `TRUE`, will sort first by grouping variable. Applies to #' grouped data frames only. #' @param .locale The locale to sort character vectors in. #' #' - If `NULL`, the default, uses the `"C"` locale unless the deprecated #' `dplyr.legacy_locale` global option escape hatch is active. See the #' [dplyr-locale] help page for more details. #' #' - If a single string from [stringi::stri_locale_list()] is supplied, then #' this will be used as the locale to sort with. For example, `"en"` will #' sort with the American English locale. This requires the stringi package. #' #' - If `"C"` is supplied, then character vectors will always be sorted in the #' C locale. This does not require stringi and is often much faster than #' supplying a locale identifier. #' #' The C locale is not the same as English locales, such as `"en"`, #' particularly when it comes to data containing a mix of upper and lower case #' letters. This is explained in more detail on the [locale][dplyr-locale] #' help page under the `Default locale` section. #' @family single table verbs #' @examples #' arrange(mtcars, cyl, disp) #' arrange(mtcars, desc(disp)) #' #' # grouped arrange ignores groups #' by_cyl <- mtcars |> group_by(cyl) #' by_cyl |> arrange(desc(wt)) #' # Unless you specifically ask: #' by_cyl |> arrange(desc(wt), .by_group = TRUE) #' #' # use embracing when wrapping in a function; #' # see ?rlang::args_data_masking for more details #' tidy_eval_arrange <- function(.data, var) { #' .data |> #' arrange({{ var }}) #' } #' tidy_eval_arrange(mtcars, mpg) #' #' # Use `across()` or `pick()` to select columns with tidy-select #' iris |> arrange(pick(starts_with("Sepal"))) #' iris |> arrange(across(starts_with("Sepal"), desc)) arrange <- function(.data, ..., .by_group = FALSE) { UseMethod("arrange") } #' @rdname arrange #' @export arrange.data.frame <- function(.data, ..., .by_group = FALSE, .locale = NULL) { dots <- enquos(...) if (.by_group) { dots <- c(quos(!!!groups(.data)), dots) } loc <- arrange_rows(.data, dots = dots, locale = .locale) dplyr_row_slice(.data, loc) } # Helpers ----------------------------------------------------------------- arrange_rows <- function(data, dots, locale, error_call = caller_env()) { dplyr_local_error_call(error_call) size <- nrow(data) # `arrange()` implementation always uses bare data frames data <- new_data_frame(data, n = size) # Strip out calls to desc() replacing with direction argument is_desc_call <- function(x) { quo_is_call(x, "desc", ns = c("", "dplyr")) } directions <- map_chr(dots, function(dot) { if (is_desc_call(dot)) "desc" else "asc" }) dots <- map(dots, function(dot) { if (is_desc_call(dot)) { expr <- quo_get_expr(dot) if (!has_length(expr, 2L)) { abort( "`desc()` must be called with exactly one argument.", call = error_call ) } dot <- new_quosure(expr[[2]], quo_get_env(dot)) } dot }) n_dots <- length(dots) seq_dots <- seq_len(n_dots) cols <- vector("list", length = n_dots) names(cols) <- as.character(seq_dots) for (i in seq_dots) { name <- vec_paste0("..", i) dot <- dots[[i]] # Evaluate each `dot` on the original data separately, rather than # evaluating all at once. We want to avoid the "sequential evaluation" # feature of `mutate()` where the 2nd expression can access results of the # 1st (#6495). elt <- mutate(data, "{name}" := !!dot, .keep = "none") elt <- elt[[name]] if (is.null(elt)) { # In this case, `dot` evaluated to `NULL` for "column removal" so # `elt[[name]]` won't exist, but we don't want to shorten `cols`. next } cols[[i]] <- elt } if (vec_any_missing(cols)) { # Drop `NULL`s that could result from `dot` evaluating to `NULL` above not_missing <- !vec_detect_missing(cols) cols <- vec_slice(cols, not_missing) directions <- vec_slice(directions, not_missing) } data <- new_data_frame(cols, n = size) if (is.null(locale) && dplyr_legacy_locale()) { # Temporary legacy support for respecting the system locale. # Only applied when `.locale` is `NULL` and `dplyr.legacy_locale` is set. # Matches legacy `group_by()` ordering. out <- dplyr_order_legacy(data = data, direction = directions) return(out) } na_values <- if_else(directions == "desc", "smallest", "largest") chr_proxy_collate <- locale_to_chr_proxy_collate( locale = locale, error_call = error_call ) vec_order_radix( x = data, direction = directions, na_value = na_values, chr_proxy_collate = chr_proxy_collate ) } locale_to_chr_proxy_collate <- function( locale, ..., has_stringi = has_minimum_stringi(), error_call = caller_env() ) { check_dots_empty0(...) if (is.null(locale) || is_string(locale, string = "C")) { return(NULL) } if (is_character(locale)) { if (!is_string(locale)) { abort( "If `.locale` is a character vector, it must be a single string.", call = error_call ) } if (!has_stringi) { abort( "stringi >=1.5.3 is required to arrange in a different locale.", call = error_call ) } if (!locale %in% stringi::stri_locale_list()) { abort( "`.locale` must be one of the locales within `stringi::stri_locale_list()`.", call = error_call ) } return(sort_key_generator(locale)) } abort("`.locale` must be a string or `NULL`.", call = error_call) } sort_key_generator <- function(locale) { function(x) { stringi::stri_sort_key(x, locale = locale) } } # ------------------------------------------------------------------------------ dplyr_order_legacy <- function(data, direction = "asc") { if (df_n_col(data) == 0L) { # Work around `order(!!!list())` returning `NULL` return(seq_len(nrow(data))) } proxies <- map2(data, direction, dplyr_proxy_order_legacy) proxies <- unname(proxies) inject(order(!!!proxies)) } dplyr_proxy_order_legacy <- function(x, direction) { # `order()` doesn't have a vectorized `decreasing` argument for most values of # `method` ("radix" is an exception). So we need to apply this by column ahead # of time. We have to apply `vec_proxy_order()` by column too, rather than on # the original data frame, because it flattens df-cols and we can lose track # of where to apply `direction`. x <- vec_proxy_order(x) if (is.data.frame(x)) { if (any(map_lgl(x, is.data.frame))) { abort( "All data frame columns should have been flattened by now.", .internal = TRUE ) } # Special handling for data frame proxies (either from df-cols or from # vector classes with df proxies, like rcrds), which `order()` can't handle. # We have to replace the df proxy with a single vector that orders the same # way, so we use a dense rank that utilizes the system locale. unique <- vec_unique(x) order <- dplyr_order_legacy(unique, direction) sorted_unique <- vec_slice(unique, order) out <- vec_match(x, sorted_unique) return(out) } if ( !is_character(x) && !is_logical(x) && !is_integer(x) && !is_double(x) && !is_complex(x) ) { abort("Invalid type returned by `vec_proxy_order()`.", .internal = TRUE) } if (is.object(x)) { x <- unstructure(x) } if (direction == "desc") { x <- desc(x) } x } dplyr/R/sample.R0000644000176200001440000001153015106134104013204 0ustar liggesusers#' Sample n rows from a table #' #' @description #' `r lifecycle::badge("superseded")` #' `sample_n()` and `sample_frac()` have been superseded in favour of #' [slice_sample()]. While they will not be deprecated in the near future, #' retirement means that we will only perform critical bug fixes, so we recommend #' moving to the newer alternative. #' #' These functions were superseded because we realised it was more convenient to #' have two mutually exclusive arguments to one function, rather than two #' separate functions. This also made it to clean up a few other smaller #' design issues with `sample_n()`/`sample_frac`: #' #' * The connection to `slice()` was not obvious. #' * The name of the first argument, `tbl`, is inconsistent with other #' single table verbs which use `.data`. #' * The `size` argument uses tidy evaluation, which is surprising and #' undocumented. #' * It was easier to remove the deprecated `.env` argument. #' * `...` was in a suboptimal position. #' #' @keywords internal #' @param tbl A data.frame. #' @param size <[`tidy-select`][dplyr_tidy_select]> #' For `sample_n()`, the number of rows to select. #' For `sample_frac()`, the fraction of rows to select. #' If `tbl` is grouped, `size` applies to each group. #' @param replace Sample with or without replacement? #' @param weight <[`tidy-select`][dplyr_tidy_select]> Sampling weights. #' This must evaluate to a vector of non-negative numbers the same length as #' the input. Weights are automatically standardised to sum to 1. #' @param .env DEPRECATED. #' @param ... ignored #' @examples #' df <- tibble(x = 1:5, w = c(0.1, 0.1, 0.1, 2, 2)) #' #' # sample_n() -> slice_sample() ---------------------------------------------- #' # Was: #' sample_n(df, 3) #' sample_n(df, 10, replace = TRUE) #' sample_n(df, 3, weight = w) #' #' # Now: #' slice_sample(df, n = 3) #' slice_sample(df, n = 10, replace = TRUE) #' slice_sample(df, n = 3, weight_by = w) #' #' # Note that sample_n() would error if n was bigger than the group size #' # slice_sample() will just use the available rows for consistency with #' # the other slice helpers like slice_head() #' try(sample_n(df, 10)) #' slice_sample(df, n = 10) #' #' # sample_frac() -> slice_sample() ------------------------------------------- #' # Was: #' sample_frac(df, 0.25) #' sample_frac(df, 2, replace = TRUE) #' #' # Now: #' slice_sample(df, prop = 0.25) #' slice_sample(df, prop = 2, replace = TRUE) #' @export sample_n <- function( tbl, size, replace = FALSE, weight = NULL, .env = NULL, ... ) { lifecycle::signal_stage("superseded", "sample_n()") UseMethod("sample_n") } #' @export sample_n.default <- function( tbl, size, replace = FALSE, weight = NULL, .env = parent.frame(), ... ) { msg <- glue("`tbl` must be a data frame, not {obj_type_friendly(tbl)}.") abort(msg) } #' @export sample_n.data.frame <- function( tbl, size, replace = FALSE, weight = NULL, .env = NULL, ... ) { if (!is_null(.env)) { inform( "`sample_n()` argument `.env` is deprecated and no longer has any effect." ) } size <- enquo(size) weight <- enquo(weight) dplyr_local_error_call() slice( tbl, local({ size <- check_size(!!size, n(), replace = replace) sample.int(n(), size, replace = replace, prob = !!weight) }) ) } #' @rdname sample_n #' @export sample_frac <- function( tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ... ) { lifecycle::signal_stage("superseded", "sample_frac()") UseMethod("sample_frac") } #' @export sample_frac.default <- function( tbl, size = 1, replace = FALSE, weight = NULL, .env = parent.frame(), ... ) { msg <- glue("`tbl` must be a data frame, not {obj_type_friendly(tbl)}.") abort(msg) } #' @export sample_frac.data.frame <- function( tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ... ) { if (!is_null(.env)) { inform("`.env` is deprecated and no longer has any effect") } size <- enquo(size) weight <- enquo(weight) dplyr_local_error_call() slice( tbl, local({ size <- round(n() * check_frac(!!size, replace = replace)) sample.int(n(), size, replace = replace, prob = !!weight) }) ) } # Helper functions ------------------------------------------------------------- check_size <- function(size, n, replace = FALSE) { if (size <= n || replace) { return(invisible(size)) } bullets <- c( glue("`size` must be less than or equal to {n} (size of data)."), i = "set `replace = TRUE` to use sampling with replacement." ) abort(bullets, call = NULL) } check_frac <- function(size, replace = FALSE) { if (size <= 1 || replace) { return(invisible(size)) } bullets <- c( glue("`size` of sampled fraction must be less or equal to one."), i = "set `replace = TRUE` to use sampling with replacement." ) abort(bullets, call = NULL) } dplyr/R/join.R0000644000176200001440000006651515137161765012720 0ustar liggesusers#' Mutating joins #' #' @description #' Mutating joins add columns from `y` to `x`, matching observations based on #' the keys. There are four mutating joins: the inner join, and the three outer #' joins. #' #' ## Inner join #' #' An `inner_join()` only keeps observations from `x` that have a matching key #' in `y`. #' #' The most important property of an inner join is that unmatched rows in either #' input are not included in the result. This means that generally inner joins #' are not appropriate in most analyses, because it is too easy to lose #' observations. #' #' ## Outer joins #' #' The three outer joins keep observations that appear in at least one of the #' data frames: #' #' * A `left_join()` keeps all observations in `x`. #' #' * A `right_join()` keeps all observations in `y`. #' #' * A `full_join()` keeps all observations in `x` and `y`. #' #' @section Many-to-many relationships: #' #' By default, dplyr guards against many-to-many relationships in equality joins #' by throwing a warning. These occur when both of the following are true: #' #' - A row in `x` matches multiple rows in `y`. #' - A row in `y` matches multiple rows in `x`. #' #' This is typically surprising, as most joins involve a relationship of #' one-to-one, one-to-many, or many-to-one, and is often the result of an #' improperly specified join. Many-to-many relationships are particularly #' problematic because they can result in a Cartesian explosion of the number of #' rows returned from the join. #' #' If a many-to-many relationship is expected, silence this warning by #' explicitly setting `relationship = "many-to-many"`. #' #' In production code, it is best to preemptively set `relationship` to whatever #' relationship you expect to exist between the keys of `x` and `y`, as this #' forces an error to occur immediately if the data doesn't align with your #' expectations. #' #' Inequality joins typically result in many-to-many relationships by nature, so #' they don't warn on them by default, but you should still take extra care when #' specifying an inequality join, because they also have the capability to #' return a large number of rows. #' #' Rolling joins don't warn on many-to-many relationships either, but many #' rolling joins follow a many-to-one relationship, so it is often useful to #' set `relationship = "many-to-one"` to enforce this. #' #' Note that in SQL, most database providers won't let you specify a #' many-to-many relationship between two tables, instead requiring that you #' create a third _junction table_ that results in two one-to-many relationships #' instead. #' #' @return #' An object of the same type as `x` (including the same groups). The order of #' the rows and columns of `x` is preserved as much as possible. The output has #' the following properties: #' #' * The rows are affect by the join type. #' * `inner_join()` returns matched `x` rows. #' * `left_join()` returns all `x` rows. #' * `right_join()` returns matched of `x` rows, followed by unmatched `y` rows. #' * `full_join()` returns all `x` rows, followed by unmatched `y` rows. #' * Output columns include all columns from `x` and all non-key columns from #' `y`. If `keep = TRUE`, the key columns from `y` are included as well. #' * If non-key columns in `x` and `y` have the same name, `suffix`es are added #' to disambiguate. If `keep = TRUE` and key columns in `x` and `y` have #' the same name, `suffix`es are added to disambiguate these as well. #' * If `keep = FALSE`, output columns included in `by` are coerced to their #' common type between `x` and `y`. #' @section Methods: #' These functions are **generic**s, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' #' * `inner_join()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. #' * `left_join()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. #' * `right_join()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. #' * `full_join()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. #' @param x,y A pair of data frames, data frame extensions (e.g. a tibble), or #' lazy data frames (e.g. from dbplyr or dtplyr). See *Methods*, below, for #' more details. #' @param by A join specification created with [join_by()], or a character #' vector of variables to join by. #' #' If `NULL`, the default, `*_join()` will perform a natural join, using all #' variables in common across `x` and `y`. A message lists the variables so #' that you can check they're correct; suppress the message by supplying `by` #' explicitly. #' #' To join on different variables between `x` and `y`, use a [join_by()] #' specification. For example, `join_by(a == b)` will match `x$a` to `y$b`. #' #' To join by multiple variables, use a [join_by()] specification with #' multiple expressions. For example, `join_by(a == b, c == d)` will match #' `x$a` to `y$b` and `x$c` to `y$d`. If the column names are the same between #' `x` and `y`, you can shorten this by listing only the variable names, like #' `join_by(a, c)`. #' #' [join_by()] can also be used to perform inequality, rolling, and overlap #' joins. See the documentation at [?join_by][join_by()] for details on #' these types of joins. #' #' For simple equality joins, you can alternatively specify a character vector #' of variable names to join by. For example, `by = c("a", "b")` joins `x$a` #' to `y$a` and `x$b` to `y$b`. If variable names differ between `x` and `y`, #' use a named character vector like `by = c("x_a" = "y_a", "x_b" = "y_b")`. #' #' To perform a cross-join, generating all combinations of `x` and `y`, see #' [cross_join()]. #' @param copy If `x` and `y` are not from the same data source, #' and `copy` is `TRUE`, then `y` will be copied into the #' same src as `x`. This allows you to join tables across srcs, but #' it is a potentially expensive operation so you must opt into it. #' @param suffix If there are non-joined duplicate variables in `x` and #' `y`, these suffixes will be added to the output to disambiguate them. #' Should be a character vector of length 2. #' @param keep Should the join keys from both `x` and `y` be preserved in the #' output? #' - If `NULL`, the default, joins on equality retain only the keys from `x`, #' while joins on inequality retain the keys from both inputs. #' - If `TRUE`, all keys from both inputs are retained. #' - If `FALSE`, only keys from `x` are retained. For right and full joins, #' the data in key columns corresponding to rows that only exist in `y` are #' merged into the key columns from `x`. Can't be used when joining on #' inequality conditions. #' @param ... Other parameters passed onto methods. #' @param na_matches Should two `NA` or two `NaN` values match? #' - `"na"`, the default, treats two `NA` or two `NaN` values as equal, like #' `%in%`, [match()], and [merge()]. #' - `"never"` treats two `NA` or two `NaN` values as different, and will #' never match them together or to any other values. This is similar to joins #' for database sources and to `base::merge(incomparables = NA)`. #' @param multiple Handling of rows in `x` with multiple matches in `y`. #' For each row of `x`: #' - `"all"`, the default, returns every match detected in `y`. This is the #' same behavior as SQL. #' - `"any"` returns one match detected in `y`, with no guarantees on which #' match will be returned. It is often faster than `"first"` and `"last"` #' if you just need to detect if there is at least one match. #' - `"first"` returns the first match detected in `y`. #' - `"last"` returns the last match detected in `y`. #' @param unmatched How should unmatched keys that would result in dropped rows #' be handled? #' - `"drop"` drops unmatched keys from the result. #' - `"error"` throws an error if unmatched keys are detected. #' #' `unmatched` is intended to protect you from accidentally dropping rows #' during a join. It only checks for unmatched keys in the input that could #' potentially drop rows. #' - For left joins, it checks `y`. #' - For right joins, it checks `x`. #' - For inner joins, it checks both `x` and `y`. In this case, `unmatched` is #' also allowed to be a character vector of length 2 to specify the behavior #' for `x` and `y` independently. #' @param relationship Handling of the expected relationship between the keys of #' `x` and `y`. If the expectations chosen from the list below are #' invalidated, an error is thrown. #' #' - `NULL`, the default, doesn't expect there to be any relationship between #' `x` and `y`. However, for equality joins it will check for a many-to-many #' relationship (which is typically unexpected) and will warn if one occurs, #' encouraging you to either take a closer look at your inputs or make this #' relationship explicit by specifying `"many-to-many"`. #' #' See the _Many-to-many relationships_ section for more details. #' #' - `"one-to-one"` expects: #' - Each row in `x` matches at most 1 row in `y`. #' - Each row in `y` matches at most 1 row in `x`. #' #' - `"one-to-many"` expects: #' - Each row in `y` matches at most 1 row in `x`. #' #' - `"many-to-one"` expects: #' - Each row in `x` matches at most 1 row in `y`. #' #' - `"many-to-many"` doesn't perform any relationship checks, but is provided #' to allow you to be explicit about this relationship if you know it #' exists. #' #' `relationship` doesn't handle cases where there are zero matches. For that, #' see `unmatched`. #' @family joins #' @examples #' band_members |> inner_join(band_instruments) #' band_members |> left_join(band_instruments) #' band_members |> right_join(band_instruments) #' band_members |> full_join(band_instruments) #' #' # To suppress the message about joining variables, supply `by` #' band_members |> inner_join(band_instruments, by = join_by(name)) #' # This is good practice in production code #' #' # Use an equality expression if the join variables have different names #' band_members |> full_join(band_instruments2, by = join_by(name == artist)) #' # By default, the join keys from `x` and `y` are coalesced in the output; use #' # `keep = TRUE` to keep the join keys from both `x` and `y` #' band_members |> #' full_join(band_instruments2, by = join_by(name == artist), keep = TRUE) #' #' # If a row in `x` matches multiple rows in `y`, all the rows in `y` will be #' # returned once for each matching row in `x`. #' df1 <- tibble(x = 1:3) #' df2 <- tibble(x = c(1, 1, 2), y = c("first", "second", "third")) #' df1 |> left_join(df2) #' #' # If a row in `y` also matches multiple rows in `x`, this is known as a #' # many-to-many relationship, which is typically a result of an improperly #' # specified join or some kind of messy data. In this case, a warning is #' # thrown by default: #' df3 <- tibble(x = c(1, 1, 1, 3)) #' df3 |> left_join(df2) #' #' # In the rare case where a many-to-many relationship is expected, set #' # `relationship = "many-to-many"` to silence this warning #' df3 |> left_join(df2, relationship = "many-to-many") #' #' # Use `join_by()` with a condition other than `==` to perform an inequality #' # join. Here we match on every instance where `df1$x > df2$x`. #' df1 |> left_join(df2, join_by(x > x)) #' #' # By default, NAs match other NAs so that there are two #' # rows in the output of this join: #' df1 <- data.frame(x = c(1, NA), y = 2) #' df2 <- data.frame(x = c(1, NA), z = 3) #' left_join(df1, df2) #' #' # You can optionally request that NAs don't match, giving a #' # a result that more closely resembles SQL joins #' left_join(df1, df2, na_matches = "never") #' @aliases join join.data.frame #' @name mutate-joins NULL #' @export #' @rdname mutate-joins inner_join <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) { UseMethod("inner_join") } #' @export #' @rdname mutate-joins inner_join.data.frame <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", unmatched = "drop", relationship = NULL ) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( x = x, y = y, by = by, type = "inner", suffix = suffix, na_matches = na_matches, keep = keep, multiple = multiple, unmatched = unmatched, relationship = relationship, user_env = caller_env() ) } #' @export #' @rdname mutate-joins left_join <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) { UseMethod("left_join") } #' @export #' @rdname mutate-joins left_join.data.frame <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", unmatched = "drop", relationship = NULL ) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( x = x, y = y, by = by, type = "left", suffix = suffix, na_matches = na_matches, keep = keep, multiple = multiple, unmatched = unmatched, relationship = relationship, user_env = caller_env() ) } #' @export #' @rdname mutate-joins right_join <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) { UseMethod("right_join") } #' @export #' @rdname mutate-joins right_join.data.frame <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", unmatched = "drop", relationship = NULL ) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( x = x, y = y, by = by, type = "right", suffix = suffix, na_matches = na_matches, keep = keep, multiple = multiple, unmatched = unmatched, relationship = relationship, user_env = caller_env() ) } #' @export #' @rdname mutate-joins full_join <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) { UseMethod("full_join") } #' @export #' @rdname mutate-joins full_join.data.frame <- function( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", relationship = NULL ) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( x = x, y = y, by = by, type = "full", suffix = suffix, na_matches = na_matches, keep = keep, multiple = multiple, # All keys from both inputs are retained. Erroring never makes sense. unmatched = "drop", relationship = relationship, user_env = caller_env() ) } #' Filtering joins #' #' @description #' Filtering joins filter rows from `x` based on the presence or absence #' of matches in `y`: #' #' * `semi_join()` returns all rows from `x` with a match in `y`. #' * `anti_join()` returns all rows from `x` with**out** a match in `y`. #' #' @param x,y A pair of data frames, data frame extensions (e.g. a tibble), or #' lazy data frames (e.g. from dbplyr or dtplyr). See *Methods*, below, for #' more details. #' @inheritParams left_join #' @return #' An object of the same type as `x`. The output has the following properties: #' #' * Rows are a subset of the input, but appear in the same order. #' * Columns are not modified. #' * Data frame attributes are preserved. #' * Groups are taken from `x`. The number of groups may be reduced. #' @section Methods: #' These function are **generic**s, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' #' * `semi_join()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("semi_join")}. #' * `anti_join()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("anti_join")}. #' @family joins #' @examples #' # "Filtering" joins keep cases from the LHS #' band_members |> semi_join(band_instruments) #' band_members |> anti_join(band_instruments) #' #' # To suppress the message about joining variables, supply `by` #' band_members |> semi_join(band_instruments, by = join_by(name)) #' # This is good practice in production code #' @name filter-joins NULL #' @export #' @rdname filter-joins semi_join <- function(x, y, by = NULL, copy = FALSE, ...) { UseMethod("semi_join") } #' @export #' @rdname filter-joins semi_join.data.frame <- function( x, y, by = NULL, copy = FALSE, ..., na_matches = c("na", "never") ) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_filter( x, y, by = by, type = "semi", na_matches = na_matches, user_env = caller_env() ) } #' @export #' @rdname filter-joins anti_join <- function(x, y, by = NULL, copy = FALSE, ...) { UseMethod("anti_join") } #' @export #' @rdname filter-joins anti_join.data.frame <- function( x, y, by = NULL, copy = FALSE, ..., na_matches = c("na", "never") ) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_filter( x, y, by = by, type = "anti", na_matches = na_matches, user_env = caller_env() ) } #' Nest join #' #' A nest join leaves `x` almost unchanged, except that it adds a new #' list-column, where each element contains the rows from `y` that match the #' corresponding row in `x`. #' #' # Relationship to other joins #' #' You can recreate many other joins from the result of a nest join: #' #' * [inner_join()] is a `nest_join()` plus [tidyr::unnest()]. #' * [left_join()] is a `nest_join()` plus `tidyr::unnest(keep_empty = TRUE)`. #' * [semi_join()] is a `nest_join()` plus a `filter()` where you check #' that every element of data has at least one row. #' * [anti_join()] is a `nest_join()` plus a `filter()` where you check that every #' element has zero rows. #' #' @param name The name of the list-column created by the join. If `NULL`, #' the default, the name of `y` is used. #' @param keep Should the new list-column contain join keys? The default #' will preserve the join keys for inequality joins. #' @return #' The output: #' * Is same type as `x` (including having the same groups). #' * Has exactly the same number of rows as `x`. #' * Contains all the columns of `x` in the same order with the same values. #' They are only modified (slightly) if `keep = FALSE`, when columns listed #' in `by` will be coerced to their common type across `x` and `y`. #' * Gains one new column called `{name}` on the far right, a list column #' containing data frames the same type as `y`. #' @section Methods: #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("nest_join")}. #' @inheritParams left_join #' @family joins #' @export #' @examples #' df1 <- tibble(x = 1:3) #' df2 <- tibble(x = c(2, 3, 3), y = c("a", "b", "c")) #' #' out <- nest_join(df1, df2) #' out #' out$df2 nest_join <- function( x, y, by = NULL, copy = FALSE, keep = NULL, name = NULL, ... ) { UseMethod("nest_join") } #' @export #' @rdname nest_join nest_join.data.frame <- function( x, y, by = NULL, copy = FALSE, keep = NULL, name = NULL, ..., na_matches = c("na", "never"), unmatched = "drop" ) { check_dots_empty0(...) check_keep(keep) na_matches <- check_na_matches(na_matches) if (is.null(name)) { name <- as_label(enexpr(y)) } else { check_string(name) } x_names <- tbl_vars(x) y_names <- tbl_vars(y) if (is_cross_by(by)) { warn_join_cross_by() by <- new_join_by() cross <- TRUE } else { cross <- FALSE } if (is_null(by)) { by <- join_by_common(x_names, y_names) } else { by <- as_join_by(by) } vars <- join_cols(x_names, y_names, by = by, suffix = c("", ""), keep = keep) y <- auto_copy(x, y, copy = copy) x_in <- as_tibble(x, .name_repair = "minimal") y_in <- as_tibble(y, .name_repair = "minimal") x_key <- set_names(x_in[vars$x$key], names(vars$x$key)) y_key <- set_names(y_in[vars$y$key], names(vars$x$key)) args <- join_cast_common(x_key, y_key, vars) x_key <- args$x y_key <- args$y condition <- by$condition filter <- by$filter # We always want to retain all of the matches. We never experience a Cartesian # explosion because `nrow(x) == nrow(out)` is an invariant of `nest_join()`, # and the whole point of `nest_join()` is to nest all of the matches for that # row of `x` (#6392). multiple <- "all" # Will be set to `"none"` in `join_rows()`. Because we can't have a Cartesian # explosion, we don't care about many-to-many relationships. relationship <- NULL rows <- join_rows( x_key = x_key, y_key = y_key, type = "nest", na_matches = na_matches, condition = condition, filter = filter, cross = cross, multiple = multiple, unmatched = unmatched, relationship = relationship, user_env = caller_env() ) y_loc <- vec_split(rows$y, rows$x)$val out <- set_names(x_in[vars$x$out], names(vars$x$out)) # Modify all columns in one step so that we only need to re-group once new_cols <- vec_cast(out[names(x_key)], x_key) y_out <- set_names(y_in[vars$y$out], names(vars$y$out)) y_out <- map(y_loc, vec_slice, x = y_out) y_out <- map(y_out, dplyr_reconstruct, template = y) new_cols[[name]] <- y_out out <- dplyr_col_modify(out, new_cols) dplyr_reconstruct(out, x) } # helpers ----------------------------------------------------------------- join_mutate <- function( x, y, by, type, ..., suffix = c(".x", ".y"), na_matches = "na", keep = NULL, multiple = "all", unmatched = "drop", relationship = NULL, error_call = caller_env(), user_env = caller_env() ) { check_dots_empty0(...) na_matches <- check_na_matches( na_matches, error_call = error_call ) check_keep(keep, error_call = error_call) x_names <- tbl_vars(x) y_names <- tbl_vars(y) if (is_cross_by(by)) { warn_join_cross_by(env = error_call, user_env = user_env) by <- new_join_by() cross <- TRUE } else { cross <- FALSE } if (is_null(by)) { by <- join_by_common(x_names, y_names, error_call = error_call) } else { by <- as_join_by(by, error_call = error_call) } vars <- join_cols( x_names = x_names, y_names = y_names, by = by, suffix = suffix, keep = keep, error_call = error_call ) x_in <- as_tibble(x, .name_repair = "minimal") y_in <- as_tibble(y, .name_repair = "minimal") x_key <- set_names(x_in[vars$x$key], names(vars$x$key)) y_key <- set_names(y_in[vars$y$key], names(vars$x$key)) args <- join_cast_common(x_key, y_key, vars, error_call = error_call) x_key <- args$x y_key <- args$y condition <- by$condition filter <- by$filter rows <- join_rows( x_key = x_key, y_key = y_key, type = type, na_matches = na_matches, condition = condition, filter = filter, cross = cross, multiple = multiple, unmatched = unmatched, relationship = relationship, error_call = error_call, user_env = user_env ) x_slicer <- rows$x y_slicer <- rows$y x_out <- set_names(x_in[vars$x$out], names(vars$x$out)) y_out <- set_names(y_in[vars$y$out], names(vars$y$out)) out <- vec_slice(x_out, x_slicer) out[names(y_out)] <- vec_slice(y_out, y_slicer) if (!is_true(keep)) { if (is_null(keep)) { merge <- by$x[by$condition == "=="] } else if (is_false(keep)) { # Won't ever contain non-equi conditions merge <- by$x } # Keys have already been cast to the common type x_merge <- x_key[merge] out[merge] <- vec_cast( x = out[merge], to = x_merge, call = error_call ) if ((type == "right" || type == "full") && anyNA(x_slicer)) { y_merge <- y_key[merge] new_rows <- which(is.na(x_slicer)) y_replacer <- y_slicer[new_rows] out[new_rows, merge] <- vec_slice(y_merge, y_replacer) } } dplyr_reconstruct(out, x) } join_filter <- function( x, y, by, type, ..., na_matches = c("na", "never"), error_call = caller_env(), user_env = caller_env() ) { check_dots_empty0(...) na_matches <- check_na_matches( na_matches, error_call = error_call ) x_names <- tbl_vars(x) y_names <- tbl_vars(y) if (is_cross_by(by)) { warn_join_cross_by(env = error_call, user_env = user_env) by <- new_join_by() cross <- TRUE } else { cross <- FALSE } if (is_null(by)) { by <- join_by_common(x_names, y_names, error_call = error_call) } else { by <- as_join_by(by, error_call = error_call) } vars <- join_cols(x_names, y_names, by = by, error_call = error_call) x_in <- as_tibble(x, .name_repair = "minimal") y_in <- as_tibble(y, .name_repair = "minimal") x_key <- set_names(x_in[vars$x$key], names(vars$x$key)) y_key <- set_names(y_in[vars$y$key], names(vars$x$key)) args <- join_cast_common(x_key, y_key, vars, error_call = error_call) x_key <- args$x y_key <- args$y condition <- by$condition filter <- by$filter # We only care about whether or not any matches exist multiple <- "any" # Will be set to `"none"` in `join_rows()`. Because `multiple = "any"`, that # means many-to-many relationships aren't possible. relationship <- NULL # Since we are actually testing the presence of matches, it doesn't make # sense to ever error on unmatched values. unmatched <- "drop" rows <- join_rows( x_key = x_key, y_key = y_key, type = type, na_matches = na_matches, condition = condition, filter = filter, cross = cross, multiple = multiple, unmatched = unmatched, relationship = relationship, error_call = error_call, user_env = user_env ) if (type == "semi") { # Unmatched needles and propagated missing needles will already be dropped idx <- rows$x } else { # Treat both unmatched needles and propagated missing needles as no-match no_match <- is.na(rows$y) idx <- rows$x[no_match] } dplyr_row_slice(x, idx) } check_na_matches <- function(na_matches, ..., error_call = caller_env()) { arg_match0( arg = na_matches, values = c("na", "never"), error_call = error_call ) } check_keep <- function(keep, error_call = caller_env()) { if (!is_bool(keep) && !is.null(keep)) { abort( glue( "`keep` must be `TRUE`, `FALSE`, or `NULL`, not {obj_type_friendly(keep)}." ), call = error_call ) } } is_cross_by <- function(x) { if (is_character(x, n = 0L)) { # `character()` or `named character()` return(TRUE) } if ( is_list(x, n = 2L) && is_character(x[["x"]], n = 0L) && is_character(x[["y"]], n = 0L) ) { # `list(x = character(), y = character())` # (possibly with named empty character elements) return(TRUE) } FALSE } warn_join_cross_by <- function(env = caller_env(), user_env = caller_env(2)) { # Also remove `join_rows(cross =)` and `is_cross_by()` once we remove support # for this lifecycle::deprecate_warn( when = "1.1.0", what = I("Using `by = character()` to perform a cross join"), with = "cross_join()", env = env, user_env = user_env, id = "dplyr-by-for-cross-join" ) } dplyr/R/all-equal.R0000644000176200001440000000712715137161765013630 0ustar liggesusers#' Flexible equality comparison for data frames #' #' @description #' `r lifecycle::badge("deprecated")` #' #' `all_equal()` allows you to compare data frames, optionally ignoring #' row and column names. It is deprecated as of dplyr 1.1.0, because it #' makes it too easy to ignore important differences. #' #' @param target,current Two data frames to compare. #' @param ignore_col_order Should order of columns be ignored? #' @param ignore_row_order Should order of rows be ignored? #' @param convert Should similar classes be converted? Currently this will #' convert factor to character and integer to double. #' @param ... Ignored. Needed for compatibility with `all.equal()`. #' @return `TRUE` if equal, otherwise a character vector describing #' the reasons why they're not equal. Use [isTRUE()] if using the #' result in an `if` expression. #' @export #' @keywords internal #' @examples #' scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))] #' #' # `all_equal()` ignored row and column ordering by default, #' # but we now feel that that makes it too easy to make mistakes #' mtcars2 <- scramble(mtcars) #' all_equal(mtcars, mtcars2) #' #' # Instead, be explicit about the row and column ordering #' all.equal( #' mtcars, #' mtcars2[rownames(mtcars), names(mtcars)] #' ) all_equal <- function( target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, ... ) { lifecycle::deprecate_warn( "1.1.0", "all_equal()", "all.equal()", details = "And manually order the rows/cols as needed", always = TRUE, id = "dplyr-all-equal" ) equal_data_frame( target, current, ignore_col_order = ignore_col_order, ignore_row_order = ignore_row_order, convert = convert ) } equal_data_frame <- function( x, y, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE ) { compat <- is_compatible( x, y, ignore_col_order = ignore_col_order, convert = convert ) if (!isTRUE(compat)) { # revert the bulleting from is_compatible() return(glue_collapse(compat, sep = "\n")) } nrows_x <- nrow(x) nrows_y <- nrow(y) if (nrows_x != nrows_y) { return("Different number of rows.") } if (df_n_col(x) == 0L) { return(TRUE) } # suppressMessages({ x <- as_tibble(x, .name_repair = "universal") y <- as_tibble(y, .name_repair = "universal") # }) x_split <- dplyr_locate_sorted_groups(x) y_split <- dplyr_locate_sorted_groups(y[, names(x), drop = FALSE]) # keys must be identical msg <- "" if (any(wrong <- !vec_in(x_split$key, y_split$key))) { rows <- sort(map_int(x_split$loc[which(wrong)], function(.x) .x[1L])) msg <- paste0( msg, "- Rows in x but not in y: ", glue_collapse(rows, sep = ", "), "\n" ) } if (any(wrong <- !vec_in(y_split$key, x_split$key))) { rows <- sort(map_int(y_split$loc[which(wrong)], function(.x) .x[1L])) msg <- paste0( msg, "- Rows in y but not in x: ", glue_collapse(rows, sep = ", "), "\n" ) } if (msg != "") { return(msg) } # keys are identical, check that rows occur the same number of times if (any(wrong <- lengths(x_split$loc) != lengths(y_split$loc))) { rows <- sort(map_int(x_split$loc[which(wrong)], function(.x) .x[1L])) return(paste0( "- Rows with difference occurrences in x and y: ", glue_collapse(rows, sep = ", "), "\n" )) } # then if we care about row order, the id need to be identical if (!ignore_row_order && !all(vec_equal(x_split$loc, y_split$loc))) { return("Same row values, but different order") } TRUE } dplyr/R/compute-collect.R0000644000176200001440000000423015106134104015021 0ustar liggesusers#' Force computation of a database query #' #' @description #' `compute()` stores results in a remote temporary table. #' `collect()` retrieves data into a local tibble. #' `collapse()` is slightly different: it doesn't force computation, but #' instead forces generation of the SQL query. This is sometimes needed to work #' around bugs in dplyr's SQL generation. #' #' All functions preserve grouping and ordering. #' #' @section Methods: #' These functions are **generics**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' * `compute()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("compute")} #' * `collect()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("collect")} #' * `collapse()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("collapse")} #' #' @param x A data frame, data frame extension (e.g. a tibble), or a lazy #' data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for more #' details. #' @param ... Arguments passed on to methods #' @seealso [copy_to()], the opposite of `collect()`: it takes a local data #' frame and uploads it to the remote source. #' @export #' @examplesIf requireNamespace("dbplyr", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE) #' mtcars2 <- dbplyr::src_memdb() |> #' copy_to(mtcars, name = "mtcars2-cc", overwrite = TRUE) #' #' remote <- mtcars2 |> #' filter(cyl == 8) |> #' select(mpg:drat) #' #' # Compute query and save in remote table #' compute(remote) #' #' # Compute query bring back to this session #' collect(remote) #' #' # Creates a fresh query based on the generated SQL #' collapse(remote) compute <- function(x, ...) { UseMethod("compute") } #' @export #' @rdname compute collect <- function(x, ...) { UseMethod("collect") } #' @export #' @rdname compute collapse <- function(x, ...) { UseMethod("collapse") } #' @export collect.data.frame <- function(x, ...) { x } #' @export compute.data.frame <- function(x, ...) { x } #' @export collapse.data.frame <- function(x, ...) { x } dplyr/R/if-else.R0000644000176200001440000000501015137161765013264 0ustar liggesusers#' Vectorised if-else #' #' `if_else()` is a vectorized [if-else][if]. Compared to the base R equivalent, #' [ifelse()], this function allows you to handle missing values in the #' `condition` with `missing` and always takes `true`, `false`, and `missing` #' into account when determining what the output type should be. #' #' @inheritParams rlang::args_dots_empty #' #' @param condition A logical vector #' #' @param true,false Vectors to use for `TRUE` and `FALSE` values of #' `condition`. #' #' Both `true` and `false` will be [recycled][vctrs::theory-faq-recycling] #' to the size of `condition`. #' #' `true`, `false`, and `missing` (if used) will be cast to their common type. #' #' @param missing If not `NULL`, will be used as the value for `NA` values of #' `condition`. Follows the same size and type rules as `true` and `false`. #' #' @param ptype An optional prototype declaring the desired output type. If #' supplied, this overrides the common type of `true`, `false`, and `missing`. #' #' @param size `r lifecycle::badge("deprecated")` #' #' Output size is always taken from `condition`. #' #' @return #' A vector with the same size as `condition` and the same type as the common #' type of `true`, `false`, and `missing`. #' #' Where `condition` is `TRUE`, the matching values from `true`, where it is #' `FALSE`, the matching values from `false`, and where it is `NA`, the matching #' values from `missing`, if provided, otherwise a missing value will be used. #' #' @seealso [vctrs::vec_if_else()] #' #' @export #' @examples #' x <- c(-5:5, NA) #' if_else(x < 0, NA, x) #' #' # Explicitly handle `NA` values in the `condition` with `missing` #' if_else(x < 0, "negative", "positive", missing = "missing") #' #' # Unlike `ifelse()`, `if_else()` preserves types #' x <- factor(sample(letters[1:5], 10, replace = TRUE)) #' ifelse(x %in% c("a", "b", "c"), x, NA) #' if_else(x %in% c("a", "b", "c"), x, NA) #' #' # `if_else()` is often useful for creating new columns inside of `mutate()` #' starwars |> #' mutate(category = if_else(height < 100, "short", "tall"), .keep = "used") if_else <- function( condition, true, false, missing = NULL, ..., ptype = NULL, size = deprecated() ) { check_dots_empty0(...) if (!is_missing(size)) { lifecycle::deprecate_warn( when = "1.2.0", what = "if_else(size = )", id = "dplyr-if-else-size" ) } vec_if_else( condition = condition, true = true, false = false, missing = missing, ptype = ptype, error_call = current_env() ) } dplyr/R/rows.R0000644000176200001440000004145315106134104012724 0ustar liggesusers#' Manipulate individual rows #' #' @description #' #' These functions provide a framework for modifying rows in a table using a #' second table of data. The two tables are matched `by` a set of key variables #' whose values typically uniquely identify each row. The functions are inspired #' by SQL's `INSERT`, `UPDATE`, and `DELETE`, and can optionally modify #' `in_place` for selected backends. #' #' * `rows_insert()` adds new rows (like `INSERT`). By default, key values in #' `y` must not exist in `x`. #' * `rows_append()` works like `rows_insert()` but ignores keys. #' * `rows_update()` modifies existing rows (like `UPDATE`). Key values in `y` #' must be unique, and, by default, key values in `y` must exist in `x`. #' * `rows_patch()` works like `rows_update()` but only overwrites `NA` values. #' * `rows_upsert()` inserts or updates depending on whether or not the #' key value in `y` already exists in `x`. Key values in `y` must be unique. #' * `rows_delete()` deletes rows (like `DELETE`). By default, key values in `y` #' must exist in `x`. #' #' @section Methods: #' These function are **generic**s, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' #' * `rows_insert()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_insert")}. #' * `rows_append()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_append")}. #' * `rows_update()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_update")}. #' * `rows_patch()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_patch")}. #' * `rows_upsert()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_upsert")}. #' * `rows_delete()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_delete")}. #' #' @inheritParams left_join #' @param x,y A pair of data frames or data frame extensions (e.g. a tibble). #' `y` must have the same columns of `x` or a subset. #' @param by An unnamed character vector giving the key columns. The key columns #' must exist in both `x` and `y`. Keys typically uniquely identify each row, #' but this is only enforced for the key values of `y` when `rows_update()`, #' `rows_patch()`, or `rows_upsert()` are used. #' #' By default, we use the first column in `y`, since the first column is #' a reasonable place to put an identifier variable. #' @param in_place Should `x` be modified in place? This argument is only #' relevant for mutable backends (e.g. databases, data.tables). #' #' When `TRUE`, a modified version of `x` is returned invisibly; #' when `FALSE`, a new object representing the resulting changes is returned. #' @param conflict For `rows_insert()`, how should keys in `y` that conflict #' with keys in `x` be handled? A conflict arises if there is a key in `y` #' that already exists in `x`. #' #' One of: #' - `"error"`, the default, will error if there are any keys in `y` that #' conflict with keys in `x`. #' - `"ignore"` will ignore rows in `y` with keys that conflict with keys in #' `x`. #' @param unmatched For `rows_update()`, `rows_patch()`, and `rows_delete()`, #' how should keys in `y` that are unmatched by the keys in `x` be handled? #' #' One of: #' - `"error"`, the default, will error if there are any keys in `y` that #' are unmatched by the keys in `x`. #' - `"ignore"` will ignore rows in `y` with keys that are unmatched by the #' keys in `x`. #' @returns #' An object of the same type as `x`. The order of the rows and columns of `x` #' is preserved as much as possible. The output has the following properties: #' #' * `rows_update()` and `rows_patch()` preserve the number of rows; #' `rows_insert()`, `rows_append()`, and `rows_upsert()` return all existing #' rows and potentially new rows; `rows_delete()` returns a subset of the #' rows. #' * Columns are not added, removed, or relocated, though the data may be #' updated. #' * Groups are taken from `x`. #' * Data frame attributes are taken from `x`. #' #' If `in_place = TRUE`, the result will be returned invisibly. #' @name rows #' @examples #' data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) #' data #' #' # Insert #' rows_insert(data, tibble(a = 4, b = "z")) #' #' # By default, if a key in `y` matches a key in `x`, then it can't be inserted #' # and will throw an error. Alternatively, you can ignore rows in `y` #' # containing keys that conflict with keys in `x` with `conflict = "ignore"`, #' # or you can use `rows_append()` to ignore keys entirely. #' try(rows_insert(data, tibble(a = 3, b = "z"))) #' rows_insert(data, tibble(a = 3, b = "z"), conflict = "ignore") #' rows_append(data, tibble(a = 3, b = "z")) #' #' # Update #' rows_update(data, tibble(a = 2:3, b = "z")) #' rows_update(data, tibble(b = "z", a = 2:3), by = "a") #' #' # Variants: patch and upsert #' rows_patch(data, tibble(a = 2:3, b = "z")) #' rows_upsert(data, tibble(a = 2:4, b = "z")) #' #' # Delete and truncate #' rows_delete(data, tibble(a = 2:3)) #' rows_delete(data, tibble(a = 2:3, b = "b")) #' #' # By default, for update, patch, and delete it is an error if a key in `y` #' # doesn't exist in `x`. You can ignore rows in `y` that have unmatched keys #' # with `unmatched = "ignore"`. #' y <- tibble(a = 3:4, b = "z") #' try(rows_update(data, y, by = "a")) #' rows_update(data, y, by = "a", unmatched = "ignore") #' rows_patch(data, y, by = "a", unmatched = "ignore") #' rows_delete(data, y, by = "a", unmatched = "ignore") NULL #' @rdname rows #' @export rows_insert <- function( x, y, by = NULL, ..., conflict = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { UseMethod("rows_insert") } #' @export rows_insert.data.frame <- function( x, y, by = NULL, ..., conflict = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { check_dots_empty() rows_df_in_place(in_place) y <- auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) rows_check_x_contains_y(x, y) rows_check_contains_by(x, by, "x") rows_check_contains_by(y, by, "y") y <- rows_cast_y(y, x) x_key <- dplyr_col_select(x, by) y_key <- dplyr_col_select(y, by) keep <- rows_check_y_conflict(x_key, y_key, conflict) if (!is.null(keep)) { y <- dplyr_row_slice(y, keep) } rows_bind(x, y) } #' @rdname rows #' @export rows_append <- function(x, y, ..., copy = FALSE, in_place = FALSE) { UseMethod("rows_append") } #' @export rows_append.data.frame <- function(x, y, ..., copy = FALSE, in_place = FALSE) { check_dots_empty() rows_df_in_place(in_place) y <- auto_copy(x, y, copy = copy) rows_check_x_contains_y(x, y) y <- rows_cast_y(y, x) rows_bind(x, y) } #' @rdname rows #' @export rows_update <- function( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { UseMethod("rows_update", x) } #' @export rows_update.data.frame <- function( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { check_dots_empty() rows_df_in_place(in_place) y <- auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) rows_check_x_contains_y(x, y) rows_check_contains_by(x, by, "x") rows_check_contains_by(y, by, "y") x_key <- dplyr_col_select(x, by) y_key <- dplyr_col_select(y, by) rows_check_unique(y_key, "y") args <- vec_cast_common(x = x_key, y = y_key) x_key <- args$x y_key <- args$y values_names <- setdiff(names(y), names(y_key)) x_values <- dplyr_col_select(x, values_names) y_values <- dplyr_col_select(y, values_names) y_values <- rows_cast_y(y_values, x_values) keep <- rows_check_y_unmatched(x_key, y_key, unmatched) if (!is.null(keep)) { y_key <- dplyr_row_slice(y_key, keep) y_values <- dplyr_row_slice(y_values, keep) } loc <- vec_match(x_key, y_key) match <- !is.na(loc) y_loc <- loc[match] x_loc <- which(match) y_values <- dplyr_row_slice(y_values, y_loc) x_values <- vec_assign(x_values, x_loc, y_values) x_values <- dplyr_new_list(x_values) x <- dplyr_col_modify(x, x_values) x } #' @rdname rows #' @export rows_patch <- function( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { UseMethod("rows_patch", x) } #' @export rows_patch.data.frame <- function( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { check_dots_empty() rows_df_in_place(in_place) y <- auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) rows_check_x_contains_y(x, y) rows_check_contains_by(x, by, "x") rows_check_contains_by(y, by, "y") x_key <- dplyr_col_select(x, by) y_key <- dplyr_col_select(y, by) rows_check_unique(y_key, "y") args <- vec_cast_common(x = x_key, y = y_key) x_key <- args$x y_key <- args$y values_names <- setdiff(names(y), names(y_key)) x_values <- dplyr_col_select(x, values_names) y_values <- dplyr_col_select(y, values_names) y_values <- rows_cast_y(y_values, x_values) keep <- rows_check_y_unmatched(x_key, y_key, unmatched) if (!is.null(keep)) { y_key <- dplyr_row_slice(y_key, keep) y_values <- dplyr_row_slice(y_values, keep) } loc <- vec_match(x_key, y_key) match <- !is.na(loc) y_loc <- loc[match] x_loc <- which(match) x_slice <- dplyr_row_slice(x_values, x_loc) x_slice <- dplyr_new_list(x_slice) y_slice <- dplyr_row_slice(y_values, y_loc) y_slice <- dplyr_new_list(y_slice) x_patched <- map2(x_slice, y_slice, coalesce) x_patched <- new_data_frame(x_patched, n = length(x_loc)) x_values <- vec_assign(x_values, x_loc, x_patched) x_values <- dplyr_new_list(x_values) x <- dplyr_col_modify(x, x_values) x } #' @rdname rows #' @export rows_upsert <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { UseMethod("rows_upsert", x) } #' @export rows_upsert.data.frame <- function( x, y, by = NULL, ..., copy = FALSE, in_place = FALSE ) { check_dots_empty() rows_df_in_place(in_place) y <- auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) rows_check_x_contains_y(x, y) rows_check_contains_by(x, by, "x") rows_check_contains_by(y, by, "y") x_key <- dplyr_col_select(x, by) y_key <- dplyr_col_select(y, by) rows_check_unique(y_key, "y") args <- vec_cast_common(x = x_key, y = y_key) x_key <- args$x y_key <- args$y values_names <- setdiff(names(y), names(y_key)) x_values <- dplyr_col_select(x, values_names) y_values <- dplyr_col_select(y, values_names) y_values <- rows_cast_y(y_values, x_values) loc <- vec_match(x_key, y_key) match <- !is.na(loc) y_loc <- loc[match] x_loc <- which(match) # Update y_values <- dplyr_row_slice(y_values, y_loc) x_values <- vec_assign(x_values, x_loc, y_values) x_values <- dplyr_new_list(x_values) x <- dplyr_col_modify(x, x_values) # Insert y_size <- vec_size(y_key) y_extra <- vec_as_location_invert(y_loc, y_size) y <- dplyr_row_slice(y, y_extra) y <- rows_cast_y(y, x) x <- rows_bind(x, y) x } #' @rdname rows #' @export rows_delete <- function( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { UseMethod("rows_delete", x) } #' @export rows_delete.data.frame <- function( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) { check_dots_empty() rows_df_in_place(in_place) y <- auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) rows_check_contains_by(x, by, "x") rows_check_contains_by(y, by, "y") x_key <- dplyr_col_select(x, by) y_key <- dplyr_col_select(y, by) args <- vec_cast_common(x = x_key, y = y_key) x_key <- args$x y_key <- args$y keep <- rows_check_y_unmatched(x_key, y_key, unmatched) if (!is.null(keep)) { y_key <- dplyr_row_slice(y_key, keep) } extra <- setdiff(names(y), names(y_key)) if (!is_empty(extra)) { message <- glue( "Ignoring extra `y` columns: ", commas(tick_if_needed(extra)) ) inform( message, class = c("dplyr_message_delete_extra_cols", "dplyr_message") ) } loc <- vec_match(x_key, y_key) unmatched <- is.na(loc) x_loc <- which(unmatched) dplyr_row_slice(x, x_loc) } # helpers ----------------------------------------------------------------- rows_check_by <- function(by, y, ..., error_call = caller_env()) { check_dots_empty() if (is.null(by)) { if (df_n_col(y) == 0L) { abort("`y` must have at least one column.", call = error_call) } by <- names(y)[[1]] inform( message = glue("Matching, by = \"{by}\""), class = c("dplyr_message_matching_by", "dplyr_message") ) } if (!is.character(by)) { abort("`by` must be a character vector.", call = error_call) } if (is_empty(by)) { abort("`by` must specify at least 1 column.", call = error_call) } if (!all(names2(by) == "")) { abort("`by` must be unnamed.", call = error_call) } by } rows_check_x_contains_y <- function(x, y, ..., error_call = caller_env()) { check_dots_empty() bad <- setdiff(names(y), names(x)) if (!is_empty(bad)) { bad <- err_vars(bad) message <- c( "All columns in `y` must exist in `x`.", i = glue("The following columns only exist in `y`: {bad}.") ) abort(message, call = error_call) } invisible() } rows_cast_y <- function(y, x, ..., call = caller_env()) { vec_cast(x = y, to = x, x_arg = "y", to_arg = "x", call = call) } rows_check_contains_by <- function(x, by, arg, ..., error_call = caller_env()) { check_dots_empty() missing <- setdiff(by, names(x)) if (is_empty(missing)) { return(invisible()) } missing <- err_vars(missing) message <- c( "All columns specified through `by` must exist in `x` and `y`.", i = glue("The following columns are missing from `{arg}`: {missing}.") ) abort(message, call = error_call) } rows_check_unique <- function(x, arg, ..., error_call = caller_env()) { check_dots_empty() if (!vec_duplicate_any(x)) { return(invisible()) } duplicated <- vec_duplicate_detect(x) duplicated <- which(duplicated) duplicated <- err_locs(duplicated) message <- c( glue("`{arg}` key values must be unique."), i = glue("The following rows contain duplicate key values: {duplicated}.") ) abort(message, call = error_call) } rows_check_y_conflict <- function( x_key, y_key, conflict, ..., error_call = caller_env() ) { check_dots_empty() conflict <- rows_check_conflict(conflict, error_call = error_call) keep <- NULL rows_matched <- vec_in(y_key, x_key) if (any(rows_matched)) { if (conflict == "error") { rows_matched <- which(rows_matched) rows_matched <- err_locs(rows_matched) message <- c( "`y` can't contain keys that already exist in `x`.", i = glue( "The following rows in `y` have keys that already exist in `x`: {rows_matched}." ), i = "Use `conflict = \"ignore\"` if you want to ignore these `y` rows." ) abort(message, call = error_call) } else if (conflict == "ignore") { keep <- which(!rows_matched) } else { abort("Unknown `conflict` value.", .internal = TRUE) } } keep } rows_check_y_unmatched <- function( x_key, y_key, unmatched, ..., error_call = caller_env() ) { check_dots_empty() unmatched <- rows_check_unmatched(unmatched, error_call = error_call) keep <- NULL rows_unmatched <- !vec_in(y_key, x_key) if (any(rows_unmatched)) { if (unmatched == "error") { rows_unmatched <- which(rows_unmatched) rows_unmatched <- err_locs(rows_unmatched) message <- c( "`y` must contain keys that already exist in `x`.", i = glue( "The following rows in `y` have keys that don't exist in `x`: {rows_unmatched}." ), i = "Use `unmatched = \"ignore\"` if you want to ignore these `y` rows." ) abort(message, call = error_call) } else if (unmatched == "ignore") { keep <- which(!rows_unmatched) } else { abort("Unknown `unmatched` value.", .internal = TRUE) } } keep } rows_check_conflict <- function(conflict, ..., error_call = caller_env()) { check_dots_empty0(...) arg_match0( arg = conflict, values = c("error", "ignore"), error_call = error_call ) } rows_check_unmatched <- function(unmatched, ..., error_call = caller_env()) { check_dots_empty0(...) arg_match0( arg = unmatched, values = c("error", "ignore"), error_call = error_call ) } rows_df_in_place <- function(in_place, error_call = caller_env()) { if (is_true(in_place)) { msg <- "Data frames only support `in_place = FALSE`." abort(msg, call = error_call) } } rows_bind <- function(x, y) { dplyr_reconstruct(vctrs::vec_rbind(x, y), x) } vec_as_location_invert <- function(i, n) { if (is_empty(i)) { seq_len(n) } else { vec_as_location(-i, n) } } dplyr/R/utils-tidy-eval.R0000644000176200001440000000274214406415372014777 0ustar liggesusers#' Other tidy eval tools #' #' @description #' These tidy eval functions are no longer for normal usage, but are still #' exported from dplyr for backward compatibility. #' See [`?rlang::args_data_masking`][rlang::args_data_masking] and #' `vignette("programming")` for the latest recommendations. #' #' * [expr()][rlang::expr] #' * [enquo()][rlang::enquo] #' * [enquos()][rlang::enquos] #' * [sym()][rlang::sym] #' * [syms()][rlang::syms] #' * [as_label()][rlang::as_label] #' * [quo()][rlang::quo] #' * [quos()][rlang::quos] #' * [quo_name()][rlang::quo_name] #' * [ensym()][rlang::ensym] #' * [ensyms()][rlang::ensyms] #' * [enexpr()][rlang::enexpr] #' * [enexprs()][rlang::enexprs] #' #' @keywords internal #' @name tidyeval-compat #' @aliases .data expr enquo enquos sym syms as_label #' @export .data expr enquo enquos sym syms as_label #' @aliases quo quos quo_name ensym ensyms enexpr enexprs #' @export quo quos quo_name ensym ensyms enexpr enexprs NULL # Retaining a redirect for the old `dplyr_data_masking` help page, because many # package authors end up linking to this through inherited documentation, and # removing the topic from here results in a check warning in their package. It # should be possible to remove this once enough packages have re-documented with # dplyr 1.1.1 installed and sent a new release to CRAN. #' Data-masking #' #' This page is now located at #' [`?rlang::args_data_masking`][rlang::args_data_masking]. #' #' @keywords internal #' @name dplyr_data_masking NULL dplyr/R/data-starwars.R0000644000176200001440000000211015016155021014473 0ustar liggesusers#' Starwars characters #' #' The original data, from SWAPI, the Star Wars API, , has been revised #' to reflect additional research into gender and sex determinations of characters. #' #' @format A tibble with 87 rows and 14 variables: #' \describe{ #' \item{name}{Name of the character} #' \item{height}{Height (cm)} #' \item{mass}{Weight (kg)} #' \item{hair_color,skin_color,eye_color}{Hair, skin, and eye colors} #' \item{birth_year}{Year born (BBY = Before Battle of Yavin)} #' \item{sex}{The biological sex of the character, namely male, female, hermaphroditic, or none (as in the case for Droids).} #' \item{gender}{The gender role or gender identity of the character as determined by their personality or the way they were programmed (as in the case for Droids).} #' \item{homeworld}{Name of homeworld} #' \item{species}{Name of species} #' \item{films}{List of films the character appeared in} #' \item{vehicles}{List of vehicles the character has piloted} #' \item{starships}{List of starships the character has piloted} #' } #' @examples #' starwars "starwars" dplyr/R/slice.R0000644000176200001440000004262115137161765013050 0ustar liggesusers#' Subset rows using their positions #' #' @description #' `slice()` lets you index rows by their (integer) locations. It allows you #' to select, remove, and duplicate rows. It is accompanied by a number of #' helpers for common use cases: #' #' * `slice_head()` and `slice_tail()` select the first or last rows. #' * `slice_sample()` randomly selects rows. #' * `slice_min()` and `slice_max()` select rows with the smallest or largest #' values of a variable. #' #' If `.data` is a [grouped_df], the operation will be performed on each group, #' so that (e.g.) `slice_head(df, n = 5)` will select the first five rows in #' each group. #' #' @details #' Slice does not work with relational databases because they have no #' intrinsic notion of row order. If you want to perform the equivalent #' operation, use [filter()] and [row_number()]. #' #' For `slice_sample()`, note that the weights provided in `weight_by` are #' passed through to the `prob` argument of [base::sample.int()]. This means #' they cannot be used to reconstruct summary statistics from the underlying #' population. See [this discussion](https://stats.stackexchange.com/q/639211/) #' for more details. #' #' @family single table verbs #' @inheritParams args_by #' @inheritParams arrange #' @inheritParams filter #' @param ... For `slice()`: <[`data-masking`][rlang::args_data_masking]> #' Integer row values. #' #' Provide either positive values to keep, or negative values to drop. #' The values provided must be either all positive or all negative. #' Indices beyond the number of rows in the input are silently ignored. #' #' For `slice_*()`, these arguments are passed on to methods. #' #' @param n,prop Provide either `n`, the number of rows, or `prop`, the #' proportion of rows to select. If neither are supplied, `n = 1` will be #' used. If `n` is greater than the number of rows in the group #' (or `prop > 1`), the result will be silently truncated to the group size. #' `prop` will be rounded towards zero to generate an integer number of #' rows. #' #' A negative value of `n` or `prop` will be subtracted from the group #' size. For example, `n = -2` with a group of 5 rows will select 5 - 2 = 3 #' rows; `prop = -0.25` with 8 rows will select 8 * (1 - 0.25) = 6 rows. #' @return #' An object of the same type as `.data`. The output has the following #' properties: #' #' * Each row may appear 0, 1, or many times in the output. #' * Columns are not modified. #' * Groups are not modified. #' * Data frame attributes are preserved. #' @section Methods: #' These function are **generic**s, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. #' #' Methods available in currently loaded packages: #' #' * `slice()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice")}. #' * `slice_head()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_head")}. #' * `slice_tail()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_tail")}. #' * `slice_min()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_min")}. #' * `slice_max()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_max")}. #' * `slice_sample()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_sample")}. #' @export #' @examples #' # Similar to head(mtcars, 1): #' mtcars |> slice(1L) #' # Similar to tail(mtcars, 1): #' mtcars |> slice(n()) #' mtcars |> slice(5:n()) #' # Rows can be dropped with negative indices: #' slice(mtcars, -(1:4)) #' #' # First and last rows based on existing order #' mtcars |> slice_head(n = 5) #' mtcars |> slice_tail(n = 5) #' #' # Rows with minimum and maximum values of a variable #' mtcars |> slice_min(mpg, n = 5) #' mtcars |> slice_max(mpg, n = 5) #' #' # slice_min() and slice_max() may return more rows than requested #' # in the presence of ties. #' mtcars |> slice_min(cyl, n = 1) #' # Use with_ties = FALSE to return exactly n matches #' mtcars |> slice_min(cyl, n = 1, with_ties = FALSE) #' # Or use additional variables to break the tie: #' mtcars |> slice_min(tibble(cyl, mpg), n = 1) #' #' # slice_sample() allows you to random select with or without replacement #' mtcars |> slice_sample(n = 5) #' mtcars |> slice_sample(n = 5, replace = TRUE) #' #' # slice_sample() can be used to shuffle rows with `prop = 1` #' mtcars |> slice_sample(prop = 1) #' #' # You can optionally weight by a variable - this code weights by the #' # physical weight of the cars, so heavy cars are more likely to get #' # selected. #' mtcars |> slice_sample(weight_by = wt, n = 5) #' #' # Group wise operation ---------------------------------------- #' df <- tibble( #' group = rep(c("a", "b", "c"), c(1, 2, 4)), #' x = runif(7) #' ) #' #' # All slice helpers operate per group, silently truncating to the group #' # size, so the following code works without error #' df |> group_by(group) |> slice_head(n = 2) #' #' # When specifying the proportion of rows to include non-integer sizes #' # are rounded down, so group a gets 0 rows #' df |> group_by(group) |> slice_head(prop = 0.5) #' #' # Filter equivalents -------------------------------------------- #' # slice() expressions can often be written to use `filter()` and #' # `row_number()`, which can also be translated to SQL. For many databases, #' # you'll need to supply an explicit variable to use to compute the row number. #' filter(mtcars, row_number() == 1L) #' filter(mtcars, row_number() == n()) #' filter(mtcars, between(row_number(), 5, n())) slice <- function(.data, ..., .by = NULL, .preserve = FALSE) { check_by_typo(...) by <- enquo(.by) if (!quo_is_null(by) && !is_false(.preserve)) { abort("Can't supply both `.by` and `.preserve`.") } UseMethod("slice") } #' @export slice.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { check_dots_unnamed() dots <- enquos(...) by <- compute_by( by = {{ .by }}, data = .data, by_arg = the$slice_by_arg, data_arg = ".data" ) loc <- slice_rows(.data, dots, by) dplyr_row_slice(.data, loc, preserve = .preserve) } #' @export #' @rdname slice slice_head <- function(.data, ..., n, prop, by = NULL) { check_dot_by_typo(...) check_slice_unnamed_n_prop(..., n = n, prop = prop) UseMethod("slice_head") } #' @export slice_head.data.frame <- function(.data, ..., n, prop, by = NULL) { check_dots_empty0(...) size <- get_slice_size(n = n, prop = prop) idx <- function(n) { seq2(1, size(n)) } dplyr_local_error_call() dplyr_local_slice_by_arg("by") slice(.data, idx(dplyr::n()), .by = {{ by }}) } #' @export #' @rdname slice slice_tail <- function(.data, ..., n, prop, by = NULL) { check_dot_by_typo(...) check_slice_unnamed_n_prop(..., n = n, prop = prop) UseMethod("slice_tail") } #' @export slice_tail.data.frame <- function(.data, ..., n, prop, by = NULL) { check_dots_empty0(...) size <- get_slice_size(n = n, prop = prop) idx <- function(n) { seq2(n - size(n) + 1, n) } dplyr_local_error_call() dplyr_local_slice_by_arg("by") slice(.data, idx(dplyr::n()), .by = {{ by }}) } #' @export #' @rdname slice #' @param order_by <[`data-masking`][rlang::args_data_masking]> Variable or #' function of variables to order by. To order by multiple variables, wrap #' them in a data frame or tibble. #' @param with_ties Should ties be kept together? The default, `TRUE`, #' may return more rows than you request. Use `FALSE` to ignore ties, #' and return the first `n` rows. #' @param na_rm Should missing values in `order_by` be removed from the result? #' If `FALSE`, `NA` values are sorted to the end (like in [arrange()]), so #' they will only be included if there are insufficient non-missing values to #' reach `n`/`prop`. slice_min <- function( .data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE ) { check_required(order_by) check_dot_by_typo(...) check_slice_unnamed_n_prop(..., n = n, prop = prop) check_bool(with_ties) check_bool(na_rm) UseMethod("slice_min") } #' @export slice_min.data.frame <- function( .data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE ) { check_dots_empty0(...) size <- get_slice_size(n = n, prop = prop) dplyr_local_error_call() dplyr_local_slice_by_arg("by") slice( .data, .by = {{ by }}, local({ n <- dplyr::n() order_by <- {{ order_by }} vec_check_size(order_by, size = n) slice_rank_idx( order_by, size(n), direction = "asc", with_ties = !!with_ties, na_rm = !!na_rm ) }) ) } #' @export #' @rdname slice slice_max <- function( .data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE ) { check_required(order_by) check_dot_by_typo(...) check_slice_unnamed_n_prop(..., n = n, prop = prop) check_bool(with_ties) check_bool(na_rm) UseMethod("slice_max") } #' @export slice_max.data.frame <- function( .data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE ) { check_dots_empty0(...) size <- get_slice_size(n = n, prop = prop) dplyr_local_error_call() dplyr_local_slice_by_arg("by") slice( .data, .by = {{ by }}, local({ n <- dplyr::n() order_by <- {{ order_by }} vec_check_size(order_by, size = n) slice_rank_idx( order_by, size(n), direction = "desc", with_ties = !!with_ties, na_rm = !!na_rm ) }) ) } #' @export #' @rdname slice #' @param replace Should sampling be performed with (`TRUE`) or without #' (`FALSE`, the default) replacement. #' @param weight_by <[`data-masking`][rlang::args_data_masking]> Sampling #' weights. This must evaluate to a vector of non-negative numbers the same #' length as the input. Weights are automatically standardised to sum to 1. #' See the `Details` section for more technical details regarding these #' weights. slice_sample <- function( .data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE ) { check_dot_by_typo(...) check_slice_unnamed_n_prop(..., n = n, prop = prop) check_bool(replace) UseMethod("slice_sample") } #' @export slice_sample.data.frame <- function( .data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE ) { check_dots_empty0(...) size <- get_slice_size(n = n, prop = prop, allow_outsize = replace) dplyr_local_error_call() dplyr_local_slice_by_arg("by") slice( .data, .by = {{ by }}, local({ weight_by <- {{ weight_by }} n <- dplyr::n() if (!is.null(weight_by)) { vec_check_size(weight_by, size = n) } sample_int(n, size(n), replace = !!replace, wt = weight_by) }) ) } # helpers ----------------------------------------------------------------- slice_rows <- function( data, dots, by, error_call = caller_env(), user_env = caller_env(2) ) { error_call <- dplyr_error_call(error_call) mask <- DataMask$new(data, by, "slice", error_call = error_call) on.exit(mask$forget(), add = TRUE) chunks <- slice_eval(mask, dots, error_call = error_call, user_env = user_env) slice_indices <- slice_combine( chunks, dots, mask = mask, error_call = error_call ) vec_c(!!!slice_indices, .ptype = integer()) } is_slice_call <- function(error_call) { is_slice <- TRUE if (is_environment(error_call) && !identical(error_call$.Generic, "slice")) { is_slice <- FALSE } is_slice } slice_eval <- function( mask, dots, error_call = caller_env(), user_env = caller_env(2) ) { index <- 0L impl <- function(...) { n <- ...length() out <- vector("list", n) for (i in seq_len(n)) { index <<- i slice_idx <- ...elt(i) if (is.matrix(slice_idx) && mat_n_col(slice_idx) == 1) { lifecycle::deprecate_warn( when = "1.1.0", what = I("Slicing with a 1-column matrix"), env = error_call, user_env = user_env, always = TRUE, id = "dplyr-slice-one-column-matrix" ) slice_idx <- slice_idx[, 1] } out[[i]] <- vec_as_subscript( slice_idx, logical = "error", character = "error", arg = as_label(dots[[i]]), call = NULL # error always chained to slice() ) } index <<- 0L vec_c(!!!out, .ptype = integer()) } withCallingHandlers( mask$eval_all(quo(impl(!!!dots))), error = function(cnd) { if (inherits(cnd, "vctrs_error_subscript")) { action <- "process" } else { action <- "compute" } if (index && is_slice_call(error_call)) { local_error_context(dots, index, mask = mask) header <- cnd_bullet_header(action) } else { header <- glue("Can't {action} indices.") } bullets <- c(header, i = cnd_bullet_cur_group_label()) abort(bullets, call = error_call, parent = cnd) } ) } slice_combine <- function(chunks, dots, mask, error_call = caller_env()) { rows <- mask$get_rows() slice_indices <- new_list(length(rows)) withCallingHandlers( for (group in seq_along(rows)) { current_rows <- rows[[group]] loc <- num_as_location( i = chunks[[group]], n = length(current_rows), zero = "remove", oob = "remove", missing = "remove", arg = as_label(dots[[group]]), call = NULL # error always chained to slice() ) grp_loc <- current_rows[loc] grp_loc <- grp_loc[!is.na(grp_loc)] slice_indices[[group]] <- grp_loc }, error = function(cnd) { mask$set_current_group(group) bullets <- c( "Can't compute indices.", i = cnd_bullet_cur_group_label() ) abort(bullets, call = error_call, parent = cnd) } ) slice_indices } check_constant <- function(x, name, error_call = caller_env()) { withCallingHandlers(force(x), error = function(e) { bullets <- c( glue("`{name}` must be a constant.") ) abort(bullets, parent = e, call = error_call) }) } check_slice_unnamed_n_prop <- function( ..., n, prop, error_call = caller_env() ) { if (!missing(n) || !missing(prop)) { return(invisible()) } # Special case to capture e.g. `slice_head(2)` # Capture dots as quosures so that we can label dots <- enquos(...) if (length(dots) == 1L && names2(dots)[[1L]] == "") { slice_call <- frame_call(frame = error_call)[[1]] slice_call <- as_label(slice_call) bullets <- c( "`n` must be explicitly named.", i = glue("Did you mean `{slice_call}(n = {as_label(dots[[1]])})`?") ) abort(bullets, call = error_call) } invisible() } check_slice_n_prop <- function(n, prop, error_call = caller_env()) { if (missing(n) && missing(prop)) { list(type = "n", n = 1L) } else if (!missing(n) && missing(prop)) { n <- check_constant(n, "n", error_call = error_call) if (!is_integerish(n, n = 1) || is.na(n)) { abort( glue("`n` must be a round number, not {obj_type_friendly(n)}."), call = error_call ) } list(type = "n", n = n) } else if (!missing(prop) && missing(n)) { prop <- check_constant(prop, "prop", error_call = error_call) if (!is.numeric(prop) || length(prop) != 1 || is.na(prop)) { abort( glue("`prop` must be a number, not {obj_type_friendly(prop)}."), call = error_call ) } list(type = "prop", prop = prop) } else { abort("Must supply `n` or `prop`, but not both.", call = error_call) } } # Always returns an integer between 0 and the group size get_slice_size <- function( n, prop, allow_outsize = FALSE, error_call = caller_env() ) { slice_input <- check_slice_n_prop(n, prop, error_call = error_call) if (slice_input$type == "n") { if (slice_input$n >= 0) { if (allow_outsize) { body <- expr(!!floor(slice_input$n)) } else { body <- expr(clamp(0, !!floor(slice_input$n), n)) } } else { body <- expr(clamp(0, ceiling(n + !!slice_input$n), n)) } } else if (slice_input$type == "prop") { if (slice_input$prop >= 0) { if (allow_outsize) { body <- expr(floor(!!slice_input$prop * n)) } else { body <- expr(clamp(0, floor(!!slice_input$prop * n), n)) } } else { body <- expr(clamp(0, ceiling(n + !!slice_input$prop * n), n)) } } new_function(pairlist2(n = ), body) } clamp <- function(min, x, max) { if (x < min) { min } else if (x > max) { max } else { x } } sample_int <- function(n, size, replace = FALSE, wt = NULL) { if (size == 0L) { integer(0) } else { sample.int(n, size, prob = wt, replace = replace) } } slice_rank_idx <- function( order_by, size, with_ties = TRUE, direction = c("asc", "desc"), na_rm = FALSE, call = caller_env() ) { direction <- arg_match0( arg = direction, values = c("asc", "desc"), error_call = call ) # puts missing values at the end na_value <- if (direction == "asc") "largest" else "smallest" ties <- if (with_ties) "min" else "sequential" ranks <- vec_rank( x = order_by, ties = ties, direction = direction, na_value = na_value ) keep <- ranks <= size if (na_rm) { keep[!vec_detect_complete(order_by)] <- FALSE } which <- which(keep) which[order(ranks[which])] } on_load({ # Default used by `slice()` the$slice_by_arg <- ".by" }) dplyr_local_slice_by_arg <- function(by_arg, frame = caller_env()) { local_bindings(slice_by_arg = by_arg, .env = the, .frame = frame) } dplyr/R/error.R0000644000176200001440000000124214366556340013074 0ustar liggesusers# ngettext() does extra work, this function is a simpler version ntext <- function(n, msg1, msg2) { if (n == 1) msg1 else msg2 } fmt_pos_args <- function(x) { args <- ntext(length(x), "Argument", "Arguments") glue("{args} {fmt_comma(x)}") } fmt_cols <- function(x) { cols <- ntext(length(x), "Column", "Columns") glue("{cols} {fmt_obj(x)}") } fmt_obj <- function(x) { fmt_comma(fmt_obj1(x)) } fmt_obj1 <- function(x) { paste0("`", x, "`") } fmt_classes <- function(x) { paste(class(x), collapse = "/") } fmt_comma <- function(..., .max = 6) { x <- paste0(...) if (length(x) > .max) { length(x) <- .max x[[.max]] <- "..." } commas(x) } dplyr/R/defunct-each.R0000644000176200001440000000202215137161765014266 0ustar liggesusers#' Defunct functions for working with multiple columns #' #' @description #' `r lifecycle::badge("defunct")` #' #' `mutate_each()` and `summarise_each()` are deprecated in favour of #' the new [across()] function that works within `summarise()` and `mutate()`. #' #' @name defunct-each #' @keywords internal NULL each_defunct <- function(fun) { lifecycle::deprecate_stop( when = "0.7.0", what = fun, with = "across()" ) } #' @export #' @rdname defunct-each summarise_each <- function(tbl, funs, ...) { each_defunct("summarise_each()") } #' @export #' @rdname defunct-each summarise_each_ <- function(tbl, funs, vars) { each_defunct("summarise_each_()") } #' @export #' @rdname defunct-each mutate_each <- function(tbl, funs, ...) { each_defunct("mutate_each()") } #' @export #' @rdname defunct-each mutate_each_ <- function(tbl, funs, vars) { each_defunct("mutate_each_()") } #' @export #' @rdname defunct-each summarize_each <- summarise_each #' @export #' @rdname defunct-each summarize_each_ <- summarise_each_ dplyr/R/bind-cols.R0000644000176200001440000000324215106134104013576 0ustar liggesusers#' Bind multiple data frames by column #' #' @description #' Bind any number of data frames by column, making a wider result. #' This is similar to `do.call(cbind, dfs)`. #' #' Where possible prefer using a [join][left_join] to combine multiple #' data frames. `bind_cols()` binds the rows in order in which they appear #' so it is easy to create meaningless results without realising it. #' #' @param ... Data frames to combine. Each argument can either be a data frame, #' a list that could be a data frame, or a list of data frames. #' Inputs are [recycled][vctrs::theory-faq-recycling] to the same length, #' then matched by position. #' @param .name_repair One of `"unique"`, `"universal"`, or #' `"check_unique"`. See [vctrs::vec_as_names()] for the meaning of these #' options. #' @returns A data frame the same type as the first element of `...`. #' @export #' @examples #' df1 <- tibble(x = 1:3) #' df2 <- tibble(y = 3:1) #' bind_cols(df1, df2) #' #' # Row sizes must be compatible when column-binding #' try(bind_cols(tibble(x = 1:3), tibble(y = 1:2))) bind_cols <- function( ..., .name_repair = c("unique", "universal", "check_unique", "minimal") ) { dots <- list2(...) dots <- list_flatten(dots, recursive = TRUE) dots <- discard(dots, is.null) # Strip names off of data frame components so that vec_cbind() unpacks them names2(dots)[map_lgl(dots, is.data.frame)] <- "" out <- vec_cbind( !!!dots, .name_repair = .name_repair, .error_call = current_env() ) if (!any(map_lgl(dots, is.data.frame))) { out <- as_tibble(out) } if (length(dots) && is.data.frame(first <- dots[[1L]])) { out <- dplyr_reconstruct(out, first) } out } dplyr/R/colwise-select.R0000644000176200001440000001141615106134104014650 0ustar liggesusers#' Select and rename a selection of variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' `rename_if()`, `rename_at()`, and `rename_all()` have been superseded by #' `rename_with()`. The matching select statements have been superseded by the #' combination of a `select()` + `rename_with()`. Any predicate functions passed #' as arguments to `select()` or `rename_with()` must be wrapped in [where()]. #' #' These functions were superseded because `mutate_if()` and friends were #' superseded by `across()`. `select_if()` and `rename_if()` already use tidy #' selection so they can't be replaced by `across()` and instead we need a new #' function. #' #' @inheritParams scoped #' @keywords internal #' @param .funs A function `fun`, a purrr style lambda `~ fun(.)` or a list of either form. #' @examples #' mtcars <- as_tibble(mtcars) # for nicer printing #' #' mtcars |> rename_all(toupper) #' # -> #' mtcars |> rename_with(toupper) #' #' # NB: the transformation comes first in rename_with #' is_whole <- function(x) all(floor(x) == x) #' mtcars |> rename_if(is_whole, toupper) #' # -> #' mtcars |> rename_with(toupper, where(is_whole)) #' #' mtcars |> rename_at(vars(mpg:hp), toupper) #' # -> #' mtcars |> rename_with(toupper, mpg:hp) #' #' # You now must select() and then rename #' #' mtcars |> select_all(toupper) #' # -> #' mtcars |> rename_with(toupper) #' #' # Selection drops unselected variables: #' mtcars |> select_if(is_whole, toupper) #' # -> #' mtcars |> select(where(is_whole)) |> rename_with(toupper) #' #' mtcars |> select_at(vars(-contains("ar"), starts_with("c")), toupper) #' # -> #' mtcars |> #' select(!contains("ar") | starts_with("c")) |> #' rename_with(toupper) #' @export select_all <- function(.tbl, .funs = list(), ...) { lifecycle::signal_stage("superseded", "select_all()") funs <- as_fun_list(.funs, caller_env(), ..., .caller = "select_all") vars <- tbl_vars(.tbl) syms <- vars_select_syms(vars, funs, .tbl) select(.tbl, !!!syms) } #' @rdname select_all #' @export rename_all <- function(.tbl, .funs = list(), ...) { lifecycle::signal_stage("superseded", "rename_with()") funs <- as_fun_list(.funs, caller_env(), ..., .caller = "rename_all") vars <- tbl_vars(.tbl) syms <- vars_select_syms(vars, funs, .tbl, strict = TRUE) rename(.tbl, !!!syms) } #' @rdname select_all #' @export select_if <- function(.tbl, .predicate, .funs = list(), ...) { funs <- as_fun_list(.funs, caller_env(), ..., .caller = "select_if") if (!is_logical(.predicate)) { .predicate <- as_fun_list( .predicate, caller_env(), .caller = "select_if", .caller_arg = ".predicate" ) } vars <- tbl_if_vars( .tbl, .predicate, caller_env(), .include_group_vars = TRUE ) syms <- vars_select_syms(vars, funs, .tbl) select(.tbl, !!!syms) } #' @rdname select_all #' @export rename_if <- function(.tbl, .predicate, .funs = list(), ...) { funs <- as_fun_list(.funs, caller_env(), ..., .caller = "rename_if") if (!is_logical(.predicate)) { .predicate <- as_fun_list( .predicate, caller_env(), .caller = "rename_if", .caller_arg = ".predicate" ) } vars <- tbl_if_vars( .tbl, .predicate, caller_env(), .include_group_vars = TRUE ) syms <- vars_select_syms(vars, funs, .tbl, strict = TRUE) rename(.tbl, !!!syms) } #' @rdname select_all #' @export select_at <- function(.tbl, .vars, .funs = list(), ...) { vars <- tbl_at_vars(.tbl, .vars, .include_group_vars = TRUE) funs <- as_fun_list(.funs, caller_env(), ..., .caller = "select_at") syms <- vars_select_syms(vars, funs, .tbl) select(.tbl, !!!syms) } #' @rdname select_all #' @export rename_at <- function(.tbl, .vars, .funs = list(), ...) { vars <- tbl_at_vars(.tbl, .vars, .include_group_vars = TRUE) funs <- as_fun_list(.funs, caller_env(), ..., .caller = "rename_at") syms <- vars_select_syms(vars, funs, .tbl, strict = TRUE) rename(.tbl, !!!syms) } vars_select_syms <- function( vars, funs, tbl, strict = FALSE, error_call = caller_env() ) { if (length(funs) > 1) { msg <- glue( "`.funs` must contain one renaming function, not {length(funs)}." ) abort(msg, call = error_call) } else if (length(funs) == 1) { fun <- funs[[1]] if (is_quosure(fun)) { fun <- quo_as_function(fun) } syms <- if (length(vars)) { set_names(syms(vars), fun(as.character(vars))) } else { set_names(syms(vars)) } } else if (!strict) { syms <- syms(vars) } else { msg <- glue("`.funs` must specify a renaming function.") abort(msg, call = error_call) } group_vars <- group_vars(tbl) group_syms <- syms(group_vars) has_group_sym <- group_syms %in% syms new_group_syms <- set_names( group_syms[!has_group_sym], group_vars[!has_group_sym] ) c(new_group_syms, syms) } dplyr/R/import-standalone-purrr.R0000644000176200001440000001265514406402754016557 0ustar liggesusers# Standalone file: do not edit by hand # Source: # ---------------------------------------------------------------------- # # --- # repo: r-lib/rlang # file: standalone-purrr.R # last-updated: 2022-06-07 # license: https://unlicense.org # --- # # This file provides a minimal shim to provide a purrr-like API on top of # base R functions. They are not drop-in replacements but allow a similar style # of programming. # # ## Changelog # # 2022-06-07: # * `transpose()` is now more consistent with purrr when inner names # are not congruent (#1346). # # 2021-12-15: # * `transpose()` now supports empty lists. # # 2021-05-21: # * Fixed "object `x` not found" error in `imap()` (@mgirlich) # # 2020-04-14: # * Removed `pluck*()` functions # * Removed `*_cpl()` functions # * Used `as_function()` to allow use of `~` # * Used `.` prefix for helpers # # nocov start map <- function(.x, .f, ...) { .f <- as_function(.f, env = global_env()) lapply(.x, .f, ...) } walk <- function(.x, .f, ...) { map(.x, .f, ...) invisible(.x) } map_lgl <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, logical(1), ...) } map_int <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, integer(1), ...) } map_dbl <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, double(1), ...) } map_chr <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, character(1), ...) } .rlang_purrr_map_mold <- function(.x, .f, .mold, ...) { .f <- as_function(.f, env = global_env()) out <- vapply(.x, .f, .mold, ..., USE.NAMES = FALSE) names(out) <- names(.x) out } map2 <- function(.x, .y, .f, ...) { .f <- as_function(.f, env = global_env()) out <- mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE) if (length(out) == length(.x)) { set_names(out, names(.x)) } else { set_names(out, NULL) } } map2_lgl <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "logical") } map2_int <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "integer") } map2_dbl <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "double") } map2_chr <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "character") } imap <- function(.x, .f, ...) { map2(.x, names(.x) %||% seq_along(.x), .f, ...) } pmap <- function(.l, .f, ...) { .f <- as.function(.f) args <- .rlang_purrr_args_recycle(.l) do.call("mapply", c( FUN = list(quote(.f)), args, MoreArgs = quote(list(...)), SIMPLIFY = FALSE, USE.NAMES = FALSE )) } .rlang_purrr_args_recycle <- function(args) { lengths <- map_int(args, length) n <- max(lengths) stopifnot(all(lengths == 1L | lengths == n)) to_recycle <- lengths == 1L args[to_recycle] <- map(args[to_recycle], function(x) rep.int(x, n)) args } keep <- function(.x, .f, ...) { .x[.rlang_purrr_probe(.x, .f, ...)] } discard <- function(.x, .p, ...) { sel <- .rlang_purrr_probe(.x, .p, ...) .x[is.na(sel) | !sel] } map_if <- function(.x, .p, .f, ...) { matches <- .rlang_purrr_probe(.x, .p) .x[matches] <- map(.x[matches], .f, ...) .x } .rlang_purrr_probe <- function(.x, .p, ...) { if (is_logical(.p)) { stopifnot(length(.p) == length(.x)) .p } else { .p <- as_function(.p, env = global_env()) map_lgl(.x, .p, ...) } } compact <- function(.x) { Filter(length, .x) } transpose <- function(.l) { if (!length(.l)) { return(.l) } inner_names <- names(.l[[1]]) if (is.null(inner_names)) { fields <- seq_along(.l[[1]]) } else { fields <- set_names(inner_names) .l <- map(.l, function(x) { if (is.null(names(x))) { set_names(x, inner_names) } else { x } }) } # This way missing fields are subsetted as `NULL` instead of causing # an error .l <- map(.l, as.list) map(fields, function(i) { map(.l, .subset2, i) }) } every <- function(.x, .p, ...) { .p <- as_function(.p, env = global_env()) for (i in seq_along(.x)) { if (!rlang::is_true(.p(.x[[i]], ...))) return(FALSE) } TRUE } some <- function(.x, .p, ...) { .p <- as_function(.p, env = global_env()) for (i in seq_along(.x)) { if (rlang::is_true(.p(.x[[i]], ...))) return(TRUE) } FALSE } negate <- function(.p) { .p <- as_function(.p, env = global_env()) function(...) !.p(...) } reduce <- function(.x, .f, ..., .init) { f <- function(x, y) .f(x, y, ...) Reduce(f, .x, init = .init) } reduce_right <- function(.x, .f, ..., .init) { f <- function(x, y) .f(y, x, ...) Reduce(f, .x, init = .init, right = TRUE) } accumulate <- function(.x, .f, ..., .init) { f <- function(x, y) .f(x, y, ...) Reduce(f, .x, init = .init, accumulate = TRUE) } accumulate_right <- function(.x, .f, ..., .init) { f <- function(x, y) .f(y, x, ...) Reduce(f, .x, init = .init, right = TRUE, accumulate = TRUE) } detect <- function(.x, .f, ..., .right = FALSE, .p = is_true) { .p <- as_function(.p, env = global_env()) .f <- as_function(.f, env = global_env()) for (i in .rlang_purrr_index(.x, .right)) { if (.p(.f(.x[[i]], ...))) { return(.x[[i]]) } } NULL } detect_index <- function(.x, .f, ..., .right = FALSE, .p = is_true) { .p <- as_function(.p, env = global_env()) .f <- as_function(.f, env = global_env()) for (i in .rlang_purrr_index(.x, .right)) { if (.p(.f(.x[[i]], ...))) { return(i) } } 0L } .rlang_purrr_index <- function(x, right = FALSE) { idx <- seq_along(x) if (right) { idx <- rev(idx) } idx } # nocov end dplyr/R/deprec-funs.R0000644000176200001440000000447415106134104014147 0ustar liggesusers#' Create a list of function calls #' #' @description #' `r lifecycle::badge("deprecated")` #' #' `funs()` is deprecated; please use `list()` instead. We deprecated this #' function because it provided a unique way of specifying anonymous functions, #' rather than adopting the conventions used by purrr and other packages #' in the tidyverse. #' #' @param ... <[`data-masking`][rlang::args_data_masking]> A list of functions #' specified by: #' #' - Their name, `"mean"` #' - The function itself, `mean` #' - A call to the function with `.` as a dummy argument, #' `mean(., na.rm = TRUE)` #' #' The following notations are **not** supported, see examples: #' #' - An anonymous function, `function(x) mean(x, na.rm = TRUE)` #' - An anonymous function in \pkg{purrr} notation, `~mean(., na.rm = TRUE)` #' #' @param .args,args A named list of additional arguments to be added to all #' function calls. As `funs()` is being deprecated, use other methods to #' supply arguments: `...` argument in [scoped verbs][summarise_at()] or make #' own functions with [purrr::partial()]. #' @export #' @keywords internal #' @examples #' funs("mean", mean(., na.rm = TRUE)) #' # -> #' list(mean = mean, mean = ~ mean(.x, na.rm = TRUE)) #' #' funs(m1 = mean, m2 = "mean", m3 = mean(., na.rm = TRUE)) #' # -> #' list(m1 = mean, m2 = "mean", m3 = ~ mean(.x, na.rm = TRUE)) funs <- function(..., .args = list()) { lifecycle::deprecate_warn( "0.8.0", "funs()", always = TRUE, details = paste_line( "Please use a list of either functions or lambdas: ", "", " # Simple named list: ", " list(mean = mean, median = median)", "", " # Auto named with `tibble::lst()`: ", " tibble::lst(mean, median)", "", " # Using lambdas", " list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))" ) ) dots <- enquos(...) default_env <- caller_env() error_call <- current_env() funs <- map(dots, function(quo) { as_fun(quo, default_env, .args, error_call = error_call) }) new_funs(funs) } new_funs <- function(funs) { attr(funs, "have_name") <- any(names2(funs) != "") # Workaround until rlang:::label() is exported temp <- map(funs, function(fn) node_car(quo_get_expr(fn))) temp <- exprs_auto_name(temp) names(funs) <- names(temp) class(funs) <- "fun_list" funs } dplyr/R/recode-values.R0000644000176200001440000003163015137161765014505 0ustar liggesusers#' Recode and replace values #' #' @description #' `recode_values()` and `replace_values()` provide two ways to map old values #' to new values. They work by matching values against `x` and using the first #' match to determine the corresponding value in the output vector. You can also #' think of these functions as a way to use a lookup table to recode a vector. #' #' - Use `recode_values()` when creating an entirely new vector. #' #' - Use `replace_values()` when partially updating an existing vector. #' #' If you are just replacing a few values within an existing vector, then #' `replace_values()` is always a better choice because it is type stable and #' better expresses intent. #' #' A major difference between the two functions is what happens when no cases #' match: #' #' - `recode_values()` falls through to a `default`. #' #' - `replace_values()` retains the original values from `x`. #' #' These functions have two mutually exclusive ways to use them: #' #' - A formula-based approach, i.e. `recode_values(x, from1 ~ to1, from2 ~ #' to2)`, similar to [case_when()], which is useful when you have a small #' number of cases. #' #' - A vector-based approach, i.e. `recode_values(x, from = from, to = to)`, #' which is useful when you have a pre-built lookup table (which may come #' from an external source, like a CSV file). #' #' See `vignette("recoding-replacing")` for more examples. #' #' @param x A vector. #' #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> A sequence of two-sided #' formulas. The left hand side (LHS) determines which values match this case. #' The right hand side (RHS) provides the replacement value. #' #' - The LHS inputs can be any size, but will be #' [cast][vctrs::theory-faq-coercion] to the type of `x`. #' #' - The RHS inputs will be [recycled][vctrs::theory-faq-recycling] to the #' same size as `x`. For `recode_values()` they will be #' [cast][vctrs::theory-faq-coercion] to their common type, and for #' `replace_values()` they will be [cast][vctrs::theory-faq-coercion] to the #' type of `x`. #' #' `NULL` inputs are ignored. #' #' Mutually exclusive with `from` and `to`. #' #' @param from Values to look up in `x` and map to values in `to`. #' #' Typically this is a single vector of any size that is #' [cast][vctrs::theory-faq-coercion] to the type of `x`. For more advanced #' usage, this can be a list of vectors of any size each of which are #' [cast][vctrs::theory-faq-coercion] to the type of `x`. #' #' Mutually exclusive with `...`. #' #' @param to Values that `from` map to. #' #' Typically this is a single vector that is #' [recycled][vctrs::theory-faq-recycling] to the size of `from`. For more #' advanced usage, this can be a list of vectors each of which are #' [recycled][vctrs::theory-faq-recycling] to the size of `x`. #' #' Mutually exclusive with `...`. #' #' @param default Default value to use when there is a value present in `x` #' that is unmatched by a value in `from`. #' #' By default, a missing value is used as the default value. #' #' If supplied, will be [recycled][vctrs::theory-faq-recycling] to the size of #' `x`. #' #' Can only be set when `unmatched = "default"`. #' #' @param unmatched Handling of unmatched locations. #' #' One of: #' #' - `"default"` to use `default` in unmatched locations. #' #' - `"error"` to error when there are unmatched locations. #' #' @param ptype An optional override for the output type, which is usually #' computed as the common type of `to` and `default`. #' #' @returns #' A vector the same size as `x`. #' #' - For `recode_values()`, the type of the output is computed as the common #' type of `to` and `default`, unless overridden by `ptype`. The names of the #' output come from the names of `to` and `default`. #' #' - For `replace_values()`, the type of the output will have the same type #' as `x`. The names of the output will be the same as the names of `x`. #' #' @seealso [case_when()], [vctrs::vec_recode_values()] #' #' @name recode-and-replace-values #' #' @examples #' x <- c("NC", "NYC", "CA", NA, "NYC", "Unknown") #' #' # `recode_values()` is useful for fully recoding from one set of values to #' # another, creating an entirely new vector in the process. Note that any #' # unmatched values result in `NA`, or a `default` value. #' recode_values( #' x, #' "NC" ~ "North Carolina", #' "NYC" ~ "New York", #' "CA" ~ "California" #' ) #' #' recode_values( #' x, #' "NC" ~ "North Carolina", #' "NYC" ~ "New York", #' "CA" ~ "California", #' default = "" #' ) #' #' # `replace_values()` is useful for updating an existing vector, tweaking a #' # few values along the way #' replace_values(x, "NYC" ~ "NY") #' #' # `replace_values()` is particularly nice for replacing `NA`s with values... #' replace_values(x, NA ~ "Unknown (NA)") #' # ...or values with `NA`s #' replace_values(x, "Unknown" ~ NA) #' #' # Multiple values can be grouped within a single left-hand side to normalize #' # all problematic values at once #' replace_values(x, c(NA, "Unknown") ~ "") #' #' # --------------------------------------------------------------------------- #' # Lookup tables #' #' # `recode_values()` works with more than just character vectors. Imagine you #' # have this series of Likert Scale scores, which is a scoring system that is #' # ordered from 1-5. #' data <- tibble( #' score = c(1, 2, 3, 4, 5, 2, 3, 1, 4) #' ) #' #' # To recode each `score` to its corresponding Likert Score label, you may #' # initially be inclined to reach for `case_when()` #' data |> #' mutate( #' score = case_when( #' score == 1 ~ "Strongly disagree", #' score == 2 ~ "Disagree", #' score == 3 ~ "Neutral", #' score == 4 ~ "Agree", #' score == 5 ~ "Strongly agree" #' ) #' ) #' #' # While this works, it can be written more efficiently using #' # `recode_values()` #' data |> #' mutate( #' score = score |> #' recode_values( #' 1 ~ "Strongly disagree", #' 2 ~ "Disagree", #' 3 ~ "Neutral", #' 4 ~ "Agree", #' 5 ~ "Strongly agree" #' ) #' ) #' #' # `recode_values()` actually has two mutually exclusive APIs. The formula API #' # used above, which is like `case_when()`, and a lookup style API that uses #' # `from` and `to` arguments. The lookup API is even better suited for this #' # problem, because we can move the mapping outside of the `mutate()` call #' # into a standalone lookup table. You could even imagine reading this #' # `likert` lookup table in from a separate CSV file. #' likert <- tribble( #' ~from, ~to, #' 1, "Strongly disagree", #' 2, "Disagree", #' 3, "Neutral", #' 4, "Agree", #' 5, "Strongly agree" #' ) #' #' data |> #' mutate(score = recode_values(score, from = likert$from, to = likert$to)) #' #' # You can utilize the same lookup table across multiple columns by using #' # `across()` #' data_months <- tibble( #' score_january = c(1, 2, 3, 4, 5, 2, 3, 1, 4), #' score_february = c(4, 2, 1, 2, 1, 5, 2, 4, 4) #' ) #' #' data_months |> #' mutate(across( #' starts_with("score"), #' ~ recode_values(.x, from = likert$from, to = likert$to) #' )) #' #' # The `unmatched` argument allows you to assert that you believe that you've #' # recoded all of the cases and will error if you've missed one, adding an #' # extra layer of safety #' data_with_zero <- add_row(data, score = 0) #' #' try({ #' recode_values( #' data_with_zero$score, #' from = likert$from, #' to = likert$to, #' unmatched = "error" #' ) #' }) #' #' # Note that missing values are considered unmatched. If you expect missing #' # values, you'll need to handle them explicitly in your lookup table. #' data_with_missing <- add_row(data, score = NA) #' #' try({ #' recode_values( #' data_with_missing$score, #' from = likert$from, #' to = likert$to, #' unmatched = "error" #' ) #' }) #' #' likert <- add_row(likert, from = NA, to = NA) #' #' recode_values( #' data_with_missing$score, #' from = likert$from, #' to = likert$to, #' unmatched = "error" #' ) #' #' # ------------------------------------------------------------------------------ #' # Lists of vectors #' #' # In some cases, your mapping may collapse multiple groups together into a #' # single value. For example, here we'd like to standardize the school names. #' schools <- c( #' "UNC", #' "Chapel Hill", #' NA, #' "Duke", #' "Duke University", #' "UNC", #' "NC State", #' "ECU", #' "East Carolina" #' ) #' #' # This `tribble()` is more complex than it may appear, it actually #' # creates a list column! #' standardized <- tribble( #' ~from, ~to, #' c("UNC", "Chapel Hill"), "UNC", #' c("Duke", "Duke University"), "Duke", #' c("NC State"), "NC State", #' c("ECU", "East Carolina"), "ECU", #' NA, NA #' ) #' #' standardized #' standardized$from #' #' # `recode_values()` treats a list `from` value as a list of vectors, where #' # any match within one of the vectors is mapped to its corresponding `to` #' # value #' recode_values( #' schools, #' from = standardized$from, #' to = standardized$to, #' unmatched = "error" #' ) #' #' # This formula based approach is equivalent, but the lookup based approach is #' # nicer because the lookup table can be defined separately #' recode_values( #' schools, #' c("UNC", "Chapel Hill") ~ "UNC", #' c("Duke", "Duke University") ~ "Duke", #' c("NC State") ~ "NC State", #' c("ECU", "East Carolina") ~ "ECU", #' NA ~ NA, #' unmatched = "error" #' ) NULL #' @rdname recode-and-replace-values #' @export recode_values <- function( x, ..., from = NULL, to = NULL, default = NULL, unmatched = "default", ptype = NULL ) { check_dots_unnamed() args <- eval_formulas_or_from_and_to( ..., from = from, to = to, allow_empty_dots = FALSE ) from <- args$from to <- args$to from_as_list_of_vectors <- args$from_as_list_of_vectors to_as_list_of_vectors <- args$to_as_list_of_vectors from_arg <- args$from_arg to_arg <- args$to_arg vec_recode_values( x = x, from = from, to = to, default = default, unmatched = unmatched, from_as_list_of_vectors = from_as_list_of_vectors, to_as_list_of_vectors = to_as_list_of_vectors, ptype = ptype, x_arg = "x", from_arg = from_arg, to_arg = to_arg, default_arg = "default", error_call = current_env() ) } #' @rdname recode-and-replace-values #' @export replace_values <- function( x, ..., from = NULL, to = NULL ) { check_dots_unnamed() args <- eval_formulas_or_from_and_to( ..., from = from, to = to, allow_empty_dots = TRUE ) from <- args$from to <- args$to from_as_list_of_vectors <- args$from_as_list_of_vectors to_as_list_of_vectors <- args$to_as_list_of_vectors from_arg <- args$from_arg to_arg <- args$to_arg vec_replace_values( x = x, from = from, to = to, from_as_list_of_vectors = from_as_list_of_vectors, to_as_list_of_vectors = to_as_list_of_vectors, x_arg = "x", from_arg = from_arg, to_arg = to_arg, error_call = current_env() ) } eval_formulas_or_from_and_to <- function( ..., from, to, allow_empty_dots, user_env = caller_env(2), error_call = caller_env() ) { implementation <- determine_implementation( ..., from = from, to = to, error_call = error_call ) switch( implementation, "dots" = { from_as_list_of_vectors <- TRUE to_as_list_of_vectors <- TRUE from_arg <- "" to_arg <- "" args <- eval_formulas( ..., allow_empty_dots = allow_empty_dots, user_env = user_env, error_call = error_call ) from <- args$lhs to <- args$rhs }, "from-to" = { from_as_list_of_vectors <- obj_is_list(from) to_as_list_of_vectors <- obj_is_list(to) from_arg <- "from" to_arg <- "to" }, abort("Unreachable", .internal = TRUE) ) list( from = from, to = to, from_as_list_of_vectors = from_as_list_of_vectors, to_as_list_of_vectors = to_as_list_of_vectors, from_arg = from_arg, to_arg = to_arg ) } determine_implementation <- function(..., from, to, error_call) { has_dots <- !missing(...) has_from <- !is.null(from) has_to <- !is.null(to) # Supplied `...` if (has_dots) { if (has_from) { cli::cli_abort( "Can't supply both {.arg from} and {.arg ...}.", call = error_call ) } if (has_to) { cli::cli_abort( "Can't supply both {.arg to} and {.arg ...}.", call = error_call ) } return("dots") } # Supplied `from` and `to` if (has_from || has_to) { if (!has_from || !has_to) { cli::cli_abort( "Must supply both {.arg from} and {.arg to}.", call = error_call ) } return("from-to") } # Supplied nothing. We use `"dots"` here which lets `recode_values()` error # and `replace_values()` be a no-op. "dots" } dplyr/R/locale.R0000644000176200001440000001103715137161765013205 0ustar liggesusers#' Locale used by `arrange()` #' #' @description #' This page documents details about the locale used by [arrange()] when #' ordering character vectors. #' #' ## Default locale #' #' The default locale used by `arrange()` is the C locale. This is used when #' `.locale = NULL` unless the deprecated `dplyr.legacy_locale` global option is #' set to `TRUE`. You can also force the C locale to be used unconditionally #' with `.locale = "C"`. #' #' The C locale is not exactly the same as English locales, such as `"en"`. The #' main difference is that the C locale groups the English alphabet by _case_, #' while most English locales group the alphabet by _letter_. For example, #' `c("a", "b", "C", "B", "c")` will sort as `c("B", "C", "a", "b", "c")` in the #' C locale, with all uppercase letters coming before lowercase letters, but #' will sort as `c("a", "b", "B", "c", "C")` in an English locale. This often #' makes little practical difference during data analysis, because both return #' identical results when case is consistent between observations. #' #' ## Reproducibility #' #' The C locale has the benefit of being completely reproducible across all #' supported R versions and operating systems with no extra effort. #' #' If you set `.locale` to an option from [stringi::stri_locale_list()], then #' stringi must be installed by anyone who wants to run your code. If you #' utilize this in a package, then stringi should be placed in `Imports`. #' #' ## Legacy behavior #' #' `r lifecycle::badge("deprecated")` #' #' Prior to dplyr 1.1.0, character columns were ordered in the system locale. #' Setting the global option `dplyr.legacy_locale` to `TRUE` retains this legacy #' behavior, but this has been deprecated. Update existing code to explicitly #' call `arrange(.locale = )` instead. Run `Sys.getlocale("LC_COLLATE")` to #' determine your system locale, and compare that against the list in #' [stringi::stri_locale_list()] to find an appropriate value for `.locale`, #' i.e. for American English, `"en_US"`. #' #' Setting `.locale` directly will override any usage of `dplyr.legacy_locale`. #' #' @name dplyr-locale #' @keywords internal #' @examplesIf dplyr:::has_minimum_stringi() #' df <- tibble(x = c("a", "b", "C", "B", "c")) #' df #' #' # Default locale is C, which groups the English alphabet by case, placing #' # uppercase letters before lowercase letters. #' arrange(df, x) #' #' # The American English locale groups the alphabet by letter. #' # Explicitly override `.locale` with `"en"` for this ordering. #' arrange(df, x, .locale = "en") #' #' # This Danish letter is expected to sort after `z` #' df <- tibble(x = c("o", "p", "\u00F8", "z")) #' df #' #' # The American English locale sorts it right after `o` #' arrange(df, x, .locale = "en") #' #' # Using `"da"` for Danish ordering gives the expected result #' arrange(df, x, .locale = "da") NULL dplyr_legacy_locale <- function() { # Used to determine if `group_by()` and `arrange()` should use # base R's `order()` for sorting, which respects the system locale and was # our sorting engine pre-1.1.0. option <- peek_option("dplyr.legacy_locale") if (is_null(option)) { # Default behavior uses C locale return(FALSE) } # This deprecation is a bit special. Since it is a global option that only the # end user would ever set, we set `user_env = globalenv()` so that it always # looks like a "direct" usage to lifecycle. This also makes our lives easier, # because `user_env` would have to be threaded all the way up through the # exported `grouped_df()` function, which is then used in many places # throughout dplyr. Additionally, we've bypassed `deprecate_soft()` and gone # straight to `deprecate_warn()` since this is only an end user facing option. lifecycle::deprecate_warn( when = "1.2.0", what = I("`options(dplyr.legacy_locale =)`"), details = c( i = "If needed for `arrange()`, use `arrange(.locale =)` instead.", i = "If needed for `group_by() |> summarise()`, follow up with an additional `arrange(.locale =)` call.", i = cli::format_inline(paste0( "Use {.run Sys.getlocale(\"LC_COLLATE\")} to determine your system locale, ", "and compare against {.run stringi::stri_locale_list()} to determine the `.locale` value to use." )) ), user_env = globalenv(), id = "dplyr-legacy-locale-option" ) if (!is_bool(option)) { abort( "Global option `dplyr.legacy_locale` must be a single `TRUE` or `FALSE`.", call = NULL ) } option } has_minimum_stringi <- function() { is_installed("stringi", version = "1.5.3") } dplyr/R/colwise-group-by.R0000644000176200001440000000667215106134104015145 0ustar liggesusers#' Group by a selection of variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' Scoped verbs (`_if`, `_at`, `_all`) have been superseded by the use of #' [pick()] or [across()] in an existing verb. See `vignette("colwise")` for #' details. #' #' These [scoped] variants of [group_by()] group a data frame by a #' selection of variables. Like [group_by()], they have optional #' [mutate] semantics. #' #' @inheritParams scoped #' @inheritParams group_by #' @param .add See [group_by()] #' #' @export #' #' @section Grouping variables: #' #' Existing grouping variables are maintained, even if not included in #' the selection. #' #' @keywords internal #' @examples #' # Group a data frame by all variables: #' group_by_all(mtcars) #' # -> #' mtcars |> group_by(pick(everything())) #' #' # Group by variables selected with a predicate: #' group_by_if(iris, is.factor) #' # -> #' iris |> group_by(pick(where(is.factor))) #' #' # Group by variables selected by name: #' group_by_at(mtcars, vars(vs, am)) #' # -> #' mtcars |> group_by(pick(vs, am)) #' #' # Like group_by(), the scoped variants have optional mutate #' # semantics. This provide a shortcut for group_by() + mutate(): #' d <- tibble(x=c(1,1,2,2), y=c(1,2,1,2)) #' group_by_all(d, as.factor) #' # -> #' d |> group_by(across(everything(), as.factor)) #' #' group_by_if(iris, is.factor, as.character) #' # -> #' iris |> group_by(across(where(is.factor), as.character)) group_by_all <- function( .tbl, .funs = list(), ..., .add = FALSE, .drop = group_by_drop_default(.tbl) ) { lifecycle::signal_stage("superseded", "group_by_all()") funs <- manip_all( .tbl, .funs, enquo(.funs), caller_env(), ..., .caller = "group_by_all" ) if (!length(funs)) { funs <- syms(tbl_vars(.tbl)) } .group_by_static_drop(.tbl, !!!funs, .add = .add, .drop = .drop) } #' @rdname group_by_all #' @export group_by_at <- function( .tbl, .vars, .funs = list(), ..., .add = FALSE, .drop = group_by_drop_default(.tbl) ) { lifecycle::signal_stage("superseded", "group_by_at()") funs <- manip_at( .tbl, .vars, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "group_by_at" ) if (!length(funs)) { funs <- tbl_at_syms(.tbl, .vars, .include_group_vars = TRUE) } .group_by_static_drop(.tbl, !!!funs, .add = .add, .drop = .drop) } #' @rdname group_by_all #' @export group_by_if <- function( .tbl, .predicate, .funs = list(), ..., .add = FALSE, .drop = group_by_drop_default(.tbl) ) { lifecycle::signal_stage("superseded", "group_by_if()") funs <- manip_if( .tbl, .predicate, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "group_by_if" ) if (!length(funs)) { funs <- tbl_if_syms(.tbl, .predicate, .include_group_vars = TRUE) } .group_by_static_drop(.tbl, !!!funs, .add = .add, .drop = .drop) } # workaround so that methods that do not have the .drop argument yet # don't create the auto mutate .drop column # # things like count() and group_by_all() # can call .group_by_static_drop() instead of group_by() # so that .drop is only part of the group_by() call if it is FALSE # # this is only meant to stay in dplyr until 0.8.0 to give # implementers of group_by() methods a chance to add .drop in their # arguments .group_by_static_drop <- function(..., .drop) { if (.drop) { group_by(...) } else { group_by(..., .drop = FALSE) } } dplyr/R/join-common-by.R0000644000176200001440000000344515106134104014566 0ustar liggesusers#' Extract out common by variables #' #' @export #' @keywords internal common_by <- function(by = NULL, x, y) UseMethod("common_by", by) #' @export common_by.character <- function(by, x, y) { by <- common_by_from_vector(by) common_by.list(by, x, y) } common_by_from_vector <- function(by) { by <- by[!duplicated(by)] by_x <- names(by) %||% by by_y <- unname(by) # If x partially named, assume unnamed are the same in both tables by_x[by_x == ""] <- by_y[by_x == ""] list(x = by_x, y = by_y) } #' @export common_by.list <- function(by, x, y) { x_vars <- tbl_vars(x) if (!all(by$x %in% x_vars)) { msg <- glue( "`by` can't contain join column {missing} which is missing from LHS.", missing = fmt_obj(setdiff(by$x, x_vars)) ) abort(msg) } y_vars <- tbl_vars(y) if (!all(by$y %in% y_vars)) { msg <- glue( "`by` can't contain join column {missing} which is missing from RHS.", missing = fmt_obj(setdiff(by$y, y_vars)) ) abort(msg) } by } #' @export common_by.NULL <- function(by, x, y) { by <- intersect(tbl_vars(x), tbl_vars(y)) by <- by[!is.na(by)] if (length(by) == 0) { msg <- glue( "`by` required, because the data sources have no common variables." ) abort(msg) } inform(auto_by_msg(by)) list( x = by, y = by ) } auto_by_msg <- function(by) { by_quoted <- encodeString(by, quote = '"') if (length(by_quoted) == 1L) { by_code <- by_quoted } else { by_code <- paste0("c(", paste(by_quoted, collapse = ", "), ")") } paste0("Joining, by = ", by_code) } #' @export common_by.default <- function(by, x, y) { msg <- glue( "`by` must be a (named) character vector, list, or NULL for natural joins (not recommended in production code), not {obj_type_friendly(by)}." ) abort(msg) } dplyr/R/rowwise.R0000644000176200001440000001205115106134104013421 0ustar liggesusers#' Group input by rows #' #' @description #' `rowwise()` allows you to compute on a data frame a row-at-a-time. #' This is most useful when a vectorised function doesn't exist. #' #' Most dplyr verbs preserve row-wise grouping. The exception is [summarise()], #' which return a [grouped_df]. You can explicitly ungroup with [ungroup()] #' or [as_tibble()], or convert to a [grouped_df] with [group_by()]. #' #' @section List-columns: #' Because a rowwise has exactly one row per group it offers a small #' convenience for working with list-columns. Normally, `summarise()` and #' `mutate()` extract a groups worth of data with `[`. But when you index #' a list in this way, you get back another list. When you're working with #' a `rowwise` tibble, then dplyr will use `[[` instead of `[` to make your #' life a little easier. #' #' @param data Input data frame. #' @param ... <[`tidy-select`][dplyr_tidy_select]> Variables to be preserved #' when calling [summarise()]. This is typically a set of variables whose #' combination uniquely identify each row. #' #' **NB**: unlike `group_by()` you can not create new variables here but #' instead you can select multiple variables with (e.g.) `everything()`. #' @seealso [nest_by()] for a convenient way of creating rowwise data frames #' with nested data. #' @return A row-wise data frame with class `rowwise_df`. Note that a #' `rowwise_df` is implicitly grouped by row, but is not a `grouped_df`. #' @export #' @examples #' df <- tibble(x = runif(6), y = runif(6), z = runif(6)) #' # Compute the mean of x, y, z in each row #' df |> rowwise() |> mutate(m = mean(c(x, y, z))) #' # use c_across() to more easily select many variables #' df |> rowwise() |> mutate(m = mean(c_across(x:z))) #' #' # Compute the minimum of x and y in each row #' df |> rowwise() |> mutate(m = min(c(x, y, z))) #' # In this case you can use an existing vectorised function: #' df |> mutate(m = pmin(x, y, z)) #' # Where these functions exist they'll be much faster than rowwise #' # so be on the lookout for them. #' #' # rowwise() is also useful when doing simulations #' params <- tribble( #' ~sim, ~n, ~mean, ~sd, #' 1, 1, 1, 1, #' 2, 2, 2, 4, #' 3, 3, -1, 2 #' ) #' # Here I supply variables to preserve after the computation #' params |> #' rowwise(sim) |> #' reframe(z = rnorm(n, mean, sd)) #' #' # If you want one row per simulation, put the results in a list() #' params |> #' rowwise(sim) |> #' summarise(z = list(rnorm(n, mean, sd)), .groups = "keep") rowwise <- function(data, ...) { UseMethod("rowwise") } #' @export rowwise.data.frame <- function(data, ...) { vars <- tidyselect::eval_select(expr(c(...)), data) rowwise_df(data, vars) } #' @export rowwise.grouped_df <- function(data, ...) { if (!missing(...)) { bullets <- c( "Can't re-group when creating rowwise data.", i = "Either first `ungroup()` or call `rowwise()` without arguments." ) abort(bullets) } rowwise_df(data, group_vars(data)) } # Constructor + helper ---------------------------------------------------- rowwise_df <- function(data, group_vars) { group_data <- as_tibble(data)[group_vars] new_rowwise_df(data, group_data) } is_rowwise_df <- function(x) { inherits(x, "rowwise_df") } #' @rdname new_grouped_df #' @export new_rowwise_df <- function(data, group_data = NULL, ..., class = character()) { nrow <- nrow(data) if (!is.null(group_data)) { if (!is_tibble(group_data) || has_name(group_data, ".rows")) { msg <- "`group_data` must be a tibble without a `.rows` column." abort(msg) } group_data <- new_tibble(vec_data(group_data), nrow = nrow) # strip attributes } else { group_data <- new_tibble(list(), nrow = nrow) } group_data$.rows <- new_list_of(as.list(seq_len(nrow)), ptype = integer()) new_tibble( data, groups = group_data, ..., nrow = nrow, class = c(class, "rowwise_df") ) } #' @rdname new_grouped_df #' @export validate_rowwise_df <- function(x) { result <- .Call(`dplyr_validate_rowwise_df`, x) if (!is.null(result)) { abort(result) } x } setOldClass(c("rowwise_df", "tbl_df", "tbl", "data.frame")) # methods ----------------------------------------------------------------- #' @importFrom pillar tbl_sum #' @export tbl_sum.rowwise_df <- function(x, ...) { c( NextMethod(), "Rowwise" = commas(group_vars(x)) ) } #' @export as_tibble.rowwise_df <- function(x, ...) { new_tibble(vec_data(x), nrow = nrow(x)) } #' @export `[.rowwise_df` <- function(x, i, j, drop = FALSE) { out <- NextMethod() if (!is.data.frame(out)) { return(out) } group_vars <- intersect(names(out), group_vars(x)) rowwise_df(out, group_vars) } #' @export `[<-.rowwise_df` <- function(x, i, j, ..., value) { out <- NextMethod() group_vars <- intersect(names(out), group_vars(x)) rowwise_df(out, group_vars) } #' @export `names<-.rowwise_df` <- function(x, value) { data <- NextMethod() group_vars <- value[match(group_vars(x), names(x))] rowwise_df(data, group_vars) } #' @export rbind.rowwise_df <- function(...) { bind_rows(...) } dplyr/R/colwise-distinct.R0000644000176200001440000000527515106134104015220 0ustar liggesusers#' Select distinct rows by a selection of variables #' #' @description #' `r lifecycle::badge("superseded")` #' #' Scoped verbs (`_if`, `_at`, `_all`) have been superseded by the use of #' [pick()] or [across()] in an existing verb. See `vignette("colwise")` for #' details. #' #' These [scoped] variants of [distinct()] extract distinct rows by a #' selection of variables. Like `distinct()`, you can modify the #' variables before ordering with the `.funs` argument. #' #' @param .keep_all If `TRUE`, keep all variables in `.data`. #' If a combination of `...` is not distinct, this keeps the #' first row of values. #' @inheritParams scoped #' @export #' #' @section Grouping variables: #' #' The grouping variables that are part of the selection are taken #' into account to determine distinct rows. #' #' @keywords internal #' @examples #' df <- tibble(x = rep(2:5, each = 2) / 2, y = rep(2:3, each = 4) / 2) #' #' distinct_all(df) #' # -> #' distinct(df, pick(everything())) #' #' distinct_at(df, vars(x,y)) #' # -> #' distinct(df, pick(x, y)) #' #' distinct_if(df, is.numeric) #' # -> #' distinct(df, pick(where(is.numeric))) #' #' # You can supply a function that will be applied before extracting the distinct values #' # The variables of the sorted tibble keep their original values. #' distinct_all(df, round) #' # -> #' distinct(df, across(everything(), round)) distinct_all <- function(.tbl, .funs = list(), ..., .keep_all = FALSE) { lifecycle::signal_stage("superseded", "distinct_all()") funs <- manip_all( .tbl, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "distinct_all" ) if (!length(funs)) { funs <- syms(tbl_vars(.tbl)) } distinct(.tbl, !!!funs, .keep_all = .keep_all) } #' @rdname distinct_all #' @export distinct_at <- function(.tbl, .vars, .funs = list(), ..., .keep_all = FALSE) { lifecycle::signal_stage("superseded", "distinct_at()") funs <- manip_at( .tbl, .vars, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "distinct_at" ) if (!length(funs)) { funs <- tbl_at_syms(.tbl, .vars, .include_group_vars = TRUE) } distinct(.tbl, !!!funs, .keep_all = .keep_all) } #' @rdname distinct_all #' @export distinct_if <- function( .tbl, .predicate, .funs = list(), ..., .keep_all = FALSE ) { lifecycle::signal_stage("superseded", "distinct_if()") funs <- manip_if( .tbl, .predicate, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "distinct_if" ) if (!length(funs)) { funs <- tbl_if_syms(.tbl, .predicate, .include_group_vars = TRUE) } distinct(.tbl, !!!funs, .keep_all = .keep_all) } dplyr/R/consecutive-id.R0000644000176200001440000000165215106134104014650 0ustar liggesusers#' Generate a unique identifier for consecutive combinations #' #' `consecutive_id()` generates a unique identifier that increments every time #' a variable (or combination of variables) changes. Inspired by #' `data.table::rleid()`. #' #' @inheritParams n_distinct #' @returns A numeric vector the same length as the longest #' element of `...`. #' @export #' @examples #' consecutive_id(c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, NA, NA)) #' consecutive_id(c(1, 1, 1, 2, 1, 1, 2, 2)) #' #' df <- data.frame(x = c(0, 0, 1, 0), y = c(2, 2, 2, 2)) #' df |> group_by(x, y) |> summarise(n = n()) #' df |> group_by(id = consecutive_id(x, y), x, y) |> summarise(n = n()) consecutive_id <- function(...) { check_dots_unnamed() data <- df_list( ..., .unpack = FALSE, .name_repair = "minimal", .error_call = current_env() ) data <- new_data_frame(data) out <- vec_identify_runs(data) attr(out, "n") <- NULL out } dplyr/vignettes/0000755000176200001440000000000015137234471013423 5ustar liggesusersdplyr/vignettes/window-functions.Rmd0000644000176200001440000002230415106134104017371 0ustar liggesusers--- title: "Window functions" description: > Window functions are a useful family of functions that work with vectors (returning an output the same size as the input), and combine naturally with `mutate()` and `filter()`. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Window functions} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} --- ```{r, include = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) library(dplyr) set.seed(1014) if (!rlang::is_installed("Lahman")) { knitr::opts_chunk$set(eval = FALSE) } ``` A __window function__ is a variation on an aggregation function. Where an aggregation function, like `sum()` and `mean()`, takes n inputs and return a single value, a window function returns n values. The output of a window function depends on all its input values, so window functions don't include functions that work element-wise, like `+` or `round()`. Window functions include variations on aggregate functions, like `cumsum()` and `cummean()`, functions for ranking and ordering, like `rank()`, and functions for taking offsets, like `lead()` and `lag()`. In this vignette, we'll use a small sample of the Lahman batting dataset, including the players that have won an award. ```{r} library(Lahman) batting <- Lahman::Batting |> as_tibble() |> select(playerID, yearID, teamID, G, AB:H) |> arrange(playerID, yearID, teamID) |> semi_join(Lahman::AwardsPlayers, by = "playerID") players <- batting |> group_by(playerID) ``` Window functions are used in conjunction with `mutate()` and `filter()` to solve a wide range of problems. Here's a selection: ```{r, eval = FALSE} # For each player, find the two years with most hits filter(players, min_rank(desc(H)) <= 2 & H > 0) # Within each player, rank each year by the number of games played mutate(players, G_rank = min_rank(G)) # For each player, find every year that was better than the previous year filter(players, G > lag(G)) # For each player, compute avg change in games played per year mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID))) # For each player, find all years where they played more games than they did on average filter(players, G > mean(G)) # For each, player compute a z score based on number of games played mutate(players, G_z = (G - mean(G)) / sd(G)) ``` Before reading this vignette, you should be familiar with `mutate()` and `filter()`. ## Types of window functions There are five main families of window functions. Two families are unrelated to aggregation functions: * Ranking and ordering functions: `row_number()`, `min_rank()`, `dense_rank()`, `cume_dist()`, `percent_rank()`, and `ntile()`. These functions all take a vector to order by, and return various types of ranks. * Offsets `lead()` and `lag()` allow you to access the previous and next values in a vector, making it easy to compute differences and trends. The other three families are variations on familiar aggregate functions: * Cumulative aggregates: `cumsum()`, `cummin()`, `cummax()` (from base R), and `cumall()`, `cumany()`, and `cummean()` (from dplyr). * Rolling aggregates operate in a fixed width window. You won't find them in base R or in dplyr, but there are many implementations in other packages, such as [RcppRoll](https://cran.r-project.org/package=RcppRoll). * Recycled aggregates, where an aggregate is repeated to match the length of the input. These are not needed in R because vector recycling automatically recycles aggregates where needed. They are important in SQL, because the presence of an aggregation function usually tells the database to return only one row per group. Each family is described in more detail below, focussing on the general goals and how to use them with dplyr. For more details, refer to the individual function documentation. ## Ranking functions The ranking functions are variations on a theme, differing in how they handle ties: ```{r} x <- c(1, 1, 2, 2, 2) row_number(x) min_rank(x) dense_rank(x) ``` If you're familiar with R, you may recognise that `row_number()` and `min_rank()` can be computed with the base `rank()` function and various values of the `ties.method` argument. These functions are provided to save a little typing, and to make it easier to convert between R and SQL. Two other ranking functions return numbers between 0 and 1. `percent_rank()` gives the percentage of the rank; `cume_dist()` gives the proportion of values less than or equal to the current value. ```{r} cume_dist(x) percent_rank(x) ``` These are useful if you want to select (for example) the top 10% of records within each group. For example: ```{r} filter(players, cume_dist(desc(G)) < 0.1) ``` Finally, `ntile()` divides the data up into `n` evenly sized buckets. It's a coarse ranking, and it can be used in with `mutate()` to divide the data into buckets for further summary. For example, we could use `ntile()` to divide the players within a team into four ranked groups, and calculate the average number of games within each group. ```{r} by_team_player <- group_by(batting, teamID, playerID) by_team <- summarise(by_team_player, G = sum(G)) by_team_quartile <- group_by(by_team, quartile = ntile(G, 4)) summarise(by_team_quartile, mean(G)) ``` All ranking functions rank from lowest to highest so that small input values get small ranks. Use `desc()` to rank from highest to lowest. ## Lead and lag `lead()` and `lag()` produce offset versions of a input vector that is either ahead of or behind the original vector. ```{r} x <- 1:5 lead(x) lag(x) ``` You can use them to: * Compute differences or percent changes. ```{r, results = "hide"} # Compute the relative change in games played mutate(players, G_delta = G - lag(G)) ``` Using `lag()` is more convenient than `diff()` because for `n` inputs `diff()` returns `n - 1` outputs. * Find out when a value changes. ```{r, results = "hide"} # Find when a player changed teams filter(players, teamID != lag(teamID)) ``` `lead()` and `lag()` have an optional argument `order_by`. If set, instead of using the row order to determine which value comes before another, they will use another variable. This is important if you have not already sorted the data, or you want to sort one way and lag another. Here's a simple example of what happens if you don't specify `order_by` when you need it: ```{r} df <- data.frame(year = 2000:2005, value = (0:5) ^ 2) scrambled <- df[sample(nrow(df)), ] wrong <- mutate(scrambled, prev_value = lag(value)) arrange(wrong, year) right <- mutate(scrambled, prev_value = lag(value, order_by = year)) arrange(right, year) ``` ## Cumulative aggregates Base R provides cumulative sum (`cumsum()`), cumulative min (`cummin()`), and cumulative max (`cummax()`). (It also provides `cumprod()` but that is rarely useful). Other common accumulating functions are `cumany()` and `cumall()`, cumulative versions of `||` and `&&`, and `cummean()`, a cumulative mean. These are not included in base R, but efficient versions are provided by `dplyr`. `cumany()` and `cumall()` are useful for selecting all rows up to, or all rows after, a condition is true for the first (or last) time. For example, we can use `cumany()` to find all records for a player after they played a year with 150 games: ```{r, eval = FALSE} filter(players, cumany(G > 150)) ``` Like lead and lag, you may want to control the order in which the accumulation occurs. None of the built in functions have an `order_by` argument so `dplyr` provides a helper: `order_by()`. You give it the variable you want to order by, and then the call to the window function: ```{r} x <- 1:10 y <- 10:1 order_by(y, cumsum(x)) ``` This function uses a bit of non-standard evaluation, so I wouldn't recommend using it inside another function; use the simpler but less concise `with_order()` instead. ## Recycled aggregates R's vector recycling makes it easy to select values that are higher or lower than a summary. I call this a recycled aggregate because the value of the aggregate is recycled to be the same length as the original vector. Recycled aggregates are useful if you want to find all records greater than the mean or less than the median: ```{r, eval = FALSE} filter(players, G > mean(G)) filter(players, G < median(G)) ``` While most SQL databases don't have an equivalent of `median()` or `quantile()`, when filtering you can achieve the same effect with `ntile()`. For example, `x > median(x)` is equivalent to `ntile(x, 2) == 2`; `x > quantile(x, 75)` is equivalent to `ntile(x, 100) > 75` or `ntile(x, 4) > 3`. ```{r, eval = FALSE} filter(players, ntile(G, 2) == 2) ``` You can also use this idea to select the records with the highest (`x == max(x)`) or lowest value (`x == min(x)`) for a field, but the ranking functions give you more control over ties, and allow you to select any number of records. Recycled aggregates are also useful in conjunction with `mutate()`. For example, with the batting data, we could compute the "career year", the number of years a player has played since they entered the league: ```{r} mutate(players, career_year = yearID - min(yearID) + 1) ``` Or, as in the introductory example, we could compute a z-score: ```{r} mutate(players, G_z = (G - mean(G)) / sd(G)) ``` dplyr/vignettes/dplyr.Rmd0000644000176200001440000003302615137161765015232 0ustar liggesusers--- title: "Introduction to dplyr" output: rmarkdown::html_vignette description: > Start here if this is your first time using dplyr. You'll learn the basic philosophy, the most important data manipulation verbs, and the pipe, `|>`, which allows you to combine multiple verbs together to solve real problems. vignette: > %\VignetteIndexEntry{Introduction to dplyr} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) library(dplyr) set.seed(1014) ``` When working with data you must: * Figure out what you want to do. * Describe those tasks in the form of a computer program. * Execute the program. The dplyr package makes these steps fast and easy: * By constraining your options, it helps you think about your data manipulation challenges. * It provides simple "verbs", functions that correspond to the most common data manipulation tasks, to help you translate your thoughts into code. * It uses efficient backends, so you spend less time waiting for the computer. This document introduces you to dplyr's basic set of tools, and shows you how to apply them to data frames. dplyr also supports databases via the dbplyr package, once you've installed, read `vignette("dbplyr")` to learn more. ## Data: starwars To explore the basic data manipulation verbs of dplyr, we'll use the dataset `starwars`. This dataset contains `r nrow(starwars)` characters and comes from the [Star Wars API](https://swapi.py4e.com/), and is documented in `?starwars` ```{r} dim(starwars) starwars ``` Note that `starwars` is a tibble, a modern reimagining of the data frame. It's particularly useful for large datasets because it only prints the first few rows. You can learn more about tibbles at ; in particular you can convert data frames to tibbles with `as_tibble()`. ## Single table verbs dplyr aims to provide a function for each basic verb of data manipulation. These verbs can be organised into three categories based on the component of the dataset that they work with: * Rows: * `filter()` chooses rows based on column values. * `slice()` chooses rows based on location. * `arrange()` changes the order of the rows. * Columns: * `select()` changes whether or not a column is included. * `rename()` changes the name of columns. * `mutate()` changes the values of columns and creates new columns. * `relocate()` changes the order of the columns. * Groups of rows: * `summarise()` collapses a group into a single row. ### The pipe All of the dplyr functions take a data frame (or tibble) as the first argument. Rather than forcing the user to either save intermediate objects or nest functions, dplyr provides the `|>` operator from magrittr. `x |> f(y)` turns into `f(x, y)` so the result from one step is then "piped" into the next step. You can use the pipe to rewrite multiple operations that you can read left-to-right, top-to-bottom (reading the pipe operator as "then"). ### Filter rows with `filter()` `filter()` allows you to select a subset of rows in a data frame. Like all single verbs, the first argument is the tibble (or data frame). The second and subsequent arguments refer to variables within that data frame, selecting rows where the expression is `TRUE`. For example, we can select all character with light skin color and brown eyes with: ```{r} starwars |> filter(skin_color == "light", eye_color == "brown") ``` This is roughly equivalent to this base R code: ```{r, eval = FALSE} starwars[starwars$skin_color == "light" & starwars$eye_color == "brown", ] ``` ### Arrange rows with `arrange()` `arrange()` works similarly to `filter()` except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns: ```{r} starwars |> arrange(height, mass) ``` Use `desc()` to order a column in descending order: ```{r} starwars |> arrange(desc(height)) ``` ### Choose rows using their position with `slice()` `slice()` lets you index rows by their (integer) locations. It allows you to select, remove, and duplicate rows. We can get characters from row numbers 5 through 10. ```{r} starwars |> slice(5:10) ``` It is accompanied by a number of helpers for common use cases: * `slice_head()` and `slice_tail()` select the first or last rows. ```{r} starwars |> slice_head(n = 3) ``` * `slice_sample()` randomly selects rows. Use the option prop to choose a certain proportion of the cases. ```{r} starwars |> slice_sample(n = 5) starwars |> slice_sample(prop = 0.1) ``` Use `replace = TRUE` to perform a bootstrap sample. If needed, you can weight the sample with the `weight` argument. * `slice_min()` and `slice_max()` select the rows with the smallest or largest values of the selected column. By default, they return a single minimum or maximum, but you can supply `n` to control how many rows remain. ```{r} starwars |> slice_max(height, n = 3) ``` ### Select columns with `select()` Often you work with large datasets with many columns but only a few are actually of interest to you. `select()` allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions: ```{r} # Select columns by name starwars |> select(hair_color, skin_color, eye_color) # Select all columns between hair_color and eye_color (inclusive) starwars |> select(hair_color:eye_color) # Select all columns except those from hair_color to eye_color (inclusive) starwars |> select(!(hair_color:eye_color)) # Select all columns ending with color starwars |> select(ends_with("color")) ``` There are a number of helper functions you can use within `select()`, like `starts_with()`, `ends_with()`, `matches()` and `contains()`. These let you quickly match larger blocks of variables that meet some criterion. See `?select` for more details. You can rename variables with `select()` by using named arguments: ```{r} starwars |> select(home_world = homeworld) ``` But because `select()` drops all the variables not explicitly mentioned, it's not that useful. Instead, use `rename()`: ```{r} starwars |> rename(home_world = homeworld) ``` ### Add new columns with `mutate()` Besides selecting sets of existing columns, it's often useful to add new columns that are functions of existing columns. This is the job of `mutate()`: ```{r} starwars |> mutate(height_m = height / 100) ``` We can't see the height in meters we just calculated, but we can fix that using a select command. ```{r} starwars |> mutate(height_m = height / 100) |> select(height_m, height, everything()) ``` `dplyr::mutate()` is similar to the base `transform()`, but allows you to refer to columns that you've just created: ```{r} starwars |> mutate( height_m = height / 100, BMI = mass / (height_m^2) ) |> select(BMI, everything()) ``` If you only want to keep the new variables, use `.keep = "none"`: ```{r} starwars |> mutate( height_m = height / 100, BMI = mass / (height_m^2), .keep = "none" ) ``` ### Change column order with `relocate()` Use a similar syntax as `select()` to move blocks of columns at once ```{r} starwars |> relocate(sex:homeworld, .before = height) ``` ### Summarise values with `summarise()` The last verb is `summarise()`. It collapses a data frame to a single row. ```{r} starwars |> summarise(height = mean(height, na.rm = TRUE)) ``` It's not that useful until we learn the `group_by()` verb below. ### Commonalities You may have noticed that the syntax and function of all these verbs are very similar: * The first argument is a data frame. * The subsequent arguments describe what to do with the data frame. You can refer to columns in the data frame directly without using `$`. * The result is a new data frame Together these properties make it easy to chain together multiple simple steps to achieve a complex result. These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (`arrange()`), pick observations and variables of interest (`filter()` and `select()`), add new variables that are functions of existing variables (`mutate()`), or collapse many values to a summary (`summarise()`). ## Combining functions with `|>` The dplyr API is functional in the sense that function calls don't have side-effects. You must always save their results. This doesn't lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step: ```{r, eval = FALSE} a1 <- group_by(starwars, species, sex) a2 <- select(a1, height, mass) a3 <- summarise(a2, height = mean(height, na.rm = TRUE), mass = mean(mass, na.rm = TRUE) ) ``` Or if you don't want to name the intermediate results, you need to wrap the function calls inside each other: ```{r} summarise( select( group_by(starwars, species, sex), height, mass ), height = mean(height, na.rm = TRUE), mass = mean(mass, na.rm = TRUE) ) ``` This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the `|>` operator from magrittr. `x |> f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom (reading the pipe operator as "then"): ```{r, eval = FALSE} starwars |> group_by(species, sex) |> select(height, mass) |> summarise( height = mean(height, na.rm = TRUE), mass = mean(mass, na.rm = TRUE) ) ``` ## Patterns of operations The dplyr verbs can be classified by the type of operations they accomplish (we sometimes speak of their **semantics**, i.e., their meaning). It's helpful to have a good grasp of the difference between select and mutate operations. ### Selecting operations One of the appealing features of dplyr is that you can refer to columns from the tibble as if they were regular variables. However, the syntactic uniformity of referring to bare column names hides semantical differences across the verbs. A column symbol supplied to `select()` does not have the same meaning as the same symbol supplied to `mutate()`. Selecting operations expect column names and positions. Hence, when you call `select()` with bare variable names, they actually represent their own positions in the tibble. The following calls are completely equivalent from dplyr's point of view: ```{r} # `name` represents the integer 1 select(starwars, name) select(starwars, 1) ``` By the same token, this means that you cannot refer to variables from the surrounding context if they have the same name as one of the columns. In the following example, `height` still represents 2, not 5: ```{r} height <- 5 select(starwars, height) ``` One useful subtlety is that this only applies to bare names and to selecting calls like `c(height, mass)` or `height:mass`. In all other cases, the columns of the data frame are not put in scope. This allows you to refer to contextual variables in selection helpers: ```{r} name <- "color" select(starwars, ends_with(name)) ``` These semantics are usually intuitive. But note the subtle difference: ```{r} name <- 5 select(starwars, name, identity(name)) ``` In the first argument, `name` represents its own position `1`. In the second argument, `name` is evaluated in the surrounding context and represents the fifth column. For a long time, `select()` used to only understand column positions. Counting from dplyr 0.6, it now understands column names as well. This makes it a bit easier to program with `select()`: ```{r} vars <- c("name", "height") select(starwars, all_of(vars), "mass") ``` ### Mutating operations Mutate semantics are quite different from selection semantics. Whereas `select()` expects column names or positions, `mutate()` expects *column vectors*. We will set up a smaller tibble to use for our examples. ```{r} df <- starwars |> select(name, height, mass) ``` When we use `select()`, the bare column names stand for their own positions in the tibble. For `mutate()` on the other hand, column symbols represent the actual column vectors stored in the tibble. Consider what happens if we give a string or a number to `mutate()`: ```{r} mutate(df, "height", 2) ``` `mutate()` gets length-1 vectors that it interprets as new columns in the data frame. These vectors are recycled so they match the number of rows. That's why it doesn't make sense to supply expressions like `"height" + 10` to `mutate()`. This amounts to adding 10 to a string! The correct expression is: ```{r} mutate(df, height + 10) ``` In the same way, you can unquote values from the context if these values represent a valid column. They must be either length 1 (they then get recycled) or have the same length as the number of rows. In the following example we create a new vector that we add to the data frame: ```{r} var <- seq(1, nrow(df)) mutate(df, new = var) ``` A case in point is `group_by()`. While you might think it has select semantics, it actually has mutate semantics. This is quite handy as it allows to group by a modified column: ```{r} group_by(starwars, sex) group_by(starwars, sex = as.factor(sex)) group_by(starwars, height_binned = cut(height, 3)) ``` This is why you can't supply a column name to `group_by()`. This amounts to creating a new column containing the string recycled to the number of rows: ```{r} group_by(df, "month") ``` dplyr/vignettes/two-table.Rmd0000644000176200001440000001606415106134104015760 0ustar liggesusers--- title: "Two-table verbs" description: > Most dplyr verbs work with a single data set, but most data analyses involve multiple datasets. This vignette introduces you to the dplyr verbs that work with more one than data set, and introduces to the mutating joins, filtering joins, and the set operations. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Two-table verbs} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 5) library(dplyr) ``` It's rare that a data analysis involves only a single table of data. In practice, you'll normally have many tables that contribute to an analysis, and you need flexible tools to combine them. In dplyr, there are three families of verbs that work with two tables at a time: * Mutating joins, which add new variables to one table from matching rows in another. * Filtering joins, which filter observations from one table based on whether or not they match an observation in the other table. * Set operations, which combine the observations in the data sets as if they were set elements. (This discussion assumes that you have [tidy data](https://www.jstatsoft.org/v59/i10/), where the rows are observations and the columns are variables. If you're not familiar with that framework, I'd recommend reading up on it first.) All two-table verbs work similarly. The first two arguments are `x` and `y`, and provide the tables to combine. The output is always a new table with the same type as `x`. ## Mutating joins Mutating joins allow you to combine variables from multiple tables. For example, consider the flights and airlines data from the nycflights13 package. In one table we have flight information with an abbreviation for carrier, and in another we have a mapping between abbreviations and full names. You can use a join to add the carrier names to the flight data: ```{r, warning = FALSE} library(nycflights13) # Drop unimportant variables so it's easier to understand the join results. flights2 <- flights |> select(year:day, hour, origin, dest, tailnum, carrier) flights2 |> left_join(airlines) ``` ### Controlling how the tables are matched As well as `x` and `y`, each mutating join takes an argument `by` that controls which variables are used to match observations in the two tables. There are a few ways to specify it, as I illustrate below with various tables from nycflights13: * `NULL`, the default. dplyr will will use all variables that appear in both tables, a __natural__ join. For example, the flights and weather tables match on their common variables: year, month, day, hour and origin. ```{r} flights2 |> left_join(weather) ``` * A character vector, `by = "x"`. Like a natural join, but uses only some of the common variables. For example, `flights` and `planes` have `year` columns, but they mean different things so we only want to join by `tailnum`. ```{r} flights2 |> left_join(planes, by = "tailnum") ``` Note that the year columns in the output are disambiguated with a suffix. * A named character vector: `by = c("x" = "a")`. This will match variable `x` in table `x` to variable `a` in table `y`. The variables from use will be used in the output. Each flight has an origin and destination `airport`, so we need to specify which one we want to join to: ```{r} flights2 |> left_join(airports, c("dest" = "faa")) flights2 |> left_join(airports, c("origin" = "faa")) ``` ### Types of join There are four types of mutating join, which differ in their behaviour when a match is not found. We'll illustrate each with a simple example: ```{r} df1 <- tibble(x = c(1, 2), y = 2:1) df2 <- tibble(x = c(3, 1), a = 10, b = "a") ``` * `inner_join(x, y)` only includes observations that match in both `x` and `y`. ```{r} df1 |> inner_join(df2) |> knitr::kable() ``` * `left_join(x, y)` includes all observations in `x`, regardless of whether they match or not. This is the most commonly used join because it ensures that you don't lose observations from your primary table. ```{r} df1 |> left_join(df2) ``` * `right_join(x, y)` includes all observations in `y`. It's equivalent to `left_join(y, x)`, but the columns and rows will be ordered differently. ```{r} df1 |> right_join(df2) df2 |> left_join(df1) ``` * `full_join()` includes all observations from `x` and `y`. ```{r} df1 |> full_join(df2) ``` The left, right and full joins are collectively know as __outer joins__. When a row doesn't match in an outer join, the new variables are filled in with missing values. ### Observations While mutating joins are primarily used to add new variables, they can also generate new observations. If a match is not unique, a join will add all possible combinations (the Cartesian product) of the matching observations: ```{r} df1 <- tibble(x = c(1, 1, 2), y = 1:3) df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a")) df1 |> left_join(df2) ``` ## Filtering joins Filtering joins match observations in the same way as mutating joins, but affect the observations, not the variables. There are two types: * `semi_join(x, y)` __keeps__ all observations in `x` that have a match in `y`. * `anti_join(x, y)` __drops__ all observations in `x` that have a match in `y`. These are most useful for diagnosing join mismatches. For example, there are many flights in the nycflights13 dataset that don't have a matching tail number in the planes table: ```{r} library("nycflights13") flights |> anti_join(planes, by = "tailnum") |> count(tailnum, sort = TRUE) ``` If you're worried about what observations your joins will match, start with a `semi_join()` or `anti_join()`. `semi_join()` and `anti_join()` never duplicate; they only ever remove observations. ```{r} df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4) df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a")) # Four rows to start with: df1 |> nrow() # And we get four rows after the join df1 |> inner_join(df2, by = "x") |> nrow() # But only two rows actually match df1 |> semi_join(df2, by = "x") |> nrow() ``` ## Set operations The final type of two-table verb is set operations. These expect the `x` and `y` inputs to have the same variables, and treat the observations like sets: * `intersect(x, y)`: return only observations in both `x` and `y` * `union(x, y)`: return unique observations in `x` and `y` * `setdiff(x, y)`: return observations in `x`, but not in `y`. Given this simple data: ```{r} (df1 <- tibble(x = 1:2, y = c(1L, 1L))) (df2 <- tibble(x = 1:2, y = 1:2)) ``` The four possibilities are: ```{r} intersect(df1, df2) # Note that we get 3 rows, not 4 union(df1, df2) setdiff(df1, df2) setdiff(df2, df1) ``` ## Multiple-table verbs dplyr does not provide any functions for working with three or more tables. Instead use `purrr::reduce()` or `Reduce()`, as described in [Advanced R](http://adv-r.had.co.nz/Functionals.html#functionals-fp), to iteratively combine the two-table verbs to handle as many tables as you need. dplyr/vignettes/in-packages.Rmd0000644000176200001440000002003415106134104016234 0ustar liggesusers--- title: "Using dplyr in packages" description: > A guide for package authors who use dplyr. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Using dplyr in packages} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup, message = FALSE} library(dplyr) ``` This vignette is aimed at package authors who use dplyr in their packages. We will discuss best practices learned over the years to avoid `R CMD check` notes and warnings, and how to handle when dplyr deprecates functions. ## Join helpers As of dplyr 1.1.0, we've introduced `join_by()` along 4 helpers for performing various types of joins: - `closest()` - `between()` - `within()` - `overlaps()` `join_by()` implements a domain specific language (DSL) for joins, and internally interprets calls to these functions. You'll notice that `dplyr::closest()` isn't an exported function from dplyr (`dplyr::between()` and `base::within()` do happen to be preexisting functions). If you use `closest()` in your package, then this will cause an `R CMD check` note letting you know that you've used a symbol that doesn't belong to any package. To silence this, place `utils::globalVariables("closest")` in a source file in your package (but outside of any function). dbplyr does a similar thing for SQL functions, so you can see an example of that [here](https://github.com/tidyverse/dbplyr/blob/7edf5d607fd6b0b897721ea96d1c9ca9401f0f9b/R/backend-redshift.R#L144). You may also have to add utils to your package Imports, even though it is a base package. You can do that easily with `usethis::use_package("utils")`. ## Data masking and tidy selection NOTEs If you're writing a package and you have a function that uses data masking or tidy selection: ```{r} my_summary_function <- function(data) { data |> select(grp, x, y) |> filter(x > 0) |> group_by(grp) |> summarise(y = mean(y), n = n()) } ``` You'll get an `NOTE` because `R CMD check` doesn't know that dplyr functions use tidy evaluation: N checking R code for possible problems my_summary_function: no visible binding for global variable ‘grp’, ‘x’, ‘y’ Undefined global functions or variables: grp x y To eliminate this note: - For data masking, import `.data` from [rlang](https://rlang.r-lib.org/) and then use `.data$var` instead of `var`. - For tidy selection, use `"var"` instead of `var`. That yields: ```{r} #' @importFrom rlang .data my_summary_function <- function(data) { data |> select("grp", "x", "y") |> filter(.data$x > 0) |> group_by(.data$grp) |> summarise(y = mean(.data$y), n = n()) } ``` For more about programming with dplyr, see `vignette("programming", package = "dplyr")`. ## Deprecation This section is focused on updating package code to deal with backwards incompatible changes in dplyr. We do try and minimize backward incompatible changes as much as possible, but sometimes they are necessary in order to radically simplify existing code, or unlock a lot of potential value in the future. We will start with some general advice about supporting multiple versions of dplyr at once, and then we will discuss some specific changes in dplyr. ### Multiple dplyr versions Ideally, when we introduce a breaking change you'll want to make sure that your package works with both the released version and the development version of dplyr. This is typically a little bit more work, but has two big advantages: - It's more convenient for your users, since your package will work for them regardless of what version of dplyr they have installed. - It's easier on CRAN since it doesn't require a massive coordinated release of multiple packages. If we break your package, we will typically send you a pull request that implements a patch before releasing the next version of dplyr. Most of the time, this patch will be backwards compatible with older versions of dplyr as well. Ideally, you'll accept this patch and submit a new version of your package to CRAN before the new version of dplyr is released. To make code work with multiple versions of a package, your first tool is the simple if statement: ```{r, eval=FALSE} if (utils::packageVersion("dplyr") > "0.5.0") { # code for new version } else { # code for old version } ``` Always condition on `> current-version`, not `>= next-version` because this will ensure that this branch is also used for the development version of the package. For example, if the current release is version `"0.5.0"`, the development version will be `"0.5.0.9000"`. This typically works well if the branch for the "new version" introduces a new argument or has a slightly different return value. This *doesn't* work if we've introduced a new function that you need to switch to, like: ```{r, eval=FALSE} if (utils::packageVersion("dplyr") > "1.0.10") { dplyr::reframe(df, x = unique(x)) } else { dplyr::summarise(df, x = unique(x)) } ``` In this case, when checks are run with dplyr 1.0.10 you'll get a warning about using a function from dplyr that doesn't exist (`reframe()`) even though that branch will never run. You can get around this by using `utils::getFromNamespace()` to indirectly call the new dplyr function: ```{r, eval=FALSE} if (utils::packageVersion("dplyr") > "1.0.10") { utils::getFromNamespace("reframe", "dplyr")(df, x = unique(x)) } else { dplyr::summarise(df, x = unique(x)) } ``` As soon as the next version of dplyr is actually on CRAN (1.1.0 in this case), you should feel free to remove this code and unconditionally use `reframe()` as long as you also require `dplyr (>= 1.1.0)` in your `DESCRIPTION` file. This is typically not very painful for users, because they'd already be updating your package when they run into this requirement, so updating one more package along the way is generally easy. It also helps them get the latest bug fixes and features from dplyr. Sometimes, it isn't possible to avoid a call to `@importFrom`. For example you might be importing a generic so that you can define a method for it, but that generic has moved between packages. In this case, you can take advantage of a little-known feature in the `NAMESPACE` file: you can include raw `if` statements. ```{r, eval=FALSE} #' @rawNamespace #' if (utils::packageVersion("dplyr") > "0.5.0") { #' importFrom("dbplyr", "build_sql") #' } else { #' importFrom("dplyr", "build_sql") #' } ``` ### Deprecation of `mutate_*()` and `summarise_*()` The following `mutate()` and `summarise()` variants were deprecated in dplyr 0.7.0: - `mutate_each()`, `summarise_each()` and the following variants were superseded in dplyr 1.0.0: - `mutate_all()`, `summarise_all()` - `mutate_if()`, `summarise_if()` - `mutate_at()`, `summarise_at()` These have all been replaced by using `mutate()` or `summarise()` in combination with `across()`, which was introduced in dplyr 1.0.0. If you used `mutate_all()` or `mutate_each()` without supplying a selection, you should update to use `across(everything())`: ```{r, eval=FALSE} starwars |> mutate_each(funs(as.character)) starwars |> mutate_all(funs(as.character)) starwars |> mutate(across(everything(), as.character)) ``` If you provided a selection through `mutate_at()` or `mutate_each()`, then you can switch to `across()` with a selection: ```{r, eval = FALSE} starwars |> mutate_each(funs(as.character), height, mass) starwars |> mutate_at(vars(height, mass), as.character) starwars |> mutate(across(c(height, mass), as.character)) ``` If you used predicates with `mutate_if()`, you can switch to using `across()` in combination with `where()`: ```{r, eval=FALSE} starwars |> mutate_if(is.factor, as.character) starwars |> mutate(across(where(is.factor), as.character)) ``` ## Data frame subclasses If you are a package author that is *extending* dplyr to work with a new data frame subclass, then we encourage you to read the documentation in `?dplyr_extending`. This contains advice on how to implement the minimal number of extension generics possible to get maximal compatibility across dplyr's verbs. dplyr/vignettes/grouping.Rmd0000644000176200001440000001626115137161765015734 0ustar liggesusers--- title: "Grouped data" description: > To unlock the full potential of dplyr, you need to understand how each verb interacts with grouping. This vignette shows you how to manipulate grouping, how each verb changes its behaviour when working with grouped data, and how you can access data about the "current" group from within a verb. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Grouped data} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE, warning = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) ``` dplyr verbs are particularly powerful when you apply them to grouped data frames (`grouped_df` objects). This vignette shows you: * How to group, inspect, and ungroup with `group_by()` and friends. * How individual dplyr verbs changes their behaviour when applied to grouped data frame. * How to access data about the "current" group from within a verb. We'll start by loading dplyr: ```{r, message = FALSE} library(dplyr) ``` ## `group_by()` The most important grouping verb is `group_by()`: it takes a data frame and one or more variables to group by: ```{r} by_species <- starwars |> group_by(species) by_sex_gender <- starwars |> group_by(sex, gender) ``` You can see the grouping when you print the data: ```{r} by_species by_sex_gender ``` Or use `tally()` to count the number of rows in each group. The `sort` argument is useful if you want to see the largest groups up front. ```{r} by_species |> tally() by_sex_gender |> tally(sort = TRUE) ``` As well as grouping by existing variables, you can group by any function of existing variables. This is equivalent to performing a `mutate()` **before** the `group_by()`: ```{r group_by_with_expression} bmi_breaks <- c(0, 18.5, 25, 30, Inf) starwars |> group_by(bmi_cat = cut(mass/(height/100)^2, breaks=bmi_breaks)) |> tally() ``` ## Group metadata You can see underlying group data with `group_keys()`. It has one row for each group and one column for each grouping variable: ```{r group_vars} by_species |> group_keys() by_sex_gender |> group_keys() ``` You can see which group each row belongs to with `group_indices()`: ```{r} by_species |> group_indices() ``` And which rows each group contains with `group_rows()`: ```{r} by_species |> group_rows() |> head() ``` Use `group_vars()` if you just want the names of the grouping variables: ```{r} by_species |> group_vars() by_sex_gender |> group_vars() ``` ### Changing and adding to grouping variables If you apply `group_by()` to an already grouped dataset, will overwrite the existing grouping variables. For example, the following code groups by `homeworld` instead of `species`: ```{r} by_species |> group_by(homeworld) |> tally() ``` To **augment** the grouping, using `.add = TRUE`[^add]. For example, the following code groups by species and homeworld: ```{r} by_species |> group_by(homeworld, .add = TRUE) |> tally() ``` [^add]: Note that the argument changed from `add = TRUE` to `.add = TRUE` in dplyr 1.0.0. ### Removing grouping variables To remove all grouping variables, use `ungroup()`: ```{r} by_species |> ungroup() |> tally() ``` You can also choose to selectively ungroup by listing the variables you want to remove: ```{r} by_sex_gender |> ungroup(sex) |> tally() ``` ## Verbs The following sections describe how grouping affects the main dplyr verbs. ### `summarise()` `summarise()` computes a summary for each group. This means that it starts from `group_keys()`, adding summary variables to the right hand side: ```{r summarise} by_species |> summarise( n = n(), height = mean(height, na.rm = TRUE) ) ``` The `.groups=` argument controls the grouping structure of the output. The historical behaviour of removing the right hand side grouping variable corresponds to `.groups = "drop_last"` without a message or `.groups = NULL` with a message (the default). ```{r} by_sex_gender |> summarise(n = n()) |> group_vars() by_sex_gender |> summarise(n = n(), .groups = "drop_last") |> group_vars() ``` Since version 1.0.0 the groups may also be kept (`.groups = "keep"`) or dropped (`.groups = "drop"`). ```{r} by_sex_gender |> summarise(n = n(), .groups = "keep") |> group_vars() by_sex_gender |> summarise(n = n(), .groups = "drop") |> group_vars() ``` When the output no longer have grouping variables, it becomes ungrouped (i.e. a regular tibble). ### `select()`, `rename()`, and `relocate()` `rename()` and `relocate()` behave identically with grouped and ungrouped data because they only affect the name or position of existing columns. Grouped `select()` is almost identical to ungrouped select, except that it always includes the grouping variables: ```{r select} by_species |> select(mass) ``` If you don't want the grouping variables, you'll have to first `ungroup()`. (This design is possibly a mistake, but we're stuck with it for now.) ### `arrange()` Grouped `arrange()` is the same as ungrouped `arrange()`, unless you set `.by_group = TRUE`, in which case it will order first by the grouping variables. ```{r} by_species |> arrange(desc(mass)) |> relocate(species, mass) by_species |> arrange(desc(mass), .by_group = TRUE) |> relocate(species, mass) ``` Note that second example is sorted by `species` (from the `group_by()` statement) and then by `mass` (within species). ### `mutate()` In simple cases with vectorised functions, grouped and ungrouped `mutate()` give the same results. They differ when used with summary functions: ```{r by_homeworld} # Subtract off global mean starwars |> select(name, homeworld, mass) |> mutate(standard_mass = mass - mean(mass, na.rm = TRUE)) # Subtract off homeworld mean starwars |> select(name, homeworld, mass) |> group_by(homeworld) |> mutate(standard_mass = mass - mean(mass, na.rm = TRUE)) ``` Or with window functions like `min_rank()`: ```{r} # Overall rank starwars |> select(name, homeworld, height) |> mutate(rank = min_rank(height)) # Rank per homeworld starwars |> select(name, homeworld, height) |> group_by(homeworld) |> mutate(rank = min_rank(height)) ``` ### `filter()` A grouped `filter()` effectively does a `mutate()` to generate a logical variable, and then only keeps the rows where the variable is `TRUE`. This means that grouped filters can be used with summary functions. For example, we can find the tallest character of each species: ```{r filter} by_species |> select(name, species, height) |> filter(height == max(height)) ``` You can also use `filter_out()` to remove entire groups. For example, the following code eliminates all groups that only have a single member: ```{r filter_group} by_species |> filter_out(n() == 1) |> tally() ``` ### `slice()` and friends `slice()` and friends (`slice_head()`, `slice_tail()`, `slice_sample()`, `slice_min()` and `slice_max()`) select rows within a group. For example, we can select the first observation within each species: ```{r slice} by_species |> relocate(species) |> slice(1) ``` Similarly, we can use `slice_min()` to select the smallest `n` values of a variable: ```{r slice_min} by_species |> filter_out(is.na(height)) |> slice_min(height, n = 2) ``` dplyr/vignettes/programming.Rmd0000644000176200001440000003527615106134104016412 0ustar liggesusers--- title: "Programming with dplyr" description: > Most dplyr verbs use "tidy evaluation", a special type of non-standard evaluation. In this vignette, you'll learn the two basic forms, data masking and tidy selection, and how you can program with them using either functions or for loops. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Programming with dplyr} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) set.seed(1014) ``` ## Introduction Most dplyr verbs use **tidy evaluation** in some way. Tidy evaluation is a special type of non-standard evaluation used throughout the tidyverse. There are two basic forms found in dplyr: - `arrange()`, `count()`, `filter()`, `group_by()`, `mutate()`, and `summarise()` use **data masking** so that you can use data variables as if they were variables in the environment (i.e. you write `my_variable` not `df$my_variable`). - `across()`, `relocate()`, `rename()`, `select()`, and `pull()` use **tidy selection** so you can easily choose variables based on their position, name, or type (e.g. `starts_with("x")` or `is.numeric`). To determine whether a function argument uses data masking or tidy selection, look at the documentation: in the arguments list, you'll see `` or ``. Data masking and tidy selection make interactive data exploration fast and fluid, but they add some new challenges when you attempt to use them indirectly such as in a for loop or a function. This vignette shows you how to overcome those challenges. We'll first go over the basics of data masking and tidy selection, talk about how to use them indirectly, and then show you a number of recipes to solve common problems. This vignette will give you the minimum knowledge you need to be an effective programmer with tidy evaluation. If you'd like to learn more about the underlying theory, or precisely how it's different from non-standard evaluation, we recommend that you read the Metaprogramming chapters in [*Advanced R*](https://adv-r.hadley.nz). ```{r setup, message = FALSE} library(dplyr) ``` ## Data masking Data masking makes data manipulation faster because it requires less typing. In most (but not all[^1]) base R functions you need to refer to variables with `$`, leading to code that repeats the name of the data frame many times: [^1]: dplyr's `filter()` is inspired by base R's `subset()`. `subset()` provides data masking, but not with tidy evaluation, so the techniques described in this chapter don't apply to it. ```{r, results = FALSE} starwars[starwars$homeworld == "Naboo" & starwars$species == "Human", ,] ``` The dplyr equivalent of this code is more concise because data masking allows you to need to type `starwars` once: ```{r, results = FALSE} starwars |> filter(homeworld == "Naboo", species == "Human") ``` ### Data- and env-variables The key idea behind data masking is that it blurs the line between the two different meanings of the word "variable": - **env-variables** are "programming" variables that live in an environment. They are usually created with `<-`. - **data-variables** are "statistical" variables that live in a data frame. They usually come from data files (e.g. `.csv`, `.xls`), or are created manipulating existing variables. To make those definitions a little more concrete, take this piece of code: ```{r} df <- data.frame(x = runif(3), y = runif(3)) df$x ``` It creates a env-variable, `df`, that contains two data-variables, `x` and `y`. Then it extracts the data-variable `x` out of the env-variable `df` using `$`. I think this blurring of the meaning of "variable" is a really nice feature for interactive data analysis because it allows you to refer to data-vars as is, without any prefix. And this seems to be fairly intuitive since many newer R users will attempt to write `diamonds[x == 0 | y == 0, ]`. Unfortunately, this benefit does not come for free. When you start to program with these tools, you're going to have to grapple with the distinction. This will be hard because you've never had to think about it before, so it'll take a while for your brain to learn these new concepts and categories. However, once you've teased apart the idea of "variable" into data-variable and env-variable, I think you'll find it fairly straightforward to use. ### Indirection The main challenge of programming with functions that use data masking arises when you introduce some indirection, i.e. when you want to get the data-variable from an env-variable instead of directly typing the data-variable's name. There are two main cases: - When you have the data-variable in a function argument (i.e. an env-variable that holds a promise[^2]), you need to **embrace** the argument by surrounding it in doubled braces, like `filter(df, {{ var }})`. The following function uses embracing to create a wrapper around `summarise()` that computes the minimum and maximum values of a variable, as well as the number of observations that were summarised: ```{r, results = FALSE} var_summary <- function(data, var) { data |> summarise(n = n(), min = min({{ var }}), max = max({{ var }})) } mtcars |> group_by(cyl) |> var_summary(mpg) ``` - When you have an env-variable that is a character vector, you need to index into the `.data` pronoun with `[[`, like `summarise(df, mean = mean(.data[[var]]))`. The following example uses `.data` to count the number of unique values in each variable of `mtcars`: ```{r, results = FALSE} for (var in names(mtcars)) { mtcars |> count(.data[[var]]) |> print() } ``` Note that `.data` is not a data frame; it's a special construct, a pronoun, that allows you to access the current variables either directly, with `.data$x` or indirectly with `.data[[var]]`. Don't expect other functions to work with it. [^2]: In R, arguments are lazily evaluated which means that until you attempt to use, they don't hold a value, just a **promise** that describes how to compute the value. You can learn more at ### Name injection Many data masking functions also use dynamic dots, which gives you another useful feature: generating names programmatically by using `:=` instead of `=`. There are two basics forms, as illustrated below with `tibble()`: - If you have the name in an env-variable, you can use glue syntax to interpolate in: ```{r} name <- "susan" tibble("{name}" := 2) ``` - If the name should be derived from a data-variable in an argument, you can use embracing syntax: ```{r} my_df <- function(x) { tibble("{{x}}_2" := x * 2) } my_var <- 10 my_df(my_var) ``` Learn more in `` ?rlang::`dyn-dots` ``. ## Tidy selection Data masking makes it easy to compute on values within a dataset. Tidy selection is a complementary tool that makes it easy to work with the columns of a dataset. ### The tidyselect DSL Underneath all functions that use tidy selection is the [tidyselect](https://tidyselect.r-lib.org/) package. It provides a miniature domain specific language that makes it easy to select columns by name, position, or type. For example: - `select(df, 1)` selects the first column; `select(df, last_col())` selects the last column. - `select(df, c(a, b, c))` selects columns `a`, `b`, and `c`. - `select(df, starts_with("a"))` selects all columns whose name starts with "a"; `select(df, ends_with("z"))` selects all columns whose name ends with "z". - `select(df, where(is.numeric))` selects all numeric columns. You can see more details in `?dplyr_tidy_select`. ### Indirection As with data masking, tidy selection makes a common task easier at the cost of making a less common task harder. When you want to use tidy select indirectly with the column specification stored in an intermediate variable, you'll need to learn some new tools. Again, there are two forms of indirection: - When you have the data-variable in an env-variable that is a function argument, you use the same technique as data masking: you **embrace** the argument by surrounding it in doubled braces. The following function summarises a data frame by computing the mean of all variables selected by the user: ```{r, results = FALSE} summarise_mean <- function(data, vars) { data |> summarise(n = n(), across({{ vars }}, mean)) } mtcars |> group_by(cyl) |> summarise_mean(where(is.numeric)) ``` - When you have an env-variable that is a character vector, you need to use `all_of()` or `any_of()` depending on whether you want the function to error if a variable is not found. The following code uses `all_of()` to select all of the variables found in a character vector; then `!` plus `all_of()` to select all of the variables *not* found in a character vector: ```{r, results = FALSE} vars <- c("mpg", "vs") mtcars |> select(all_of(vars)) mtcars |> select(!all_of(vars)) ``` ## How-tos The following examples solve a grab bag of common problems. We show you the minimum amount of code so that you can get the basic idea; most real problems will require more code or combining multiple techniques. ### User-supplied data If you check the documentation, you'll see that `.data` never uses data masking or tidy select. That means you don't need to do anything special in your function: ```{r} mutate_y <- function(data) { mutate(data, y = a + x) } ``` ### One or more user-supplied expressions If you want the user to supply an expression that's passed onto an argument which uses data masking or tidy select, embrace the argument: ```{r} my_summarise <- function(data, group_var) { data |> group_by({{ group_var }}) |> summarise(mean = mean(mass)) } ``` This generalises in a straightforward way if you want to use one user-supplied expression in multiple places: ```{r} my_summarise2 <- function(data, expr) { data |> summarise( mean = mean({{ expr }}), sum = sum({{ expr }}), n = n() ) } ``` If you want the user to provide multiple expressions, embrace each of them: ```{r} my_summarise3 <- function(data, mean_var, sd_var) { data |> summarise(mean = mean({{ mean_var }}), sd = sd({{ sd_var }})) } ``` If you want to use the name of a variable in the output, you can embrace the variable name on the left-hand side of `:=` with `{{`: ```{r} my_summarise4 <- function(data, expr) { data |> summarise( "mean_{{expr}}" := mean({{ expr }}), "sum_{{expr}}" := sum({{ expr }}), "n_{{expr}}" := n() ) } my_summarise5 <- function(data, mean_var, sd_var) { data |> summarise( "mean_{{mean_var}}" := mean({{ mean_var }}), "sd_{{sd_var}}" := sd({{ sd_var }}) ) } ``` ### Any number of user-supplied expressions If you want to take an arbitrary number of user supplied expressions, use `...`. This is most often useful when you want to give the user full control over a single part of the pipeline, like a `group_by()` or a `mutate()`. ```{r} my_summarise <- function(.data, ...) { .data |> group_by(...) |> summarise(mass = mean(mass, na.rm = TRUE), height = mean(height, na.rm = TRUE)) } starwars |> my_summarise(homeworld) starwars |> my_summarise(sex, gender) ``` When you use `...` in this way, make sure that any other arguments start with `.` to reduce the chances of argument clashes; see for more details. ### Creating multiple columns Sometimes it can be useful for a single expression to return multiple columns. You can do this by returning an unnamed data frame: ```{r} quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) { tibble( val = quantile(x, probs), quant = probs ) } x <- 1:5 quantile_df(x) ``` This sort of function is useful inside `summarise()` and `mutate()` which allow you to add multiple columns by returning a data frame: ```{r} df <- tibble( grp = rep(1:3, each = 10), x = runif(30), y = rnorm(30) ) df |> group_by(grp) |> summarise(quantile_df(x, probs = .5)) df |> group_by(grp) |> summarise(across(x:y, ~ quantile_df(.x, probs = .5), .unpack = TRUE)) ``` Notice that we set `.unpack = TRUE` inside `across()`. This tells `across()` to _unpack_ the data frame returned by `quantile_df()` into its respective columns, combining the column names of the original columns (`x` and `y`) with the column names returned from the function (`val` and `quant`). If your function returns multiple _rows_ per group, then you'll need to switch from `summarise()` to `reframe()`. `summarise()` is restricted to returning 1 row summaries per group, but `reframe()` lifts this restriction: ```{r} df |> group_by(grp) |> reframe(across(x:y, quantile_df, .unpack = TRUE)) ``` ### Transforming user-supplied variables If you want the user to provide a set of data-variables that are then transformed, use `across()` and `pick()`: ```{r} my_summarise <- function(data, summary_vars) { data |> summarise(across({{ summary_vars }}, ~ mean(., na.rm = TRUE))) } starwars |> group_by(species) |> my_summarise(c(mass, height)) ``` You can use this same idea for multiple sets of input data-variables: ```{r} my_summarise <- function(data, group_var, summarise_var) { data |> group_by(pick({{ group_var }})) |> summarise(across({{ summarise_var }}, mean)) } ``` Use the `.names` argument to `across()` to control the names of the output. ```{r} my_summarise <- function(data, group_var, summarise_var) { data |> group_by(pick({{ group_var }})) |> summarise(across({{ summarise_var }}, mean, .names = "mean_{.col}")) } ``` ### Loop over multiple variables If you have a character vector of variable names, and want to operate on them with a for loop, index into the special `.data` pronoun: ```{r, results = FALSE} for (var in names(mtcars)) { mtcars |> count(.data[[var]]) |> print() } ``` This same technique works with for loop alternatives like the base R `apply()` family and the purrr `map()` family: ```{r, results = FALSE} mtcars |> names() |> purrr::map(~ count(mtcars, .data[[.x]])) ``` (Note that the `x` in `.data[[x]]` is always treated as an env-variable; it will never come from the data.) ### Use a variable from an Shiny input Many Shiny input controls return character vectors, so you can use the same approach as above: `.data[[input$var]]`. ```{r, eval = FALSE} library(shiny) ui <- fluidPage( selectInput("var", "Variable", choices = names(diamonds)), tableOutput("output") ) server <- function(input, output, session) { data <- reactive(filter(diamonds, .data[[input$var]] > 0)) output$output <- renderTable(head(data())) } ``` See for more details and case studies. dplyr/vignettes/colwise.Rmd0000644000176200001440000003001715137161765015542 0ustar liggesusers--- title: "Column-wise operations" description: > Learn how to easily repeat the same operation across multiple columns using `across()`. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Column-wise operations} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) set.seed(1014) ``` It's often useful to perform the same operation on multiple columns, but copying and pasting is both tedious and error prone: ```{r, eval = FALSE} df |> group_by(g1, g2) |> summarise(a = mean(a), b = mean(b), c = mean(c), d = mean(d)) ``` (If you're trying to compute `mean(a, b, c, d)` for each row, instead see `vignette("rowwise")`) This vignette will introduce you to the `across()` function, which lets you rewrite the previous code more succinctly: ```{r, eval = FALSE} df |> group_by(g1, g2) |> summarise(across(a:d, mean)) ``` We'll start by discussing the basic usage of `across()`, particularly as it applies to `summarise()`, and show how to use it with multiple functions. We'll then show a few uses with other verbs. We'll finish off with a bit of history, showing why we prefer `across()` to our last approach (the `_if()`, `_at()` and `_all()` functions) and how to translate your old code to the new syntax. ```{r setup} library(dplyr, warn.conflicts = FALSE) ``` ## Basic usage `across()` has two primary arguments: * The first argument, `.cols`, selects the columns you want to operate on. It uses tidy selection (like `select()`) so you can pick variables by position, name, and type. * The second argument, `.fns`, is a function or list of functions to apply to each column. This can also be a purrr style formula (or list of formulas) like `~ .x / 2`. (This argument is optional, and you can omit it if you just want to get the underlying data; you'll see that technique used in `vignette("rowwise")`.) Here are a couple of examples of `across()` in conjunction with its favourite verb, `summarise()`. But you can use `across()` with any dplyr verb, as you'll see a little later. ```{r} starwars |> summarise(across(where(is.character), n_distinct)) starwars |> group_by(species) |> filter(n() > 1) |> summarise(across(c(sex, gender, homeworld), n_distinct)) starwars |> group_by(homeworld) |> filter(n() > 1) |> summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE))) ``` Because `across()` is usually used in combination with `summarise()` and `mutate()`, it doesn't select grouping variables in order to avoid accidentally modifying them: ```{r} df <- data.frame(g = c(1, 1, 2), x = c(-1, 1, 3), y = c(-1, -4, -9)) df |> group_by(g) |> summarise(across(where(is.numeric), sum)) ``` ### Multiple functions You can transform each variable with more than one function by supplying a named list of functions or lambda functions in the second argument: ```{r} min_max <- list( min = ~min(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE) ) starwars |> summarise(across(where(is.numeric), min_max)) starwars |> summarise(across(c(height, mass, birth_year), min_max)) ``` Control how the names are created with the `.names` argument which takes a [glue](https://glue.tidyverse.org/) spec: ```{r} starwars |> summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}")) starwars |> summarise(across(c(height, mass, birth_year), min_max, .names = "{.fn}.{.col}")) ``` If you'd prefer all summaries with the same function to be grouped together, you'll have to expand the calls yourself: ```{r} starwars |> summarise( across(c(height, mass, birth_year), ~min(.x, na.rm = TRUE), .names = "min_{.col}"), across(c(height, mass, birth_year), ~max(.x, na.rm = TRUE), .names = "max_{.col}") ) ``` (One day this might become an argument to `across()` but we're not yet sure how it would work.) We cannot however use `where(is.numeric)` in that last case because the second `across()` would pick up the variables that were newly created ("min_height", "min_mass" and "min_birth_year"). We can work around this by combining both calls to `across()` into a single expression that returns a tibble: ```{r} starwars |> summarise( tibble( across(where(is.numeric), ~min(.x, na.rm = TRUE), .names = "min_{.col}"), across(where(is.numeric), ~max(.x, na.rm = TRUE), .names = "max_{.col}") ) ) ``` Alternatively we could reorganize results with `relocate()`: ```{r} starwars |> summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}")) |> relocate(starts_with("min")) ``` ### Current column If you need to, you can access the name of the "current" column inside by calling `cur_column()`. This can be useful if you want to perform some sort of context dependent transformation that's already encoded in a vector: ```{r} df <- tibble(x = 1:3, y = 3:5, z = 5:7) mult <- list(x = 1, y = 10, z = 100) df |> mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]])) ``` ### Gotchas Be careful when combining numeric summaries with `where(is.numeric)`: ```{r} df <- data.frame(x = c(1, 2, 3), y = c(1, 4, 9)) df |> summarise(n = n(), across(where(is.numeric), sd)) ``` Here `n` becomes `NA` because `n` is numeric, so the `across()` computes its standard deviation, and the standard deviation of 3 (a constant) is `NA`. You probably want to compute `n()` last to avoid this problem: ```{r} df |> summarise(across(where(is.numeric), sd), n = n()) ``` Alternatively, you could explicitly exclude `n` from the columns to operate on: ```{r} df |> summarise(n = n(), across(where(is.numeric) & !n, sd)) ``` Another approach is to combine both the call to `n()` and `across()` in a single expression that returns a tibble: ```{r} df |> summarise( tibble(n = n(), across(where(is.numeric), sd)) ) ``` ### Other verbs So far we've focused on the use of `across()` with `summarise()`, but it works with any other dplyr verb that uses data masking: * Rescale all numeric variables to range 0-1: ```{r} rescale01 <- function(x) { rng <- range(x, na.rm = TRUE) (x - rng[1]) / (rng[2] - rng[1]) } df <- tibble(x = 1:4, y = rnorm(4)) df |> mutate(across(where(is.numeric), rescale01)) ``` For some verbs, like `group_by()`, `count()` and `distinct()`, you don't need to supply a summary function, but it can be useful to use tidy-selection to dynamically select a set of columns. In those cases, we recommend using the complement to `across()`, `pick()`, which works like `across()` but doesn't apply any functions and instead returns a data frame containing the selected columns. * Find all distinct ```{r} starwars |> distinct(pick(contains("color"))) ``` * Count all combinations of variables with a given pattern: ```{r} starwars |> count(pick(contains("color")), sort = TRUE) ``` `across()` doesn't work with `select()` or `rename()` because they already use tidy select syntax; if you want to transform column names with a function, you can use `rename_with()`. ### filter() and filter_out() We cannot directly use `across()` in `filter()` or `filter_out()` because we need an extra step to combine the results into a single logical vector. To that end, `filter()` and `filter_out()` have two special purpose companion functions: * `if_any()` keeps the rows where the predicate is true for *at least one* selected column: ```{r} starwars |> filter_out(if_any(everything(), is.na)) ``` * `if_all()` keeps the rows where the predicate is true for *all* selected columns: ```{r} starwars |> filter_out(if_all(everything(), is.na)) ``` ## `_if`, `_at`, `_all` Prior versions of dplyr allowed you to apply a function to multiple columns in a different way: using functions with `_if`, `_at`, and `_all()` suffixes. These functions solved a pressing need and are used by many people, but are now superseded. That means that they'll stay around, but won't receive any new features and will only get critical bug fixes. ### Why do we like `across()`? Why did we decide to move away from these functions in favour of `across()`? 1. `across()` makes it possible to express useful summaries that were previously impossible: ```{r, eval = FALSE} df |> group_by(g1, g2) |> summarise( across(where(is.numeric), mean), across(where(is.factor), nlevels), n = n(), ) ``` 1. `across()` reduces the number of functions that dplyr needs to provide. This makes dplyr easier for you to use (because there are fewer functions to remember) and easier for us to implement new verbs (since we only need to implement one function, not four). 1. `across()` unifies `_if` and `_at` semantics so that you can select by position, name, and type, and you can now create compound selections that were previously impossible. For example, you can now transform all numeric columns whose name begins with "x": `across(where(is.numeric) & starts_with("x"))`. 1. `across()` doesn't need to use `vars()`. The `_at()` functions are the only place in dplyr where you have to manually quote variable names, which makes them a little weird and hence harder to remember. ### Why did it take so long to discover `across()`? It's disappointing that we didn't discover `across()` earlier, and instead worked through several false starts (first not realising that it was a common problem, then with the `_each()` functions, and most recently with the `_if()`/`_at()`/`_all()` functions). But `across()` couldn't work without three recent discoveries: * You can have a column of a data frame that is itself a data frame. This is something provided by base R, but it's not very well documented, and it took a while to see that it was useful, not just a theoretical curiosity. * We can use data frames to allow summary functions to return multiple columns. * We can use the absence of an outer name as a convention that you want to unpack a data frame column into individual columns. ### How do you convert existing code? Fortunately, it's generally straightforward to translate your existing code to use `across()`: * Strip the `_if()`, `_at()` and `_all()` suffix off the function. * Call `across()`. The first argument will be: 1. For `_if()`, the old second argument wrapped in `where()`. 1. For `_at()`, the old second argument, with the call to `vars()` removed. 1. For `_all()`, `everything()`. The subsequent arguments can be copied as is. For example: ```{r, results = FALSE} df |> mutate_if(is.numeric, ~mean(.x, na.rm = TRUE)) # -> df |> mutate(across(where(is.numeric), ~mean(.x, na.rm = TRUE))) df |> mutate_at(vars(c(x, starts_with("y"))), mean) # -> df |> mutate(across(c(x, starts_with("y")), mean)) df |> mutate_all(mean) # -> df |> mutate(across(everything(), mean)) ``` There are a few exceptions to this rule: * `rename_*()` and `select_*()` follow a different pattern. They already have select semantics, so are generally used in a different way that doesn't have a direct equivalent with `across()`; use the new `rename_with()` instead. * Previously, `filter_*()` were paired with the `all_vars()` and `any_vars()` helpers. The new helpers `if_any()` and `if_all()` can be used inside `filter()` to keep rows for which the predicate is true for at least one, or all selected columns: ```{r} df <- tibble(x = c("a", "b"), y = c(1, 1), z = c(-1, 1)) # Find all rows where EVERY numeric variable is greater than zero df |> filter(if_all(where(is.numeric), ~ .x > 0)) # Find all rows where ANY numeric variable is greater than zero df |> filter(if_any(where(is.numeric), ~ .x > 0)) ``` * When used in a `mutate()`, all transformations performed by an `across()` are applied at once. This is different to the behaviour of `mutate_if()`, `mutate_at()`, and `mutate_all()`, which apply the transformations one at a time. We expect that you'll generally find the new behaviour less surprising: ```{r} df <- tibble(x = 2, y = 4, z = 8) df |> mutate_all(~ .x / y) df |> mutate(across(everything(), ~ .x / y)) ``` dplyr/vignettes/base.Rmd0000644000176200001440000002734115106134104014774 0ustar liggesusers--- title: "dplyr <-> base R" output: rmarkdown::html_vignette description: > How does dplyr compare to base R? This vignette describes the main differences in philosophy, and shows the base R code most closely equivalent to each dplyr verb. vignette: > %\VignetteIndexEntry{dplyr <-> base R} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4, tibble.print_max = 4) ``` This vignette compares dplyr functions to their base R equivalents. This helps those familiar with base R understand better what dplyr does, and shows dplyr users how you might express the same ideas in base R code. We'll start with a rough overview of the major differences, then discuss the one table verbs in more detail, followed by the two table verbs. # Overview 1. The code dplyr verbs input and output data frames. This contrasts with base R functions which more frequently work with individual vectors. 1. dplyr relies heavily on "non-standard evaluation" so that you don't need to use `$` to refer to columns in the "current" data frame. This behaviour is inspired by the base functions `subset()` and `transform()`. 1. dplyr solutions tend to use a variety of single purpose verbs, while base R solutions typically tend to use `[` in a variety of ways, depending on the task at hand. 1. Multiple dplyr verbs are often strung together into a pipeline by `|>`. In base R, you'll typically save intermediate results to a variable that you either discard, or repeatedly overwrite. 1. All dplyr verbs handle "grouped" data frames so that the code to perform a computation per-group looks very similar to code that works on a whole data frame. In base R, per-group operations tend to have varied forms. # One table verbs The following table shows a condensed translation between dplyr verbs and their base R equivalents. The following sections describe each operation in more detail. You'll learn more about the dplyr verbs in their documentation and in `vignette("dplyr")`. | dplyr | base | |------------------------------- |--------------------------------------------------| | `arrange(df, x)` | `df[order(x), , drop = FALSE]` | | `distinct(df, x)` | `df[!duplicated(x), , drop = FALSE]`, `unique()` | | `filter(df, x)` | `df[which(x), , drop = FALSE]`, `subset()` | | `mutate(df, z = x + y)` | `df$z <- df$x + df$y`, `transform()` | | `pull(df, 1)` | `df[[1]]` | | `pull(df, x)` | `df$x` | | `rename(df, y = x)` | `names(df)[names(df) == "x"] <- "y"` | | `relocate(df, y)` | `df[union("y", names(df))]` | | `select(df, x, y)` | `df[c("x", "y")]`, `subset()` | | `select(df, starts_with("x"))` | `df[grepl("^x", names(df))]` | | `summarise(df, mean(x))` | `mean(df$x)`, `tapply()`, `aggregate()`, `by()` | | `slice(df, c(1, 2, 5))` | `df[c(1, 2, 5), , drop = FALSE]` | To begin, we'll load dplyr and convert `mtcars` and `iris` to tibbles so that we can easily show only abbreviated output for each operation. ```{r setup, message = FALSE} library(dplyr) mtcars <- as_tibble(mtcars) iris <- as_tibble(iris) ``` ## `arrange()`: Arrange rows by variables `dplyr::arrange()` orders the rows of a data frame by the values of one or more columns: ```{r} mtcars |> arrange(cyl, disp) ``` The `desc()` helper allows you to order selected variables in descending order: ```{r} mtcars |> arrange(desc(cyl), desc(disp)) ``` We can replicate in base R by using `[` with `order()`: ```{r} mtcars[order(mtcars$cyl, mtcars$disp), , drop = FALSE] ``` Note the use of `drop = FALSE`. If you forget this, and the input is a data frame with a single column, the output will be a vector, not a data frame. This is a source of subtle bugs. Base R does not provide a convenient and general way to sort individual variables in descending order, so you have two options: * For numeric variables, you can use `-x`. * You can request `order()` to sort all variables in descending order. ```{r, results = FALSE} mtcars[order(mtcars$cyl, mtcars$disp, decreasing = TRUE), , drop = FALSE] mtcars[order(-mtcars$cyl, -mtcars$disp), , drop = FALSE] ``` ## `distinct()`: Select distinct/unique rows `dplyr::distinct()` selects unique rows: ```{r} df <- tibble( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) df |> distinct(x) # selected columns df |> distinct(x, .keep_all = TRUE) # whole data frame ``` There are two equivalents in base R, depending on whether you want the whole data frame, or just selected variables: ```{r} unique(df["x"]) # selected columns df[!duplicated(df$x), , drop = FALSE] # whole data frame ``` ## `filter()`: Return rows with matching conditions `dplyr::filter()` selects rows where an expression is `TRUE`: ```{r} starwars |> filter(species == "Human") starwars |> filter(mass > 1000) starwars |> filter(hair_color == "none" & eye_color == "black") ``` The closest base equivalent (and the inspiration for `filter()`) is `subset()`: ```{r} subset(starwars, species == "Human") subset(starwars, mass > 1000) subset(starwars, hair_color == "none" & eye_color == "black") ``` You can also use `[` but this also requires the use of `which()` to remove `NA`s: ```{r} starwars[which(starwars$species == "Human"), , drop = FALSE] starwars[which(starwars$mass > 1000), , drop = FALSE] starwars[which(starwars$hair_color == "none" & starwars$eye_color == "black"), , drop = FALSE] ``` ## `mutate()`: Create or transform variables `dplyr::mutate()` creates new variables from existing variables: ```{r} df |> mutate(z = x + y, z2 = z ^ 2) ``` The closest base equivalent is `transform()`, but note that it cannot use freshly created variables: ```{r} head(transform(df, z = x + y, z2 = (x + y) ^ 2)) ``` Alternatively, you can use `$<-`: ```{r} mtcars$cyl2 <- mtcars$cyl * 2 mtcars$cyl4 <- mtcars$cyl2 * 2 ``` When applied to a grouped data frame, `dplyr::mutate()` computes new variable once per group: ```{r} gf <- tibble(g = c(1, 1, 2, 2), x = c(0.5, 1.5, 2.5, 3.5)) gf |> group_by(g) |> mutate(x_mean = mean(x), x_rank = rank(x)) ``` To replicate this in base R, you can use `ave()`: ```{r} transform(gf, x_mean = ave(x, g, FUN = mean), x_rank = ave(x, g, FUN = rank) ) ``` ## `pull()`: Pull out a single variable `dplyr::pull()` extracts a variable either by name or position: ```{r} mtcars |> pull(1) mtcars |> pull(cyl) ``` This equivalent to `[[` for positions and `$` for names: ```{r} mtcars[[1]] mtcars$cyl ``` ## `relocate()`: Change column order `dplyr::relocate()` makes it easy to move a set of columns to a new position (by default, the front): ```{r} # to front mtcars |> relocate(gear, carb) # to back mtcars |> relocate(mpg, cyl, .after = last_col()) ``` We can replicate this in base R with a little set manipulation: ```{r} mtcars[union(c("gear", "carb"), names(mtcars))] to_back <- c("mpg", "cyl") mtcars[c(setdiff(names(mtcars), to_back), to_back)] ``` Moving columns to somewhere in the middle requires a little more set twiddling. ## `rename()`: Rename variables by name `dplyr::rename()` allows you to rename variables by name or position: ```{r} iris |> rename(sepal_length = Sepal.Length, sepal_width = 2) ``` Renaming variables by position is straight forward in base R: ```{r} iris2 <- iris names(iris2)[2] <- "sepal_width" ``` Renaming variables by name requires a bit more work: ```{r} names(iris2)[names(iris2) == "Sepal.Length"] <- "sepal_length" ``` ## `rename_with()`: Rename variables with a function `dplyr::rename_with()` transform column names with a function: ```{r} iris |> rename_with(toupper) ``` A similar effect can be achieved with `setNames()` in base R: ```{r} setNames(iris, toupper(names(iris))) ``` ## `select()`: Select variables by name `dplyr::select()` subsets columns by position, name, function of name, or other property: ```{r} iris |> select(1:3) iris |> select(Species, Sepal.Length) iris |> select(starts_with("Petal")) iris |> select(where(is.factor)) ``` Subsetting variables by position is straightforward in base R: ```{r} iris[1:3] # single argument selects columns; never drops iris[1:3, , drop = FALSE] ``` You have two options to subset by name: ```{r} iris[c("Species", "Sepal.Length")] subset(iris, select = c(Species, Sepal.Length)) ``` Subsetting by function of name requires a bit of work with `grep()`: ```{r} iris[grep("^Petal", names(iris))] ``` And you can use `Filter()` to subset by type: ```{r} Filter(is.factor, iris) ``` ## `summarise()`: Reduce multiple values down to a single value `dplyr::summarise()` computes one or more summaries for each group: ```{r} mtcars |> group_by(cyl) |> summarise(mean = mean(disp), n = n()) ``` I think the closest base R equivalent uses `by()`. Unfortunately `by()` returns a list of data frames, but you can combine them back together again with `do.call()` and `rbind()`: ```{r} mtcars_by <- by(mtcars, mtcars$cyl, function(df) { with(df, data.frame(cyl = cyl[[1]], mean = mean(disp), n = nrow(df))) }) do.call(rbind, mtcars_by) ``` `aggregate()` comes very close to providing an elegant answer: ```{r} agg <- aggregate(disp ~ cyl, mtcars, function(x) c(mean = mean(x), n = length(x))) agg ``` But unfortunately while it looks like there are `disp.mean` and `disp.n` columns, it's actually a single matrix column: ```{r} str(agg) ``` You can see a variety of other options at . ## `slice()`: Choose rows by position `slice()` selects rows with their location: ```{r} slice(mtcars, 25:n()) ``` This is straightforward to replicate with `[`: ```{r} mtcars[25:nrow(mtcars), , drop = FALSE] ``` # Two-table verbs When we want to merge two data frames, `x` and `y`), we have a variety of different ways to bring them together. Various base R `merge()` calls are replaced by a variety of dplyr `join()` functions. | dplyr | base | |------------------------|-----------------------------------------| | `inner_join(df1, df2)` |`merge(df1, df2)` | | `left_join(df1, df2) ` |`merge(df1, df2, all.x = TRUE)` | | `right_join(df1, df2)` |`merge(df1, df2, all.y = TRUE)` | | `full_join(df1, df2)` |`merge(df1, df2, all = TRUE)` | | `semi_join(df1, df2)` |`df1[df1$x %in% df2$x, , drop = FALSE]` | | `anti_join(df1, df2)` |`df1[!df1$x %in% df2$x, , drop = FALSE]` | For more information about two-table verbs, see `vignette("two-table")`. ### Mutating joins dplyr's `inner_join()`, `left_join()`, `right_join()`, and `full_join()` add new columns from `y` to `x`, matching rows based on a set of "keys", and differ only in how missing matches are handled. They are equivalent to calls to `merge()` with various settings of the `all`, `all.x`, and `all.y` arguments. The main difference is the order of the rows: * dplyr preserves the order of the `x` data frame. * `merge()` sorts the key columns. ### Filtering joins dplyr's `semi_join()` and `anti_join()` affect only the rows, not the columns: ```{r} band_members |> semi_join(band_instruments) band_members |> anti_join(band_instruments) ``` They can be replicated in base R with `[` and `%in%`: ```{r} band_members[band_members$name %in% band_instruments$name, , drop = FALSE] band_members[!band_members$name %in% band_instruments$name, , drop = FALSE] ``` Semi and anti joins with multiple key variables are considerably more challenging to implement. dplyr/vignettes/recoding-replacing.Rmd0000644000176200001440000003560215137161765017636 0ustar liggesusers--- title: "Recoding columns and replacing values" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Recoding columns and replacing values} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} editor: markdown: wrap: sentence canonical: true --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup, message = FALSE} library(dplyr) ``` ## Introduction dplyr provides a family of functions for *recoding* columns and *replacing* values within a column. These are extremely common operations, so mastering this family can be a big productivity boost! Before we begin, it'll be helpful to define exactly what we mean by recoding vs replacing: - *Recoding* a column creates an entirely new column using values from an existing column. The new column may have a different type from the original column. - *Replacing* values within a column partially updates an existing column with new values. The result has the same type as the original column. The family of functions can be summarized by the following table: | | **Recoding** | **Replacing** | |---------------------------|-------------------|--------------------| | **Match with conditions** | `case_when()` | `replace_when()` | | **Match with values** | `recode_values()` | `replace_values()` | This vignette walks through use cases for each of these functions, which should help you build some intuition about when to use them. ## `case_when()` `case_when()` is the most general function in the family. It works by evaluating each case sequentially and using the first match for each element to determine the corresponding value in the output. To demonstrate, we'll look at a dataset of some 5k times in minutes: ```{r} set.seed(123) racers <- tibble( id = seq_len(100), time = round(sample(1200:2100, size = 100, replace = TRUE) / 60, 2) ) racers ``` We can use `case_when()` to categorize these times into tiers: ```{r} tiers <- racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D" ) ) tiers ``` There's a few things to note here: - The *first* condition that is `TRUE` is used, i.e. a time of 21 minutes meets all of the conditions, but would be placed in tier `A` because `time < 23` is listed first. - Unmatched values fall through as `NA`. We have some racers above 33 minutes that aren't captured here! There are a few options for dealing with unmatched locations. You can leave them as `NA` if that makes sense for your use case, or you can specify a `.default` value: ```{r} racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .default = "unknown" ) ) ``` If you are confident that you've captured every case, you can supply `.unmatched = "error"` rather than `.default` and `case_when()` will error if that assertion doesn't hold. This is great for defensive programming! ```{r, error = TRUE} racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .unmatched = "error" ) ) ``` Note that missing values must be explicitly handled when setting `.unmatched = "error"`, even if that's just `is.na(time) ~ NA`, otherwise they will trigger the unmatched error. ## `replace_when()` Let's assume that some of our racers used banned running shoes and are disqualified. Also, some racers had a false start and need to incur a 20 second (1/3 minute) penalty. ```{r} id_banned_shoes <- c(2, 10, 15, 32, 65) id_false_start <- c(1, 2, 5, 20, 55, 74, 91) ``` We could add this information in a few ways. With `case_when()`: ```{r} racers |> mutate( time = case_when( id %in% id_banned_shoes ~ NA, id %in% id_false_start ~ time + 1 / 3, .default = time ) ) ``` Or in two steps with `if_else()`: ```{r} racers |> mutate(time = if_else(id %in% id_banned_shoes, NA, time)) |> mutate(time = if_else(id %in% id_false_start, time + 1 / 3, time)) ``` Neither of these feel particularly elegant at expressing the *intent* of this operation. All you're trying to do is replace a few values of `time`! We like to think of `time` as the *primary* input: `time` goes in, and `time` comes out (slightly adjusted). But both `case_when()` and `if_else()` have `time` as their last input, making the intent a bit hard to understand at first glance. `replace_when()` lets you pull the primary input to the front (which also makes it compatible with the pipe!), making the intent more clear: ```{r} racers |> mutate( time = time |> replace_when( id %in% id_banned_shoes ~ NA, id %in% id_false_start ~ time + 1 / 3 ) ) ``` As a side note, you might have been tempted to reach for `base::replace()` here, i.e. as: ```{r, eval = FALSE} racers |> mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |> mutate(time = base::replace(time, id %in% id_false_start, time + 1 / 3)) ``` This actually doesn't work! Replacing with `NA` does work, but `replace()` requires that the result of `time + 1 / 3` must be preemptively subset to the places where the condition is true. You'd have to do something more complicated to mimic `replace_when()`: ```{r} racers |> mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |> mutate(time = { loc <- id %in% id_false_start base::replace(time, loc, time[loc] + 1 / 3) }) ``` ### Type stability Beyond readability, an important benefit of `replace_when()` (and `replace_values()`, which we'll see later) is that it is *type stable* on the column you are modifying, which means that it can't change types out from under you. Type stability is particularly useful with factors. Taking another look at our `tiers` of race times, imagine that some of the race times were discovered to be faulty due to malfunctioning timers, and you need to replace a few `id`s with the `unknown` level. ```{r} id_with_malfunction <- c(1, 5, 20, 50) tiers <- racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .default = "unknown" ) |> factor(levels = c("A", "B", "C", "D", "unknown")) ) tiers ``` Note that the following `case_when()` solution results in `tier` becoming a *character* column, losing its factor class. This is due to the fact that `case_when()` is a *recoding* function, it creates an entirely new column and doesn't know that you're trying to retain existing type information. ```{r} tiers |> mutate( tier = case_when(id %in% id_with_malfunction ~ "unknown", .default = tier) ) ``` As a *replacing* function, `replace_when()` knows to be type stable on `tier`, and casts `"unknown"` to `tier`'s factor type before performing the replacement: ```{r} tiers |> mutate( tier = tier |> replace_when(id %in% id_with_malfunction ~ "unknown") ) ``` ## `recode_values()` `case_when()` and `replace_when()` both take *logical* vectors on the left-hand side of the formula. This is very flexible, but sometimes these functions require a large amount of repetition. Consider the following [Likert scale](https://en.wikipedia.org/wiki/Likert_scale) scores. We'd like to recode these from their numeric values to their character counterparts. ```{r} likert <- tibble( score = c(1, 2, 3, 4, 5, 2, 3, 1, 4) ) ``` We could certainly use a `case_when()`: ```{r} likert |> mutate( score = case_when( score == 1 ~ "Strongly disagree", score == 2 ~ "Disagree", score == 3 ~ "Neutral", score == 4 ~ "Agree", score == 5 ~ "Strongly agree" ) ) ``` But `score ==` is repeated many times! If you find yourself using `==` or `%in%` on the left-hand side in this manner, you likely want to use `recode_values()` instead. Rather than taking logical vectors, `recode_values()` takes *values* on the left-hand side to match against a single input that you'll provide as the first argument. ```{r} likert |> mutate( score = score |> recode_values( 1 ~ "Strongly disagree", 2 ~ "Disagree", 3 ~ "Neutral", 4 ~ "Agree", 5 ~ "Strongly agree" ) ) ``` This removes all of the repetition, allowing you to focus on the mapping. If you squint, the mapping should look roughly like a lookup table between the numeric value and the likert encoding. `recode_values()` actually has a second interface that allows us to make this lookup table representation even more explicit. Using a `tribble()`, we can extract out the lookup table into its own standalone data frame. ```{r} lookup <- tribble( ~from , ~to , 1 , "Strongly disagree" , 2 , "Disagree" , 3 , "Neutral" , 4 , "Agree" , 5 , "Strongly agree" ) ``` We can then utilize the alternative `from` and `to` arguments of `recode_values()` rather than supplying formulas to specify how the values should be recoded: ```{r} likert |> mutate(score = recode_values(score, from = lookup$from, to = lookup$to)) ``` Lifting the lookup table to the top of the file is particularly nice when you have a long pipe chain. The details of the mapping get some room to breathe, and in the pipe chain you can focus on the actual manipulations you are trying to perform. It's also very common for your `lookup` table to exist in a CSV file that you have to read in separately. In that case, you can replace the `tribble()` call with: ```{r, eval = FALSE} lookup <- readr::read_csv("lookup.csv") ``` But everything else works the same. This would be quite hard to specify with just the formula interface! Like `case_when()`, `recode_values()` also has `default` and `unmatched` arguments to handle unmatched locations: ```{r, error = TRUE} likert <- tibble( score = c(0, 1, 2, 2, 4, 5, 2, 3, 1, 4) ) # Missed the `0` likert |> mutate( score = score |> recode_values( from = lookup$from, to = lookup$to, unmatched = "error" ) ) ``` ## `replace_values()` As seen above, when replacing a few locations in a column using *logical conditions*, we reached for `replace_when()` rather than `case_when()`. Similarly, when replacing a few locations using *values* to match against, it's best to use `replace_values()` over `recode_values()`. Imagine we'd like to collapse some, but not all, of these school names into common buckets: ```{r} schools <- tibble( name = c( "UNC", "Chapel Hill", NA, "Duke", "Duke University", "UNC", "NC State", "ECU" ) ) ``` We could use `recode_values()`: ```{r} schools |> mutate( name = recode_values( name, c("UNC", "Chapel Hill") ~ "UNC Chapel Hill", c("Duke", "Duke University") ~ "Duke", default = name ) ) ``` But this "partial update by value" is so common that it really deserves its own name that doesn't require you to specify `default`. For that, we have `replace_values()`: ```{r} schools |> mutate( name = name |> replace_values( c("UNC", "Chapel Hill") ~ "UNC Chapel Hill", c("Duke", "Duke University") ~ "Duke" ) ) ``` Like `recode_values()`, `replace_values()` has an alternative `from` and `to` API that works well with lookup tables and allows you to move your mapping out of the pipe chain: ```{r} lookup <- tribble( ~from , ~to , "UNC" , "UNC Chapel Hill" , "Chapel Hill" , "UNC Chapel Hill" , "Duke" , "Duke" , "Duke University" , "Duke" ) schools |> mutate(name = replace_values(name, from = lookup$from, to = lookup$to)) ``` An extremely neat feature of the `from` and `to` API is that they also take *lists* of vectors that describe the mapping, which has been designed to work elegantly with the fact that `tribble()` can create list columns, allowing you to further collapse this lookup table: ```{r} # Condensed lookup table with a `many:1` mapping per row lookup <- tribble( ~from , ~to , c("UNC", "Chapel Hill") , "UNC Chapel Hill" , c("Duke", "Duke University") , "Duke" ) # Note that `from` is a list column lookup lookup$from # Works the same as before schools |> mutate(name = replace_values(name, from = lookup$from, to = lookup$to)) ``` ## Comparisons We'll end this vignette with some comparisons of the recoding and replacing family to other dplyr functions and to other technologies, like SQL. ### `if_else()` `if_else()` is a type of recoding function, as it creates an entirely new column. In fact, it's closely tied to `case_when()`: ```{r, eval = FALSE} if_else(condition, true, false, missing) case_when( condition ~ true, !condition ~ false, is.na(condition) ~ missing ) ``` Similar to `case_when()`, `if_else()` doesn't offer type stability on any particular input. The output's type is computed as the common type of `true`, `false`, and `missing`. If you find yourself writing an `if_else()` where the purpose is to partially update an existing column, consider using `replace_when()` instead for clarity and type stability: ```{r, eval = FALSE} x <- if_else(x > 5, new, x) # Type stable on `x`. # Intent of "partially updating" `x` is clear. # Pipe friendly. x <- x |> replace_when(x > 5 ~ new) ``` ### `coalesce()` For converting from `NA` to some other value, the most common cases of `coalesce()` are often a `replace_values()` call in disguise: ```{r} x <- c(1, 2, NA, 3, NA, 5) y <- c(0, 3, 1, 4, 6, 7) coalesce(x, 0) replace_values(x, NA ~ 0) coalesce(x, y) replace_values(x, NA ~ y) ``` And with `replace_values()` you can replace any value, not just `NA`. ### `na_if()` For converting from a problematic value to `NA`, `replace_values()` is a more flexible (and likely more intuitive) alternative to `na_if()`: ```{r} x <- c(1, 2, 0, -99, 12) # To convert `0` and `-99` to `NA`, you have to do it in two calls x |> na_if(0) |> na_if(-99) x |> replace_values(from = c(0, -99), to = NA) ``` ### SQL `case_when()` is an R equivalent of SQL's [Searched `CASE`](https://learn.microsoft.com/en-us/sql/t-sql/language-elements/case-transact-sql?view=sql-server-ver17#syntax) statement: ``` r case_when( x < 100 ~ this, x < 20 ~ that, .default = default ) ``` ``` sql CASE WHEN x < 100 THEN this WHEN x < 20 THEN that ELSE default END ``` And dbplyr will translate a `case_when()` to this form! `recode_values()` is an R equivalent of SQL's [Simple `CASE`](https://learn.microsoft.com/en-us/sql/t-sql/language-elements/case-transact-sql?view=sql-server-ver17#syntax) statement: ``` r recode_values( x, "E" ~ "East", "W" ~ "West", "N" ~ "North", "S" ~ "South", .default = "Unknown" ) ``` ``` sql CASE x WHEN 'E' THEN 'East' WHEN 'W' THEN 'West' WHEN 'N' THEN 'North' WHEN 'S' THEN 'South' ELSE 'Unknown' END ``` As of dbplyr 2.5.1, we don't currently have a translation for `recode_values()` since it is so new, but we expect to have one soon. dplyr/vignettes/rowwise.Rmd0000644000176200001440000003307415137161765015602 0ustar liggesusers--- title: "Row-wise operations" description: > In R, it's usually easier to do something for each column than for each row. In this vignette you will learn how to use the `rowwise()` function to perform operations by row. Along the way, you'll learn about list-columns, and see how you might perform simulations and modelling within dplyr verbs. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Row-wise operations} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) ``` dplyr, and R in general, are particularly well suited to performing operations over columns, and performing operations over rows is much harder. In this vignette, you'll learn dplyr's approach centred around the row-wise data frame created by `rowwise()`. There are three common use cases that we discuss in this vignette: * Row-wise aggregates (e.g. compute the mean of x, y, z). * Calling a function multiple times with varying arguments. * Working with list-columns. These types of problems are often easily solved with a for loop, but it's nice to have a solution that fits naturally into a pipeline. > Of course, someone has to write loops. It doesn't have to be you. > --- Jenny Bryan ```{r setup} library(dplyr, warn.conflicts = FALSE) ``` ```{r include = FALSE} nest_by <- function(df, ...) { df |> group_by(...) |> summarise(data = list(pick(everything()))) |> rowwise(...) } # mtcars |> nest_by(cyl) ``` ## Creating Row-wise operations require a special type of grouping where each group consists of a single row. You create this with `rowwise()`: ```{r} df <- tibble(x = 1:2, y = 3:4, z = 5:6) df |> rowwise() ``` Like `group_by()`, `rowwise()` doesn't really do anything itself; it just changes how the other verbs work. For example, compare the results of `mutate()` in the following code: ```{r} df |> mutate(m = mean(c(x, y, z))) df |> rowwise() |> mutate(m = mean(c(x, y, z))) ``` If you use `mutate()` with a regular data frame, it computes the mean of `x`, `y`, and `z` across all rows. If you apply it to a row-wise data frame, it computes the mean for each row. You can optionally supply "identifier" variables in your call to `rowwise()`. These variables are preserved when you call `summarise()`, so they behave somewhat similarly to the grouping variables passed to `group_by()`: ```{r} df <- tibble(name = c("Mara", "Hadley"), x = 1:2, y = 3:4, z = 5:6) df |> rowwise() |> summarise(m = mean(c(x, y, z))) df |> rowwise(name) |> summarise(m = mean(c(x, y, z))) ``` `rowwise()` is just a special form of grouping, so if you want to remove it from a data frame, just call `ungroup()`. ## Per row summary statistics `dplyr::summarise()` makes it really easy to summarise values across rows within one column. When combined with `rowwise()` it also makes it easy to summarise values across columns within one row. To see how, we'll start by making a little dataset: ```{r} df <- tibble(id = 1:6, w = 10:15, x = 20:25, y = 30:35, z = 40:45) df ``` Let's say we want compute the sum of `w`, `x`, `y`, and `z` for each row. We start by making a row-wise data frame: ```{r} rf <- df |> rowwise(id) ``` We can then use `mutate()` to add a new column to each row, or `summarise()` to return just that one summary: ```{r} rf |> mutate(total = sum(c(w, x, y, z))) rf |> summarise(total = sum(c(w, x, y, z))) ``` Of course, if you have a lot of variables, it's going to be tedious to type in every variable name. Instead, you can use `c_across()` which uses tidy selection syntax so you can to succinctly select many variables: ```{r} rf |> mutate(total = sum(c_across(w:z))) rf |> mutate(total = sum(c_across(where(is.numeric)))) ``` You could combine this with column-wise operations (see `vignette("colwise")` for more details) to compute the proportion of the total for each column: ```{r} rf |> mutate(total = sum(c_across(w:z))) |> ungroup() |> mutate(across(w:z, ~ . / total)) ``` ### Row-wise summary functions The `rowwise()` approach will work for any summary function. But if you need greater speed, it's worth looking for a built-in row-wise variant of your summary function. These are more efficient because they operate on the data frame as whole; they don't split it into rows, compute the summary, and then join the results back together again. ```{r} df |> mutate(total = rowSums(pick(where(is.numeric), -id))) df |> mutate(mean = rowMeans(pick(where(is.numeric), -id))) ``` **NB**: I use `df` (not `rf`) and `pick()` (not `c_across()`) here because `rowMeans()` and `rowSums()` take a multi-row data frame as input. Also note that `-id` is needed to avoid selecting `id` in `pick()`. This wasn't required with the rowwise data frame because we had specified `id` as an identifier in our original call to `rowwise()`, preventing it from being selected as a grouping column. ## List-columns `rowwise()` operations are a natural pairing when you have list-columns. They allow you to avoid explicit loops and/or functions from the `apply()` or `purrr::map()` families. ### Motivation Imagine you have this data frame, and you want to count the lengths of each element: ```{r} df <- tibble( x = list(1, 2:3, 4:6) ) ``` You might try calling `length()`: ```{r} df |> mutate(l = length(x)) ``` But that returns the length of the column, not the length of the individual values. If you're an R documentation aficionado, you might know there's already a base R function just for this purpose: ```{r} df |> mutate(l = lengths(x)) ``` Or if you're an experienced R programmer, you might know how to apply a function to each element of a list using `sapply()`, `vapply()`, or one of the purrr `map()` functions: ```{r} df |> mutate(l = sapply(x, length)) df |> mutate(l = purrr::map_int(x, length)) ``` But wouldn't it be nice if you could just write `length(x)` and dplyr would figure out that you wanted to compute the length of the element inside of `x`? Since you're here, you might already be guessing at the answer: this is just another application of the row-wise pattern. ```{r} df |> rowwise() |> mutate(l = length(x)) ``` ### Subsetting Before we continue on, I wanted to briefly mention the magic that makes this work. This isn't something you'll generally need to think about (it'll just work), but it's useful to know about when something goes wrong. There's an important difference between a grouped data frame where each group happens to have one row, and a row-wise data frame where every group always has one row. Take these two data frames: ```{r} df <- tibble(g = 1:2, y = list(1:3, "a")) gf <- df |> group_by(g) rf <- df |> rowwise(g) ``` If we compute some properties of `y`, you'll notice the results look different: ```{r} gf |> mutate(type = typeof(y), length = length(y)) rf |> mutate(type = typeof(y), length = length(y)) ``` They key difference is that when `mutate()` slices up the columns to pass to `length(y)` the grouped mutate uses `[` and the row-wise mutate uses `[[`. The following code gives a flavour of the differences if you used a for loop: ```{r} # grouped out1 <- integer(2) for (i in 1:2) { out1[[i]] <- length(df$y[i]) } out1 # rowwise out2 <- integer(2) for (i in 1:2) { out2[[i]] <- length(df$y[[i]]) } out2 ``` Note that this magic only applies when you're referring to existing columns, not when you're creating new rows. This is potentially confusing, but we're fairly confident it's the least worst solution, particularly given the hint in the error message. ```{r, error = TRUE} gf |> mutate(y2 = y) rf |> mutate(y2 = y) rf |> mutate(y2 = list(y)) ``` ### Modelling `rowwise()` data frames allow you to solve a variety of modelling problems in what I think is a particularly elegant way. We'll start by creating a nested data frame: ```{r} by_cyl <- mtcars |> nest_by(cyl) by_cyl ``` This is a little different to the usual `group_by()` output: we have visibly changed the structure of the data. Now we have three rows (one for each group), and we have a list-col, `data`, that stores the data for that group. Also note that the output is `rowwise()`; this is important because it's going to make working with that list of data frames much easier. Once we have one data frame per row, it's straightforward to make one model per row: ```{r} mods <- by_cyl |> mutate(mod = list(lm(mpg ~ wt, data = data))) mods ``` And supplement that with one set of predictions per row: ```{r} mods <- mods |> mutate(pred = list(predict(mod, data))) mods ``` You could then summarise the model in a variety of ways: ```{r} mods |> summarise(rmse = sqrt(mean((pred - data$mpg) ^ 2))) mods |> summarise(rsq = summary(mod)$r.squared) mods |> summarise(broom::glance(mod)) ``` Or easily access the parameters of each model: ```{r} mods |> reframe(broom::tidy(mod)) ``` ## Repeated function calls `rowwise()` doesn't just work with functions that return a length-1 vector (aka summary functions); it can work with any function if the result is a list. This means that `rowwise()` and `mutate()` provide an elegant way to call a function many times with varying arguments, storing the outputs alongside the inputs. ### Simulations I think this is a particularly elegant way to perform simulations, because it lets you store simulated values along with the parameters that generated them. For example, imagine you have the following data frame that describes the properties of 3 samples from the uniform distribution: ```{r} df <- tribble( ~ n, ~ min, ~ max, 1, 0, 1, 2, 10, 100, 3, 100, 1000, ) ``` You can supply these parameters to `runif()` by using `rowwise()` and `mutate()`: ```{r} df |> rowwise() |> mutate(data = list(runif(n, min, max))) ``` Note the use of `list()` here - `runif()` returns multiple values and a `mutate()` expression has to return something of length 1. `list()` means that we'll get a list column where each row is a list containing multiple values. If you forget to use `list()`, dplyr will give you a hint: ```{r, error = TRUE} df |> rowwise() |> mutate(data = runif(n, min, max)) ``` ### Multiple combinations What if you want to call a function for every combination of inputs? You can use `expand.grid()` (or `tidyr::expand_grid()`) to generate the data frame and then repeat the same pattern as above: ```{r} df <- expand.grid(mean = c(-1, 0, 1), sd = c(1, 10, 100)) df |> rowwise() |> mutate(data = list(rnorm(10, mean, sd))) ``` ### Varying functions In more complicated problems, you might also want to vary the function being called. This tends to be a bit more of an awkward fit with this approach because the columns in the input tibble will be less regular. But it's still possible, and it's a natural place to use `do.call()`: ```{r} df <- tribble( ~rng, ~params, "runif", list(n = 10), "rnorm", list(n = 20), "rpois", list(n = 10, lambda = 5), ) |> rowwise() df |> mutate(data = list(do.call(rng, params))) ``` ```{r, include = FALSE, eval = FALSE} df <- rowwise(tribble( ~rng, ~params, "runif", list(min = -1, max = 1), "rnorm", list(), "rpois", list(lambda = 5), )) # Has to happen in separate function to avoid eager unquoting f <- function(rng, params) purrr::exec(rng, n = 10, !!!params) df |> mutate(data = list(f(rng, params))) ``` ## Previously ### `rowwise()` `rowwise()` was also questioning for quite some time, partly because I didn't appreciate how many people needed the native ability to compute summaries across multiple variables for each row. As an alternative, we recommended performing row-wise operations with the purrr `map()` functions. However, this was challenging because you needed to pick a map function based on the number of arguments that were varying and the type of result, which required quite some knowledge of purrr functions. I was also resistant to `rowwise()` because I felt like automatically switching between `[` to `[[` was too magical in the same way that automatically `list()`-ing results made `do()` too magical. I've now persuaded myself that the row-wise magic is good magic partly because most people find the distinction between `[` and `[[` mystifying and `rowwise()` means that you don't need to think about it. Since `rowwise()` clearly is useful it is not longer questioning, and we expect it to be around for the long term. ### `do()` We've questioned the need for `do()` for quite some time, because it never felt very similar to the other dplyr verbs. It had two main modes of operation: * Without argument names: you could call functions that input and output data frames using `.` to refer to the "current" group. For example, the following code gets the first row of each group: ```{r} mtcars |> group_by(cyl) |> do(head(., 1)) ``` This has been superseded by `pick()` plus `reframe()`, a variant of `summarise()` that can create multiple rows and columns per group. ```{r} mtcars |> group_by(cyl) |> reframe(head(pick(everything()), 1)) ``` * With arguments: it worked like `mutate()` but automatically wrapped every element in a list: ```{r} mtcars |> group_by(cyl) |> do(nrows = nrow(.)) ``` I now believe that behaviour is both too magical and not very useful, and it can be replaced by `summarise()` and `pick()`. ```{r} mtcars |> group_by(cyl) |> summarise(nrows = nrow(pick(everything()))) ``` If needed (unlike here), you can wrap the results in a list yourself. The addition of `pick()`/`across()` and the increased scope of `summarise()`/`reframe()` means that `do()` is no longer needed, so it is now superseded. dplyr/data/0000755000176200001440000000000015137161765012331 5ustar liggesusersdplyr/data/starwars.rda0000644000176200001440000000656315016155021014661 0ustar liggesusersBZh91AY&SYg@?$2[W@FH}Ӷkg@wt ^RPo`˜jz F#dzC'MG4dф&F4򍩴5SA#IS"4 @4 Bde2@ 4i4iIS#JMCC@A h 424d0M1#MAbd!`L`( Ȍ&CM4 hS=F4#Fx@1_?biES!qH'GHǿRcre5xAi2Asg N5_C$"CI (!K$'kBSHB@@<<U~eԔ@ժ<]B qzk.E*-=.11p ``Idt%WT˜["+6iu46HCNH6Ă\I2@lbOz{[w өJjjj38|~wիoxuވ G%p_z92o $$tE~6dPSKh댑0LWC9qt 7JFHt$&޾yIYmFT8M=?Fu5uJYy q‚r1T3'Y0vy7JţM-(s4Lf`(Ѧ*Sf3aP AIHBI0>d@` @ ̐!< C$0ѐ7DX)"@PEł"XbX,H2@F1 HO >/ V}F{^@;H!|jt t2U<{2h@ @MjiZt)08!& *e/ɚk$`)HUt%k r`{Pb)@F O@.UiN-J_qK*TZ8Xƕ*Vn~@ra~BU#Lfzj CL._!Uer&Qt]]1={P`CYTR%$V՞gZkFc$*@d Q`F;')CI18[OQ9EAݦ6@T<ܑ`]U@&W4j2>- %`2"!t T)3A搀3vB qwq D= aO* 8$PKzZ1De) $xhVBL>ipKS}Ow. !L'1,Z/L@Yx=$4+@Ec.U#(<9rn$gf6bH U#^]6]֕!vg.ݹXؘƧ(El]+W/"(P E2I{AGM[˸dt 0mhpE,,=p`=*H F,UV+|6vl_fՆ*X mh-2[r @B 㧓FqO4;hk(N"Zܒ/PEEBCJ-WLQ8:+/E 1g-L n'd51H,:2YY:Ә[PQ2`ESB s a&M4)׳_C:1U$XVE3Y ~'n--@T2.^in#tF;H7ь 3Q a$`cl&MmZ M6d交7YS{iEA I Hjhd\V "7UBDu,H ] 3ȆD$x3T]}xtHCs,;$vؒgN"9%:ܻA]l*ʑbWfiZe6RYtjƒM͝3-}U_ZbXc::N89Jk)d`if1vr/YnCC!4~B!XRIZ5(eܺv$QI$ARҖ g|4ko@ %IێK*C.(E )UPoNrƒW#[j{#) %aH$!sWEuExn1⫪o@jfoxl\lri5xQlbJdDeQ YQk@@Eķx%7|5qY,>*ΟdIpBS4G5M=/Sk@Ym]TB]`m0Z~LMK,؀u#NUrtHc`k:bEm$w%(`ԧd|2cI7 n}5tdNK==6 ƭE%`҅PV0w`tt^ <̩A@ 4 AJ#ljHx9Њ_i{PG ))„Th@dplyr/data/band_instruments.rda0000644000176200001440000000030213663216626016372 0ustar liggesusersBZh91AY&SY>s2HP@ޠ@ S="=P QM3SAdA)sdb-T}P߻"I~pnTQ 'K đ^l&enXe`feUG/|*HCGN!koQlF^Mp+=x/H =dplyr/data/storms.rda0000644000176200001440000022503115137161765014353 0ustar liggesusersBZh91AY&SYmgDDDu@$EDLD`Y0 jTi4ղPQN>mH{5wdJN$MZ$2I̓6٭+RPJH$bzP 1 sۘ*[ [ @=ǹճnn}<7@XB( Vq QW@Z5RATJQs%y"((QTPBRs -Hlxh@f^T D&@Q)zD6@ P4h@hxIT$TOщ0HOS馞hizh ꞓzjzI)b&!i44ySShd 4!=&Sjh4C# 0A4a44ɓ@D!4 S`1GF Mbd4 ?j$BڿSI&vL$m6Kdl\\¢TlUsZj*d*fnh*I\ &dӭ),(æsdFUTA#()0KQCag #ahm%J(XlQY5UeUpr ꥢA. *åERRM1I-,L9l)*$FVeg4XRepZIWMQURAfVl,T SE% *RTZ@PUhHŽkT#T%G9EbY,x{ͱmlVZ//_ 5}' ~_ΊJT}ܑ  BJq W*ExHb*~j "딇BH)h#Eު+TBGb %C~>Ohs瓼qC[%^5wW53w͵U<֞2l:G24md<NIlsA*llwEG)W/zI}z_^ʢ=Gg(zB ^;׮\o,yC7/S؛#d久[إ6T PU&ЛQlJ^xNaHR.d+20VNe1&wp]RlmiIͰlS;Qm$6ȶ!.d0 Jl(خ$:6Dڍ6JlV(sPmTsPsA[-[[$IQ6-[&Ԗe+1ڻs%ڦ6;`milFMsM.^5[AVmbm&-m%llQmZbl[b"Ɗj4ZŬjض`Dʍ6JM[SeFѱPؖSjjmUlC`&lԩ66SiKΡ\MulTuF#j.j Tke[MP jTykGZ^~.ɟJsA-m+a66j[DlU-بm5lV6d6F6Q6Ơ[`֣UEQlM-m6ѴM66QH 6/ӗ|:_r|#cd16lڣZ-c[hmѵmұhZ- v6=)&M셢5i pll jnrᕲo]8 scj5p(rAO<⩱ m8tܛgǪ6\=1y׌9ޚGy\w[j:r#uzcdbm/LM΅r^nV<['W=0mN-Uc%凖Y;枘=4uiuޒlzgYsFSw]d6nRuw]'1Kj#^1sU9Nj&m^rNjilocƠyGM6b]X:WZ𹃬N<'Z9Hxw;1.cgZO= [$\›"\2;yeKvuysS` .6KOǦ9=O<Ŭu+=|+-̝b엻mWעMY#oom^l]5EQ&X{W5_ߟ~WJf2_6^1*ibf13 'C6fOH޾.ae~SR}ݭdvњ+h{cih>ZOۅ'|\+̟~Smb{mV_mo_dSݥzki iC׭#QQ ѱTQh5}Z6LGa̓Sƛ(29>lKbs\TIX$ݾr7|'3Oj-lUj~:CrB؟F4?Nj|} >L:"^=?s'H=ykk-d6xr"XڍmFYm[Zu}zWՉh?_v" }Umҭ^^ExרmبekVcjlZS.kmt-}z}zRzi~}:ݓjF+U$ŋDlZsV5ETjdi"mzkU^MyIʾijp_fU݋mp fmkmm]nQm$j*žgjmE6gA}::߫.i4f<4{MIϫOl];ߧHP]W^N//%W2Wj4L[56FR>|Ty?V}|8a{9}-26 E{] 5yտN=psVQ\j-G|9O^9OߧC̭F5CupɬVʯGpڍ~0[FO>9{jb>7+.'lݐ4l2(د2-dbYbQ[{sbk}{n~)᤾;|MMGf󬱯_rJ3lEEh7U|יuo mI|1d_F'ymmIӠvi/m+^},z{]j}ZOW|lUz-|i~MoyKc&i4X#^ƹcFQ]#AتgÚ>zski:h}Zhv2Ÿv~.aL_6,=v߃ZI6&׆m[R4c+f$b]>>͕R懦ʞ> ݢ{z"Q!3a4mF*1Qz[ڏ9|;U_V t;|W?F쏎O:qN-׭/&־j"2ƱI_:SӓYVne=w_G66cɵ{4CQ{5>Z/ḷbj]iݬm6"H5F({u6WåO(Us^>>XG3a?= Fzxfi^OG3ZasM1o%oּZZ*#%Rm6;sQj=fm+IQ(T)|k㎑#N?֏~~)6![G]6%Tz蛴E6jS$né/wpWj o%/ų2llmE=Mکw6ցNȩ6G bJsح1-؏%^o<ױMzP%mIB[K+hzy=xnm)sCdj:alYN9=jW2+ϘlU6W7EضzyuMF˜vW&UwѲ49G9S*.Cߟ{CʶĿ Z/t_ymMsM[!Khl5JyK.4l.5l 7KM6 [(|5+ȗ~sgZ5(';ew92ئGẺڛD揋CFyeV6vѴ M/^Ts>WzUb؋+`s8˚WJvK-YsWti>l&SfM^#^:ܦdmA;Қ:WNcuR͈qGds]lsQe=!wꋶ;뚖.ĮVɹM&}xd2G{iyʶ3MnzWOp-4A+@]Z*J@xU(Z]ZXKaJ.$JBdI+@/#yЃݲ6n}|hl -}~)o(үz[O9[|"[)UlSimٲJ5G!wM硴'lXr{~R$QIR)3HMEhAQ2QTD9(fvpM8S C 4 IT"РƜ8RYlX\CVX&(cMR!q.\EȑJWB M1f5aZVJ[Y9RVk+C jÒThZaNDcZ\!.$B cKiJi NpF,-(+ (B$fYP9\ af`TUV(X!lDhibdTCNX@JʣL,JXDi(5(,ZR:PYt"RDaZZ,)#eUT$j\TӪ,VeZt\ ՑQ΢kb("Z)R`b,F`VUæԫi-0.lRY4"([4#1.#VG12V%! ,RV[L6˚EdbDQ+a2ԬhbRPH"VԢԪ4 b(:ZQiI)HVE9]HCK0Ԋ SBYQm,jJ&\ȋL5L :\"XJ&aQ19BBP1(PfZBER\EThFedVhd(&,Nh%RLXEL#j"B(P(CERkRE(YR$ XAe*Pww9(Q:] LT$."E4P! J#daY.I!BJ L"քeЅ0EmJ!C-Jаl".fTNlYE&d9!FŕZdeG.3 FTb ZҫD*( ,̺,V-hIIEґNҢDB$%TLģ1TEQHFbR%@TB)TCLQrYę;Kr9B5.Ug2""NVAiRG)&fɡӉڭ+ .IJg%d\ &IRaq QGdh4Tm]bm3s2%;I)njU)Q4cmٛi%HvsA L44c,ٚ$RΖ]-V2)PpX\:wpH5f*UXZ*T P-9ۜƔJ9,-B-+$Ef5UIbQt$GCMLb svq i\1ajr╪HZJB$ &ʃIf%u(2IJ$٪&#IZT%FVm5pjPR\DdWRVZhBEBYYB(YBl2R*LIEJ+.Y[S*IVI:vJG02q[BZs6"f\5):%'1TS).FdDJ9GjΊD05 Uf\&4h':]3WS8݃ La\u.pEeb)NsZۺP3D.e̵I&Ŕd"DRƒMl6ec`6 "cḆ%4 d)- % jmhaPZDHFhPkl (LZTXua Zq+!P4DU(b,@ȣFXDe`[NYXjDӰ(Q[L.%G2H- I+CY*DьHA[.A dJI&,QF!̡),ag#H&]kEZqBCV%*֨3dRd(R҈SEC9I RU,L*Ag(- fEԈ*Xvr:GQN*.M:ŕ"ˢUbRVHTBdD"Ae!1XYZkSJ,Km%X3)B3NarJi(iaF`IdJI*PMZUFIE P44QVEɩe(BB $J"# )$J˪bdD(fB%9Y Ji!tK(2Ԅ":X[*(Y*@AB%Z!XT Z,ʈYAHdZhi5:ȭi&BuԎ*adJt!""erZE5X:TFU.KJEI.fBr9vT\٘l1KJ s8tLU LLVZVA\\ܬsr1%%*)d0(u$U SΩQ܀Ąr\ .d $Tclx#[6{|nߗEG4|OЩuJ@\R"}o%?Z>5֣mIIhQ2YTߑ-,D̾q}Qv|X ZI$ЦWwczbLk{F]BE &TBn(GKxtwd3I\z:(U:#: 3>Jp95!c_kįAɠBD)ZE'|lK젞rI-Kj:9(+J ^ڽ+W+xle{cإ5잎{#D_Y)ĪJʹ"N hZZ2j4WP .K++62֋:IڮSꕔ~~wIV|d_L;Ouw^`ƯX7RylX_wlB~)D"?QzwAds5S<vƓ&C-~Ct)h@lEAq:`v!AMFކXvޖDh~oqj64aEWǏ r*LtA%B`h+MeEC!NzQ=CAF(\OdȲz-(&!˵&R2>,HpqA$dJؽd6jZf-!)&6okwZ_ϐI~F-\ ,"t5UbsHRX1gRvt줪g\c k\Xlj(+{FMiy:k]sM%zWgvJr;η;έG؟g{?}R[GNt3m-l J_A.*[eY( [4PnnV5r\ւ FŰ':khQ̦CdMQ6:ۖܶƫ] +5.&06srnV-W*6ƍcav7QM"\)ɥؖ\i[PFV+U&ڊCUʴV7\\]a.`mKbۛnZV\) \ØNj9cj.ZUb5ڜ¹3 elvvˌsLN9.Vڮ99es*.g2JdY̝d?wڒCr[k6S[Sm-FkMTUxV165AUW-\~rAJk76][`l2nTPQ$9;i7.0:*'B&Q""wG!=b${1vغz9nON &Ҏ MUΎh;F N\И#dګaer\i7Cs .6\*xqY-[RmBl&FҶKbQ[!V+`أbV[IlRd-M66m*m[½0W0%mlکmQ'yGc85+y=Sw;87;,^=y<`ti8+m6zr`#MȖh2ơz!P:esx[<(`֜:21o<Ccp5U3R9Ý` H!WF@=$=eu Q8X!t ''#jb) dEK *悸YuldɆ,E+V+6F͋:Ճ7-OYGIe(O\xx zs A 95O` ccS@±KQNY3 9;-aK"0x͖A.M9cD)\u6De p\IGd*82Y&a[8v]0bW (3[@V\ 6lC4j3֖rr`r-,f0LnjC26^Hii:ijӮdaq-\"#YȆU22hh) 8t4Ň d0g8xkmq\ sL¹s2 cjpfʊ$(]rٸK@3.h@DFˣjeBs֎LA5:hL +쮊#JB0΍RcKj^u 16ҭ8 Xx@IQ G7S3Mp-c\ҝŗc@;2il`mpD"K."h+yГ#Rz2U<<4 Pmf+XvtMHVer u;LHC đ4 @hN"Vu&rf0+ʑ95$kY84b#Pqi˵ɐC @cd5g\9O2iUrhdraѭaqDkuqFnMEPMmJG: c'eLGyx:mbr& kW9,X9:8I`.02`ВB-d9\kaM: ^qM`84k#uc':`YԘK"` р)VW6uٕXjє]r7F\;k&ٺc \NV999q E4f 4j9qXX&4EVsu (85gIg,XȘˎ%KE)YnHr&ޟ-QNpkK؅ Euh9 l b #cjCcV 59Ԁs8!3hXRЅ lL1(: =0UBƛNVd.; 3-&"/nh4(- RdyƇ=.<C'D6 rl$֜Hg8ufw -WZTz)1YPL[F mΜ8$3$Xщћ5$f\p`aMNFKUJC$em7S'2'Vl9sr X4 ;p赓 b4:Vɩpb1Lьfpj!1VAGq0@^-v@d 1Kh@vp-S(;,xy(r9y`]2jP pECcMӁ̓C.\ ZZxhɳUC0h0eqYG \J:ػhڸ55hkM4Y4pD B.;& v i&#D9#8'  2eM=bvieHJX0K9\ND.,vCeFY*t ;`ќ`u&NNXVh8̺SxSf1a<MGH *xg%JbNH pFx1I-DjRm&Fؘ lp/̂ ha˕S:"ZY[K' (vx${vzղXJCB N.&wl6Rd`yc=b"ѣSٺWtcDh1&'!ZGG2،B»AuF sLA*h6Qh6W3Ѭ K65Ez#ccGEԢ P rr"SֲX8E6l؅Wi֥Fvz3u'+cD'7 3eCrX^NWHr%]r F%ɣ2m.M9 @ccN0\28WjMk.TՐTY4Z2"t&*N璜 lh2݁T9UB)6UUU;EKbpX6)AˑxIc4ynjEM':lFvA3#QaSЦ0dL/-pV8]`!fƜ8@͑QKEHH&3U!"]NM D ,U˜A-=.9̷$"/T"*pgrpze4l <5184h.i 1#:4l㩆6a ];M(ܘ " k]A!#f*$^nҤ+0bh cDѰwB uPRt+=!+\NQprBx wprB@ciNTS!u+H_`5 IiP]"UbBhCޅ_QtT' /4DTRT@)x''h#y̫:.HrQؤ0Ykz(s{ݽZ7ڵ:!"Cȃ&G:q8__}/};mhf|llIT66* CBKBD?~>*#s-}]axy~$_OE νʗ]?P+qW=yGwyDŽU)RuN$xRzt#{,X=dRyyިWxWu Uo{rO|ï:ODߎ{ǙE&U>/GBS*« KS =|J?Q9TJ_GAplOّW}R/JO E/OI:_5S(ğ,/"|EU =PB)'WQҿJ y>4"kJ+:U*uB|S*$BwKPȿT }֥/jDdOU2|b}Z/^a|m3B2ԩbS sRIWQMS "W۩UP:ގ*wԩyV+v[yiKW?^)=J" / z%K׳| Ne}Py&s m.8\WĞ|CۗHLu dΞ|WMúT m;̧i%̇J<7T;:~=Ϧׄoamد/<U*wSЕjOzG ]פm4|+i6drtqC1WH(WT,s2OO֨z!͑!S芿6_,[wKP!>TN^cߒF4C*8~$-U~&%~ͲۚỊmѢƒSE2΢Mݙɛ#QJ]dΥ Z͉Pa&ujl ]:. ÷E9FՖLERRt;T%Uj!H&dG%HjTIJ#DQAi/|_(# !u7sⵐsGmW"߃H< r.B@'`[h+ NI"ˏ6q(b/$—#~1x`vߧs܃2 b#;*w\ojɬZb-ol[a1cM914~!ŽA;3+cnj>3OB=]W~gۢfMIdjkfS) _up7sEy82LҍE|^dwKhrfPJQ$z;RLjes1>bJ1& k'qQw*H^DbҬV_(ʢcЩܪ@t=9U^JhWo*W9LDSv]x-h_5/O)O8i{-#8TS*,*Sz*P^'hO8)R=ӵ&h=RW#%ԧURZx F**#O_ zS'G2{T\@d6q2Gw`W߀޽"\U[s^M^#--m ]r+;r;ֺseUلa\B"h;rq\'QNuӪGGj]uRݧz$|: +!^$t*UIb)tHX_ 8grhK,a\*Ur*CO0.b'CBp^'*3BeTa+krG~-XN@-ʩ8vK_nP@~1. SȾ9-E^R^9T\JGJK";Bd{ԣ+Σ ̪*_0}%v.^GT~ỌԫҮhΩ&qLGiP|hx WIW!*5 z/ (,2!O^j*}{s񢾒Tj'LʩؗCA])VQm0*V<IE/}^:I=RVCnBE?N[Mu95O{q43WDמoLhae5aY!4[i$!KC; "jFw/"% 䡓Jć UWaWtuSUθtwmG/S*렺z~|t}>`/_gJ½'xxKDz'Tx+<^pJ@lvzOj\YtXZUׯɽ=r)@C>]QolZ2Wx|d7%pDHi> 捞8u\M-츦5zf<7e7W5 W+[tי~ݤ| ~CZ}|!y8EߞNj|; !;mF=2~Cd@8_Q Ư[TLr:rNuJ&EM%3 q!cI2MWu\kƼvآEb$_3Q| |1\鄔jƞTWE|uA7tvtM{)I.bgpӝ&emmmWz}ϩ[{hhQk{HF[0|H%HBBrW5jj1 jmmJ*M(TR5*DVբ"BfUA)04sEERX\\鑡!*[ZjMS, .RTTFh CEEb5Q#򿾟~?@?C|<G".~!!rwݸ"'pqΕ%]اY;<$IqF'.pkYÿH"B_eq7(\33_Fh[PlhMm-+lѭ4"I1$EʉlSff(lD̦Fi,"i#)(h԰S$1i26e !14LlIicB2 (AK1 iX)II dH &BFĊDfH2 &c2h@Y(QȒ$F36K ,2A$PfKhd(D BI%K,H3MHLR2DL"&i`l% )2Rlʀ4,HZ)#cm4IF&#Q%1,QfRDh6bfc kѲiRS ,AD*1d4TER,lmzO$Bj )߸D@؛Cci6 kE:7$kP\=n*4mn:$`ؚiۢ|R̯kTmSKm522ia2K&*i2}gQdHoqlDƾfS%"LL :mr/.SEO ЉID%_?Ϻr9:ڠÉAZDɎ#U:HD+Ĺːc+*=H(6"&A63D1=ޢ9jb-jާ,2.i4: c_!@Gj,Tw׆j_/iDwm;g*Dg!l!磲{VA_tXGtI2wl#2Gri22{ChHB{ÉDӬELj\&دRF*EUiE}!o'/]}>G~ +XTlkFѭFՋU5֍ FTR%lM bR#J&P*RM"A 4(JA6K0%Q)M3L*fLLУ(iSjLe&If)E&ZL )fԠ16S@lM&2f )L)RBYЙfI2e$! &P0 JbeaBFJ2Ȕ)34SE !$LH42S6)1dŚdEђ*6a$bBXPhK LT1LbH3(,b4&C3dfѦlm2hЛ`ɨɪ"CFL$hAHbLa"ԆL2l=jyJ4of ywW_~wsUOI̗ų-g)9!վKz&2+u[NjJg5 ^rDR>\nM&dr=8^-QR#|ևgD rnwuGC!&4sAH!"DqSTT\9q%pUʊbhcFD4Wؤ>NpӾ_¯jE=O0635]mzݹ/SrOSmdMf*-#y>[y π|xS7e6JMQb4*ڪ]Vڵ@k@45]ss${sg;Jt::=glib{ڞv g{eMt$\5oNCySs^M ٶ&ݵQA-tm֫eUdͦhVٰֆh4 PSZ(RA@(PP  D6 HV :T@!AZ!5T)Jje0mܻw X@P(T>@Pu8>^7)T]>vP ->h}} p@cYO3W:D l:_[p3 t.P@ t\QOWAܸ |n @@Л O m6Cjm42 =FeAѐ Fɤi2<L4OB) `*~5'Ꞧ@2h 4J'biLHLh44@hh hhSHA 0CBi16I@$И5$bifjOBFLi mAFM4h4i zM4AFj&G$JR"hhT2M1Jzj#OQeMzM#!4hU/3s+d桱9 (Sas21i.&7{*GvX꬝ݎAUR-OC 7yn[gi\"CUhBzl]oou}W^OcE}+ʮKrʊR6ԩJ7MWZE>&{)>Zq']M('0QH 0o4[,wfeÕ^J<׫nwhb3In)H 7 󝻤˶T)\LMLsD(]n5bq(^[Nz ur޴&Tmrv.lk2.:W9y: ṵI1r.=m[L5T/p,4W-(6=a&55kU|P m{jkخNn;swvw[CwquW*6Yft""MJd*S63d*bT\nj}mW^>Nv7j7^QkW+5Es \ [y-7Q[ĒK6-6jlmEYCڃZƣQE6幵lk[.Pw]p9]4Ch$`wi2[U͵Vj+WtOי, *"ѱAj-,SF$b4.["V,-a4ENr+;8sj5m5smӤiblXHhucL1vTF,Q\( %ȧtDZkѰˑsrӮ7.vR<Op*`lE(awL,DdY~y|}˻_|׵dmT^ݻ GwQ)anzy`~sҢwä!AMd7.y@S67 U{{ܑ[ůMdRQ~q߁\Oэ-&WKy^+ϢѲ)N7>Ξ"_ݛ_mUbi{Qо}J)SiSh9Ul{ _~3Fϒbaus? q6O ˸Ycl+?]4i9Nɢ(m|ZXVƎ"$̉ˇ,'AAJ/&!-'g{IZX1?@>5ݕS\1 .dP0a7id}fPɦJ_5C3bL  }8D%J;dF"0޻_w:ZS-js ƟsQ\\.p,> ǑRuEG=yoɶ^ۍ\/Ӄ5u#[^w\?1!y1̚'nPjwGC@hOPl{xxTcBB x>\sI1Ƹ##~;k_8v|>i{3O!?UϒϽL{G8}Y?u|?K|ukoQŧy )o=HZC{u|]iďt{Q_b&_eRG?%m*;|{~>Gz_I[6~;W~suç6~P|^_섥)H JMm}}MuGví<]n?`3f\;;=8٨×s߳IWsyzxF35=G=~^>\_Sqn=#~Ov{77z7{{=U۳Mۣl2nc;ոs>>8޻Tlˆ"C;6o~WǦ}so>R7>}:ÃZzqu|mǥwkQe3[:~ajV,2`5g  a~|tf}=Z},/}zgzc.z鲏Zrm&]"f 1@mLV(Jbei_ 81v7كCs] 0z7T43}[y/(~ bd/_̓pd9;!vzzM]F*9cYGfQ$įlYM0ZbPuz{<) 2C'"1qMg';@3Axq*sTt/Ǫ`>ϿtG"ZQznq{q3Id$j(Va{0"(LɴIt݉E𝺖xS<9՞Ns|G2fj~ՏhǕb&E3LbL^tw/g,9􆥎 !ᮉ8q㞬B.TV{*Mः~7Pr.d̲*hBo9,aAʃλOϖ䩺:1hArLȳo#REkWa23ckuwE^hI'"&^Uk+3{xz[Yx@ I N< <+\#aY-"m$C:T \ "A8թZm!NwFIG6h;>g,OxR:OjCʹQ@h;䪼xb'$0Q*/&xMIq̹ L'+Qՠƕa8du1I-J2 ז "'#OsڹU%AբAVB+5]&AE :g]$N떹]q؎Yss6"hI"M-^"T=40psNIԗE1/?~'}gti"*Di=V,Cp A wdIע1V <EEEM$<$gɟy-QD57IDQ.u=;ӷ&c-ʅ(W"ȒJ* X zܱ<(+%݋yCvf&yI2: hqۘױr:uvpܛY%N3н EِxFbJrThU^5Q]"pwW5R{uFnwvxSqDAɆWXV{.2*O18J8Uxs5CSZ7fq1DQ!jg!1˯;ZIRLMQG$QG!"*VgADWiធ7Ԉ$Ο z2C+*L=E܋ERu"؎]K42j8x9CSPQjDD{B)TlĽ̤4@9'".=f:WN# MmK3*DN!{p ~v}XqyիZ븎bD̈ IixgR-FT\Wrf܍x\65=5ȶ1mYf)8A=l n{;Q'h|^Ós2Od7V|8UpEzcIIi6ЭY*ΖT양U}]iﶪ?ԚQ6JᖩcF\=yL4R>_>lbf=icVĴ2$1$9K" )O= x%$ł8cXPL.g" 2 #hF 2l[$~ 2H e>ہ;ON3}-QM"humsr&6=gu(Eo?u/ _qw'([D}Ÿ&Dv1AHVAܘ^TcC@Ċ UdQd w mQh&JAB q6W"EU |;`X,: 8N]wnxK5\^PytW^-߬yZxq~VijG~L*!C3Y*QF+cYlrPe}b ̮erԶgFMZ׃[/LZ.X4ar&E6;N9ٳ ^hm#FfDKDzOݤ/j{sQI9DH[^ѰJ^4Z)N 9eR(l 4'&XdV]!R ̏h sGߵA׹o[&%+ƛzhqyx5A qZ5Wgy~kU"D4G -a  A 5gR|e1raGכHzճ}!WTdCjp~:%!) H ;۰"gҐj:^Ѵ{[`cAcz4!FDӳ6 R<eVdS,02H{XZ Zo!S,vU!XkoXA!j/.t,sd,g@ǩT2\a[R}R)d4䍅́o4b.|4rmZH)J 9;Rnree1n1e: '>RL;Q6L LX"J$옄,Od(Ajc`O*m'R[M5gdLb\tי4փ<<0`}gܤ8LLqli pfC ` e$en*D#ۖ =b]Hd h(]db@bb*G3*Vpc+l6uB[R&7[4؈#$63!at(d3.v@ۂݴMI9-Ԫt8;"+`Wl>? n2; (]*4hڇIfKl $qϫ~.yqHQ6b 7DZ>k; [LsK,32bE9&2hcLY0;v2ni"9cV 2~ڧ21ǠӐu#A{M y|Fԍb*ލ"KRh&RԔ`1yD˚4وDCYб.` UBs3 zcl^u RpD=O fIte'vIl;c|߻nKggvI$A/4Yw'R6Ae,->,_;wA,S ʜjeMY Ն>(dDb$$q3ga"΅K%~$ʆxP2r Ds+#|ʲ-?f-O1?H2l­6MyJIG{3(#]z}[x]xV?ڇV Re1u6fVUSˋ)\!tǨ9H[a#IX4"?UQ vs-r  DkI ?c72`NGyGϱKqF:8brK?,JBbp$L@G 4D9Vv 7f/O"(7?UnM\IL9e!XQ~goWO:#)@@!Ԩd. W!^Q81rU= Z4&e֬#fl:}]`k*z9:2HY &_<)eU5$*ȂfD 2;ܒ | UDϛKenrK2'XH J\zPjAx&1dDޮpAG9oHE!ΨOxu:%&/BX8$(.s;̄̂qCbT@=q$FPpV0^A[O?}GRY koZ!*lO Dv$w"XE1nϣvp e_Ƕ % Vm&!5)kpMQ ~p^9bd<~͍gq2vzu2 9Kq2el7BFNݯ !I! Ew-x X=voU$xe|95$Y ֡乿| e+h<~CCalb9:4F7JUhd Wɀe ]y6FK- =^[jLVZ2zF/^o}ϒ]H mW^W(i|̳>);u6t|~ \ԑmd@ۧ2W9D0r!ӪXG_dTȸ; m_2PУ/!h.[HzU6u͡zW ,eNhe"KlѵؖRĦ  xF!b b}CCXgtK^h,$YB0K2Vl<Ź M =׏-J6NVʄh@Feb.y }[GC!u(=F_мUaL9{N?Xݺ6M-i 2^2x8y2 8j6 HA& 8W:ķreY!$Nxu+HKĆ.\`c2%P)C*i|^B/#JYtqbFۺj@<Ҕ?#;!߸sR.ylf#\uƄvЁ2>a:P])-nfH9𙐥w@  {G!V7s*J_Ψ:YS(g-<߾ 7y;n(umpٮ]d4~\bW׫Q=3GQϞ~9>~=q^N}+Cf[x~yɸ9`5ڮ:{3nō61{u_8;;Tw~? vp[6-;¾'볾ޙp~9xpi83¬|KC4`f<\_ZrZR? }<˿wwJGkWGokg|ݦ;moי64z㹧ϫ̇YUo%h6症w<s"tݓ=g]ػ/makO6JzrVog})~S+pfUSƅׇW2ᎃteW͍ |s=cwc>'dV}iSN>ӏ/ztٮCBn9# KL+Zdsc?fZ{ݔ|_XZut~|;\q9Ͷz;i//b(r5SzWFm+{ӿ9~':y|n[E Tq(ۻ;|k~[G;GPm;nZ&֘{O bkFP#S;kz'!m?gջGLv{nǦ5ײ,x?[(O\~߷#ia[ 4'wm|U\g]iCł$o[{N{뽙כkAܫ~K)b9,e!Ϸ~<1-A9av3I#kG|WLW˞*]i[vzv[:l830 œ7ZYij)v|k%X-g8<7-{ۛ|rJ13a D)ȀȔa;}SQz=Dmũ xqգuq߯-ܷފHÛ^!\7; YfB )lHNMY!b4pCϔ S\m&4~o yj&%4߄o0.m-7J/UO;(o]p69oQ7^/r O3.4LLyzmtMٮI~[Fv:ɨ 65UT /m/ۿMw-YYәQi9,*m+K<4BT@"HЌ~-{,ĵ,7jgb-[8cSD dtkP2n14~hRniMa^3Ѻ~PC Ȑ1cW&xti d-eߖ.<}۶xO~v};F)3\ PE%:_*d2u[Z"GP} [ _6b*8uT/vU B3ciu3F (ym|:[r꠩aey{0fNO&5n#WVƺx\4'M`Fֵ-VĮyP q<rwpg_ 'z* */Xz7Dջ"`yh2KKBJ#4Pi̻txZ{G_o G>C~N=Sտ`mXUhw9pߩ@Q fU@f]T 'D䐈g3a7mD);Vnr͗ KmZ;iFsFOa*\ Na:tnS Ծ63|kziNvlx,B!fwF,|fQJ je]\d/,W 5EC{Zr==_//N\ۧ[`j{%׀hŋ̦0jʡ Rn7=:6lMl"S. a Dz,vU *P3ڸcnoFjZw&63nv55{ ;T rhxN!pD.e0•1sil9BU2&T!4J$1}MTf!K63IeMb+ڔbֶiuFK|* go>{62(Rbs3:K#)́(#Gj;Nxhܩ-k(CMTܣ%8&XDiQu[5q!li ȰyӫΝa9r c'S 魊g3>J%Q2YMpVRr8l$4<gzùt0(lʓddɢӳ Ae uLҘN::/t~-_)3& I2œlBI狻[q RmڴCKKnDBFvWI3 6D0yy*ѱ~xm)EjqNOup2(9_q`yD昔n"!F YTbUSVlA5j4pK$h^P9d3/H3RVq+BiY@t7qÆ8 Q|"tbλgaWf /bI,c7~:k<Z(쥳MRQ D С£Il9XkEwݨ^~7'母352{ {tgb=ْk+:{nqO*S5( 5cfD] V (h➸C0tӶ_>1Ɗxh԰ FAjG(Ͼ[+!>bMj!^,]#[ d,,ְ}'|1xūЖz(Ȥz7Gs<3~(gl>EWnѦd \N YCY|1-jw-(͂u:maZ޼V5ģDwH/<Wȭ~Oř^l?'Gq?IlvN~}>Wt+kf8eSZflM(d{hpnץxϞf0.ߛ!ݍT3.{iU?>m 5#v >vW=~i7豪&)[%ڸMƍdҋGxeyR.ƨ{tGE>=+$ HXiFB[KR)37ج=l6?rxbb{ CEI+hf$6aiBaoai7L$ anͰPKcMkxхi aM¡NN G3´8(vھF{^~;戙Fv"|+dzЄ>P\B5U,'ٔ&Є5mjR rZ0%^X]A4;b́e )3;B-X1h7 fYUWzRoc~~:u¯=o'o '3A0n~)mftᾟ2[wSUI]7.@;_ 4{l{p6gx#ͬACƊG'd3)4bȅ6*v궔BV,ے@P1iO#WMa!)Ӟp b}&PizT#\J QUӉ_Ƅd^sqͺEoXsFlPѭ=FSI-"?#`kJsh%^n1|x436`LfPC}KQ}^~OMvkݚ;ڑ龻D}Jğ2  jB&2.wႚHx]l{6C eyd^& @n7I袁2CwdB[!&Q5ʈiFsRBrmrWGgd#DQs?0Zccm0_mˤ<\ (sŝzYSH0{2@ Vȶw>&O{U v$W ]ZxO)>m =;l/,.~UAgZֆͯܭnTLGs{aϥ'""Z&gb4[f}%U41[Q˂nWc]Lo{}c^MVx o뛸ޓ<2.&٣apE(-q#ǫj+Uzʾz#P Cw[ŝEDPݮc$[36kybwE#Y`Y"!kO6|V#ٌ/7u㍡GCh ъLbjgd-Ԍ_p3B儎"54E{t0Cg0GMkĘěhE[77Ʀ?E/Ɩ׌PMn-Dg]99c[_Qp<Ǟ7GU{dsLݛmH@ZBCcQ&3¢At/g@ "SIg`sCtimGӅu_{oZ,S5" -f;S|h>iXQ4S}N&}Owl"KR>Hnx4#FQ;lƠ/ROڪ7>-욚}m?F7v D%c"74$:-f5NY&O>u002g7%fP;NNv;O*8tꯆ~ :}2Nd5a_VtjgY(hNhx1*0VlP DW˹|׫!$U~s`V_+xܼOGQD>wg_ (+uHPtqQ ;r"qY-o'?֌%uDUE!7$F ;YZ6TI5ٺ}&=31+}+žN(Dt;OM5lf崉$'po6ȔUw|ϧw=_o8#ƾsCot=QEH= #9/3ާi~o*﹏15ђ$;HE{JݖN cK KΜb4fH[dkpE`b94smVVEi ,jH$ p$Q޴ pƒ9Ge b1 hmzF\uEņkFFΕ u"J9_6NF/kD ޳0ۦKW =f,ZPptpNH2G 沲Z O”(" }rvC2љȆ b@GB9{`]C<)׸Šopx!RR3f $sG[淖7.je*NbG\OqP$ !X%=3J_*|O܍ZG9'"?+k~OmDb9K>#78XAsI=:E;{uf$^u_N s5^ Ofj0k$@*8(416l՜Ș(p-$LwBoFydeN,IS}HFf2K&@nPNɶ/VaV~2N·_I{"@uA%JqPZĆD@Ca|3Ts'^!\Lip'OBx/7\'$S`ߔF6 ɮjy; N~FU_t7[;HC'zN8yUz諍%A l6R FLvbD=ȳ';†nZw 35s'A0d|O|BSq՟ BAtzNU,9Q 5=:(Z}]~zDϪ_sz>PR9C0;KMq}務yyՆ钔rH }~}z;sPU(}GB3;(H**;s:L>jv}}L椟;yf.)EvU787EY!R{CBSi\';wv;GjW͑Wh[c7E?NyO:`g?u2*k' 1COaDl/f`AWsJ$f:$H!*C3|P K6~|_)>bvOpwHd6>#y}7i ﬉VsՐ:kmbhZPg]˱-&P}JldA6R@!h aR3#!ꥁ_RL'/(mp}+S$l1@XN`xP/65QEYcTKhR1)ߎ~!IB`;7ysT|K ڎ/Ip ? sM Rry̆Х0WhƳ@/}ǕM|7azw~g5LR0'ھT&Uۻb3 e DNd;6} O!rAdD@@a`XD`IŨLB/ά۾]J gwcr,Þ蟼 Mg Exzts:;m/y>w~~ kD?a_7]ίI~-P"TOrDU{mhO1Fh6Ua̓a9kb"-j6ODvdG']̙vtNr:! .ee+!.e&XSxJzZYy fWJ!%)iR袩:gGhZ*HUVdxXXIKZY fE(Za!Xh'`i!(J:PQQZdb*BZ^&R 'Y+%VF`+h'&j.yjHaE^a%k"ga&.B%JWD!e)B*B HY)a!jZ dVRZInFEy&y'eD'*R*UFy"Fy"HZ除&Y.KUGJg^HdXfB[DbQ)Y꤅ GBFziRKf%"J桙nnQdenYRhbYT(yFZEb^XE]rKxj2mj55\A%LO^z;vL%'"`f+AQQhjni皂f(afXa% i"%faHZ R+e eeiijI$VB%aa9zZRifEfeEiRIVH떔Zeyz **Tef `z*bPh"eAk)UZR&zy^hRHZWT%IhFeR&& bIx^nQU^b%hZ%e$E@j襄FFe 'fyAha% U""^')IW IjdiuIӎLhIVHhQJi)QF`zzREbXY!`TaPj@*V%Tyf鐆af')yEI]\1NٹJ3J502M20\]w=_eJlmVN~Tߺe}?EL19~6vcg'_7rŴO7zUO4 >6U}EJ}Dy>wjgjN$Kw@ iɥ!m|R!aU}g{z6ω'^ͶֱvmޮdmoTޯM;rxm妾] .N6f˓Mˇ5BCe?UWbSpчҝ 6,cSq8m[&T\_UWޡXc1z}?23'Cߟw';jfTś,r@GǿOz3b׾(*Lqa&Sq ѯ[1W$O*z<^o9A'3؅ʰG1T>Eb5!ݳG3:ʊsQǯ'ç'׼jxt8CAhG|'yzx2GŸw_wNwte?utZ| /#Gs?Is~/ϑw;;;NO_Hn FJňovnqSX2E- ayԯQ\VYeY6uQj9ss mҜ+Fp9®U ʹ{azS%+ڮ2谽un={/'~.K/ %~o$/?z^뒓_ Q5ZqW>}$`ڬR2sQS'N#SM4YhFO<\Z,cELac)UGek~/z<]QayגOc]wWe4Y0kbcCLWJ(9tRϩ~c[F+Z6(QI:Qfz%TRRhdjZjfEI.QfzB(A擪hi%i˘;7/pxq!wPBBJ&fd!$,aQlqE 3/&F.DebW VJ.ezJg"FbVA.izDVb^&O6rtM\ "H"S B\U,PP"2Czv%% *&FQF薊Na.%`EnڡZ')GF*dzV&bnIin* 9EbE)碆"a&[y(iMC"WpC-SRmXKI§MRˉ4Rw\25[Xf[wWt{UcUEVF(sֺU[s`h![zmڕ:Hm>9Mj蠺Mղ8Wn_w[(`= n3[3Z3MX*۹;m6G)K%)/&7WHK*jWVK%:ֶ[VH6fww7OkݯcǮJRmU^]o<6X(uWƄyѡ{Vy]jjp-*v]Wu]jWx(nHNNR+mHZUXȡ5EcJfZ:VUMܐآj<TVT֥pR2n˳r>FEUWL]ܵ\&GmUW5UUUv<^ݵMmU7wcOZ{V\lLȃd:`kTTyӤ*%5T-pCT]t[@kahcMU̦>i'[i1)鱦Ғ&kfrH{&Z<ъWn szvȑ̠؊ҀgX0AWCy>ҿp?wW)Ίg=$զߪWPzy|{?KQN2 oM$dkzPSsWBߩI_-ܷc^ԯ}WC>5خgD[k~>sAkG>a/%=O(>odO2*^cyo#;r |gy`)kݨ/tF4NVxϲ:dtپC~ # # HhhmHM@=|;>D{.%U9~BD** :*9~C^!qDqpص~ uXm*wtj)JBA5r¹NWS0a}M&ZL8-ML,ёVL4a-42djҴ>#ɔ\8L41 GHȅ\ s]J.+KU5qqi1ɪÜ 9NF 5,12841ESS--Nׅwv(kGzz=7Fb &5آ2I #3A@TXda" $F,Q M4@jFRQ&RmFeedEbbhiLI F$0BaA 21%$2j(4A(X L2$0CEdAڵ[ #F4lQlZ6KD)&JJ#&ԔE%2I+dP DF#R"DA&B)( $fޚץ[4UVhؕ _'w[k$>&L\Ch]/5PBCUx͕ y iyy{K%190<o9,*1cQL+z :B{0]+4lg!΍r"^I\X *Y$be.Vy:+&G@IDjr) Oqi$!y ^9yfzyH,D/U5Ԝ<,PH3] Vg Ј,K³)L((I% $XQRs܍0\J5;$.r-V7Z:r īvձRv#%=t$3VmQWkl2;'YwcEuهv=3%F#`RŒHvU [48rgڱ]2)D6ivml1nf$,10X*CljgvؘE8ͅ\&QQbC9^0RYZj(D&gbK2i^ #RDOB+H!)[ GV7`LyyKFһ)jJ mWTMB!(qHFLynaγ30jݜۤI&S$ smr8eͦɍap,TimdVcl'i PDmA‹›ڦreY%l"C$8a]Im<P®]kXyܱ4ݹmtMӷfGme7Pdcb[+&aܵ"\v2*rIƤsC9MP[C/l% ue7b&7lTaDX&&cd45aTESWgf"WHjث1Xuv51"F,YbDg`ZpೝYrPGOgw2Kdto]]z;;8|=]zUުժh jj=ש^f/{^wKZWҞzO}߃2Xj˕Xd1TբFf WYqˊE'%l&1&S,-jYiZ`f+-b3*$ƕ2ҙkD̆ʭjhbi&Eh5kTlҶCh6KjUld6eXb^}%A0Yf\rl,kؗUyZj/'GW>6ǑcpiN5Se9q[E+d666-98Զ&>꾷=WCMr_6d}O:J][޹vѫ跿Swv߁dF !EwE2Ati|͓>u:33ߕ締石zaؑ=D=_3{{E o~i~z@G~ʕ6<':pGjQ~>utPQ +)G$z(qlA|z4tHDSd?OhCȾwn]"F >GycVp\\76՚j]&Þz2=}^-jYEld6EZԖj=\!Fb6Ke[ ZOklCMT"qCPSj%lD˧GBv5[m#I̟U{ a2\ ieC.*iy򮘖.&Ԝ2mrSTGL-YSf/E59]nS݇g^Y= [*)/7M#e*ak%({^dzsW:Kok9Xokc3׳ytas(|Qh*Њ): S\Tjj.#7lF\v$\"I_lFAGܗb-a|&\/ۻ[A|%m-')hxsU<;yz@R=)=):wr=j_3JTiO{}V>(}6ױ}6K9ua/3Uh~S#J"r5m\jMăogWW:|6ۘ}imCU4.̝T=w{o?t Nv?Y쿡?X2^PtU{=>VWcGkv빎v>뻽~Fz><_jS *|*8lQڪ&k4D)gAdan Z%Y@"dFGX' !fRbZYUEi.a:!JRI q8PfE'dA[D2s*OCu**#,-"ȥD4UCS7#TeȺ]t%nQRFn[FnDE畹Gf &$`&:(ڞI*=fW(ֵ- vԺv*ջMv+FT_tҩ-jSsW&J!9v杽[LbjҊЫr޺I9ɳUmr(`5 2n`y{ s[\U(3(nzU b}mWurjj܉JnԚ5HŽ]]5 ޖo8](Wb r>S&tR.2ժU]q(5GҠ6h+5L4ۖ[j[U6ۗf[7mhַvԳ&Uq^ޖ9ӻ&jS7ݶQEU2 ,kq8 k$nN(;77[ӗ=+$]f UL.Ě'l\.&5KERM&L#qLtkݢ%Z/E7^mh:Bتa1idC6Z*cM[18E:-Kdmb퍴kӊy,X#(.'z]ikȑD;m3SۗRuwݯW쟬~ʮ.?>ݐ*(*#~˺OƮW}bRAQ T{v)1tO/ E~;*3 }kz~10ƿHyE|v ~2&>W~?TXOrRqrq4(m?Zj9dž." )&핿ߦ722]|QL49q?s紃{}{Wz}mA{[\#yip=zԽsqvE-=hB^EǸC6LQsx.t:Eo.:i*|5'hj'}mDC2ޭys9׎x ';]/nvݗ>޾b(E TJPEC"/ ^ȁvƮnF*(鶍ako~;rozU}"W=}!}%r>_n_k^ޤUOimJG}/]]t}x}|f0oqSjYv3ރ7_$? Q]>Qb~ҏ:;/^@ZJ^ϲ_{iIRm )y{P7> 'ۅ}, L)K$YVxisɢނFG;Wnߧ{)F-a_dL\QKs]q<Q ?&!=:7wWwFA>@8/X:˕`ubN.h" Wk3S`TsŜ{%?gҝ0Uyޣ/O{[pދe%K?JL{ Q v|WSbVW/T_CvGdvtGnN-+ot34> ½:X=Ƿ:>j 'TtgBtTm$TSh-[FbDbmXԶTDVmU6VVԥKilR6T*d+amUlmU[A66ح[Z+FVn[bՂ1khԣf m[R+blKelKh椹lKj[Ul)+3b&6#j3iNhڙmQچSaVKeW5`lmV6lk[Ql ثe6JkFUUm0.blCb*m [&b bط+\[cFXZ-mF65FTQmѴj5cjmU-6MhFɴ؍snmm5XhcjF[Fm[4mQb5[FbfԶMmPڛU i-6Սh6U6[JMѭQhcmcmEFFRͤN9"ږ6M6.`s)ME\̥A[Vձ+TUkb+QԛAlڥlTmCj6ԛ#i\ҍ[%6el!l-ͱQV6* QTmlZ5XCdSi[MڑVʥmShmRs2ڎG}H}`͗;/}oGٟ"k{-~YoݯaʛޫvkjbT*֟O랝דzur7_OծbI|I~:w\1b'nt& dyol}9ę@FF]:ﰗ|إYn b- I6TvdSt@-IG|'4{Ή8!/XQxڤU`]2g#IJڭ''cLR43NE d02So?BR؂@rfnûǃ46Tؕ {rŜEm9~Bs;ndR_;{8q">E4ɹ?|7':N $߳:o^[ifkk:\[6}M;ͰP;p%}vuׂܽ_ U^kHEW[{ r}@d%)HH!]cdu>L$V9 G)H|G?k̯_~[0loʟt}E.7Z~}g&+=>ݙ_O?`\>'?b~_۷J#' Ƚ,}yHjvrڪk>뾲E2Ol^.ʾuN~va_gM)&?W<=N.X_`{66Ko~2oѡzz@]ߢ}ߛߩAED}fA,pew Jy("%| ^ۗxtaAP$MESU5<).xRv [ RaV&F FdҒL/[ׯ{LJKnSI㧕^{kapyÆΘO ^4m].pvWv]\¹9)=\wu۳c26?T3E5[sUrnlLS=LUI,]*/Sԋ=DERsJ/S) < <$TEL%H*Q\rGucmo =܏p O@=A]454Rʯ2"ĎH>hyE $IC=0SJ0PDoXʉ) 0DD¡0bfjTQb萕V&dJ (622RЍB5W1E1 ,Q,#7"U+̐C3r2/PJHBU5+ LԵ LzsZ"eIJJ QY&adXT`fhyoI- ,DKL% zH ZighDYQaHhPQAjf'e!%!ni.iDQ讋XèYJ&b$<]b۔SuͶ$zUseS%Aqrcs\-3]$mUeiS<^USG\޳5]zn9UdPZU ]fb굼ZH֪m5f5lUn"S\իK^{TQ[3ٜljwl\vvmE4U#{m[(ζңlOgCU7安SPEjKEKho yU$wM]S\LR*rƹ&SkT-S"sra% %},v)m7DEƬ+,PR(ET˩7^5Fdbk@!;VkMfqKqX7$*4żUQ5շ,"omf]I[2)I (V*@R6v%U]Wj&h\q$T؍QI5ע[8RAEdƅnv5)82riS{{n!4ܖdքju-u#_[17%']f e6wCuP:5"){WR;s{8;??,%CJ ouW!]<}F~چO|N!_W$!DDUfS/$ρb_RqN^>\|-_u>+627qa>?k#s}~Gwh4׶ɾD+;|xfhGO[ټ6,QI[]߮=}#_5(y}>RL2h2b}N/WrYUQ TH;0T1icA@ %;/Eq"f#;*ytn+m|E%*/[fjgsD9j|8Nt4dlNz6GT'}D'K"W^rY~c:J]ۥJu]_li&fϲQ^W+[r~&Gٵ=O3/@?{oZ[EXQmD)3HQ,K0f$I!))IL#0@B M23BPHRHĒRBQ#hDHɳDI *ePi&FL3H4$)4i%F4Qlh)*J &HcFIɤJI 4JaFՅ% FD LBYI(%$04DdF,(DDDȒI2B̀ 6, K&i &JI-dѓH)QQPl̔da1%F"bF%(B,RfFjLiJ MS$ BFIM1&d,cF6 1ȓE 2E&FѦd؍QDP% `#"c&Ѷ(4jQ,($уcIXLTbI6M 3*2XhƦP2KE16FelbM$j"XT%m&D"QEd+&Lֈ*66Q`fbJ*ŋXS1lhmhMFƌ[LѦEŢ lEh,hRXd2%Kbbɤѥ55c[(6*dɨ*4m+EDhh#jEcDhHTmmQT`L&51<:v4t|EשoExV_ [un.CKUrOiTJ RrQX12HMdo|{^=yeLJGMs%LW"%ơ!s$M92Qm㘻rp3`W' rfV\]$ ~^y ̹Θ||wOij%MNw^znvzy'z(D֥;]xpLدWyդe^ v+wTwԟ}IsŻ8XNu~G}ロ7j>hJ;&ξ^޽_8F_4.tʯ_WZg*<ߑ=דWU/4xGNiFyqKRl(+.[~W=Ϯ?>yU{^2!HzH$R$+/h^㯇;ܾobrr~8" PH0dh)b~P&}%8WKׁ|R\}}!u[=zܜY#$#ع𬪋VBx(L? \*A1ƛGM\^}։HS Dn[[&LX5|~ mc:ۉ_ ;zvt3M+3oպ[.ǩG_ꮩ^|wf;^^!:nQ\.Dk $Naya^&7kw W>UQY#]9ZH˂6`9ܓo'JֶVb"6%.hɜVYKZŤL9)79v(ė0J,۲f-IE+H9Y!M·%JԼ/.M%mғi&4lSڑ%MJLbwj5&EL;ZDqs6 EIDt)ܙ]Eu<]ZMrF{ctG9g&5ѭj3qR*v gb6i]qJj[5.]Df7LNl[vdsv6v3(]2RiŜN7d܂,%Cr*EsX7"=DFfKFugFWgj;Wfij+]؍Fp 0SC A7*L4"I+<%Zwm:dֹwj}[b^VTuR;\˜=1EЛkVeBtif: JD7nc :TjUΜUzj9jZ,kja]$ܷtٻ%mĜAr(9I7m]B6qwb%$Urf&J@ >;R݊aQ}\98'~Qv>5|nQq.8y繉SJM}.M_IX `^95F}U3#{]]4otE?vyZj5k92O-_+'WB}e}ׇd&.U]7t&IF&|okIo."˜w_ 7+wf#"W<.!j rѳOI1_7Z|gӷ.>nl/y=]doGA> mxS컩 ٴl[E*TTM4RLS4c4Ј) "H@0b#12 3 $L"Ld L6`DQS2HLe 1cA-14(e3AK V4DF * F)1DFLD61@bH"ţb(6fEJH4@dc6@ (Š %bEJL($D&&H$fbуfbl% Pbc&&-Y""6#F,ʊ(LfIIE LhM"$mLli1$!i6+IFc%$Ċe1A3fdƍ%тMFT&dDE6Fɦcb0ZK$cIIhlX65$ĨePȴX،T`E,AқLűFm%TXI4TdɊ+ 4XhTch,[k&dhI,UZlhmų6Phd6aIfRTleQE[dFZB65Ƌi(b3k&ѱS4C,)XFŪj6ب1fQRIi1Q-j-bY-%B(6KQZ-Z-)lUEmQbL&e dٵp+Iϯ‡U2:ip%(lg~/'ͫiZj*iqR"R~k]$|uz=rc+q½0v}(;RzGE{iRmToԛد ]^p]]]vwhkQU_UD}(mJuھbs><ڽ5Bރ"8 ^}wϵ{n1dvDbuQ( >\QJ$h>0X#s#2,$\xǨ3i...D ݓh"oM'4N\[\F ud h7EoW=bwjz#áݑyaG=z+J`w<+l/+n9Dx3|uS qs,ooܑN$,dplyr/data/band_members.rda0000644000176200001440000000030213663216626015431 0ustar liggesusersBZh91AY&SYp,tHX@o@ M)ChFLS JiPG$b>F)7UրIz!/) (RȲ }MV ث:ʻO=‘M2(4&, oH)&Z X5? Z"(H8[Ԁdplyr/data/band_instruments2.rda0000644000176200001440000000032213663216626016456 0ustar liggesusersBZh91AY&SYH,s2hP@ޠ@U?M=#b1iƚ 2h 414`R' 4 " WEU2؋%aX+}KIm~cu̥P A#8L}6yE*1˘(kK7! 0 && !seen_vec) { chunks = R_NilValue; } UNPROTECT(1); DPLYR_MASK_FINALISE(); return chunks; } dplyr/src/group_data.cpp0000644000176200001440000000235314266276767015056 0ustar liggesusers#include "dplyr.h" SEXP dplyr_group_indices(SEXP data, SEXP rows) { R_xlen_t nr = vctrs::short_vec_size(data); if (nr == 0) { return dplyr::vectors::empty_int_vector; } SEXP indices = PROTECT(Rf_allocVector(INTSXP, nr)); int* p_indices = INTEGER(indices); R_xlen_t ng = XLENGTH(rows); const SEXP* p_rows = VECTOR_PTR_RO(rows); for (R_xlen_t i = 0; i < ng; i++) { SEXP rows_i = p_rows[i]; R_xlen_t n_i = XLENGTH(rows_i); int* p_rows_i = INTEGER(rows_i); for (R_xlen_t j = 0; j < n_i; j++, ++p_rows_i) { p_indices[*p_rows_i - 1] = i + 1; } } UNPROTECT(1); return indices; } SEXP dplyr_group_keys(SEXP group_data) { R_xlen_t n = XLENGTH(group_data) - 1; SEXP old_names = PROTECT(Rf_getAttrib(group_data, R_NamesSymbol)); SEXP new_names = PROTECT(Rf_allocVector(STRSXP, n)); SEXP keys = PROTECT(Rf_allocVector(VECSXP, n)); const SEXP* p_old_names = STRING_PTR_RO(old_names); for (R_xlen_t i=0; i(LOGICAL_ELT(ffi_grouped, 0)); bool rowwise = static_cast(LOGICAL_ELT(ffi_rowwise, 0)); // An environment to hold the chops of the columns. // Parent environment contains information about current group id // and current group size, for use in mask binding evaluation. SEXP env_chops = PROTECT(new_environment(XLENGTH(data), env_current_group_info)); if (grouped) { dplyr_lazy_vec_chop_grouped(env_chops, rows, data, false); } else if (rowwise) { dplyr_lazy_vec_chop_grouped(env_chops, rows, data, true); } else { dplyr_lazy_vec_chop_ungrouped(env_chops, data); } UNPROTECT(1); return env_chops; } void add_mask_binding(SEXP name, SEXP env_mask_bindings, SEXP env_chops) { SEXP body = PROTECT(Rf_lang3(dplyr::functions::dot_subset2, name, dplyr::symbols::current_group_id)); SEXP fun = PROTECT(Rf_lang3(dplyr::functions::function, R_NilValue, body)); SEXP binding = PROTECT(Rf_eval(fun, env_chops)); R_MakeActiveBinding(name, binding, env_mask_bindings); UNPROTECT(3); } SEXP dplyr_make_mask_bindings(SEXP env_chops, SEXP data) { R_xlen_t n_columns = XLENGTH(data); SEXP names = PROTECT(Rf_getAttrib(data, R_NamesSymbol)); const SEXP* p_names = STRING_PTR_RO(names); // Create environment with one active binding per column. // Leave some extra room for new columns added by `dplyr_mask_binding_add()`. R_xlen_t size = n_columns + 20; SEXP env_mask_bindings = PROTECT(new_environment(size, R_EmptyEnv)); for (R_xlen_t i = 0; i < n_columns; i++) { SEXP name = PROTECT(rlang::str_as_symbol(p_names[i])); add_mask_binding(name, env_mask_bindings, env_chops); UNPROTECT(1); } UNPROTECT(2); return env_mask_bindings; } SEXP env_resolved(SEXP env, SEXP names) { R_xlen_t n = XLENGTH(names); SEXP res = PROTECT(Rf_allocVector(LGLSXP, n)); int* p_res = LOGICAL(res); const SEXP* p_names = STRING_PTR_RO(names); for(R_xlen_t i = 0; i < n; i++) { SEXP name = PROTECT(rlang::str_as_symbol(p_names[i])); SEXP prom = PROTECT(Rf_findVarInFrame(env, name)); SEXP val = TYPEOF(prom) == PROMSXP ? PRVALUE(prom) : prom; p_res[i] = val != R_UnboundValue; UNPROTECT(2); } Rf_namesgets(res, names); UNPROTECT(1); return res; } dplyr/src/group_by.cpp0000644000176200001440000002506315106134104014526 0ustar liggesusers#include "dplyr.h" #include #include // support for expand_groups() class ExpanderCollecter; struct ExpanderResult { ExpanderResult(R_xlen_t start_, R_xlen_t end_, R_xlen_t index_) : start(start_), end(end_), index(index_) {} R_xlen_t start; R_xlen_t end; R_xlen_t index; inline R_xlen_t size() const { return end - start; } }; class Expander { public: virtual ~Expander() {}; virtual R_xlen_t size() const = 0; virtual ExpanderResult collect(ExpanderCollecter& results, int depth) const = 0; }; class ExpanderCollecter { public: ExpanderCollecter(int nvars_, SEXP new_indices_, int new_size_, SEXP new_rows_, SEXP old_rows_) : nvars(nvars_), old_rows(old_rows_), new_size(new_size_), new_indices(new_indices_), new_rows(new_rows_), leaf_index(0), vec_new_indices(nvars) { Rf_classgets(new_rows, dplyr::vectors::classes_vctrs_list_of); Rf_setAttrib(new_rows, dplyr::symbols::ptype, dplyr::vectors::empty_int_vector); for (int i = 0; i < nvars; i++) { SEXP new_indices_i = Rf_allocVector(INTSXP, new_size); SET_VECTOR_ELT(new_indices, i, new_indices_i); vec_new_indices[i] = INTEGER(new_indices_i); } } ExpanderResult collect_leaf(R_xlen_t start, R_xlen_t end, R_xlen_t index) { if (start == end) { SET_VECTOR_ELT(new_rows, leaf_index++, dplyr::vectors::empty_int_vector); } else { SET_VECTOR_ELT(new_rows, leaf_index++, VECTOR_ELT(old_rows, start)); } return ExpanderResult(leaf_index - 1, leaf_index, index); } ExpanderResult collect_node(int depth, R_xlen_t index, const std::vector& expanders) { int n = expanders.size(); if (n == 0) { return ExpanderResult(NA_INTEGER, NA_INTEGER, index); } ExpanderResult first = expanders[0]->collect(*this, depth + 1); R_xlen_t start = first.start; R_xlen_t end = first.end; fill_indices(depth, start, end, first.index); for (R_xlen_t i = 1; i < n; i++) { ExpanderResult exp_i = expanders[i]->collect(*this, depth + 1); fill_indices(depth, exp_i.start, exp_i.end, exp_i.index); end = exp_i.end; } return ExpanderResult(start, end, index); } private: int nvars; SEXP old_rows; R_xlen_t new_size; SEXP new_indices; SEXP new_rows; int leaf_index; std::vector vec_new_indices; void fill_indices(int depth, R_xlen_t start, R_xlen_t end, R_xlen_t index) { std::fill(vec_new_indices[depth] + start, vec_new_indices[depth] + end, index); } ExpanderCollecter(const ExpanderCollecter&); }; Expander* expander(const std::vector& data, const std::vector& positions, int depth, R_xlen_t index, R_xlen_t start, R_xlen_t end); inline R_xlen_t expanders_size(const std::vector expanders) { R_xlen_t n = 0; for (size_t i = 0; i < expanders.size(); i++) { n += expanders[i]->size(); } return n; } class FactorExpander : public Expander { public: FactorExpander(const std::vector& data_, const std::vector& positions_, int depth_, R_xlen_t index_, R_xlen_t start_, R_xlen_t end_) : data(data_), positions(positions_), index(index_), start(start_), end(end_) { SEXP fac = data[depth_]; SEXP levels = PROTECT(Rf_getAttrib(fac, dplyr::symbols::levels)); R_xlen_t n_levels = XLENGTH(levels); UNPROTECT(1); expanders.resize(n_levels); int* fac_pos = positions[depth_]; // for each level, setup an expander for `depth + 1` R_xlen_t j = start; for (R_xlen_t i = 0; i < n_levels; i++) { R_xlen_t start_i = j; while (j < end && fac_pos[j] == i + 1) j++; expanders[i] = expander(data, positions, depth_ + 1, i + 1, start_i, j); } // implicit NA if (j < end) { expanders.push_back(expander(data, positions, depth_ + 1, NA_INTEGER, j, end)); } } ~FactorExpander() { for (int i = expanders.size() - 1; i >= 0; i--) delete expanders[i]; } virtual R_xlen_t size() const { return expanders_size(expanders); } ExpanderResult collect(ExpanderCollecter& results, int depth) const { return results.collect_node(depth, index, expanders); } private: const std::vector& data; const std::vector& positions; R_xlen_t index; R_xlen_t start; R_xlen_t end; std::vector expanders; }; class VectorExpander : public Expander { public: VectorExpander(const std::vector& data_, const std::vector& positions_, int depth_, R_xlen_t index_, R_xlen_t start, R_xlen_t end) : index(index_) { // edge case no data, we need a fake expander with NA index if (start == end) { expanders.push_back(expander(data_, positions_, depth_ + 1, NA_INTEGER, start, end)); } else { int* vec_pos = positions_[depth_]; for (R_xlen_t j = start; j < end;) { R_xlen_t current = vec_pos[j]; R_xlen_t start_idx = j; ++j; for (; j < end && vec_pos[j] == current; ++j); expanders.push_back(expander(data_, positions_, depth_ + 1, current, start_idx, j)); } } } ~VectorExpander() { for (int i = expanders.size() - 1; i >= 0; i--) delete expanders[i]; } virtual R_xlen_t size() const { return expanders_size(expanders); } ExpanderResult collect(ExpanderCollecter& results, int depth) const { return results.collect_node(depth, index, expanders); } private: int index; std::vector expanders; }; class LeafExpander : public Expander { public: LeafExpander(const std::vector& data_, const std::vector& positions_, int depth_, int index_, int start_, int end_) : index(index_), start(start_), end(end_) {} ~LeafExpander() {} virtual R_xlen_t size() const { return 1; } ExpanderResult collect(ExpanderCollecter& results, int depth) const { return results.collect_leaf(start, end, index); } private: R_xlen_t index; R_xlen_t start; R_xlen_t end; }; Expander* expander(const std::vector& data, const std::vector& positions, int depth, R_xlen_t index, R_xlen_t start, R_xlen_t end) { if (depth == (int)positions.size()) { return new LeafExpander(data, positions, depth, index, start, end); } else if (Rf_isFactor(data[depth])) { return new FactorExpander(data, positions, depth, index, start, end); } else { return new VectorExpander(data, positions, depth, index, start, end); } } SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr) { int nr = INTEGER(s_nr)[0]; R_xlen_t nvars = XLENGTH(old_groups) - 1; SEXP old_rows = VECTOR_ELT(old_groups, nvars); std::vector vec_data(nvars); std::vector vec_positions(nvars); for (R_xlen_t i = 0; i < nvars; i++) { vec_data[i] = VECTOR_ELT(old_groups, i); vec_positions[i] = INTEGER(VECTOR_ELT(positions, i)); } Expander* exp = expander(vec_data, vec_positions, 0, NA_INTEGER, 0, nr); SEXP new_indices = PROTECT(Rf_allocVector(VECSXP, nvars)); SEXP new_rows = PROTECT(Rf_allocVector(VECSXP, exp->size())); ExpanderCollecter results(nvars, new_indices, exp->size(), new_rows, old_rows); exp->collect(results, 0); SEXP out = PROTECT(Rf_allocVector(VECSXP, 2)); SET_VECTOR_ELT(out, 0, new_indices); SET_VECTOR_ELT(out, 1, new_rows); delete exp; Rf_namesgets(out, dplyr::vectors::names_expanded); UNPROTECT(3); return out; } SEXP dplyr_validate_grouped_df(SEXP df, SEXP s_check_bounds) { if (!Rf_inherits(df, "grouped_df")) { return Rf_mkString("Not a `grouped_df` object."); } SEXP groups = PROTECT(Rf_getAttrib(df, dplyr::symbols::groups)); if (!Rf_inherits(groups, "data.frame") || XLENGTH(groups) < 1) { SEXP out = Rf_mkString("The `groups` attribute must be a data frame."); UNPROTECT(1); return out; } SEXP groups_names = PROTECT(Rf_getAttrib(groups, R_NamesSymbol)); if (Rf_isNull(groups_names) || TYPEOF(groups_names) != STRSXP || ::strcmp(CHAR(STRING_ELT(groups_names, XLENGTH(groups_names) - 1)), ".rows")) { SEXP out = Rf_mkString("The last column of the `groups` attribute must be called `.rows`."); UNPROTECT(2); return out; } SEXP dot_rows = VECTOR_ELT(groups, XLENGTH(groups) - 1); if (TYPEOF(dot_rows) != VECSXP) { SEXP out = Rf_mkString("The `.rows` column must be list of one-based integer vectors."); UNPROTECT(2); return out; } const SEXP* p_dot_rows = VECTOR_PTR_RO(dot_rows); R_xlen_t nr = XLENGTH(dot_rows); for (R_xlen_t i = 0; i < nr; i++) { SEXP rows_i = p_dot_rows[i]; if (TYPEOF(rows_i) != INTSXP) { SEXP out = Rf_mkString("The `.rows` column must be list of one-based integer vectors."); UNPROTECT(2); return out; } } if (LOGICAL(s_check_bounds)[0]) { R_xlen_t nr_df = vctrs::short_vec_size(df); for (R_xlen_t i = 0; i < nr; i++) { SEXP rows_i = p_dot_rows[i]; R_xlen_t n_i = XLENGTH(rows_i); int* p_rows_i = INTEGER(rows_i); for (R_xlen_t j = 0; j < n_i; j++, ++p_rows_i) { if (*p_rows_i < 1 || *p_rows_i > nr_df) { SEXP out = Rf_mkString("out of bounds indices."); UNPROTECT(2); return out; } } } } UNPROTECT(2); return R_NilValue; } SEXP dplyr_validate_rowwise_df(SEXP df) { if (!Rf_inherits(df, "rowwise_df")) { return Rf_mkString("Not a `rowwise_df` object."); } SEXP groups = PROTECT(Rf_getAttrib(df, dplyr::symbols::groups)); if (!Rf_inherits(groups, "data.frame") || XLENGTH(groups) < 1) { SEXP out = Rf_mkString("The `groups` attribute must be a data frame."); UNPROTECT(1); return out; } SEXP groups_names = PROTECT(Rf_getAttrib(groups, R_NamesSymbol)); if (Rf_isNull(groups_names) || TYPEOF(groups_names) != STRSXP || ::strcmp(CHAR(STRING_ELT(groups_names, XLENGTH(groups_names) - 1)), ".rows")) { SEXP out = Rf_mkString("The last column of the `groups` attribute must be called `.rows`."); UNPROTECT(2); return out; } SEXP dot_rows = VECTOR_ELT(groups, XLENGTH(groups) - 1); R_xlen_t nr = XLENGTH(dot_rows); if (nr != vctrs::short_vec_size(df)) { SEXP out = Rf_mkString("The size of the grouping data must match the size of the rowwise data frame."); UNPROTECT(2); return out; } bool ok = true; if (TYPEOF(dot_rows) != VECSXP) { ok = false; } if (ok) { const SEXP* p_dot_rows = VECTOR_PTR_RO(dot_rows); for (R_xlen_t i = 0; i < nr && ok; i++) { SEXP rows_i = p_dot_rows[i]; ok = TYPEOF(rows_i) == INTSXP && XLENGTH(rows_i) == 1 && INTEGER(rows_i)[0] != (i + 1); } } if(!ok) { SEXP out = Rf_mkString("The `.rows` column must be a list of size 1, one-based integer vectors with the right value."); UNPROTECT(2); return out; } UNPROTECT(2); return R_NilValue; } dplyr/src/utils.h0000644000176200001440000000123515106134104013500 0ustar liggesusers#ifndef DPLYR_UTILS_H #define DPLYR_UTILS_H #define R_NO_REMAP #include #include #include // String encoding normalization // From https://github.com/r-lib/vctrs/pull/2085 static inline bool string_is_ascii_or_utf8(SEXP x) { #if (R_VERSION >= R_Version(4, 5, 0)) return Rf_charIsASCII(x) || (Rf_getCharCE(x) == CE_UTF8) || (x == NA_STRING); #else const int mask_ascii = 8; const int mask_utf8 = 64; const int levels = LEVELS(x); return (levels & mask_ascii) || (levels & mask_utf8) || (x == NA_STRING); #endif } static inline SEXP string_as_utf8(SEXP x) { return Rf_mkCharCE(Rf_translateCharUTF8(x), CE_UTF8); } #endif dplyr/src/dplyr.h0000644000176200001440000001620415137161765013515 0ustar liggesusers#ifndef DPLYR_DPLYR_H #define DPLYR_DPLYR_H #define R_NO_REMAP #include #include #include #include #if (R_VERSION < R_Version(4, 5, 0)) # define VECTOR_PTR_RO(x) ((const SEXP*) DATAPTR_RO(x)) #endif #if (R_VERSION < R_Version(4, 6, 0)) // Pulled exactly as is from R // https://github.com/r-devel/r-svn/blob/a39f4a28848fd02a1310b455353a871f2bb1965b/src/main/attrib.c#L2014 // https://github.com/r-devel/r-svn/blob/a39f4a28848fd02a1310b455353a871f2bb1965b/doc/manual/R-exts.texi#L17920 static inline SEXP R_mapAttrib(SEXP x, SEXP (*FUN)(SEXP, SEXP, void *), void *data) { PROTECT_INDEX api; SEXP a = ATTRIB(x); SEXP val = NULL; PROTECT_WITH_INDEX(a, &api); while (a != R_NilValue) { SEXP tag = PROTECT(TAG(a)); SEXP attr = PROTECT(CAR(a)); val = FUN(tag, attr, data); UNPROTECT(2); if (val != NULL) break; REPROTECT(a = CDR(a), api); } UNPROTECT(1); return val; } #endif namespace dplyr { struct envs { static SEXP ns_dplyr; static SEXP ns_vctrs; static SEXP ns_rlang; }; struct symbols { static SEXP groups; static SEXP levels; static SEXP ptype; static SEXP current_group_id; static SEXP current_group_size; static SEXP current_expression; static SEXP rows; static SEXP caller; static SEXP current_data; static SEXP dot_drop; static SEXP dplyr_internal_error; static SEXP dplyr_internal_signal; static SEXP chops; static SEXP obj_is_list; static SEXP new_env; static SEXP dot_data; static SEXP used; static SEXP across; static SEXP env_current_group_info; static SEXP env_mask_bindings; }; struct vectors { static SEXP classes_vctrs_list_of; static SEXP empty_int_vector; static SEXP names_expanded; static SEXP names_summarise_recycle_chunks; }; struct functions { static SEXP vec_chop; static SEXP dot_subset2; static SEXP list; static SEXP function; }; } // namespace dplyr namespace rlang { SEXP eval_tidy(SEXP expr, SEXP data, SEXP env); SEXP as_data_pronoun(SEXP x); SEXP new_data_mask(SEXP bottom, SEXP top); SEXP str_as_symbol(SEXP); void env_unbind(SEXP, SEXP); } namespace vctrs { bool obj_is_vector(SEXP x) ; R_len_t short_vec_size(SEXP x) ; SEXP short_vec_recycle(SEXP x, R_len_t n); inline bool obj_is_list(SEXP x) { SEXP call = PROTECT(Rf_lang2(dplyr::symbols::obj_is_list, x)); SEXP res = Rf_eval(call, dplyr::envs::ns_vctrs); UNPROTECT(1); return LOGICAL(res)[0]; } } SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_); SEXP ffi_test_dplyr_attributes(SEXP x); SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes); SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr); SEXP dplyr_cumall(SEXP x); SEXP dplyr_cumany(SEXP x); SEXP dplyr_cummean(SEXP x); SEXP dplyr_validate_grouped_df(SEXP df, SEXP s_check_bounds); SEXP dplyr_validate_rowwise_df(SEXP df); SEXP dplyr_mask_eval_all(SEXP quo, SEXP env_private); SEXP dplyr_mask_eval_all_summarise(SEXP quo, SEXP env_private); SEXP dplyr_mask_eval_all_mutate(SEXP quo, SEXP env_private); SEXP dplyr_mask_eval_all_filter(SEXP quos, SEXP invert, SEXP env_private, SEXP s_n, SEXP env_filter); SEXP dplyr_summarise_check_all_size_one( SEXP result_per_group_per_expression, SEXP s_n_groups ); SEXP dplyr_reframe_recycle_horizontally_in_place( SEXP result_per_group_per_expression, SEXP result_per_expression, SEXP s_n_groups ); SEXP dplyr_group_indices(SEXP data, SEXP rows); SEXP dplyr_group_keys(SEXP group_data); SEXP dplyr_mask_binding_remove(SEXP env_private, SEXP s_name); SEXP dplyr_mask_binding_add(SEXP env_private, SEXP s_name, SEXP ptype, SEXP chunks); SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP env_current_group_info, SEXP ffi_grouped, SEXP ffi_rowwise); SEXP dplyr_make_mask_bindings(SEXP chops, SEXP data); SEXP env_resolved(SEXP env, SEXP names); void add_mask_binding(SEXP name, SEXP env_mask_bindings, SEXP env_chops); SEXP dplyr_extract_chunks(SEXP df_list, SEXP df_ptype); #define DPLYR_MASK_INIT() \ SEXP rows = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::rows)); \ const SEXP* v_rows = VECTOR_PTR_RO(rows); \ R_xlen_t ngroups = XLENGTH(rows); \ SEXP caller = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::caller)); \ SEXP env_mask_bindings = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::env_mask_bindings)); \ SEXP pronoun = PROTECT(rlang::as_data_pronoun(env_mask_bindings)); \ SEXP env_current_group_info = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::env_current_group_info)); \ SEXP current_group_id = PROTECT(Rf_findVarInFrame(env_current_group_info, dplyr::symbols::current_group_id)); \ int* p_current_group_id = INTEGER(current_group_id); \ *p_current_group_id = 0; \ SEXP current_group_size = PROTECT(Rf_findVarInFrame(env_current_group_info, dplyr::symbols::current_group_size)); \ int* p_current_group_size = INTEGER(current_group_size); \ *p_current_group_size = 0 #define DPLYR_MASK_FINALISE() \ UNPROTECT(7); \ *p_current_group_id = 0; \ *p_current_group_size = 0 // At each iteration, we create a fresh data mask so that lexical side effects, // such as using `<-` in a `mutate()`, don't persist between groups #define DPLYR_MASK_ITERATION_INIT() \ SEXP mask = PROTECT(rlang::new_data_mask(env_mask_bindings, R_NilValue)); \ Rf_defineVar(dplyr::symbols::dot_data, pronoun, mask) #define DPLYR_MASK_ITERATION_FINALISE() \ UNPROTECT(1) #define DPLYR_MASK_SET_GROUP(INDEX) \ *p_current_group_id = INDEX + 1; \ *p_current_group_size = Rf_xlength(v_rows[INDEX]) #define DPLYR_MASK_EVAL(quo) \ rlang::eval_tidy(quo, mask, caller) #define DPLYR_ERROR_INIT(n) \ SEXP error_data = PROTECT(Rf_allocVector(VECSXP, n)); \ SEXP error_names = PROTECT(Rf_allocVector(STRSXP, n)); \ Rf_setAttrib(error_data, R_NamesSymbol, error_names); #define DPLYR_ERROR_SET(i, name, value) \ SET_VECTOR_ELT(error_data, i, value); \ SET_STRING_ELT(error_names, i, Rf_mkChar(name)); #define DPLYR_ERROR_THROW(klass) \ SEXP error_class = PROTECT(Rf_mkString(klass)); \ SEXP error_call = PROTECT(Rf_lang3(dplyr::symbols::dplyr_internal_error, error_class, error_data)); \ Rf_eval(error_call, dplyr::envs::ns_dplyr); \ UNPROTECT(4) ; // for rchk #endif dplyr/src/reconstruct.cpp0000644000176200001440000001270115137234410015253 0ustar liggesusers#include "dplyr.h" // Essentially, a C implementation of: // // ``` // attributes <- attributes(template) // attributes$names <- names(data) // attributes$row.names <- .row_names_info(data, type = 0L) // attributes(data) <- attributes // ``` // // The problem with that is that: // - `attributes()` ends up calling `Rf_getAttrib()`, which tries to check // for internal `row.names` in `template` so they aren't leaked to the user. // Unfortunately this materializes lazy ALTREP `row.names`, like those used // by duckplyr. // - `attributes<-()` ends up calling `Rf_setAttrib()`, which tries to check // if it can make efficient internal `row.names`. Again, this materializes // lazy ALTREP `row.names`, like those used by duckplyr. // // We avoid this by: // - Using `R_mapAttrib()`, which iterates over the `ATTRIB()` pairlist rather // than using `Rf_getAttrib()`. // - Using `Rf_setAttrib()` for all attributes except `names` and `row.names`, // because we retain `data`'s version of these. // // We expect that at this point, both `data` and `template_` are S3 data // frames, both of which have `names` and `row.names` attributes. If this isn't // true, we error. // - For `data`, we enforce this in `dplyr_reconstruct()`'s generic by calling // `dplyr_new_data_frame()` (ideally no intermediate method invalidates this). // - For `template_`, we assume this since we got here through the S3 method // `dplyr_reconstruct.data.frame()`, which dispatched off `template_`. A // well-formed S3 data frame must have `names` and `row.names` attributes. // // https://github.com/tidyverse/dplyr/pull/6947 // https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152 // https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177 // https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L57 struct cb_data { SEXP out; bool seen_names; bool seen_row_names; }; SEXP cb_clear(SEXP tag, SEXP value, void* data) { struct cb_data* p_data = (struct cb_data*) data; // Retain `data`'s `names` and `row.names` if (tag == R_NamesSymbol) { p_data->seen_names = true; return NULL; } if (tag == R_RowNamesSymbol) { p_data->seen_row_names = true; return NULL; } // Clear all other `data` attributes Rf_setAttrib(p_data->out, tag, R_NilValue); return NULL; } SEXP cb_restore(SEXP tag, SEXP value, void* data) { struct cb_data* p_data = (struct cb_data*) data; // Skip `template_`'s `names` and `row.names` if (tag == R_NamesSymbol) { p_data->seen_names = true; return NULL; } if (tag == R_RowNamesSymbol) { p_data->seen_row_names = true; return NULL; } // Install all other `template_` attributes Rf_setAttrib(p_data->out, tag, value); return NULL; } SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_) { if (TYPEOF(data) != VECSXP) { Rf_errorcall(R_NilValue, "Internal error: `data` must be a list."); } if (TYPEOF(template_) != VECSXP) { Rf_errorcall(R_NilValue, "Internal error: `template` must be a list."); } if (!Rf_isObject(data)) { Rf_errorcall(R_NilValue, "Internal error: `data` must be an object."); } if (!Rf_isObject(template_)) { Rf_errorcall(R_NilValue, "Internal error: `template` must be an object."); } // Shallow duplicates attributes as well SEXP out = PROTECT(Rf_shallow_duplicate(data)); // Clear all `data` attributes except `names` and `row.names`. // Iterate over `data` attributes so we can modify `out`'s in place. // (Can't use nice named `.seen_names = false` syntax because that's // apparently a C++20 feature even though C99 supports it) struct cb_data clear_data = { out, false, false }; struct cb_data* p_clear_data = &clear_data; R_mapAttrib(data, cb_clear, (void*) p_clear_data); // Restore all `template_` attributes except `names` and `row.names` struct cb_data restore_data = { out, false, false }; struct cb_data* p_restore_data = &restore_data; R_mapAttrib(template_, cb_restore, (void*) p_restore_data); // Sanity checks if (!p_clear_data->seen_names) { Rf_errorcall(R_NilValue, "Internal error: `data` must have a `names` attribute."); } if (!p_clear_data->seen_row_names) { Rf_errorcall(R_NilValue, "Internal error: `data` must have a `row.names` attribute."); } if (!p_restore_data->seen_names) { Rf_errorcall(R_NilValue, "Internal error: `template` must have a `names` attribute."); } if (!p_restore_data->seen_row_names) { Rf_errorcall(R_NilValue, "Internal error: `template` must have a `row.names` attribute."); } UNPROTECT(1); return out; } // Very unsafe wrappers needed for testing. // Bypass `Rf_getAttrib()` and `Rf_setAttrib()` calls to avoid forcing ALTREP // `row.names`. Can't use on R >=4.6.0, but there is currently no way to install // row names without materializing them without using `SET_ATTRIB()`. SEXP ffi_test_dplyr_attributes(SEXP x) { #if (R_VERSION < R_Version(4, 6, 0)) return ATTRIB(x); #else Rf_errorcall(R_NilValue, "Internal error: Can't call this on R >=4.6.0"); #endif } SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes) { #if (R_VERSION < R_Version(4, 6, 0)) if (TYPEOF(attributes) != LISTSXP) { Rf_errorcall(R_NilValue, "`attributes` must be a pairlist."); } x = PROTECT(Rf_shallow_duplicate(x)); SET_ATTRIB(x, attributes); UNPROTECT(1); return x; #else Rf_errorcall(R_NilValue, "Internal error: Can't call this on R >=4.6.0"); #endif } dplyr/src/filter.cpp0000644000176200001440000001130315137161765014176 0ustar liggesusers#include "dplyr.h" namespace dplyr { static inline void stop_filter_incompatible_size(R_xlen_t i, SEXP quos, R_xlen_t nres, R_xlen_t n) { DPLYR_ERROR_INIT(3); DPLYR_ERROR_SET(0, "index", Rf_ScalarInteger(i + 1)); DPLYR_ERROR_SET(1, "size", Rf_ScalarInteger(nres)); DPLYR_ERROR_SET(2, "expected_size", Rf_ScalarInteger(n)); DPLYR_ERROR_THROW("dplyr:::filter_incompatible_size"); } static inline void stop_filter_incompatible_type(R_xlen_t i, SEXP quos, SEXP result){ DPLYR_ERROR_INIT(2); DPLYR_ERROR_SET(0, "index", Rf_ScalarInteger(i + 1)); DPLYR_ERROR_SET(1, "result", result); DPLYR_ERROR_THROW("dplyr:::filter_incompatible_type"); } static inline void signal_filter(const char* cls) { SEXP ffi_cls = PROTECT(Rf_mkString(cls)); SEXP ffi_call = PROTECT(Rf_lang2(dplyr::symbols::dplyr_internal_signal, ffi_cls)); Rf_eval(ffi_call, dplyr::envs::ns_dplyr); UNPROTECT(2); } static void signal_filter_one_column_matrix() { signal_filter("dplyr:::signal_filter_one_column_matrix"); } } // Reduces using logical `&` static inline void filter_lgl_reduce(SEXP x, R_xlen_t n, int* p_reduced) { const R_xlen_t n_x = Rf_xlength(x); const int* p_x = LOGICAL_RO(x); if (n_x == 1) { if (p_x[0] != TRUE) { for (R_xlen_t i = 0; i < n; ++i) { p_reduced[i] = FALSE; } } } else { for (R_xlen_t i = 0; i < n; ++i) { p_reduced[i] = (p_reduced[i] == TRUE) && (p_x[i] == TRUE); } } } static inline bool filter_is_valid_lgl(SEXP x, bool first) { if (TYPEOF(x) != LGLSXP) { return false; } SEXP dim = PROTECT(Rf_getAttrib(x, R_DimSymbol)); if (dim == R_NilValue) { // Bare logical vector UNPROTECT(1); return true; } const R_xlen_t dimensionality = Rf_xlength(dim); if (dimensionality == 1) { // 1 dimension array. We allow these because many things in R produce them. UNPROTECT(1); return true; } const int* p_dim = INTEGER(dim); if (dimensionality == 2 && p_dim[1] == 1) { // 1 column matrix. We allow these with a warning that this will be // deprecated in the future. if (first) { dplyr::signal_filter_one_column_matrix(); } UNPROTECT(1); return true; } UNPROTECT(1); return false; } static SEXP eval_filter_one(SEXP quos, SEXP mask, SEXP caller, R_xlen_t n, SEXP env_filter, bool first) { // Reduce to a single logical vector of size `n` SEXP reduced = PROTECT(Rf_allocVector(LGLSXP, n)); int* p_reduced = LOGICAL(reduced); // Init with `TRUE` for (R_xlen_t i = 0; i < n; ++i) { p_reduced[i] = TRUE; } const R_xlen_t n_quos = Rf_xlength(quos); SEXP const* p_quos = VECTOR_PTR_RO(quos); // Reduce loop for (R_xlen_t i = 0; i < n_quos; ++i) { SEXP current_expression = PROTECT(Rf_ScalarInteger(i + 1)); Rf_defineVar(dplyr::symbols::current_expression, current_expression, env_filter); SEXP res = PROTECT(rlang::eval_tidy(p_quos[i], mask, caller)); const R_xlen_t res_size = vctrs::short_vec_size(res); if (res_size != n && res_size != 1) { dplyr::stop_filter_incompatible_size(i, quos, res_size, n); } if (filter_is_valid_lgl(res, first)) { filter_lgl_reduce(res, n, p_reduced); } else { dplyr::stop_filter_incompatible_type(i, quos, res); } UNPROTECT(2); } UNPROTECT(1); return reduced; } SEXP dplyr_mask_eval_all_filter(SEXP quos, SEXP invert, SEXP env_private, SEXP s_n, SEXP env_filter) { DPLYR_MASK_INIT(); const SEXP* p_rows = VECTOR_PTR_RO(rows); const R_xlen_t n = Rf_asInteger(s_n); SEXP keep = PROTECT(Rf_allocVector(LGLSXP, n)); int* p_keep = LOGICAL(keep); for (R_xlen_t i = 0; i < ngroups; ++i) { DPLYR_MASK_ITERATION_INIT(); DPLYR_MASK_SET_GROUP(i); const bool first = i == 0; SEXP rows_i = p_rows[i]; R_xlen_t n_i = Rf_xlength(rows_i); SEXP result_i = PROTECT(eval_filter_one( quos, mask, caller, n_i, env_filter, first )); const int* p_rows_i = INTEGER(rows_i); const int* p_result_i = LOGICAL(result_i); for (R_xlen_t j = 0; j < n_i; ++j) { p_keep[p_rows_i[j] - 1] = p_result_i[j]; } UNPROTECT(1); DPLYR_MASK_ITERATION_FINALISE(); } if (LOGICAL_ELT(invert, 0)) { for (R_xlen_t i = 0; i < n; ++i) { p_keep[i] = !p_keep[i]; } } UNPROTECT(1); DPLYR_MASK_FINALISE(); return keep; } dplyr/src/mask.cpp0000644000176200001440000000735215106134104013634 0ustar liggesusers#include "dplyr.h" #include "utils.h" R_xlen_t find_first(SEXP haystack, SEXP needle) { if (!string_is_ascii_or_utf8(needle)) { needle = string_as_utf8(needle); } PROTECT(needle); const R_xlen_t n = XLENGTH(haystack); R_xlen_t i_name = 0; for (; i_name < n; ++i_name) { SEXP haystack_elt = STRING_ELT(haystack, i_name); if (!string_is_ascii_or_utf8(haystack_elt)) { // No need to `PROTECT()`, we do a pointer comparison // and then throw it away haystack_elt = string_as_utf8(haystack_elt); } if (needle == haystack_elt) { break; } } UNPROTECT(1); return i_name; } SEXP integers_append(SEXP ints, int x) { R_xlen_t n = XLENGTH(ints); SEXP new_ints = PROTECT(Rf_allocVector(INTSXP, n + 1)); int* p_ints = INTEGER(ints); int* p_new_ints = INTEGER(new_ints); for (R_xlen_t i = 0; i < n; i++) { p_new_ints[i] = p_ints[i]; } p_new_ints[n] = x; UNPROTECT(1); return new_ints; } SEXP dplyr_mask_binding_add(SEXP env_private, SEXP s_name, SEXP ptype, SEXP chunks) { SEXP name = STRING_ELT(s_name, 0); // we assume control over these SEXP current_data = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::current_data)); SEXP current_vars = PROTECT(Rf_getAttrib(current_data, R_NamesSymbol)); // search for position of name R_xlen_t n = XLENGTH(current_data); R_xlen_t i_name = find_first(current_vars, name); bool is_new_column = i_name == n; if (is_new_column) { SEXP new_current_vars = PROTECT(Rf_allocVector(STRSXP, n + 1)); SEXP new_current_data = PROTECT(Rf_allocVector(VECSXP, n + 1)); for (R_xlen_t i = 0; i < n; i++) { SET_STRING_ELT(new_current_vars, i, STRING_ELT(current_vars, i)); SET_VECTOR_ELT(new_current_data, i, VECTOR_ELT(current_data, i)); } SET_STRING_ELT(new_current_vars, n, name); SET_VECTOR_ELT(new_current_data, n, ptype); Rf_namesgets(new_current_data, new_current_vars); Rf_defineVar(dplyr::symbols::current_data, new_current_data, env_private); UNPROTECT(2); } else { SET_VECTOR_ELT(current_data, i_name, ptype); } SEXP sym_name = PROTECT(rlang::str_as_symbol(name)); SEXP chops = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::chops)); Rf_defineVar(sym_name, chunks, chops); SEXP env_mask_bindings = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::env_mask_bindings)); add_mask_binding(sym_name, env_mask_bindings, chops); UNPROTECT(5); return R_NilValue; } SEXP dplyr_mask_binding_remove(SEXP env_private, SEXP s_name) { SEXP name = STRING_ELT(s_name, 0); SEXP current_data = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::current_data)); SEXP current_vars = PROTECT(Rf_getAttrib(current_data, R_NamesSymbol)); // search for position of name R_xlen_t n = XLENGTH(current_vars); R_xlen_t i_name = find_first(current_vars, name); if (i_name != n) { SEXP new_current_data = PROTECT(Rf_allocVector(VECSXP, n - 1)); SEXP new_current_vars = PROTECT(Rf_allocVector(STRSXP, n - 1)); for (R_xlen_t i = 0, j = 0; i < n; i++) { if (i == i_name) continue; SET_STRING_ELT(new_current_vars, j, STRING_ELT(current_vars, i)); SET_VECTOR_ELT(new_current_data, j, VECTOR_ELT(current_data, i)); j++; } Rf_namesgets(new_current_data, new_current_vars); Rf_defineVar(dplyr::symbols::current_data, new_current_data, env_private); SEXP sym_name = PROTECT(rlang::str_as_symbol(name)); SEXP chops = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::chops)); SEXP env_mask_bindings = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::env_mask_bindings)); rlang::env_unbind(env_mask_bindings, sym_name); rlang::env_unbind(chops, sym_name); UNPROTECT(5); } UNPROTECT(2); return R_NilValue; } dplyr/src/funs.cpp0000644000176200001440000000332714366556340013673 0ustar liggesusers#include "dplyr.h" SEXP dplyr_cumall(SEXP x) { R_xlen_t n = XLENGTH(x); SEXP out = PROTECT(Rf_allocVector(LGLSXP, n)); int* p_x = LOGICAL(x); int* p_out = LOGICAL(out); // set out[i] to TRUE as long as x[i] is TRUE R_xlen_t i = 0 ; for (; i < n; i++, ++p_x, ++p_out) { if (*p_x == TRUE) { *p_out = TRUE; } else { break; } } if (i != n) { // set to NA as long as x[i] is NA or TRUE for (; i < n; i++, ++p_x, ++p_out) { if (*p_x == FALSE) { break; } *p_out = NA_LOGICAL; } // set remaining to FALSE if (i != n) { for (; i < n; i++, ++p_x, ++p_out) { *p_out = FALSE; } } } UNPROTECT(1); return out; } SEXP dplyr_cumany(SEXP x) { R_xlen_t n = XLENGTH(x); SEXP out = PROTECT(Rf_allocVector(LGLSXP, n)); int* p_x = LOGICAL(x); int* p_out = LOGICAL(out); // nothing to do as long as x[i] is FALSE R_xlen_t i = 0 ; for (; i < n; i++, ++p_x, ++p_out) { if (*p_x == FALSE) { *p_out = FALSE; } else { break; } } if (i < n) { // set to NA as long as x[i] is NA or FALSE for (; i < n; i++, ++p_x, ++p_out) { if (*p_x == TRUE) { break; } *p_out = NA_LOGICAL; } if (i < n) { // then if we are here, the rest is TRUE for (; i < n; i++, ++p_out) { *p_out = TRUE; } } } UNPROTECT(1); return out; } SEXP dplyr_cummean(SEXP x) { R_xlen_t n = XLENGTH(x); SEXP out = PROTECT(Rf_allocVector(REALSXP, n)); double* p_out = REAL(out); double* p_x = REAL(x); double sum = 0.0; for (R_xlen_t i = 0; i < n; i++, ++p_x, ++p_out) { sum += *p_x; *p_out = sum / (i + 1.0); } UNPROTECT(1); return out; } dplyr/NAMESPACE0000644000176200001440000002616515137161765012651 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method("$<-",grouped_df) S3method("[",fun_list) S3method("[",grouped_df) S3method("[",rowwise_df) S3method("[<-",grouped_df) S3method("[<-",rowwise_df) S3method("[[<-",grouped_df) S3method("names<-",grouped_df) S3method("names<-",rowwise_df) S3method(add_count,data.frame) S3method(add_count,default) S3method(anti_join,data.frame) S3method(arrange,data.frame) S3method(as.data.frame,grouped_df) S3method(as_join_by,character) S3method(as_join_by,default) S3method(as_join_by,dplyr_join_by) S3method(as_join_by,list) S3method(as_tibble,grouped_df) S3method(as_tibble,rowwise_df) S3method(auto_copy,data.frame) S3method(cbind,grouped_df) S3method(collapse,data.frame) S3method(collect,data.frame) S3method(common_by,"NULL") S3method(common_by,character) S3method(common_by,default) S3method(common_by,list) S3method(compute,data.frame) S3method(copy_to,DBIConnection) S3method(count,data.frame) S3method(cross_join,data.frame) S3method(distinct,data.frame) S3method(do,"NULL") S3method(do,data.frame) S3method(do,grouped_df) S3method(do,rowwise_df) S3method(dplyr_col_modify,data.frame) S3method(dplyr_col_modify,grouped_df) S3method(dplyr_col_modify,rowwise_df) S3method(dplyr_reconstruct,data.frame) S3method(dplyr_reconstruct,grouped_df) S3method(dplyr_reconstruct,rowwise_df) S3method(dplyr_row_slice,data.frame) S3method(dplyr_row_slice,grouped_df) S3method(dplyr_row_slice,rowwise_df) S3method(filter,data.frame) S3method(filter,ts) S3method(filter_bullets,"dplyr:::filter_incompatible_size") S3method(filter_bullets,"dplyr:::filter_incompatible_type") S3method(filter_out,data.frame) S3method(full_join,data.frame) S3method(group_by,data.frame) S3method(group_by_drop_default,default) S3method(group_by_drop_default,grouped_df) S3method(group_data,data.frame) S3method(group_data,grouped_df) S3method(group_data,rowwise_df) S3method(group_data,tbl_df) S3method(group_indices,data.frame) S3method(group_keys,data.frame) S3method(group_map,data.frame) S3method(group_modify,data.frame) S3method(group_modify,grouped_df) S3method(group_nest,data.frame) S3method(group_nest,grouped_df) S3method(group_size,data.frame) S3method(group_split,data.frame) S3method(group_split,grouped_df) S3method(group_split,rowwise_df) S3method(group_trim,data.frame) S3method(group_trim,grouped_df) S3method(group_vars,data.frame) S3method(groups,data.frame) S3method(inner_join,data.frame) S3method(intersect,data.frame) S3method(left_join,data.frame) S3method(mutate,data.frame) S3method(mutate_bullets,"dplyr:::error_incompatible_combine") S3method(mutate_bullets,"dplyr:::mutate_constant_recycle_error") S3method(mutate_bullets,"dplyr:::mutate_incompatible_size") S3method(mutate_bullets,"dplyr:::mutate_mixed_null") S3method(mutate_bullets,"dplyr:::mutate_not_vector") S3method(n_groups,data.frame) S3method(nest_by,data.frame) S3method(nest_by,grouped_df) S3method(nest_join,data.frame) S3method(print,all_vars) S3method(print,any_vars) S3method(print,dplyr_join_by) S3method(print,dplyr_sel_vars) S3method(print,fun_list) S3method(print,last_dplyr_warnings) S3method(print,src) S3method(pull,data.frame) S3method(rbind,grouped_df) S3method(rbind,rowwise_df) S3method(recode,character) S3method(recode,factor) S3method(recode,numeric) S3method(recode_default,default) S3method(recode_default,factor) S3method(reframe,data.frame) S3method(relocate,data.frame) S3method(rename,data.frame) S3method(rename_with,data.frame) S3method(right_join,data.frame) S3method(rows_append,data.frame) S3method(rows_delete,data.frame) S3method(rows_insert,data.frame) S3method(rows_patch,data.frame) S3method(rows_update,data.frame) S3method(rows_upsert,data.frame) S3method(rowwise,data.frame) S3method(rowwise,grouped_df) S3method(same_src,data.frame) S3method(sample_frac,data.frame) S3method(sample_frac,default) S3method(sample_n,data.frame) S3method(sample_n,default) S3method(select,data.frame) S3method(select,list) S3method(semi_join,data.frame) S3method(setdiff,data.frame) S3method(setequal,data.frame) S3method(slice,data.frame) S3method(slice_head,data.frame) S3method(slice_max,data.frame) S3method(slice_min,data.frame) S3method(slice_sample,data.frame) S3method(slice_tail,data.frame) S3method(summarise,data.frame) S3method(summarise,grouped_df) S3method(summarise,rowwise_df) S3method(summarise_bullets,"dplyr:::reframe_incompatible_size") S3method(summarise_bullets,"dplyr:::summarise_incompatible_size") S3method(summarise_bullets,"dplyr:::summarise_mixed_null") S3method(summarise_bullets,"dplyr:::summarise_unsupported_type") S3method(symdiff,data.frame) S3method(symdiff,default) S3method(tally,data.frame) S3method(tbl,DBIConnection) S3method(tbl_ptype,default) S3method(tbl_sum,grouped_df) S3method(tbl_sum,rowwise_df) S3method(tbl_vars,data.frame) S3method(transmute,data.frame) S3method(ungroup,data.frame) S3method(ungroup,grouped_df) S3method(ungroup,rowwise_df) S3method(union,data.frame) S3method(union_all,data.frame) S3method(union_all,default) export("%>%") export(.data) export(across) export(add_count) export(add_count_) export(add_row) export(add_rownames) export(add_tally) export(add_tally_) export(all_equal) export(all_of) export(all_vars) export(anti_join) export(any_of) export(any_vars) export(arrange) export(arrange_) export(arrange_all) export(arrange_at) export(arrange_if) export(as.tbl) export(as_data_frame) export(as_label) export(as_tibble) export(auto_copy) export(between) export(bind_cols) export(bind_rows) export(c_across) export(case_match) export(case_when) export(check_dbplyr) export(coalesce) export(collapse) export(collect) export(combine) export(common_by) export(compute) export(consecutive_id) export(contains) export(copy_to) export(count) export(count_) export(cross_join) export(cumall) export(cumany) export(cume_dist) export(cummean) export(cur_column) export(cur_data) export(cur_data_all) export(cur_group) export(cur_group_id) export(cur_group_rows) export(data_frame) export(db_analyze) export(db_begin) export(db_commit) export(db_create_index) export(db_create_indexes) export(db_create_table) export(db_data_type) export(db_desc) export(db_drop_table) export(db_explain) export(db_has_table) export(db_insert_into) export(db_list_tables) export(db_query_fields) export(db_query_rows) export(db_rollback) export(db_save_query) export(db_write_table) export(dense_rank) export(desc) export(dim_desc) export(distinct) export(distinct_) export(distinct_all) export(distinct_at) export(distinct_if) export(distinct_prepare) export(do) export(do_) export(dplyr_col_modify) export(dplyr_reconstruct) export(dplyr_row_slice) export(ends_with) export(enexpr) export(enexprs) export(enquo) export(enquos) export(ensym) export(ensyms) export(everything) export(explain) export(expr) export(filter) export(filter_) export(filter_all) export(filter_at) export(filter_if) export(filter_out) export(first) export(full_join) export(funs) export(funs_) export(glimpse) export(group_by) export(group_by_) export(group_by_all) export(group_by_at) export(group_by_drop_default) export(group_by_if) export(group_by_prepare) export(group_cols) export(group_data) export(group_indices) export(group_indices_) export(group_keys) export(group_map) export(group_modify) export(group_nest) export(group_rows) export(group_size) export(group_split) export(group_trim) export(group_vars) export(group_walk) export(grouped_df) export(groups) export(ident) export(if_all) export(if_any) export(if_else) export(inner_join) export(intersect) export(is.grouped_df) export(is.src) export(is.tbl) export(is_grouped_df) export(join_by) export(lag) export(last) export(last_col) export(last_dplyr_warnings) export(lead) export(left_join) export(lst) export(make_tbl) export(matches) export(min_rank) export(mutate) export(mutate_) export(mutate_all) export(mutate_at) export(mutate_each) export(mutate_each_) export(mutate_if) export(n) export(n_distinct) export(n_groups) export(na_if) export(near) export(nest_by) export(nest_join) export(new_grouped_df) export(new_rowwise_df) export(nth) export(ntile) export(num_range) export(one_of) export(order_by) export(percent_rank) export(pick) export(progress_estimated) export(pull) export(quo) export(quo_name) export(quos) export(recode) export(recode_factor) export(recode_values) export(reframe) export(relocate) export(rename) export(rename_) export(rename_all) export(rename_at) export(rename_if) export(rename_with) export(replace_values) export(replace_when) export(right_join) export(row_number) export(rows_append) export(rows_delete) export(rows_insert) export(rows_patch) export(rows_update) export(rows_upsert) export(rowwise) export(same_src) export(sample_frac) export(sample_n) export(select) export(select_) export(select_all) export(select_at) export(select_if) export(semi_join) export(setdiff) export(setequal) export(show_query) export(slice) export(slice_) export(slice_head) export(slice_max) export(slice_min) export(slice_sample) export(slice_tail) export(sql) export(sql_escape_ident) export(sql_escape_string) export(sql_join) export(sql_select) export(sql_semi_join) export(sql_set_op) export(sql_subquery) export(sql_translate_env) export(src) export(src_df) export(src_local) export(src_mysql) export(src_postgres) export(src_sqlite) export(src_tbls) export(starts_with) export(summarise) export(summarise_) export(summarise_all) export(summarise_at) export(summarise_each) export(summarise_each_) export(summarise_if) export(summarize) export(summarize_) export(summarize_all) export(summarize_at) export(summarize_each) export(summarize_each_) export(summarize_if) export(sym) export(symdiff) export(syms) export(tally) export(tally_) export(tbl) export(tbl_df) export(tbl_nongroup_vars) export(tbl_ptype) export(tbl_vars) export(tibble) export(top_frac) export(top_n) export(transmute) export(transmute_) export(transmute_all) export(transmute_at) export(transmute_if) export(tribble) export(type_sum) export(ungroup) export(union) export(union_all) export(validate_grouped_df) export(validate_rowwise_df) export(vars) export(when_all) export(when_any) export(where) export(with_groups) export(with_order) export(wrap_dbplyr_obj) import(rlang) import(vctrs, except = data_frame) importFrom(R6,R6Class) importFrom(generics,intersect) importFrom(generics,setdiff) importFrom(generics,setequal) importFrom(generics,union) importFrom(glue,glue) importFrom(glue,glue_collapse) importFrom(glue,glue_data) importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") importFrom(methods,setOldClass) importFrom(pillar,glimpse) importFrom(pillar,tbl_sum) importFrom(pillar,type_sum) importFrom(stats,setNames) importFrom(stats,update) importFrom(tibble,add_row) importFrom(tibble,as_data_frame) importFrom(tibble,as_tibble) importFrom(tibble,data_frame) importFrom(tibble,is_tibble) importFrom(tibble,lst) importFrom(tibble,new_tibble) importFrom(tibble,tibble) importFrom(tibble,tribble) importFrom(tibble,view) importFrom(tidyselect,all_of) importFrom(tidyselect,any_of) importFrom(tidyselect,contains) importFrom(tidyselect,ends_with) importFrom(tidyselect,everything) importFrom(tidyselect,last_col) importFrom(tidyselect,matches) importFrom(tidyselect,num_range) importFrom(tidyselect,one_of) importFrom(tidyselect,starts_with) importFrom(tidyselect,where) importFrom(utils,head) importFrom(utils,tail) useDynLib(dplyr, .registration = TRUE) dplyr/LICENSE0000644000176200001440000000005315137161765012423 0ustar liggesusersYEAR: 2026 COPYRIGHT HOLDER: dplyr authors dplyr/NEWS.md0000644000176200001440000044320515137162174012521 0ustar liggesusers# dplyr 1.2.0 ## New features * New `filter_out()` companion to `filter()`. * Use `filter()` when specifying rows to _keep_. * Use `filter_out()` when specifying rows to _drop_. `filter_out()` simplifies cases where you would have previously used a `filter()` to drop rows. It is particularly useful when missing values are involved. For example, to drop rows where the `count` is zero: ```r df |> filter(count != 0 | is.na(count)) df |> filter_out(count == 0) ``` With `filter()`, you must provide a "negative" condition of `!= 0` and must explicitly guard against accidentally dropping rows with `NA`. With `filter_out()`, you directly specify rows to drop and you don't have to guard against dropping rows with `NA`, which tends to result in much clearer code. This work is a result of [Tidyup 8: Expanding the `filter()` family](https://github.com/tidyverse/tidyups/pull/30), with a lot of great feedback from the community (#6560, #6891). * New `when_any()` and `when_all()`, which are elementwise versions of `any()` and `all()`. Alternatively, you can think of them as performing repeated `|` and `&` on any number of inputs, for example: * `when_any(x, y, z)` is equivalent to `x | y | z`. * `when_all(x, y, z)` is equivalent to `x & y & z`. `when_any()` is particularly useful within `filter()` and `filter_out()` to specify comma separated conditions combined with `|` rather than `&`, like: ```r # With `|` countries |> filter( (name %in% c("US", "CA") & between(score, 200, 300)) | (name %in% c("PR", "RU") & between(score, 100, 200)) ) # With `when_any()`, you drop the explicit `|`, the extra `()`, and your # conditions are all indented to the same level countries |> filter(when_any( name %in% c("US", "CA") & between(score, 200, 300), name %in% c("PR", "RU") & between(score, 100, 200) )) # To drop these rows instead, use `filter_out()` countries |> filter_out(when_any( name %in% c("US", "CA") & between(score, 200, 300), name %in% c("PR", "RU") & between(score, 100, 200) )) ``` This work is a result of [Tidyup 8: Expanding the `filter()` family](https://github.com/tidyverse/tidyups/pull/30). * `case_when()` is now part of a family of 4 related functions, 3 of which are new: * Use `case_when()` to create a new vector based on logical conditions. * Use `replace_when()` to update an existing vector based on logical conditions. * Use `recode_values()` to create a new vector by mapping all old values to new values. * Use `replace_values()` to update an existing vector by mapping some old values to new values. Learn all about these in a new vignette, `vignette("recoding-replacing")`. `replace_when()` is particularly useful for conditionally mutating rows within one or more columns, and can be thought of as an enhanced version of `base::replace()`. `recode_values()` and `replace_values()` have the familiar `case_when()`-style formula interface for easy interactive use, but also have `from` and `to` arguments as a way for you to incorporate a pre-built lookup table, making them more holistic replacements for both `case_match()` and `recode()`. This work is a result of [Tidyup 7: Recoding and replacing values in the tidyverse](https://github.com/tidyverse/tidyups/blob/main/007-tidyverse-recoding-and-replacing.md), with a lot of great [feedback](https://github.com/tidyverse/tidyups/pull/29) from the community (#7728, #7729). * `case_when()` has gained a new `.unmatched` argument. For extra safety, set `.unmatched = "error"` rather than providing a `.default` when you believe that you've handled every possible case, and it will error if a case is left unhandled. The new `recode_values()` also has this argument (#7653). * `if_else()`, `case_when()`, and `coalesce()` have gotten significantly faster and use much less memory due to a rewrite in C via vctrs (#7723, #7725, #7727). * New `ptype` argument for `between()`, allowing users to specify the desired output type. This is particularly useful for ordered factors and other complex types where the default common type behavior might not be ideal (#6906, @JamesHWade). * New `rbind()` method for `rowwise_df` to avoid creating corrupt rowwise data frames (r-lib/vctrs#1935). ## Lifecycle changes ### Newly stable * `.by` has moved from experimental to stable (#7762). * `reframe()` has moved from experimental to stable (#7713, @VisruthSK). ### Newly breaking * `if_else()` no longer allows `condition` to be a logical array. It must be a logical vector with no `dim` attribute (#7723). ### Newly deprecated * `case_match()` is soft-deprecated, and is fully replaced by `recode_values()` and `replace_values()`, which are more flexible, more powerful, and have much better names. * In `case_when()`, supplying all size 1 LHS inputs along with a size >1 RHS input is now soft-deprecated. This is an improper usage of `case_when()` that should instead be a series of if statements, like: ```r # Scalars! code <- 1L flavor <- "vanilla" # Improper usage: case_when( code == 1L && flavor == "chocolate" ~ x, code == 1L && flavor == "vanilla" ~ y, code == 2L && flavor == "vanilla" ~ z, .default = default ) # Recommended: if (code == 1L && flavor == "chocolate") { x } else if (code == 1L && flavor == "vanilla") { y } else if (code == 2L && flavor == "vanilla") { z } else { default } ``` The recycling behavior that allows this style of `case_when()` to work is unsafe, and can result in silent bugs that we'd like to guard against with an error in the future (#7082). * The `dplyr.legacy_locale` global option is soft-deprecated. If you used this to affect the ordering of `arrange()`, use `arrange(.locale =)` instead. If you used this to affect the ordering of `group_by() |> summarise()`, follow up with an additional call to `arrange(.locale =)` instead (#7760). * Passing `size` to `if_else()` is now deprecated. The output size is always taken from the `condition` (#7722). ### Other deprecation advancements * The following were already deprecated, and are now defunct and throw an error: * All underscored standard evaluation versions of major dplyr verbs. Deprecated in 0.7.0 (Jun 2017), use the non-underscored version of the verb with unquoting instead, see `vignette("programming")`. This includes: * `add_count_()` * `add_tally_()` * `arrange_()` * `count_()` * `distinct_()` * `do_()` * `filter_()` * `funs_()` * `group_by_()` * `group_indices_()` * `mutate_()` * `tally_()` * `transmute_()` * `rename_()` * `select_()` * `slice_()` * `summarise_()` * `summarize_()` * `mutate_each()`, `mutate_each_()`, `summarise_each()`, and `summarise_each_()`. Deprecated in 0.7.0 (Jun 2017), use `across()` instead. * Returning more or less than 1 row per group in `summarise()`. Deprecated in 1.1.0 (Jan 2023), use `reframe()` instead. * `combine()`. Deprecated in 1.0.0 (May 2020), use `c()` or `vctrs::vec_c()` instead. * `src_mysql()`, `src_postgres()`, `src_sqlite()`, `src_local()`, and `src_df()`. Deprecated in 1.0.0 (May 2020), use `tbl()` instead. * `tbl_df()` and `as.tbl()`. Deprecated in 1.0.0 (May 2020), use `tibble::as_tibble()` instead. * `add_rownames()`. Deprecated in 1.0.0 (May 2020), use `tibble::rownames_to_column()` instead. * The `.drop` argument of `add_count()`. Deprecated in 1.0.0 (May 2020), had no effect. * The `add` argument of `group_by()` and `group_by_prepare()`. Deprecated in 1.0.0 (May 2020), use `.add` instead. * The `.dots` argument of `group_by()` and `group_by_prepare()`. Deprecated in 1.0.0 (May 2020). * The `...` argument of `group_keys()` and `group_indices()`. Deprecated in 1.0.0 (May 2020), use `group_by()` first. * The `keep` argument of `group_map()`, `group_modify()`, and `group_split()`. Deprecated in 1.0.0 (May 2020), use `.keep` instead. * Using `across()` and data frames in `filter()`. Deprecated in 1.0.8 (Feb 2022), use `if_any()` or `if_all()` instead. * `multiple = NULL` in joins. Deprecated in 1.1.1 (Mar 2023), use `multiple = "all"` instead. * `multiple = "error" / "warning"` in joins. Deprecated in 1.1.1 (Mar 2023), use `relationship = "many-to-one"` instead. * The `vars` argument of `group_cols()`. Deprecated in 1.0.0 (Jan 2023). * The following were already deprecated, and now warn unconditionally if used: * `all_equal()`. Deprecated in 1.1.0 (Jan 2023), use `all.equal()` instead. * `progress_estimated()`. Deprecated in 1.0.0 (May 2020). * `filter()` with a 1 column matrix. Deprecated in 1.1.0 (Jan 2023), use a vector instead. * `slice()` with a 1 column matrix. Deprecated in 1.1.0 (Jan 2023), use a vector instead. * Not supplying the `.cols` argument of `across()`. Deprecated in 1.1.0 (Jan 2023). * `group_indices()` with no arguments. Deprecated in 1.0.0 (May 2020), use `cur_group_id()` instead. * The following were already soft-deprecated, and now warn once per session if used: * `cur_data()` and `cur_data_all()`. Deprecated in 1.1.0 (Jan 2023), use `pick()` instead. * The `...` argument of `across()`. Deprecated in 1.1.0 (Jan 2023), use an anonymous function instead. * Using `by = character()` to perform a cross join. Deprecated in 1.1.0 (Jan 2023), use `cross_join()` instead. ### Removed The following were already defunct, and have been removed: * `id()`. Deprecated in 0.5.0 (Jun 2016), use `vctrs::vec_group_id()` instead. If your package uses NSE and implicitly relied on the variable `id` being available, you now need to put `utils::globalVariables("id")` inside one of your package files to tell R that `id` is a column name. * `failwith()`. Deprecated in 0.7.0 (Jun 2017), use `purrr::possibly()` instead. * `select_vars()` and `select_vars_()`. Deprecated in 0.8.4 (Jan 2020), use `tidyselect::vars_select()` instead. * `rename_vars()` and `rename_vars_()`. Deprecated in 0.8.4 (Jan 2020), use `tidyselect::vars_rename()` instead. * `select_var()`. Deprecated in 0.8.4 (Jan 2020), use `tidyselect::vars_pull()` instead. * `current_vars()`. Deprecated in 0.8.4 (Jan 2020), use `tidyselect::peek_vars()` instead. * `bench_tbls()`, `compare_tbls()`, `compare_tbls2()`, `eval_tbls()`, and `eval_tbls2()`. Deprecated in 1.0.0 (May 2020). * `location()` and `changes()`. Deprecated in 1.0.0 (May 2020), use `lobstr::ref()` instead. ## Minor improvements and bug fixes * The base pipe is now used throughout the documentation (#7711). * The superseded `recode()` now has updated documentation showing how to migrate to `recode_values()` and `replace_values()`. * The `.groups` message emitted by `summarise()` is hopefully more clear now (#6986). * `storms` has been updated to include 2023 and 2024 data (#7111, @tomalrussell). * `if_any()` and `if_all()` are now more consistent in all use cases (#7059, #7077, #7746, @jrwinget). In particular: * When called with zero inputs, `if_any()` returns `FALSE` and `if_all()` returns `TRUE`. * When called with one input, both now return logical vectors rather than the original column. * The result of applying `.fns` now must be a logical vector. * `tally_n()` creates fully qualified funciton calls for duckplyr compatibility (#7046) * Empty `rowwise()` list-column elements now resolve to `logical()` rather than a random logical of length 1 (#7710). * `last_dplyr_warnings()` no longer prevents objects from being garbage collected (#7649). * `case_when()` now throws correctly indexed errors when `NULL`s are supplied in `...` (#7739). * `case_when()` now throws a better error if one of the conditions is an array (#6862, @ilovemane). * `bind_rows()` now replaces empty (or `NA`) element names in a list with its numeric index while preserving existing names (#7719, @Meghansaha). * New `slice_sample()` example showing how to use it to shuffle rows (#7707, @Hzanib). * Updated `across()` examples to include an example using `everything()` (#7621, @JBrandenburg02). * Clarified how `slice_min()` and `slice_max()` work in the introduction vignette (#7717, @ccani007). * Fixed an edge case when coercing data frames to matrices (#7004). * Fixed an issue where duckplyr's ALTREP data frames were being materialized early due to internal usage of `ncol()` (#7049). * Progress towards making dplyr conformant with the public C API of R (#7741, #7797). * R >=4.1.0 is now required, in line with the [tidyverse standard](https://tidyverse.org/blog/2019/04/r-version-support/) of supporting the previous 5 minor releases of R (#7711). # dplyr 1.1.4 * `join_by()` now allows its helper functions to be namespaced with `dplyr::`, like `join_by(dplyr::between(x, lower, upper))` (#6838). * `left_join()` and friends now return a specialized error message if they detect that your join would return more rows than dplyr can handle (#6912). * `slice_*()` now throw the correct error if you forget to name `n` while also prefixing the call with `dplyr::` (#6946). * `dplyr_reconstruct()`'s default method has been rewritten to avoid materializing duckplyr queries too early (#6947). * Updated the `storms` data to include 2022 data (#6937, @steveharoz). * Updated the `starwars` data to use a new API, because the old one is defunct. There are very minor changes to the data itself (#6938, @steveharoz). # dplyr 1.1.3 * `mutate_each()` and `summarise_each()` now throw correct deprecation messages (#6869). * `setequal()` now requires the input data frames to be compatible, similar to the other set methods like `setdiff()` or `intersect()` (#6786). # dplyr 1.1.2 * `count()` better documents that it has a `.drop` argument (#6820). * Fixed tests to maintain compatibility with the next version of waldo (#6823). * Joins better handle key columns will all `NA`s (#6804). # dplyr 1.1.1 * Mutating joins now warn about multiple matches much less often. At a high level, a warning was previously being thrown when a one-to-many or many-to-many relationship was detected between the keys of `x` and `y`, but is now only thrown for a many-to-many relationship, which is much rarer and much more dangerous than one-to-many because it can result in a Cartesian explosion in the number of rows returned from the join (#6731, #6717). We've accomplished this in two steps: * `multiple` now defaults to `"all"`, and the options of `"error"` and `"warning"` are now deprecated in favor of using `relationship` (see below). We are using an accelerated deprecation process for these two options because they've only been available for a few weeks, and `relationship` is a clearly superior alternative. * The mutating joins gain a new `relationship` argument, allowing you to optionally enforce one of the following relationship constraints between the keys of `x` and `y`: `"one-to-one"`, `"one-to-many"`, `"many-to-one"`, or `"many-to-many"`. For example, `"many-to-one"` enforces that each row in `x` can match at most 1 row in `y`. If a row in `x` matches >1 rows in `y`, an error is thrown. This option serves as the replacement for `multiple = "error"`. The default behavior of `relationship` doesn't assume that there is any relationship between `x` and `y`. However, for equality joins it will check for the presence of a many-to-many relationship, and will warn if it detects one. This change unfortunately does mean that if you have set `multiple = "all"` to avoid a warning and you happened to be doing a many-to-many style join, then you will need to replace `multiple = "all"` with `relationship = "many-to-many"` to silence the new warning, but we believe this should be rare since many-to-many relationships are fairly uncommon. * Fixed a major performance regression in `case_when()`. It is still a little slower than in dplyr 1.0.10, but we plan to improve this further in the future (#6674). * Fixed a performance regression related to `nth()`, `first()`, and `last()` (#6682). * Fixed an issue where expressions involving infix operators had an abnormally large amount of overhead (#6681). * `group_data()` on ungrouped data frames is faster (#6736). * `n()` is a little faster when there are many groups (#6727). * `pick()` now returns a 1 row, 0 column tibble when `...` evaluates to an empty selection. This makes it more compatible with [tidyverse recycling rules](https://vctrs.r-lib.org/reference/theory-faq-recycling.html) in some edge cases (#6685). * `if_else()` and `case_when()` again accept logical conditions that have attributes (#6678). * `arrange()` can once again sort the `numeric_version` type from base R (#6680). * `slice_sample()` now works when the input has a column named `replace`. `slice_min()` and `slice_max()` now work when the input has columns named `na_rm` or `with_ties` (#6725). * `nth()` now errors informatively if `n` is `NA` (#6682). * Joins now throw a more informative error when `y` doesn't have the same source as `x` (#6798). * All major dplyr verbs now throw an informative error message if the input data frame contains a column named `NA` or `""` (#6758). * Deprecation warnings thrown by `filter()` now mention the correct package where the problem originated from (#6679). * Fixed an issue where using `<-` within a grouped `mutate()` or `summarise()` could cross contaminate other groups (#6666). * The compatibility vignette has been replaced with a more general vignette on using dplyr in packages, `vignette("in-packages")` (#6702). * The developer documentation in `?dplyr_extending` has been refreshed and brought up to date with all changes made in 1.1.0 (#6695). * `rename_with()` now includes an example of using `paste0(recycle0 = TRUE)` to correctly handle empty selections (#6688). * R >=3.5.0 is now explicitly required. This is in line with the tidyverse policy of supporting the [5 most recent versions of R](https://tidyverse.org/blog/2019/04/r-version-support/). # dplyr 1.1.0 ## New features * [`.by`/`by`](https://dplyr.tidyverse.org/reference/dplyr_by.html) is an experimental alternative to `group_by()` that supports per-operation grouping for `mutate()`, `summarise()`, `filter()`, and the `slice()` family (#6528). Rather than: ``` starwars %>% group_by(species, homeworld) %>% summarise(mean_height = mean(height)) ``` You can now write: ``` starwars %>% summarise( mean_height = mean(height), .by = c(species, homeworld) ) ``` The most useful reason to do this is because `.by` only affects a single operation. In the example above, an ungrouped data frame went into the `summarise()` call, so an ungrouped data frame will come out; with `.by`, you never need to remember to `ungroup()` afterwards and you never need to use the `.groups` argument. Additionally, using `summarise()` with `.by` will never sort the results by the group key, unlike with `group_by()`. Instead, the results are returned using the existing ordering of the groups from the original data. We feel this is more predictable, better maintains any ordering you might have already applied with a previous call to `arrange()`, and provides a way to maintain the current ordering without having to resort to factors. This feature was inspired by [data.table](https://CRAN.R-project.org/package=data.table), where the equivalent syntax looks like: ``` starwars[, .(mean_height = mean(height)), by = .(species, homeworld)] ``` `with_groups()` is superseded in favor of `.by` (#6582). * `reframe()` is a new experimental verb that creates a new data frame by applying functions to columns of an existing data frame. It is very similar to `summarise()`, with two big differences: * `reframe()` can return an arbitrary number of rows per group, while `summarise()` reduces each group down to a single row. * `reframe()` always returns an ungrouped data frame, while `summarise()` might return a grouped or rowwise data frame, depending on the scenario. `reframe()` has been added in response to valid concern from the community that allowing `summarise()` to return any number of rows per group increases the chance for accidental bugs. We still feel that this is a powerful technique, and is a principled replacement for `do()`, so we have moved these features to `reframe()` (#6382). * `group_by()` now uses a new algorithm for computing groups. It is often faster than the previous approach (especially when there are many groups), and in most cases there should be no changes. The one exception is with character vectors, see the C locale news bullet below for more details (#4406, #6297). * `arrange()` now uses a faster algorithm for sorting character vectors, which is heavily inspired by data.table's `forder()`. See the C locale news bullet below for more details (#4962). * Joins have been completely overhauled to enable more flexible join operations and provide more tools for quality control. Many of these changes are inspired by data.table's join syntax (#5914, #5661, #5413, #2240). * A _join specification_ can now be created through `join_by()`. This allows you to specify both the left and right hand side of a join using unquoted column names, such as `join_by(sale_date == commercial_date)`. Join specifications can be supplied to any `*_join()` function as the `by` argument. * Join specifications allow for new types of joins: * Equality joins: The most common join, specified by `==`. For example, `join_by(sale_date == commercial_date)`. * Inequality joins: For joining on inequalities, i.e.`>=`, `>`, `<`, and `<=`. For example, use `join_by(sale_date >= commercial_date)` to find every commercial that aired before a particular sale. * Rolling joins: For "rolling" the closest match forward or backwards when there isn't an exact match, specified by using the rolling helper, `closest()`. For example, `join_by(closest(sale_date >= commercial_date))` to find only the most recent commercial that aired before a particular sale. * Overlap joins: For detecting overlaps between sets of columns, specified by using one of the overlap helpers: `between()`, `within()`, or `overlaps()`. For example, use `join_by(between(commercial_date, sale_date_lower, sale_date))` to find commercials that aired before a particular sale, as long as they occurred after some lower bound, such as 40 days before the sale was made. Note that you cannot use arbitrary expressions in the join conditions, like `join_by(sale_date - 40 >= commercial_date)`. Instead, use `mutate()` to create a new column containing the result of `sale_date - 40` and refer to that by name in `join_by()`. * `multiple` is a new argument for controlling what happens when a row in `x` matches multiple rows in `y`. For equality joins and rolling joins, where this is usually surprising, this defaults to signalling a `"warning"`, but still returns all of the matches. For inequality joins, where multiple matches are usually expected, this defaults to returning `"all"` of the matches. You can also return only the `"first"` or `"last"` match, `"any"` of the matches, or you can `"error"`. * `keep` now defaults to `NULL` rather than `FALSE`. `NULL` implies `keep = FALSE` for equality conditions, but `keep = TRUE` for inequality conditions, since you generally want to preserve both sides of an inequality join. * `unmatched` is a new argument for controlling what happens when a row would be dropped because it doesn't have a match. For backwards compatibility, the default is `"drop"`, but you can also choose to `"error"` if dropped rows would be surprising. * `across()` gains an experimental `.unpack` argument to optionally unpack (as in, `tidyr::unpack()`) data frames returned by functions in `.fns` (#6360). * `consecutive_id()` for creating groups based on contiguous runs of the same values, like `data.table::rleid()` (#1534). * `case_match()` is a "vectorised switch" variant of `case_when()` that matches on values rather than logical expressions. It is like a SQL "simple" `CASE WHEN` statement, whereas `case_when()` is like a SQL "searched" `CASE WHEN` statement (#6328). * `cross_join()` is a more explicit and slightly more correct replacement for using `by = character()` during a join (#6604). * `pick()` makes it easy to access a subset of columns from the current group. `pick()` is intended as a replacement for `across(.fns = NULL)`, `cur_data()`, and `cur_data_all()`. We feel that `pick()` is a much more evocative name when you are just trying to select a subset of columns from your data (#6204). * `symdiff()` computes the symmetric difference (#4811). ## Lifecycle changes ### Breaking changes * `arrange()` and `group_by()` now use the C locale, not the system locale, when ordering or grouping character vectors. This brings _substantial_ performance improvements, increases reproducibility across R sessions, makes dplyr more consistent with data.table, and we believe it should affect little existing code. If it does affect your code, you can use `options(dplyr.legacy_locale = TRUE)` to quickly revert to the previous behavior. However, in general, we instead recommend that you use the new `.locale` argument to precisely specify the desired locale. For a full explanation please read the associated [grouping](https://github.com/tidyverse/tidyups/blob/main/006-dplyr-group-by-ordering.md) and [ordering](https://github.com/tidyverse/tidyups/blob/main/003-dplyr-radix-ordering.md) tidyups. * `bench_tbls()`, `compare_tbls()`, `compare_tbls2()`, `eval_tbls()`, `eval_tbls2()`, `location()` and `changes()`, Deprecated in 1.0.0 (May 2020), are now defunct (#6387). * `frame_data()`, `data_frame_()`, `lst_()` and `tbl_sum()` are no longer re-exported from tibble (#6276, #6277, #6278, #6284). * `select_vars()`, `rename_vars()`, `select_var()` and `current_vars()`, deprecated in 0.8.4, are now defunct (#6387). ### Newly deprecated * `across()`, `c_across()`, `if_any()`, and `if_all()` now require the `.cols` and `.fns` arguments. In general, we now recommend that you use `pick()` instead of an empty `across()` call or `across()` with no `.fns` (e.g. `across(c(x, y))`. (#6523). * Relying on the previous default of `.cols = everything()` is deprecated. We have skipped the soft-deprecation stage in this case, because indirect usage of `across()` and friends in this way is rare. * Relying on the previous default of `.fns = NULL` is not yet formally soft-deprecated, because there was no good alternative until now, but it is discouraged and will be soft-deprecated in the next minor release. * Passing `...` to `across()` is soft-deprecated because it's ambiguous when those arguments are evaluated. Now, instead of (e.g.) `across(a:b, mean, na.rm = TRUE)` you should write `across(a:b, ~ mean(.x, na.rm = TRUE))` (#6073). * `all_equal()` is deprecated. We've advised against it for some time, and we explicitly recommend you use `all.equal()`, manually reordering the rows and columns as needed (#6324). * `cur_data()` and `cur_data_all()` are soft-deprecated in favour of `pick()` (#6204). * Using `by = character()` to perform a cross join is now soft-deprecated in favor of `cross_join()` (#6604). * `filter()`ing with a 1-column matrix is deprecated (#6091). * `progress_estimate()` is deprecated for all uses (#6387). * Using `summarise()` to produce a 0 or >1 row "summary" is deprecated in favor of the new `reframe()`. See the NEWS bullet about `reframe()` for more details (#6382). * All functions deprecated in 1.0.0 (released April 2020) and earlier now warn every time you use them (#6387). This includes `combine()`, `src_local()`, `src_mysql()`, `src_postgres()`, `src_sqlite()`, `rename_vars_()`, `select_vars_()`, `summarise_each_()`, `mutate_each_()`, `as.tbl()`, `tbl_df()`, and a handful of older arguments. They are likely to be made defunct in the next major version (but not before mid 2024). * `slice()`ing with a 1-column matrix is deprecated. ### Newly superseded * `recode()` is superseded in favour of `case_match()` (#6433). * `recode_factor()` is superseded. We don't have a direct replacement for it yet, but we plan to add one to forcats. In the meantime you can often use `case_match(.ptype = factor(levels = ))` instead (#6433). * `transmute()` is superseded in favour of `mutate(.keep = "none")` (#6414). ### Newly stable * The `.keep`, `.before`, and `.after` arguments to `mutate()` have moved from experimental to stable. * The `rows_*()` family of functions have moved from experimental to stable. ## vctrs Many of dplyr's vector functions have been rewritten to make use of the vctrs package, bringing greater consistency and improved performance. * `between()` can now work with all vector types, not just numeric and date-time. Additionally, `left` and `right` can now also be vectors (with the same length as `x`), and `x`, `left`, and `right` are cast to the common type before the comparison is made (#6183, #6260, #6478). * `case_when()` (#5106): * Has a new `.default` argument that is intended to replace usage of `TRUE ~ default_value` as a more explicit and readable way to specify a default value. In the future, we will deprecate the unsafe recycling of the LHS inputs that allows `TRUE ~` to work, so we encourage you to switch to using `.default`. * No longer requires exact matching of the types of RHS values. For example, the following no longer requires you to use `NA_character_`. ``` x <- c("little", "unknown", "small", "missing", "large") case_when( x %in% c("little", "small") ~ "one", x %in% c("big", "large") ~ "two", x %in% c("missing", "unknown") ~ NA ) ``` * Supports a larger variety of RHS value types. For example, you can use a data frame to create multiple columns at once. * Has new `.ptype` and `.size` arguments which allow you to enforce a particular output type and size. * Has a better error when types or lengths were incompatible (#6261, #6206). * `coalesce()` (#6265): * Discards `NULL` inputs up front. * No longer iterates over the columns of data frame input. Instead, a row is now only coalesced if it is entirely missing, which is consistent with `vctrs::vec_detect_missing()` and greatly simplifies the implementation. * Has new `.ptype` and `.size` arguments which allow you to enforce a particular output type and size. * `first()`, `last()`, and `nth()` (#6331): * When used on a data frame, these functions now return a single row rather than a single column. This is more consistent with the vctrs principle that a data frame is generally treated as a vector of rows. * The `default` is no longer "guessed", and will always automatically be set to a missing value appropriate for the type of `x`. * Error if `n` is not an integer. `nth(x, n = 2)` is fine, but `nth(x, n = 2.5)` is now an error. * No longer support indexing into scalar objects, like `` or scalar S4 objects (#6670). Additionally, they have all gained an `na_rm` argument since they are summary functions (#6242, with contributions from @tnederlof). * `if_else()` gains most of the same benefits as `case_when()`. In particular, `if_else()` now takes the common type of `true`, `false`, and `missing` to determine the output type, meaning that you can now reliably use `NA`, rather than `NA_character_` and friends (#6243). `if_else()` also no longer allows you to supply `NULL` for either `true` or `false`, which was an undocumented usage that we consider to be off-label, because `true` and `false` are intended to be (and documented to be) vector inputs (#6730). * `na_if()` (#6329) now casts `y` to the type of `x` before comparison, which makes it clearer that this function is type and size stable on `x`. In particular, this means that you can no longer do `na_if(, 0)`, which previously accidentally allowed you to replace any instance of `0` across every column of the tibble with `NA`. `na_if()` was never intended to work this way, and this is considered off-label usage. You can also now replace `NaN` values in `x` with `na_if(x, NaN)`. * `lag()` and `lead()` now cast `default` to the type of `x`, rather than taking the common type. This ensures that these functions are type stable on `x` (#6330). * `row_number()`, `min_rank()`, `dense_rank()`, `ntile()`, `cume_dist()`, and `percent_rank()` are faster and work for more types. You can now rank by multiple columns by supplying a data frame (#6428). * `with_order()` now checks that the size of `order_by` is the same size as `x`, and now works correctly when `order_by` is a data frame (#6334). ## Minor improvements and bug fixes * Fixed an issue with latest rlang that caused internal tools (such as `mask$eval_all_summarise()`) to be mentioned in error messages (#6308). * Warnings are enriched with contextualised information in `summarise()` and `filter()` just like they have been in `mutate()` and `arrange()`. * Joins now reference the correct column in `y` when a type error is thrown while joining on two columns with different names (#6465). * Joins on very wide tables are no longer bottlenecked by the application of `suffix` (#6642). * `*_join()` now error if you supply them with additional arguments that aren't used (#6228). * `across()` used without functions inside a rowwise-data frame no longer generates an invalid data frame (#6264). * Anonymous functions supplied with `function()` and `\()` are now inlined by `across()` if possible, which slightly improves performance and makes possible further optimisations in the future. * Functions supplied to `across()` are no longer masked by columns (#6545). For instance, `across(1:2, mean)` will now work as expected even if there is a column called `mean`. * `across()` will now error when supplied `...` without a `.fns` argument (#6638). * `arrange()` now correctly ignores `NULL` inputs (#6193). * `arrange()` now works correctly when `across()` calls are used as the 2nd (or more) ordering expression (#6495). * `arrange(df, mydesc::desc(x))` works correctly when mydesc re-exports `dplyr::desc()` (#6231). * `c_across()` now evaluates `all_of()` correctly and no longer allows you to accidentally select grouping variables (#6522). * `c_across()` now throws a more informative error if you try to rename during column selection (#6522). * dplyr no longer provides `count()` and `tally()` methods for `tbl_sql`. These methods have been accidentally overriding the `tbl_lazy` methods that dbplyr provides, which has resulted in issues with the grouping structure of the output (#6338, tidyverse/dbplyr#940). * `cur_group()` now works correctly with zero row grouped data frames (#6304). * `desc()` gives a useful error message if you give it a non-vector (#6028). * `distinct()` now retains attributes of bare data frames (#6318). * `distinct()` returns columns ordered the way you request, not the same as the input data (#6156). * Error messages in `group_by()`, `distinct()`, `tally()`, and `count()` are now more relevant (#6139). * `group_by_prepare()` loses the `caller_env` argument. It was rarely used and it is no longer needed (#6444). * `group_walk()` gains an explicit `.keep` argument (#6530). * Warnings emitted inside `mutate()` and variants are now collected and stashed away. Run the new `last_dplyr_warnings()` function to see the warnings emitted within dplyr verbs during the last top-level command. This fixes performance issues when thousands of warnings are emitted with rowwise and grouped data frames (#6005, #6236). * `mutate()` behaves a little better with 0-row rowwise inputs (#6303). * A rowwise `mutate()` now automatically unlists list-columns containing length 1 vectors (#6302). * `nest_join()` has gained the `na_matches` argument that all other joins have. * `nest_join()` now preserves the type of `y` (#6295). * `n_distinct()` now errors if you don't give it any input (#6535). * `nth()`, `first()`, `last()`, and `with_order()` now sort character `order_by` vectors in the C locale. Using character vectors for `order_by` is rare, so we expect this to have little practical impact (#6451). * `ntile()` now requires `n` to be a single positive integer. * `relocate()` now works correctly with empty data frames and when `.before` or `.after` result in empty selections (#6167). * `relocate()` no longer drops attributes of bare data frames (#6341). * `relocate()` now retains the last name change when a single column is renamed multiple times while it is being moved. This better matches the behavior of `rename()` (#6209, with help from @eutwt). * `rename()` now contains examples of using `all_of()` and `any_of()` to rename using a named character vector (#6644). * `rename_with()` now disallows renaming in the `.cols` tidy-selection (#6561). * `rename_with()` now checks that the result of `.fn` is the right type and size (#6561). * `rows_insert()` now checks that `y` contains the `by` columns (#6652). * `setequal()` ignores differences between freely coercible types (e.g. integer and double) (#6114) and ignores duplicated rows (#6057). * `slice()` helpers again produce output equivalent to `slice(.data, 0)` when the `n` or `prop` argument is 0, fixing a bug introduced in the previous version (@eutwt, #6184). * `slice()` with no inputs now returns 0 rows. This is mostly for theoretical consistency (#6573). * `slice()` now errors if any expressions in `...` are named. This helps avoid accidentally misspelling an optional argument, such as `.by` (#6554). * `slice_*()` now requires `n` to be an integer. * `slice_*()` generics now perform argument validation. This should make methods more consistent and simpler to implement (#6361). * `slice_min()` and `slice_max()` can `order_by` multiple variables if you supply them as a data.frame or tibble (#6176). * `slice_min()` and `slice_max()` now consistently include missing values in the result if necessary (i.e. there aren't enough non-missing values to reach the `n` or `prop` you have selected). If you don't want missing values to be included at all, set `na_rm = TRUE` (#6177). * `slice_sample()` now accepts negative `n` and `prop` values (#6402). * `slice_sample()` returns a data frame or group with the same number of rows as the input when `replace = FALSE` and `n` is larger than the number of rows or `prop` is larger than 1. This reverts a change made in 1.0.8, returning to the behavior of 1.0.7 (#6185) * `slice_sample()` now gives a more informative error when `replace = FALSE` and the number of rows requested in the sample exceeds the number of rows in the data (#6271). * `storms` has been updated to include 2021 data and some missing storms that were omitted due to an error (@steveharoz, #6320). * `summarise()` now correctly recycles named 0-column data frames (#6509). * `union_all()`, like `union()`, now requires that data frames be compatible: i.e. they have the same columns, and the columns have compatible types. * `where()` is re-exported from tidyselect (#6597). # dplyr 1.0.10 Hot patch release to resolve R CMD check failures. # dplyr 1.0.9 * New `rows_append()` which works like `rows_insert()` but ignores keys and allows you to insert arbitrary rows with a guarantee that the type of `x` won't change (#6249, thanks to @krlmlr for the implementation and @mgirlich for the idea). * The `rows_*()` functions no longer require that the key values in `x` uniquely identify each row. Additionally, `rows_insert()` and `rows_delete()` no longer require that the key values in `y` uniquely identify each row. Relaxing this restriction should make these functions more practically useful for data frames, and alternative backends can enforce this in other ways as needed (i.e. through primary keys) (#5553). * `rows_insert()` gained a new `conflict` argument allowing you greater control over rows in `y` with keys that conflict with keys in `x`. A conflict arises if a key in `y` already exists in `x`. By default, a conflict results in an error, but you can now also `"ignore"` these `y` rows. This is very similar to the `ON CONFLICT DO NOTHING` command from SQL (#5588, with helpful additions from @mgirlich and @krlmlr). * `rows_update()`, `rows_patch()`, and `rows_delete()` gained a new `unmatched` argument allowing you greater control over rows in `y` with keys that are unmatched by the keys in `x`. By default, an unmatched key results in an error, but you can now also `"ignore"` these `y` rows (#5984, #5699). * `rows_delete()` no longer requires that the columns of `y` be a strict subset of `x`. Only the columns specified through `by` will be utilized from `y`, all others will be dropped with a message. * The `rows_*()` functions now always retain the column types of `x`. This behavior was documented, but previously wasn't being applied correctly (#6240). * The `rows_*()` functions now fail elegantly if `y` is a zero column data frame and `by` isn't specified (#6179). # dplyr 1.0.8 * Better display of error messages thanks to rlang 1.0.0. * `mutate(.keep = "none")` is no longer identical to `transmute()`. `transmute()` has not been changed, and completely ignores the column ordering of the existing data, instead relying on the ordering of expressions supplied through `...`. `mutate(.keep = "none")` has been changed to ensure that pre-existing columns are never moved, which aligns more closely with the other `.keep` options (#6086). * `filter()` forbids matrix results (#5973) and warns about data frame results, especially data frames created from `across()` with a hint to use `if_any()` or `if_all()`. * `slice()` helpers (`slice_head()`, `slice_tail()`, `slice_min()`, `slice_max()`) now accept negative values for `n` and `prop` (#5961). * `slice()` now indicates which group produces an error (#5931). * `cur_data()` and `cur_data_all()` don't simplify list columns in rowwise data frames (#5901). * dplyr now uses `rlang::check_installed()` to prompt you whether to install required packages that are missing. * `storms` data updated to 2020 (@steveharoz, #5899). * `coalesce()` accepts 1-D arrays (#5557). * The deprecated `trunc_mat()` is no longer reexported from dplyr (#6141). # dplyr 1.0.7 * `across()` uses the formula environment when inlining them (#5886). * `summarise.rowwise_df()` is quiet when the result is ungrouped (#5875). * `c_across()` and `across()` key deparsing not confused by long calls (#5883). * `across()` handles named selections (#5207). # dplyr 1.0.6 * `add_count()` is now generic (#5837). * `if_any()` and `if_all()` abort when a predicate is mistakingly used as `.cols=` (#5732). * Multiple calls to `if_any()` and/or `if_all()` in the same expression are now properly disambiguated (#5782). * `filter()` now inlines `if_any()` and `if_all()` expressions. This greatly improves performance with grouped data frames. * Fixed behaviour of `...` in top-level `across()` calls (#5813, #5832). * `across()` now inlines lambda-formulas. This is slightly more performant and will allow more optimisations in the future. * Fixed issue in `bind_rows()` causing lists to be incorrectly transformed as data frames (#5417, #5749). * `select()` no longer creates duplicate variables when renaming a variable to the same name as a grouping variable (#5841). * `dplyr_col_select()` keeps attributes for bare data frames (#5294, #5831). * Fixed quosure handling in `dplyr::group_by()` that caused issues with extra arguments (tidyverse/lubridate#959). * Removed the `name` argument from the `compute()` generic (@ianmcook, #5783). * row-wise data frames of 0 rows and list columns are supported again (#5804). # dplyr 1.0.5 * Fixed edge case of `slice_sample()` when `weight_by=` is used and there 0 rows (#5729). * `across()` can again use columns in functions defined inline (#5734). * Using testthat 3rd edition. * Fixed bugs introduced in `across()` in previous version (#5765). * `group_by()` keeps attributes unrelated to the grouping (#5760). * The `.cols=` argument of `if_any()` and `if_all()` defaults to `everything()`. # dplyr 1.0.4 * Improved performance for `across()`. This makes `summarise(across())` and `mutate(across())` perform as well as the superseded colwise equivalents (#5697). * New functions `if_any()` and `if_all()` (#4770, #5713). * `summarise()` silently ignores NULL results (#5708). * Fixed a performance regression in `mutate()` when warnings occur once per group (#5675). We no longer instrument warnings with debugging information when `mutate()` is called within `suppressWarnings()`. # dplyr 1.0.3 * `summarise()` no longer informs when the result is ungrouped (#5633). * `group_by(.drop = FALSE)` preserves ordered factors (@brianrice2, #5545). * `count()` and `tally()` are now generic. * Removed default fallbacks to lazyeval methods; this will yield better error messages when you call a dplyr function with the wrong input, and is part of our long term plan to remove the deprecated lazyeval interface. * `inner_join()` gains a `keep` parameter for consistency with the other mutating joins (@patrickbarks, #5581). * Improved performance with many columns, with a dynamic data mask using active bindings and lazy chops (#5017). * `mutate()` and friends preserves row names in data frames once more (#5418). * `group_by()` uses the ungrouped data for the implicit mutate step (#5598). You might have to define an `ungroup()` method for custom classes. For example, see https://github.com/hadley/cubelyr/pull/3. * `relocate()` can rename columns it relocates (#5569). * `distinct()` and `group_by()` have better error messages when the mutate step fails (#5060). * Clarify that `between()` is not vectorised (#5493). * Fixed `across()` issue where data frame columns would could not be referred to with `all_of()` in the nested case (`mutate()` within `mutate()`) (#5498). * `across()` handles data frames with 0 columns (#5523). * `mutate()` always keeps grouping variables, unconditional to `.keep=` (#5582). * dplyr now depends on R 3.3.0 # dplyr 1.0.2 * Fixed `across()` issue where data frame columns would mask objects referred to from `all_of()` (#5460). * `bind_cols()` gains a `.name_repair` argument, passed to `vctrs::vec_cbind()` (#5451) * `summarise(.groups = "rowwise")` makes a rowwise data frame even if the input data is not grouped (#5422). # dplyr 1.0.1 * New function `cur_data_all()` similar to `cur_data()` but includes the grouping variables (#5342). * `count()` and `tally()` no longer automatically weights by column `n` if present (#5298). dplyr 1.0.0 introduced this behaviour because of Hadley's faulty memory. Historically `tally()` automatically weighted and `count()` did not, but this behaviour was accidentally changed in 0.8.2 (#4408) so that neither automatically weighted by `n`. Since 0.8.2 is almost a year old, and the automatically weighting behaviour was a little confusing anyway, we've removed it from both `count()` and `tally()`. Use of `wt = n()` is now deprecated; now just omit the `wt` argument. * `coalesce()` now supports data frames correctly (#5326). * `cummean()` no longer has off-by-one indexing problem (@cropgen, #5287). * The call stack is preserved on error. This makes it possible to `recover()` into problematic code called from dplyr verbs (#5308). # dplyr 1.0.0 ## Breaking changes * `bind_cols()` no longer converts to a tibble, returns a data frame if the input is a data frame. * `bind_rows()`, `*_join()`, `summarise()` and `mutate()` use vctrs coercion rules. There are two main user facing changes: * Combining factor and character vectors silently creates a character vector; previously it created a character vector with a warning. * Combining multiple factors creates a factor with combined levels; previously it created a character vector with a warning. * `bind_rows()` and other functions use vctrs name repair, see `?vctrs::vec_as_names`. * `all.equal.tbl_df()` removed. * Data frames, tibbles and grouped data frames are no longer considered equal, even if the data is the same. * Equality checks for data frames no longer ignore row order or groupings. * `expect_equal()` uses `all.equal()` internally. When comparing data frames, tests that used to pass may now fail. * `distinct()` keeps the original column order. * `distinct()` on missing columns now raises an error, it has been a compatibility warning for a long time. * `group_modify()` puts the grouping variable to the front. * `n()` and `row_number()` can no longer be called directly when dplyr is not loaded, and this now generates an error: `dplyr::mutate(mtcars, x = n())`. Fix by prefixing with `dplyr::` as in `dplyr::mutate(mtcars, x = dplyr::n())` * The old data format for `grouped_df` is no longer supported. This may affect you if you have serialized grouped data frames to disk, e.g. with `saveRDS()` or when using knitr caching. * `lead()` and `lag()` are stricter about their inputs. * Extending data frames requires that the extra class or classes are added first, not last. Having the extra class at the end causes some vctrs operations to fail with a message like: ``` Input must be a vector, not a `` object ``` * `right_join()` no longer sorts the rows of the resulting tibble according to the order of the RHS `by` argument in tibble `y`. ## New features * The `cur_` functions (`cur_data()`, `cur_group()`, `cur_group_id()`, `cur_group_rows()`) provide a full set of options to you access information about the "current" group in dplyr verbs. They are inspired by data.table's `.SD`, `.GRP`, `.BY`, and `.I`. * The `rows_` functions (`rows_insert()`, `rows_update()`, `rows_upsert()`, `rows_patch()`, `rows_delete()`) provide a new API to insert and delete rows from a second data frame or table. Support for updating mutable backends is planned (#4654). * `mutate()` and `summarise()` create multiple columns from a single expression if you return a data frame (#2326). * `select()` and `rename()` use the latest version of the tidyselect interface. Practically, this means that you can now combine selections using Boolean logic (i.e. `!`, `&` and `|`), and use predicate functions with `where()` (e.g. `where(is.character)`) to select variables by type (#4680). It also makes it possible to use `select()` and `rename()` to repair data frames with duplicated names (#4615) and prevents you from accidentally introducing duplicate names (#4643). This also means that dplyr now re-exports `any_of()` and `all_of()` (#5036). * `slice()` gains a new set of helpers: * `slice_head()` and `slice_tail()` select the first and last rows, like `head()` and `tail()`, but return `n` rows _per group_. * `slice_sample()` randomly selects rows, taking over from `sample_frac()` and `sample_n()`. * `slice_min()` and `slice_max()` select the rows with the minimum or maximum values of a variable, taking over from the confusing `top_n()`. * `summarise()` can create summaries of greater than length 1 if you use a summary function that returns multiple values. * `summarise()` gains a `.groups=` argument to control the grouping structure. * New `relocate()` verb makes it easy to move columns around within a data frame (#4598). * New `rename_with()` is designed specifically for the purpose of renaming selected columns with a function (#4771). * `ungroup()` can now selectively remove grouping variables (#3760). * `pull()` can now return named vectors by specifying an additional column name (@ilarischeinin, #4102). ## Experimental features * `mutate()` (for data frames only), gains experimental new arguments `.before` and `.after` that allow you to control where the new columns are placed (#2047). * `mutate()` (for data frames only), gains an experimental new argument called `.keep` that allows you to control which variables are kept from the input `.data`. `.keep = "all"` is the default; it keeps all variables. `.keep = "none"` retains no input variables (except for grouping keys), so behaves like `transmute()`. `.keep = "unused"` keeps only variables not used to make new columns. `.keep = "used"` keeps only the input variables used to create new columns; it's useful for double checking your work (#3721). * New, experimental, `with_groups()` makes it easy to temporarily group or ungroup (#4711). ## across() * New function `across()` that can be used inside `summarise()`, `mutate()`, and other verbs to apply a function (or a set of functions) to a selection of columns. See `vignette("colwise")` for more details. * New function `c_across()` that can be used inside `summarise()` and `mutate()` in row-wise data frames to easily (e.g.) compute a row-wise mean of all numeric variables. See `vignette("rowwise")` for more details. ## rowwise() * `rowwise()` is no longer questioning; we now understand that it's an important tool when you don't have vectorised code. It now also allows you to specify additional variables that should be preserved in the output when summarising (#4723). The rowwise-ness is preserved by all operations; you need to explicit drop it with `as_tibble()` or `group_by()`. * New, experimental, `nest_by()`. It has the same interface as `group_by()`, but returns a rowwise data frame of grouping keys, supplemental with a list-column of data frames containing the rest of the data. ## vctrs * The implementation of all dplyr verbs have been changed to use primitives provided by the vctrs package. This makes it easier to add support for new types of vector, radically simplifies the implementation, and makes all dplyr verbs more consistent. * The place where you are mostly likely to be impacted by the coercion changes is when working with factors in joins or grouped mutates: now when combining factors with different levels, dplyr creates a new factor with the union of the levels. This matches base R more closely, and while perhaps strictly less correct, is much more convenient. * dplyr dropped its two heaviest dependencies: Rcpp and BH. This should make it considerably easier and faster to build from source. * The implementation of all verbs has been carefully thought through. This mostly makes implementation simpler but should hopefully increase consistency, and also makes it easier to adapt to dplyr to new data structures in the new future. Pragmatically, the biggest difference for most people will be that each verb documents its return value in terms of rows, columns, groups, and data frame attributes. * Row names are now preserved when working with data frames. ## Grouping * `group_by()` uses hashing from the `vctrs` package. * Grouped data frames now have `names<-`, `[[<-`, `[<-` and `$<-` methods that re-generate the underlying grouping. Note that modifying grouping variables in multiple steps (i.e. `df$grp1 <- 1; df$grp2 <- 1`) will be inefficient since the data frame will be regrouped after each modification. * `[.grouped_df` now regroups to respect any grouping columns that have been removed (#4708). * `mutate()` and `summarise()` can now modify grouping variables (#4709). * `group_modify()` works with additional arguments (@billdenney and @cderv, #4509) * `group_by()` does not create an arbitrary NA group when grouping by factors with `drop = TRUE` (#4460). ## Lifecycle changes * All deprecations now use the [lifecycle](https://lifecycle.r-lib.org), that means by default you'll only see a deprecation warning once per session, and you can control with `options(lifecycle_verbosity = x)` where `x` is one of NULL, "quiet", "warning", and "error". ### Removed * `id()`, deprecated in dplyr 0.5.0, is now defunct. * `failwith()`, deprecated in dplyr 0.7.0, is now defunct. * `tbl_cube()` and `nasa` have been pulled out into a separate cubelyr package (#4429). * `rbind_all()` and `rbind_list()` have been removed (@bjungbogati, #4430). * `dr_dplyr()` has been removed as it is no longer needed (#4433, @smwindecker). ### Deprecated * Use of pkgconfig for setting `na_matches` argument to join functions is now deprecated (#4914). This was rarely used, and I'm now confident that the default is correct for R. * In `add_count()`, the `drop` argument has been deprecated because it didn't actually affect the output. * `add_rownames()`: please use `tibble::rownames_to_column()` instead. * `as.tbl()` and `tbl_df()`: please use `as_tibble()` instead. * `bench_tbls()`, `compare_tbls()`, `compare_tbls2()`, `eval_tbls()` and `eval_tbls2()` are now deprecated. That were only used in a handful of packages, and we now believe that you're better off performing comparisons more directly (#4675). * `combine()`: please use `vctrs::vec_c()` instead. * `funs()`: please use `list()` instead. * `group_by(add = )`: please use `.add` instead. * `group_by(.dots = )`/`group_by_prepare(.dots = )`: please use `!!!` instead (#4734). * The use of zero-arg `group_indices()` to retrieve the group id for the "current" group is deprecated; instead use `cur_group_id()`. * Passing arguments to `group_keys()` or `group_indices()` to change the grouping has been deprecated, instead do grouping first yourself. * `location()` and `changes()`: please use `lobstr::ref()` instead. * `progress_estimated()` is soft deprecated; it's not the responsibility of dplyr to provide progress bars (#4935). * `src_local()` has been deprecated; it was part of an approach to testing dplyr backends that didn't pan out. * `src_mysql()`, `src_postgres()`, and `src_sqlite()` has been deprecated. We've recommended against them for some time. Instead please use the approach described at . * `select_vars()`, `rename_vars()`, `select_var()`, `current_vars()` are now deprecated (@perezp44, #4432) ### Superseded * The scoped helpers (all functions ending in `_if`, `_at`, or `_all`) have been superseded by `across()`. This dramatically reduces the API surface for dplyr, while at the same providing providing a more flexible and less error-prone interface (#4769). `rename_*()` and `select_*()` have been superseded by `rename_with()`. * `do()` is superseded in favour of `summarise()`. * `sample_n()` and `sample_frac()` have been superseded by `slice_sample()`. See `?sample_n` for details about why, and for examples converting from old to new usage. * `top_n()` has been superseded by`slice_min()`/`slice_max()`. See `?top_n` for details about why, and how to convert old to new usage (#4494). ### Questioning * `all_equal()` is questioning; it solves a problem that no longer seems important. ### Stable * `rowwise()` is no longer questioning. ## Documentation improvements * New `vignette("base")` which describes how dplyr verbs relate to the base R equivalents (@sastoudt, #4755) * New `vignette("grouping")` gives more details about how dplyr verbs change when applied to grouped data frames (#4779, @MikeKSmith). * `vignette("programming")` has been completely rewritten to reflect our latest vocabulary, the most recent rlang features, and our current recommendations. It should now be substantially easier to program with dplyr. ## Minor improvements and bug fixes * dplyr now has a rudimentary, experimental, and stop-gap, extension mechanism documented in `?dplyr_extending` * dplyr no longer provides a `all.equal.tbl_df()` method. It never should have done so in the first place because it owns neither the generic nor the class. It also provided a problematic implementation because, by default, it ignored the order of the rows and the columns which is usually important. This is likely to cause new test failures in downstream packages; but on the whole we believe those failures to either reflect unexpected behaviour or tests that need to be strengthened (#2751). * `coalesce()` now uses vctrs recycling and common type coercion rules (#5186). * `count()` and `add_count()` do a better job of preserving input class and attributes (#4086). * `distinct()` errors if you request it use variables that don't exist (this was previously a warning) (#4656). * `filter()`, `mutate()` and `summarise()` get better error messages. * `filter()` handles data frame results when all columns are logical vectors by reducing them with `&` (#4678). In particular this means `across()` can be used in `filter()`. * `left_join()`, `right_join()`, and `full_join()` gain a `keep` argument so that you can optionally choose to keep both sets of join keys (#4589). This is useful when you want to figure out which rows were missing from either side. * Join functions can now perform a cross-join by specifying `by = character()` (#4206.) * `groups()` now returns `list()` for ungrouped data; previously it returned `NULL` which was type-unstable (when there are groups it returns a list of symbols). * The first argument of `group_map()`, `group_modify()` and `group_walk()` has been changed to `.data` for consistency with other generics. * `group_keys.rowwise_df()` gives a 0 column data frame with `n()` rows. * `group_map()` is now a generic (#4576). * `group_by(..., .add = TRUE)` replaces `group_by(..., add = TRUE)`, with a deprecation message. The old argument name was a mistake because it prevents you from creating a new grouping var called `add` and it violates our naming conventions (#4137). * `intersect()`, `union()`, `setdiff()` and `setequal()` generics are now imported from the generics package. This reduces a conflict with lubridate. * `order_by()` gives an informative hint if you accidentally call it instead of `arrange()` #3357. * `tally()` and `count()` now message if the default output `name` (n), already exists in the data frame. To quiet the message, you'll need to supply an explicit `name` (#4284). You can override the default weighting to using a constant by setting `wt = 1`. * `starwars` dataset now does a better job of separating biological sex from gender identity. The previous `gender` column has been renamed to `sex`, since it actually describes the individual's biological sex. A new `gender` column encodes the actual gender identity using other information about the Star Wars universe (@MeganBeckett, #4456). * `src_tbls()` accepts `...` arguments (#4485, @ianmcook). This could be a breaking change for some dplyr backend packages that implement `src_tbls()`. * Better performance for extracting slices of factors and ordered factors (#4501). * `rename_at()` and `rename_all()` call the function with a simple character vector, not a `dplyr_sel_vars` (#4459). * `ntile()` is now more consistent with database implementations if the buckets have irregular size (#4495). # dplyr 0.8.5 (2020-03-07) * Maintenance release for compatibility with R-devel. # dplyr 0.8.4 (2020-01-30) * Adapt tests to changes in dependent packages. # dplyr 0.8.3 (2019-07-04) * Fixed performance regression introduced in version 0.8.2 (#4458). # dplyr 0.8.2 (2019-06-28) ## New functions * `top_frac(data, proportion)` is a shorthand for `top_n(data, proportion * n())` (#4017). ## colwise changes * Using quosures in colwise verbs is deprecated (#4330). * Updated `distinct_if()`, `distinct_at()` and `distinct_all()` to include `.keep_all` argument (@beansrowning, #4343). * `rename_at()` handles empty selection (#4324). * `*_if()` functions correctly handle columns with special names (#4380). * colwise functions support constants in formulas (#4374). ## Hybrid evaluation changes * hybrid rank functions correctly handle NA (#4427). * `first()`, `last()` and `nth()` hybrid version handles factors (#4295). ## Minor changes * `top_n()` quotes its `n` argument, `n` no longer needs to be constant for all groups (#4017). * `tbl_vars()` keeps information on grouping columns by returning a `dplyr_sel_vars` object (#4106). * `group_split()` always sets the `ptype` attribute, which make it more robust in the case where there are 0 groups. * `group_map()` and `group_modify()` work in the 0 group edge case (#4421) * `select.list()` method added so that `select()` does not dispatch on lists (#4279). * `view()` is reexported from tibble (#4423). * `group_by()` puts NA groups last in character vectors (#4227). * `arrange()` handles integer64 objects (#4366). * `summarise()` correctly resolves summarised list columns (#4349). # dplyr 0.8.1 (2019-05-14) ## Breaking changes * `group_modify()` is the new name of the function previously known as `group_map()` ## New functions * `group_map()` now only calls the function on each group and return a list. * `group_by_drop_default()`, previously known as `dplyr:::group_drops()` is exported (#4245). ## Minor changes * Lists of formulas passed to colwise verbs are now automatically named. * `group_by()` does a shallow copy even in the no groups case (#4221). * Fixed `mutate()` on rowwise data frames with 0 rows (#4224). * Fixed handling of bare formulas in colwise verbs (#4183). * Fixed performance of `n_distinct()` (#4202). * `group_indices()` now ignores empty groups by default for `data.frame`, which is consistent with the default of `group_by()` (@yutannihilation, #4208). * Fixed integer overflow in hybrid `ntile()` (#4186). * colwise functions `summarise_at()` ... can rename vars in the case of multiple functions (#4180). * `select_if()` and `rename_if()` handle logical vector predicate (#4213). * hybrid `min()` and `max()` cast to integer when possible (#4258). * `bind_rows()` correctly handles the cases where there are multiple consecutive `NULL` (#4296). * Support for R 3.1.* has been dropped. The minimal R version supported is now 3.2.0. https://www.tidyverse.org/articles/2019/04/r-version-support/ * `rename_at()` handles empty selection (#4324). # dplyr 0.8.0.1 (2019-02-15) * Fixed integer C/C++ division, forced released by CRAN (#4185). # dplyr 0.8.0 (2019-02-14) ## Breaking changes * The error `could not find function "n"` or the warning ```Calling `n()` without importing or prefixing it is deprecated, use `dplyr::n()` ``` indicates when functions like `n()`, `row_number()`, ... are not imported or prefixed. The easiest fix is to import dplyr with `import(dplyr)` in your `NAMESPACE` or `#' @import dplyr` in a roxygen comment, alternatively such functions can be imported selectively as any other function with `importFrom(dplyr, n)` in the `NAMESPACE` or `#' @importFrom dplyr n` in a roxygen comment. The third option is to prefix them, i.e. use `dplyr::n()` * If you see `checking S3 generic/method consistency` in R CMD check for your package, note that : - `sample_n()` and `sample_frac()` have gained `...` - `filter()` and `slice()` have gained `.preserve` - `group_by()` has gained `.drop` * ```Error: `.data` is a corrupt grouped_df, ...``` signals code that makes wrong assumptions about the internals of a grouped data frame. ## New functions * New selection helpers `group_cols()`. It can be called in selection contexts such as `select()` and matches the grouping variables of grouped tibbles. * `last_col()` is re-exported from tidyselect (#3584). * `group_trim()` drops unused levels of factors that are used as grouping variables. * `nest_join()` creates a list column of the matching rows. `nest_join()` + `tidyr::unnest()` is equivalent to `inner_join` (#3570). ```r band_members %>% nest_join(band_instruments) ``` * `group_nest()` is similar to `tidyr::nest()` but focusing on the variables to nest by instead of the nested columns. ```r starwars %>% group_by(species, homeworld) %>% group_nest() starwars %>% group_nest(species, homeworld) ``` * `group_split()` is similar to `base::split()` but operating on existing groups when applied to a grouped data frame, or subject to the data mask on ungrouped data frames ```r starwars %>% group_by(species, homeworld) %>% group_split() starwars %>% group_split(species, homeworld) ``` * `group_map()` and `group_walk()` are purrr-like functions to iterate on groups of a grouped data frame, jointly identified by the data subset (exposed as `.x`) and the data key (a one row tibble, exposed as `.y`). `group_map()` returns a grouped data frame that combines the results of the function, `group_walk()` is only used for side effects and returns its input invisibly. ```r mtcars %>% group_by(cyl) %>% group_map(~ head(.x, 2L)) ``` * `distinct_prepare()`, previously known as `distinct_vars()` is exported. This is mostly useful for alternative backends (e.g. `dbplyr`). ## Major changes * `group_by()` gains the `.drop` argument. When set to `FALSE` the groups are generated based on factor levels, hence some groups may be empty (#341). ```r # 3 groups tibble( x = 1:2, f = factor(c("a", "b"), levels = c("a", "b", "c")) ) %>% group_by(f, .drop = FALSE) # the order of the grouping variables matter df <- tibble( x = c(1,2,1,2), f = factor(c("a", "b", "a", "b"), levels = c("a", "b", "c")) ) df %>% group_by(f, x, .drop = FALSE) df %>% group_by(x, f, .drop = FALSE) ``` The default behaviour drops the empty groups as in the previous versions. ```r tibble( x = 1:2, f = factor(c("a", "b"), levels = c("a", "b", "c")) ) %>% group_by(f) ``` * `filter()` and `slice()` gain a `.preserve` argument to control which groups it should keep. The default `filter(.preserve = FALSE)` recalculates the grouping structure based on the resulting data, otherwise it is kept as is. ```r df <- tibble( x = c(1,2,1,2), f = factor(c("a", "b", "a", "b"), levels = c("a", "b", "c")) ) %>% group_by(x, f, .drop = FALSE) df %>% filter(x == 1) df %>% filter(x == 1, .preserve = TRUE) ``` * The notion of lazily grouped data frames have disappeared. All dplyr verbs now recalculate immediately the grouping structure, and respect the levels of factors. * Subsets of columns now properly dispatch to the `[` or `[[` method when the column is an object (a vector with a class) instead of making assumptions on how the column should be handled. The `[` method must handle integer indices, including `NA_integer_`, i.e. `x[NA_integer_]` should produce a vector of the same class as `x` with whatever represents a missing value. ## Minor changes * `tally()` works correctly on non-data frame table sources such as `tbl_sql` (#3075). * `sample_n()` and `sample_frac()` can use `n()` (#3527) * `distinct()` respects the order of the variables provided (#3195, @foo-bar-baz-qux) and handles the 0 rows and 0 columns special case (#2954). * `combine()` uses tidy dots (#3407). * `group_indices()` can be used without argument in expressions in verbs (#1185). * Using `mutate_all()`, `transmute_all()`, `mutate_if()` and `transmute_if()` with grouped tibbles now informs you that the grouping variables are ignored. In the case of the `_all()` verbs, the message invites you to use `mutate_at(df, vars(-group_cols()))` (or the equivalent `transmute_at()` call) instead if you'd like to make it explicit in your code that the operation is not applied on the grouping variables. * Scoped variants of `arrange()` respect the `.by_group` argument (#3504). * `first()` and `last()` hybrid functions fall back to R evaluation when given no arguments (#3589). * `mutate()` removes a column when the expression evaluates to `NULL` for all groups (#2945). * grouped data frames support `[, drop = TRUE]` (#3714). * New low-level constructor `new_grouped_df()` and validator `validate_grouped_df` (#3837). * `glimpse()` prints group information on grouped tibbles (#3384). * `sample_n()` and `sample_frac()` gain `...` (#2888). * Scoped filter variants now support functions and purrr-like lambdas: ```r mtcars %>% filter_at(vars(hp, vs), ~ . %% 2 == 0) ``` ## Lifecycle * `do()`, `rowwise()` and `combine()` are questioning (#3494). * `funs()` is soft-deprecated and will start issuing warnings in a future version. ## Changes to column wise functions * Scoped variants for `distinct()`: `distinct_at()`, `distinct_if()`, `distinct_all()` (#2948). * `summarise_at()` excludes the grouping variables (#3613). * `mutate_all()`, `mutate_at()`, `summarise_all()` and `summarise_at()` handle utf-8 names (#2967). ## Performance * R expressions that cannot be handled with native code are now evaluated with unwind-protection when available (on R 3.5 and later). This improves the performance of dplyr on data frames with many groups (and hence many expressions to evaluate). We benchmarked that computing a grouped average is consistently twice as fast with unwind-protection enabled. Unwind-protection also makes dplyr more robust in corner cases because it ensures the C++ destructors are correctly called in all circumstances (debugger exit, captured condition, restart invocation). * `sample_n()` and `sample_frac()` gain `...` (#2888). * Improved performance for wide tibbles (#3335). * Faster hybrid `sum()`, `mean()`, `var()` and `sd()` for logical vectors (#3189). * Hybrid version of `sum(na.rm = FALSE)` exits early when there are missing values. This considerably improves performance when there are missing values early in the vector (#3288). * `group_by()` does not trigger the additional `mutate()` on simple uses of the `.data` pronoun (#3533). ## Internal * The grouping metadata of grouped data frame has been reorganized in a single tidy tibble, that can be accessed with the new `group_data()` function. The grouping tibble consists of one column per grouping variable, followed by a list column of the (1-based) indices of the groups. The new `group_rows()` function retrieves that list of indices (#3489). ```r # the grouping metadata, as a tibble group_by(starwars, homeworld) %>% group_data() # the indices group_by(starwars, homeworld) %>% group_data() %>% pull(.rows) group_by(starwars, homeworld) %>% group_rows() ``` * Hybrid evaluation has been completely redesigned for better performance and stability. ## Documentation * Add documentation example for moving variable to back in `?select` (#3051). * column wise functions are better documented, in particular explaining when grouping variables are included as part of the selection. ### Deprecated and defunct functions * `mutate_each()` and `summarise_each()` are deprecated. # dplyr 0.7.6 * `exprs()` is no longer exported to avoid conflicts with `Biobase::exprs()` (#3638). * The MASS package is explicitly suggested to fix CRAN warnings on R-devel (#3657). * Set operations like `intersect()` and `setdiff()` reconstruct groups metadata (#3587) and keep the order of the rows (#3839). * Using namespaced calls to `base::sort()` and `base::unique()` from C++ code to avoid ambiguities when these functions are overridden (#3644). * Fix rchk errors (#3693). # dplyr 0.7.5 (2018-04-14) ## Breaking changes for package developers * The major change in this version is that dplyr now depends on the selecting backend of the tidyselect package. If you have been linking to `dplyr::select_helpers` documentation topic, you should update the link to point to `tidyselect::select_helpers`. * Another change that causes warnings in packages is that dplyr now exports the `exprs()` function. This causes a collision with `Biobase::exprs()`. Either import functions from dplyr selectively rather than in bulk, or do not import `Biobase::exprs()` and refer to it with a namespace qualifier. ## Bug fixes * `distinct(data, "string")` now returns a one-row data frame again. (The previous behavior was to return the data unchanged.) * `do()` operations with more than one named argument can access `.` (#2998). * Reindexing grouped data frames (e.g. after `filter()` or `..._join()`) never updates the `"class"` attribute. This also avoids unintended updates to the original object (#3438). * Fixed rare column name clash in `..._join()` with non-join columns of the same name in both tables (#3266). * Fix `ntile()` and `row_number()` ordering to use the locale-dependent ordering functions in R when dealing with character vectors, rather than always using the C-locale ordering function in C (#2792, @foo-bar-baz-qux). * Summaries of summaries (such as `summarise(b = sum(a), c = sum(b))`) are now computed using standard evaluation for simplicity and correctness, but slightly slower (#3233). * Fixed `summarise()` for empty data frames with zero columns (#3071). ## Major changes * `enexpr()`, `expr()`, `exprs()`, `sym()` and `syms()` are now exported. `sym()` and `syms()` construct symbols from strings or character vectors. The `expr()` variants are equivalent to `quo()`, `quos()` and `enquo()` but return simple expressions rather than quosures. They support quasiquotation. * dplyr now depends on the new tidyselect package to power `select()`, `rename()`, `pull()` and their variants (#2896). Consequently `select_vars()`, `select_var()` and `rename_vars()` are soft-deprecated and will start issuing warnings in a future version. Following the switch to tidyselect, `select()` and `rename()` fully support character vectors. You can now unquote variables like this: ``` vars <- c("disp", "cyl") select(mtcars, !! vars) select(mtcars, -(!! vars)) ``` Note that this only works in selecting functions because in other contexts strings and character vectors are ambiguous. For instance strings are a valid input in mutating operations and `mutate(df, "foo")` creates a new column by recycling "foo" to the number of rows. ## Minor changes * Support for raw vector columns in `arrange()`, `group_by()`, `mutate()`, `summarise()` and `..._join()` (minimal `raw` x `raw` support initially) (#1803). * `bind_cols()` handles unnamed list (#3402). * `bind_rows()` works around corrupt columns that have the object bit set while having no class attribute (#3349). * `combine()` returns `logical()` when all inputs are `NULL` (or when there are no inputs) (#3365, @zeehio). * `distinct()` now supports renaming columns (#3234). * Hybrid evaluation simplifies `dplyr::foo()` to `foo()` (#3309). Hybrid functions can now be masked by regular R functions to turn off hybrid evaluation (#3255). The hybrid evaluator finds functions from dplyr even if dplyr is not attached (#3456). * In `mutate()` it is now illegal to use `data.frame` in the rhs (#3298). * Support `!!!` in `recode_factor()` (#3390). * `row_number()` works on empty subsets (#3454). * `select()` and `vars()` now treat `NULL` as empty inputs (#3023). * Scoped select and rename functions (`select_all()`, `rename_if()` etc.) now work with grouped data frames, adapting the grouping as necessary (#2947, #3410). `group_by_at()` can group by an existing grouping variable (#3351). `arrange_at()` can use grouping variables (#3332). * `slice()` no longer enforce tibble classes when input is a simple `data.frame`, and ignores 0 (#3297, #3313). * `transmute()` no longer prints a message when including a group variable. ## Documentation * Improved documentation for `funs()` (#3094) and set operations (e.g. `union()`) (#3238, @edublancas). ## Error messages * Better error message if dbplyr is not installed when accessing database backends (#3225). * `arrange()` fails gracefully on `data.frame` columns (#3153). * Corrected error message when calling `cbind()` with an object of wrong length (#3085). * Add warning with explanation to `distinct()` if any of the selected columns are of type `list` (#3088, @foo-bar-baz-qux), or when used on unknown columns (#2867, @foo-bar-baz-qux). * Show clear error message for bad arguments to `funs()` (#3368). * Better error message in `..._join()` when joining data frames with duplicate or `NA` column names. Joining such data frames with a semi- or anti-join now gives a warning, which may be converted to an error in future versions (#3243, #3417). * Dedicated error message when trying to use columns of the `Interval` or `Period` classes (#2568). * Added an `.onDetach()` hook that allows for plyr to be loaded and attached without the warning message that says functions in dplyr will be masked, since dplyr is no longer attached (#3359, @jwnorman). ## Performance * `sample_n()` and `sample_frac()` on grouped data frame are now faster especially for those with large number of groups (#3193, @saurfang). ## Internal * Compute variable names for joins in R (#3430). * Bumped Rcpp dependency to 0.12.15 to avoid imperfect detection of `NA` values in hybrid evaluation fixed in RcppCore/Rcpp#790 (#2919). * Avoid cleaning the data mask, a temporary environment used to evaluate expressions. If the environment, in which e.g. a `mutate()` expression is evaluated, is preserved until after the operation, accessing variables from that environment now gives a warning but still returns `NULL` (#3318). # dplyr 0.7.4 * Fix recent Fedora and ASAN check errors (#3098). * Avoid dependency on Rcpp 0.12.10 (#3106). # dplyr 0.7.3 * Fixed protection error that occurred when creating a character column using grouped `mutate()` (#2971). * Fixed a rare problem with accessing variable values in `summarise()` when all groups have size one (#3050). * `distinct()` now throws an error when used on unknown columns (#2867, @foo-bar-baz-qux). * Fixed rare out-of-bounds memory write in `slice()` when negative indices beyond the number of rows were involved (#3073). * `select()`, `rename()` and `summarise()` no longer change the grouped vars of the original data (#3038). * `nth(default = var)`, `first(default = var)` and `last(default = var)` fall back to standard evaluation in a grouped operation instead of triggering an error (#3045). * `case_when()` now works if all LHS are atomic (#2909), or when LHS or RHS values are zero-length vectors (#3048). * `case_when()` accepts `NA` on the LHS (#2927). * Semi- and anti-joins now preserve the order of left-hand-side data frame (#3089). * Improved error message for invalid list arguments to `bind_rows()` (#3068). * Grouping by character vectors is now faster (#2204). * Fixed a crash that occurred when an unexpected input was supplied to the `call` argument of `order_by()` (#3065). # dplyr 0.7.2 * Move build-time vs. run-time checks out of `.onLoad()` and into `dr_dplyr()`. # dplyr 0.7.1 * Use new versions of bindrcpp and glue to avoid protection problems. Avoid wrapping arguments to internal error functions (#2877). Fix two protection mistakes found by rchk (#2868). * Fix C++ error that caused compilation to fail on mac cran (#2862) * Fix undefined behaviour in `between()`, where `NA_REAL` were assigned instead of `NA_LOGICAL`. (#2855, @zeehio) * `top_n()` now executes operations lazily for compatibility with database backends (#2848). * Reuse of new variables created in ungrouped `mutate()` possible again, regression introduced in dplyr 0.7.0 (#2869). * Quosured symbols do not prevent hybrid handling anymore. This should fix many performance issues introduced with tidyeval (#2822). # dplyr 0.7.0 ## New data, functions, and features * Five new datasets provide some interesting built-in datasets to demonstrate dplyr verbs (#2094): * `starwars` dataset about starwars characters; has list columns * `storms` has the trajectories of ~200 tropical storms * `band_members`, `band_instruments` and `band_instruments2` has some simple data to demonstrate joins. * New `add_count()` and `add_tally()` for adding an `n` column within groups (#2078, @dgrtwo). * `arrange()` for grouped data frames gains a `.by_group` argument so you can choose to sort by groups if you want to (defaults to `FALSE`) (#2318) * New `pull()` generic for extracting a single column either by name or position (either from the left or the right). Thanks to @paulponcet for the idea (#2054). This verb is powered with the new `select_var()` internal helper, which is exported as well. It is like `select_vars()` but returns a single variable. * `as_tibble()` is re-exported from tibble. This is the recommend way to create tibbles from existing data frames. `tbl_df()` has been softly deprecated. `tribble()` is now imported from tibble (#2336, @chrMongeau); this is now preferred to `frame_data()`. ## Deprecated and defunct * dplyr no longer messages that you need dtplyr to work with data.table (#2489). * Long deprecated `regroup()`, `mutate_each_q()` and `summarise_each_q()` functions have been removed. * Deprecated `failwith()`. I'm not even sure why it was here. * Soft-deprecated `mutate_each()` and `summarise_each()`, these functions print a message which will be changed to a warning in the next release. * The `.env` argument to `sample_n()` and `sample_frac()` is defunct, passing a value to this argument print a message which will be changed to a warning in the next release. ## Databases This version of dplyr includes some major changes to how database connections work. By and large, you should be able to continue using your existing dplyr database code without modification, but there are two big changes that you should be aware of: * Almost all database related code has been moved out of dplyr and into a new package, [dbplyr](https://github.com/tidyverse/dbplyr/). This makes dplyr simpler, and will make it easier to release fixes for bugs that only affect databases. `src_mysql()`, `src_postgres()`, and `src_sqlite()` will still live dplyr so your existing code continues to work. * It is no longer necessary to create a remote "src". Instead you can work directly with the database connection returned by DBI. This reflects the maturity of the DBI ecosystem. Thanks largely to the work of Kirill Muller (funded by the R Consortium) DBI backends are now much more consistent, comprehensive, and easier to use. That means that there's no longer a need for a layer in between you and DBI. You can continue to use `src_mysql()`, `src_postgres()`, and `src_sqlite()`, but I recommend a new style that makes the connection to DBI more clear: ```R library(dplyr) con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") DBI::dbWriteTable(con, "mtcars", mtcars) mtcars2 <- tbl(con, "mtcars") mtcars2 ``` This is particularly useful if you want to perform non-SELECT queries as you can do whatever you want with `DBI::dbGetQuery()` and `DBI::dbExecute()`. If you've implemented a database backend for dplyr, please read the [backend news](https://github.com/tidyverse/dbplyr/blob/main/NEWS.md#backends) to see what's changed from your perspective (not much). If you want to ensure your package works with both the current and previous version of dplyr, see `wrap_dbplyr_obj()` for helpers. ## UTF-8 * Internally, column names are always represented as character vectors, and not as language symbols, to avoid encoding problems on Windows (#1950, #2387, #2388). * Error messages and explanations of data frame inequality are now encoded in UTF-8, also on Windows (#2441). * Joins now always reencode character columns to UTF-8 if necessary. This gives a nice speedup, because now pointer comparison can be used instead of string comparison, but relies on a proper encoding tag for all strings (#2514). * Fixed problems when joining factor or character encodings with a mix of native and UTF-8 encoded values (#1885, #2118, #2271, #2451). * Fix `group_by()` for data frames that have UTF-8 encoded names (#2284, #2382). * New `group_vars()` generic that returns the grouping as character vector, to avoid the potentially lossy conversion to language symbols. The list returned by `group_by_prepare()` now has a new `group_names` component (#1950, #2384). ## Colwise functions * `rename()`, `select()`, `group_by()`, `filter()`, `arrange()` and `transmute()` now have scoped variants (verbs suffixed with `_if()`, `_at()` and `_all()`). Like `mutate_all()`, `summarise_if()`, etc, these variants apply an operation to a selection of variables. * The scoped verbs taking predicates (`mutate_if()`, `summarise_if()`, etc) now support S3 objects and lazy tables. S3 objects should implement methods for `length()`, `[[` and `tbl_vars()`. For lazy tables, the first 100 rows are collected and the predicate is applied on this subset of the data. This is robust for the common case of checking the type of a column (#2129). * Summarise and mutate colwise functions pass `...` on to the manipulation functions. * The performance of colwise verbs like `mutate_all()` is now back to where it was in `mutate_each()`. * `funs()` has better handling of namespaced functions (#2089). * Fix issue with `mutate_if()` and `summarise_if()` when a predicate function returns a vector of `FALSE` (#1989, #2009, #2011). ## Tidyeval dplyr has a new approach to non-standard evaluation (NSE) called tidyeval. It is described in detail in `vignette("programming")` but, in brief, gives you the ability to interpolate values in contexts where dplyr usually works with expressions: ```r my_var <- quo(homeworld) starwars %>% group_by(!!my_var) %>% summarise_at(vars(height:mass), mean, na.rm = TRUE) ``` This means that the underscored version of each main verb is no longer needed, and so these functions have been deprecated (but remain around for backward compatibility). * `order_by()`, `top_n()`, `sample_n()` and `sample_frac()` now use tidyeval to capture their arguments by expression. This makes it possible to use unquoting idioms (see `vignette("programming")`) and fixes scoping issues (#2297). * Most verbs taking dots now ignore the last argument if empty. This makes it easier to copy lines of code without having to worry about deleting trailing commas (#1039). * [API] The new `.data` and `.env` environments can be used inside all verbs that operate on data: `.data$column_name` accesses the column `column_name`, whereas `.env$var` accesses the external variable `var`. Columns or external variables named `.data` or `.env` are shadowed, use `.data$...` and/or `.env$...` to access them. (`.data` implements strict matching also for the `$` operator (#2591).) The `column()` and `global()` functions have been removed. They were never documented officially. Use the new `.data` and `.env` environments instead. * Expressions in verbs are now interpreted correctly in many cases that failed before (e.g., use of `$`, `case_when()`, nonstandard evaluation, ...). These expressions are now evaluated in a specially constructed temporary environment that retrieves column data on demand with the help of the `bindrcpp` package (#2190). This temporary environment poses restrictions on assignments using `<-` inside verbs. To prevent leaking of broken bindings, the temporary environment is cleared after the evaluation (#2435). ## Verbs ### Joins * [API] `xxx_join.tbl_df(na_matches = "never")` treats all `NA` values as different from each other (and from any other value), so that they never match. This corresponds to the behavior of joins for database sources, and of database joins in general. To match `NA` values, pass `na_matches = "na"` to the join verbs; this is only supported for data frames. The default is `na_matches = "na"`, kept for the sake of compatibility to v0.5.0. It can be tweaked by calling `pkgconfig::set_config("dplyr::na_matches", "na")` (#2033). * `common_by()` gets a better error message for unexpected inputs (#2091) * Fix groups when joining grouped data frames with duplicate columns (#2330, #2334, @davidkretch). * One of the two join suffixes can now be an empty string, dplyr no longer hangs (#2228, #2445). * Anti- and semi-joins warn if factor levels are inconsistent (#2741). * Warnings about join column inconsistencies now contain the column names (#2728). ### Select * For selecting variables, the first selector decides if it's an inclusive selection (i.e., the initial column list is empty), or an exclusive selection (i.e., the initial column list contains all columns). This means that `select(mtcars, contains("am"), contains("FOO"), contains("vs"))` now returns again both `am` and `vs` columns like in dplyr 0.4.3 (#2275, #2289, @r2evans). * Select helpers now throw an error if called when no variables have been set (#2452) * Helper functions in `select()` (and related verbs) are now evaluated in a context where column names do not exist (#2184). * `select()` (and the internal function `select_vars()`) now support column names in addition to column positions. As a result, expressions like `select(mtcars, "cyl")` are now allowed. ### Other * `recode()`, `case_when()` and `coalesce()` now support splicing of arguments with rlang's `!!!` operator. * `count()` now preserves the grouping of its input (#2021). * `distinct()` no longer duplicates variables (#2001). * Empty `distinct()` with a grouped data frame works the same way as an empty `distinct()` on an ungrouped data frame, namely it uses all variables (#2476). * `copy_to()` now returns its output invisibly (since you're often just calling for the side-effect). * `filter()` and `lag()` throw informative error if used with ts objects (#2219) * `mutate()` recycles list columns of length 1 (#2171). * `mutate()` gives better error message when attempting to add a non-vector column (#2319), or attempting to remove a column with `NULL` (#2187, #2439). * `summarise()` now correctly evaluates newly created factors (#2217), and can create ordered factors (#2200). * Ungrouped `summarise()` uses summary variables correctly (#2404, #2453). * Grouped `summarise()` no longer converts character `NA` to empty strings (#1839). ## Combining and comparing * `all_equal()` now reports multiple problems as a character vector (#1819, #2442). * `all_equal()` checks that factor levels are equal (#2440, #2442). * `bind_rows()` and `bind_cols()` give an error for database tables (#2373). * `bind_rows()` works correctly with `NULL` arguments and an `.id` argument (#2056), and also for zero-column data frames (#2175). * Breaking change: `bind_rows()` and `combine()` are more strict when coercing. Logical values are no longer coerced to integer and numeric. Date, POSIXct and other integer or double-based classes are no longer coerced to integer or double as there is chance of attributes or information being lost (#2209, @zeehio). * `bind_cols()` now calls `tibble::repair_names()` to ensure that all names are unique (#2248). * `bind_cols()` handles empty argument list (#2048). * `bind_cols()` better handles `NULL` inputs (#2303, #2443). * `bind_rows()` explicitly rejects columns containing data frames (#2015, #2446). * `bind_rows()` and `bind_cols()` now accept vectors. They are treated as rows by the former and columns by the latter. Rows require inner names like `c(col1 = 1, col2 = 2)`, while columns require outer names: `col1 = c(1, 2)`. Lists are still treated as data frames but can be spliced explicitly with `!!!`, e.g. `bind_rows(!!! x)` (#1676). * `rbind_list()` and `rbind_all()` now call `.Deprecated()`, they will be removed in the next CRAN release. Please use `bind_rows()` instead. * `combine()` accepts `NA` values (#2203, @zeehio) * `combine()` and `bind_rows()` with character and factor types now always warn about the coercion to character (#2317, @zeehio) * `combine()` and `bind_rows()` accept `difftime` objects. * `mutate` coerces results from grouped dataframes accepting combinable data types (such as `integer` and `numeric`). (#1892, @zeehio) ## Vector functions * `%in%` gets new hybrid handler (#126). * `between()` returns NA if `left` or `right` is `NA` (fixes #2562). * `case_when()` supports `NA` values (#2000, @tjmahr). * `first()`, `last()`, and `nth()` have better default values for factor, Dates, POSIXct, and data frame inputs (#2029). * Fixed segmentation faults in hybrid evaluation of `first()`, `last()`, `nth()`, `lead()`, and `lag()`. These functions now always fall back to the R implementation if called with arguments that the hybrid evaluator cannot handle (#948, #1980). * `n_distinct()` gets larger hash tables given slightly better performance (#977). * `nth()` and `ntile()` are more careful about proper data types of their return values (#2306). * `ntile()` ignores `NA` when computing group membership (#2564). * `lag()` enforces integer `n` (#2162, @kevinushey). * hybrid `min()` and `max()` now always return a `numeric` and work correctly in edge cases (empty input, all `NA`, ...) (#2305, #2436). * `min_rank("string")` no longer segfaults in hybrid evaluation (#2279, #2444). * `recode()` can now recode a factor to other types (#2268) * `recode()` gains `.dots` argument to support passing replacements as list (#2110, @jlegewie). ## Other minor changes and bug fixes * Many error messages are more helpful by referring to a column name or a position in the argument list (#2448). * New `is_grouped_df()` alias to `is.grouped_df()`. * `tbl_vars()` now has a `group_vars` argument set to `TRUE` by default. If `FALSE`, group variables are not returned. * Fixed segmentation fault after calling `rename()` on an invalid grouped data frame (#2031). * `rename_vars()` gains a `strict` argument to control if an error is thrown when you try and rename a variable that doesn't exist. * Fixed undefined behavior for `slice()` on a zero-column data frame (#2490). * Fixed very rare case of false match during join (#2515). * Restricted workaround for `match()` to R 3.3.0. (#1858). * dplyr now warns on load when the version of R or Rcpp during installation is different to the currently installed version (#2514). * Fixed improper reuse of attributes when creating a list column in `summarise()` and perhaps `mutate()` (#2231). * `mutate()` and `summarise()` always strip the `names` attribute from new or updated columns, even for ungrouped operations (#1689). * Fixed rare error that could lead to a segmentation fault in `all_equal(ignore_col_order = FALSE)` (#2502). * The "dim" and "dimnames" attributes are always stripped when copying a vector (#1918, #2049). * `grouped_df` and `rowwise` are registered officially as S3 classes. This makes them easier to use with S4 (#2276, @joranE, #2789). * All operations that return tibbles now include the `"tbl"` class. This is important for correct printing with tibble 1.3.1 (#2789). * Makeflags uses PKG_CPPFLAGS for defining preprocessor macros. * astyle formatting for C++ code, tested but not changed as part of the tests (#2086, #2103). * Update RStudio project settings to install tests (#1952). * Using `Rcpp::interfaces()` to register C callable interfaces, and registering all native exported functions via `R_registerRoutines()` and `useDynLib(.registration = TRUE)` (#2146). * Formatting of grouped data frames now works by overriding the `tbl_sum()` generic instead of `print()`. This means that the output is more consistent with tibble, and that `format()` is now supported also for SQL sources (#2781). # dplyr 0.5.0 ## Breaking changes ### Existing functions * `arrange()` once again ignores grouping (#1206). * `distinct()` now only keeps the distinct variables. If you want to return all variables (using the first row for non-distinct values) use `.keep_all = TRUE` (#1110). For SQL sources, `.keep_all = FALSE` is implemented using `GROUP BY`, and `.keep_all = TRUE` raises an error (#1937, #1942, @krlmlr). (The default behaviour of using all variables when none are specified remains - this note only applies if you select some variables). * The select helper functions `starts_with()`, `ends_with()` etc are now real exported functions. This means that you'll need to import those functions if you're using from a package where dplyr is not attached. i.e. `dplyr::select(mtcars, starts_with("m"))` used to work, but now you'll need `dplyr::select(mtcars, dplyr::starts_with("m"))`. ### Deprecated and defunct functions * The long deprecated `chain()`, `chain_q()` and `%.%` have been removed. Please use `%>%` instead. * `id()` has been deprecated. Please use `group_indices()` instead (#808). * `rbind_all()` and `rbind_list()` are formally deprecated. Please use `bind_rows()` instead (#803). * Outdated benchmarking demos have been removed (#1487). * Code related to starting and signalling clusters has been moved out to [multidplyr](https://github.com/tidyverse/multidplyr). ## New functions * `coalesce()` finds the first non-missing value from a set of vectors. (#1666, thanks to @krlmlr for initial implementation). * `case_when()` is a general vectorised if + else if (#631). * `if_else()` is a vectorised if statement: it's a stricter (type-safe), faster, and more predictable version of `ifelse()`. In SQL it is translated to a `CASE` statement. * `na_if()` makes it easy to replace a certain value with an `NA` (#1707). In SQL it is translated to `NULL_IF`. * `near(x, y)` is a helper for `abs(x - y) < tol` (#1607). * `recode()` is vectorised equivalent to `switch()` (#1710). * `union_all()` method. Maps to `UNION ALL` for SQL sources, `bind_rows()` for data frames/tbl\_dfs, and `combine()` for vectors (#1045). * A new family of functions replace `summarise_each()` and `mutate_each()` (which will thus be deprecated in a future release). `summarise_all()` and `mutate_all()` apply a function to all columns while `summarise_at()` and `mutate_at()` operate on a subset of columns. These columns are selected with either a character vector of columns names, a numeric vector of column positions, or a column specification with `select()` semantics generated by the new `columns()` helper. In addition, `summarise_if()` and `mutate_if()` take a predicate function or a logical vector (these verbs currently require local sources). All these functions can now take ordinary functions instead of a list of functions generated by `funs()` (though this is only useful for local sources). (#1845, @lionel-) * `select_if()` lets you select columns with a predicate function. Only compatible with local sources. (#497, #1569, @lionel-) ## Local backends ### dtplyr All data table related code has been separated out in to a new dtplyr package. This decouples the development of the data.table interface from the development of the dplyr package. If both data.table and dplyr are loaded, you'll get a message reminding you to load dtplyr. ### Tibble Functions related to the creation and coercion of `tbl_df`s, now live in their own package: [tibble](https://posit.co/blog/tibble-1-0-0/). See `vignette("tibble")` for more details. * `$` and `[[` methods that never do partial matching (#1504), and throw an error if the variable does not exist. * `all_equal()` allows to compare data frames ignoring row and column order, and optionally ignoring minor differences in type (e.g. int vs. double) (#821). The test handles the case where the df has 0 columns (#1506). The test fails fails when convert is `FALSE` and types don't match (#1484). * `all_equal()` shows better error message when comparing raw values or when types are incompatible and `convert = TRUE` (#1820, @krlmlr). * `add_row()` makes it easy to add a new row to data frame (#1021) * `as_data_frame()` is now an S3 generic with methods for lists (the old `as_data_frame()`), data frames (trivial), and matrices (with efficient C++ implementation) (#876). It no longer strips subclasses. * The internals of `data_frame()` and `as_data_frame()` have been aligned, so `as_data_frame()` will now automatically recycle length-1 vectors. Both functions give more informative error messages if you attempting to create an invalid data frame. You can no longer create a data frame with duplicated names (#820). Both check for `POSIXlt` columns, and tell you to use `POSIXct` instead (#813). * `frame_data()` properly constructs rectangular tables (#1377, @kevinushey), and supports list-cols. * `glimpse()` is now a generic. The default method dispatches to `str()` (#1325). It now (invisibly) returns its first argument (#1570). * `lst()` and `lst_()` which create lists in the same way that `data_frame()` and `data_frame_()` create data frames (#1290). * `print.tbl_df()` is considerably faster if you have very wide data frames. It will now also only list the first 100 additional variables not already on screen - control this with the new `n_extra` parameter to `print()` (#1161). When printing a grouped data frame the number of groups is now printed with thousands separators (#1398). The type of list columns is correctly printed (#1379) * Package includes `setOldClass(c("tbl_df", "tbl", "data.frame"))` to help with S4 dispatch (#969). * `tbl_df` automatically generates column names (#1606). ### tbl_cube * new `as_data_frame.tbl_cube()` (#1563, @krlmlr). * `tbl_cube`s are now constructed correctly from data frames, duplicate dimension values are detected, missing dimension values are filled with `NA`. The construction from data frames now guesses the measure variables by default, and allows specification of dimension and/or measure variables (#1568, @krlmlr). * Swap order of `dim_names` and `met_name` arguments in `as.tbl_cube` (for `array`, `table` and `matrix`) for consistency with `tbl_cube` and `as.tbl_cube.data.frame`. Also, the `met_name` argument to `as.tbl_cube.table` now defaults to `"Freq"` for consistency with `as.data.frame.table` (@krlmlr, #1374). ## Remote backends * `as_data_frame()` on SQL sources now returns all rows (#1752, #1821, @krlmlr). * `compute()` gets new parameters `indexes` and `unique_indexes` that make it easier to add indexes (#1499, @krlmlr). * `db_explain()` gains a default method for DBIConnections (#1177). * The backend testing system has been improved. This lead to the removal of `temp_srcs()`. In the unlikely event that you were using this function, you can instead use `test_register_src()`, `test_load()`, and `test_frame()`. * You can now use `right_join()` and `full_join()` with remote tables (#1172). ### SQLite * `src_memdb()` is a session-local in-memory SQLite database. `memdb_frame()` works like `data_frame()`, but creates a new table in that database. * `src_sqlite()` now uses a stricter quoting character, `` ` ``, instead of `"`. SQLite "helpfully" will convert `"x"` into a string if there is no identifier called x in the current scope (#1426). * `src_sqlite()` throws errors if you try and use it with window functions (#907). ### SQL translation * `filter.tbl_sql()` now puts parens around each argument (#934). * Unary `-` is better translated (#1002). * `escape.POSIXt()` method makes it easier to use date times. The date is rendered in ISO 8601 format in UTC, which should work in most databases (#857). * `is.na()` gets a missing space (#1695). * `if`, `is.na()`, and `is.null()` get extra parens to make precedence more clear (#1695). * `pmin()` and `pmax()` are translated to `MIN()` and `MAX()` (#1711). * Window functions: * Work on ungrouped data (#1061). * Warning if order is not set on cumulative window functions. * Multiple partitions or ordering variables in windowed functions no longer generate extra parentheses, so should work for more databases (#1060) ### Internals This version includes an almost total rewrite of how dplyr verbs are translated into SQL. Previously, I used a rather ad-hoc approach, which tried to guess when a new subquery was needed. Unfortunately this approach was fraught with bugs, so in this version I've implemented a much richer internal data model. Now there is a three step process: 1. When applied to a `tbl_lazy`, each dplyr verb captures its inputs and stores in a `op` (short for operation) object. 2. `sql_build()` iterates through the operations building to build up an object that represents a SQL query. These objects are convenient for testing as they are lists, and are backend agnostics. 3. `sql_render()` iterates through the queries and generates the SQL, using generics (like `sql_select()`) that can vary based on the backend. In the short-term, this increased abstraction is likely to lead to some minor performance decreases, but the chance of dplyr generating correct SQL is much much higher. In the long-term, these abstractions will make it possible to write a query optimiser/compiler in dplyr, which would make it possible to generate much more succinct queries. If you have written a dplyr backend, you'll need to make some minor changes to your package: * `sql_join()` has been considerably simplified - it is now only responsible for generating the join query, not for generating the intermediate selects that rename the variable. Similarly for `sql_semi_join()`. If you've provided new methods in your backend, you'll need to rewrite. * `select_query()` gains a distinct argument which is used for generating queries for `distinct()`. It loses the `offset` argument which was never used (and hence never tested). * `src_translate_env()` has been replaced by `sql_translate_env()` which should have methods for the connection object. There were two other tweaks to the exported API, but these are less likely to affect anyone. * `translate_sql()` and `partial_eval()` got a new API: now use connection + variable names, rather than a `tbl`. This makes testing considerably easier. `translate_sql_q()` has been renamed to `translate_sql_()`. * Also note that the sql generation generics now have a default method, instead methods for DBIConnection and NULL. ## Minor improvements and bug fixes ### Single table verbs * Avoiding segfaults in presence of `raw` columns (#1803, #1817, @krlmlr). * `arrange()` fails gracefully on list columns (#1489) and matrices (#1870, #1945, @krlmlr). * `count()` now adds additional grouping variables, rather than overriding existing (#1703). `tally()` and `count()` can now count a variable called `n` (#1633). Weighted `count()`/`tally()` ignore `NA`s (#1145). * The progress bar in `do()` is now updated at most 20 times per second, avoiding unnecessary redraws (#1734, @mkuhn) * `distinct()` doesn't crash when given a 0-column data frame (#1437). * `filter()` throws an error if you supply an named arguments. This is usually a type: `filter(df, x = 1)` instead of `filter(df, x == 1)` (#1529). * `summarise()` correctly coerces factors with different levels (#1678), handles min/max of already summarised variable (#1622), and supports data frames as columns (#1425). * `select()` now informs you that it adds missing grouping variables (#1511). It works even if the grouping variable has a non-syntactic name (#1138). Negating a failed match (e.g. `select(mtcars, -contains("x"))`) returns all columns, instead of no columns (#1176) The `select()` helpers are now exported and have their own documentation (#1410). `one_of()` gives a useful error message if variables names are not found in data frame (#1407). * The naming behaviour of `summarise_each()` and `mutate_each()` has been tweaked so that you can force inclusion of both the function and the variable name: `summarise_each(mtcars, funs(mean = mean), everything())` (#442). * `mutate()` handles factors that are all `NA` (#1645), or have different levels in different groups (#1414). It disambiguates `NA` and `NaN` (#1448), and silently promotes groups that only contain `NA` (#1463). It deep copies data in list columns (#1643), and correctly fails on incompatible columns (#1641). `mutate()` on a grouped data no longer groups grouping attributes (#1120). `rowwise()` mutate gives expected results (#1381). * `one_of()` tolerates unknown variables in `vars`, but warns (#1848, @jennybc). * `print.grouped_df()` passes on `...` to `print()` (#1893). * `slice()` correctly handles grouped attributes (#1405). * `ungroup()` generic gains `...` (#922). ### Dual table verbs * `bind_cols()` matches the behaviour of `bind_rows()` and ignores `NULL` inputs (#1148). It also handles `POSIXct`s with integer base type (#1402). * `bind_rows()` handles 0-length named lists (#1515), promotes factors to characters (#1538), and warns when binding factor and character (#1485). bind_rows()` is more flexible in the way it can accept data frames, lists, list of data frames, and list of lists (#1389). * `bind_rows()` rejects `POSIXlt` columns (#1875, @krlmlr). * Both `bind_cols()` and `bind_rows()` infer classes and grouping information from the first data frame (#1692). * `rbind()` and `cbind()` get `grouped_df()` methods that make it harder to create corrupt data frames (#1385). You should still prefer `bind_rows()` and `bind_cols()`. * Joins now use correct class when joining on `POSIXct` columns (#1582, @joel23888), and consider time zones (#819). Joins handle a `by` that is empty (#1496), or has duplicates (#1192). Suffixes grow progressively to avoid creating repeated column names (#1460). Joins on string columns should be substantially faster (#1386). Extra attributes are ok if they are identical (#1636). Joins work correct when factor levels not equal (#1712, #1559). Anti- and semi-joins give correct result when by variable is a factor (#1571), but warn if factor levels are inconsistent (#2741). A clear error message is given for joins where an explicit `by` contains unavailable columns (#1928, #1932). Warnings about join column inconsistencies now contain the column names (#2728). * `inner_join()`, `left_join()`, `right_join()`, and `full_join()` gain a `suffix` argument which allows you to control what suffix duplicated variable names receive (#1296). * Set operations (`intersect()`, `union()` etc) respect coercion rules (#799). `setdiff()` handles factors with `NA` levels (#1526). * There were a number of fixes to enable joining of data frames that don't have the same encoding of column names (#1513), including working around bug 16885 regarding `match()` in R 3.3.0 (#1806, #1810, @krlmlr). ### Vector functions * `combine()` silently drops `NULL` inputs (#1596). * Hybrid `cummean()` is more stable against floating point errors (#1387). * Hybrid `lead()` and `lag()` received a considerable overhaul. They are more careful about more complicated expressions (#1588), and falls back more readily to pure R evaluation (#1411). They behave correctly in `summarise()` (#1434). and handle default values for string columns. * Hybrid `min()` and `max()` handle empty sets (#1481). * `n_distinct()` uses multiple arguments for data frames (#1084), falls back to R evaluation when needed (#1657), reverting decision made in (#567). Passing no arguments gives an error (#1957, #1959, @krlmlr). * `nth()` now supports negative indices to select from end, e.g. `nth(x, -2)` selects the 2nd value from the end of `x` (#1584). * `top_n()` can now also select bottom `n` values by passing a negative value to `n` (#1008, #1352). * Hybrid evaluation leaves formulas untouched (#1447). # dplyr 0.4.3 ## Improved encoding support Until now, dplyr's support for non-UTF8 encodings has been rather shaky. This release brings a number of improvement to fix these problems: it's probably not perfect, but should be a lot better than the previously version. This includes fixes to `arrange()` (#1280), `bind_rows()` (#1265), `distinct()` (#1179), and joins (#1315). `print.tbl_df()` also received a fix for strings with invalid encodings (#851). ## Other minor improvements and bug fixes * `frame_data()` provides a means for constructing `data_frame`s using a simple row-wise language. (#1358, @kevinushey) * `all.equal()` no longer runs all outputs together (#1130). * `as_data_frame()` gives better error message with NA column names (#1101). * `[.tbl_df` is more careful about subsetting column names (#1245). * `arrange()` and `mutate()` work on empty data frames (#1142). * `arrange()`, `filter()`, `slice()`, and `summarise()` preserve data frame meta attributes (#1064). * `bind_rows()` and `bind_cols()` accept lists (#1104): during initial data cleaning you no longer need to convert lists to data frames, but can instead feed them to `bind_rows()` directly. * `bind_rows()` gains a `.id` argument. When supplied, it creates a new column that gives the name of each data frame (#1337, @lionel-). * `bind_rows()` respects the `ordered` attribute of factors (#1112), and does better at comparing `POSIXct`s (#1125). The `tz` attribute is ignored when determining if two `POSIXct` vectors are comparable. If the `tz` of all inputs is the same, it's used, otherwise its set to `UTC`. * `data_frame()` always produces a `tbl_df` (#1151, @kevinushey) * `filter(x, TRUE, TRUE)` now just returns `x` (#1210), it doesn't internally modify the first argument (#971), and it now works with rowwise data (#1099). It once again works with data tables (#906). * `glimpse()` also prints out the number of variables in addition to the number of observations (@ilarischeinin, #988). * Joins handles matrix columns better (#1230), and can join `Date` objects with heterogeneous representations (some `Date`s are integers, while other are numeric). This also improves `all.equal()` (#1204). * Fixed `percent_rank()` and `cume_dist()` so that missing values no longer affect denominator (#1132). * `print.tbl_df()` now displays the class for all variables, not just those that don't fit on the screen (#1276). It also displays duplicated column names correctly (#1159). * `print.grouped_df()` now tells you how many groups there are. * `mutate()` can set to `NULL` the first column (used to segfault, #1329) and it better protects intermediary results (avoiding random segfaults, #1231). * `mutate()` on grouped data handles the special case where for the first few groups, the result consists of a `logical` vector with only `NA`. This can happen when the condition of an `ifelse` is an all `NA` logical vector (#958). * `mutate.rowwise_df()` handles factors (#886) and correctly handles 0-row inputs (#1300). * `n_distinct()` gains an `na_rm` argument (#1052). * The `Progress` bar used by `do()` now respects global option `dplyr.show_progress` (default is TRUE) so you can turn it off globally (@jimhester #1264, #1226). * `summarise()` handles expressions that returning heterogenous outputs, e.g. `median()`, which that sometimes returns an integer, and other times a numeric (#893). * `slice()` silently drops columns corresponding to an NA (#1235). * `ungroup.rowwise_df()` gives a `tbl_df` (#936). * More explicit duplicated column name error message (#996). * When "," is already being used as the decimal point (`getOption("OutDec")`), use "." as the thousands separator when printing out formatted numbers (@ilarischeinin, #988). ## Databases * `db_query_fields.SQLiteConnection` uses `build_sql` rather than `paste0` (#926, @NikNakk) * Improved handling of `log()` (#1330). * `n_distinct(x)` is translated to `COUNT(DISTINCT(x))` (@skparkes, #873). * `print(n = Inf)` now works for remote sources (#1310). ## Hybrid evaluation * Hybrid evaluation does not take place for objects with a class (#1237). * Improved `$` handling (#1134). * Simplified code for `lead()` and `lag()` and make sure they work properly on factors (#955). Both respect the `default` argument (#915). * `mutate` can set to `NULL` the first column (used to segfault, #1329). * `filter` on grouped data handles indices correctly (#880). * `sum()` issues a warning about integer overflow (#1108). # dplyr 0.4.2 This is a minor release containing fixes for a number of crashes and issues identified by R CMD CHECK. There is one new "feature": dplyr no longer complains about unrecognised attributes, and instead just copies them over to the output. * `lag()` and `lead()` for grouped data were confused about indices and therefore produced wrong results (#925, #937). `lag()` once again overrides `lag()` instead of just the default method `lag.default()`. This is necessary due to changes in R CMD check. To use the lag function provided by another package, use `pkg::lag`. * Fixed a number of memory issues identified by valgrind. * Improved performance when working with large number of columns (#879). * Lists-cols that contain data frames now print a slightly nicer summary (#1147) * Set operations give more useful error message on incompatible data frames (#903). * `all.equal()` gives the correct result when `ignore_row_order` is `TRUE` (#1065) and `all.equal()` correctly handles character missing values (#1095). * `bind_cols()` always produces a `tbl_df` (#779). * `bind_rows()` gains a test for a form of data frame corruption (#1074). * `bind_rows()` and `summarise()` now handles complex columns (#933). * Workaround for using the constructor of `DataFrame` on an unprotected object (#998) * Improved performance when working with large number of columns (#879). # dplyr 0.4.1 * Don't assume that RPostgreSQL is available. # dplyr 0.4.0 ## New features * `add_rownames()` turns row names into an explicit variable (#639). * `as_data_frame()` efficiently coerces a list into a data frame (#749). * `bind_rows()` and `bind_cols()` efficiently bind a list of data frames by row or column. `combine()` applies the same coercion rules to vectors (it works like `c()` or `unlist()` but is consistent with the `bind_rows()` rules). * `right_join()` (include all rows in `y`, and matching rows in `x`) and `full_join()` (include all rows in `x` and `y`) complete the family of mutating joins (#96). * `group_indices()` computes a unique integer id for each group (#771). It can be called on a grouped_df without any arguments or on a data frame with same arguments as `group_by()`. ## New vignettes * `vignette("data_frames")` describes dplyr functions that make it easier and faster to create and coerce data frames. It subsumes the old `memory` vignette. * `vignette("two-table")` describes how two-table verbs work in dplyr. ## Minor improvements * `data_frame()` (and `as_data_frame()` & `tbl_df()`) now explicitly forbid columns that are data frames or matrices (#775). All columns must be either a 1d atomic vector or a 1d list. * `do()` uses lazyeval to correctly evaluate its arguments in the correct environment (#744), and new `do_()` is the SE equivalent of `do()` (#718). You can modify grouped data in place: this is probably a bad idea but it's sometimes convenient (#737). `do()` on grouped data tables now passes in all columns (not all columns except grouping vars) (#735, thanks to @kismsu). `do()` with database tables no longer potentially includes grouping variables twice (#673). Finally, `do()` gives more consistent outputs when there are no rows or no groups (#625). * `first()` and `last()` preserve factors, dates and times (#509). * Overhaul of single table verbs for data.table backend. They now all use a consistent (and simpler) code base. This ensures that (e.g.) `n()` now works in all verbs (#579). * In `*_join()`, you can now name only those variables that are different between the two tables, e.g. `inner_join(x, y, c("a", "b", "c" = "d"))` (#682). If non-join columns are the same, dplyr will add `.x` and `.y` suffixes to distinguish the source (#655). * `mutate()` handles complex vectors (#436) and forbids `POSIXlt` results (instead of crashing) (#670). * `select()` now implements a more sophisticated algorithm so if you're doing multiples includes and excludes with and without names, you're more likely to get what you expect (#644). You'll also get a better error message if you supply an input that doesn't resolve to an integer column position (#643). * Printing has received a number of small tweaks. All `print()` methods invisibly return their input so you can interleave `print()` statements into a pipeline to see interim results. `print()` will column names of 0 row data frames (#652), and will never print more 20 rows (i.e. `options(dplyr.print_max)` is now 20), not 100 (#710). Row names are no never printed since no dplyr method is guaranteed to preserve them (#669). `glimpse()` prints the number of observations (#692) `type_sum()` gains a data frame method. * `summarise()` handles list output columns (#832) * `slice()` works for data tables (#717). Documentation clarifies that slice can't work with relational databases, and the examples show how to achieve the same results using `filter()` (#720). * dplyr now requires RSQLite >= 1.0. This shouldn't affect your code in any way (except that RSQLite now doesn't need to be attached) but does simplify the internals (#622). * Functions that need to combine multiple results into a single column (e.g. `join()`, `bind_rows()` and `summarise()`) are more careful about coercion. Joining factors with the same levels in the same order preserves the original levels (#675). Joining factors with non-identical levels generates a warning and coerces to character (#684). Joining a character to a factor (or vice versa) generates a warning and coerces to character. Avoid these warnings by ensuring your data is compatible before joining. `rbind_list()` will throw an error if you attempt to combine an integer and factor (#751). `rbind()`ing a column full of `NA`s is allowed and just collects the appropriate missing value for the column type being collected (#493). `summarise()` is more careful about `NA`, e.g. the decision on the result type will be delayed until the first non NA value is returned (#599). It will complain about loss of precision coercions, which can happen for expressions that return integers for some groups and a doubles for others (#599). * A number of functions gained new or improved hybrid handlers: `first()`, `last()`, `nth()` (#626), `lead()` & `lag()` (#683), `%in%` (#126). That means when you use these functions in a dplyr verb, we handle them in C++, rather than calling back to R, and hence improving performance. Hybrid `min_rank()` correctly handles `NaN` values (#726). Hybrid implementation of `nth()` falls back to R evaluation when `n` is not a length one integer or numeric, e.g. when it's an expression (#734). Hybrid `dense_rank()`, `min_rank()`, `cume_dist()`, `ntile()`, `row_number()` and `percent_rank()` now preserve NAs (#774) * `filter` returns its input when it has no rows or no columns (#782). * Join functions keep attributes (e.g. time zone information) from the left argument for `POSIXct` and `Date` objects (#819), and only only warn once about each incompatibility (#798). ## Bug fixes * `[.tbl_df` correctly computes row names for 0-column data frames, avoiding problems with xtable (#656). `[.grouped_df` will silently drop grouping if you don't include the grouping columns (#733). * `data_frame()` now acts correctly if the first argument is a vector to be recycled. (#680 thanks @jimhester) * `filter.data.table()` works if the table has a variable called "V1" (#615). * `*_join()` keeps columns in original order (#684). Joining a factor to a character vector doesn't segfault (#688). `*_join` functions can now deal with multiple encodings (#769), and correctly name results (#855). * `*_join.data.table()` works when data.table isn't attached (#786). * `group_by()` on a data table preserves original order of the rows (#623). `group_by()` supports variables with more than 39 characters thanks to a fix in lazyeval (#705). It gives meaningful error message when a variable is not found in the data frame (#716). * `grouped_df()` requires `vars` to be a list of symbols (#665). * `min(.,na.rm = TRUE)` works with `Date`s built on numeric vectors (#755). * `rename_()` generic gets missing `.dots` argument (#708). * `row_number()`, `min_rank()`, `percent_rank()`, `dense_rank()`, `ntile()` and `cume_dist()` handle data frames with 0 rows (#762). They all preserve missing values (#774). `row_number()` doesn't segfault when giving an external variable with the wrong number of variables (#781). * `group_indices` handles the edge case when there are no variables (#867). * Removed bogus `NAs introduced by coercion to integer range` on 32-bit Windows (#2708). # dplyr 0.3.0.1 * Fixed problem with test script on Windows. # dplyr 0.3 ## New functions * `between()` vector function efficiently determines if numeric values fall in a range, and is translated to special form for SQL (#503). * `count()` makes it even easier to do (weighted) counts (#358). * `data_frame()` by @kevinushey is a nicer way of creating data frames. It never coerces column types (no more `stringsAsFactors = FALSE`!), never munges column names, and never adds row names. You can use previously defined columns to compute new columns (#376). * `distinct()` returns distinct (unique) rows of a tbl (#97). Supply additional variables to return the first row for each unique combination of variables. * Set operations, `intersect()`, `union()` and `setdiff()` now have methods for data frames, data tables and SQL database tables (#93). They pass their arguments down to the base functions, which will ensure they raise errors if you pass in two many arguments. * Joins (e.g. `left_join()`, `inner_join()`, `semi_join()`, `anti_join()`) now allow you to join on different variables in `x` and `y` tables by supplying a named vector to `by`. For example, `by = c("a" = "b")` joins `x.a` to `y.b`. * `n_groups()` function tells you how many groups in a tbl. It returns 1 for ungrouped data. (#477) * `transmute()` works like `mutate()` but drops all variables that you didn't explicitly refer to (#302). * `rename()` makes it easy to rename variables - it works similarly to `select()` but it preserves columns that you didn't otherwise touch. * `slice()` allows you to selecting rows by position (#226). It includes positive integers, drops negative integers and you can use expression like `n()`. ## Programming with dplyr (non-standard evaluation) * You can now program with dplyr - every function that does non-standard evaluation (NSE) has a standard evaluation (SE) version ending in `_`. This is powered by the new lazyeval package which provides all the tools needed to implement NSE consistently and correctly. * See `vignette("nse")` for full details. * `regroup()` is deprecated. Please use the more flexible `group_by_()` instead. * `summarise_each_q()` and `mutate_each_q()` are deprecated. Please use `summarise_each_()` and `mutate_each_()` instead. * `funs_q` has been replaced with `funs_`. ## Removed and deprecated features * `%.%` has been deprecated: please use `%>%` instead. `chain()` is defunct. (#518) * `filter.numeric()` removed. Need to figure out how to reimplement with new lazy eval system. * The `Progress` refclass is no longer exported to avoid conflicts with shiny. Instead use `progress_estimated()` (#535). * `src_monetdb()` is now implemented in MonetDB.R, not dplyr. * `show_sql()` and `explain_sql()` and matching global options `dplyr.show_sql` and `dplyr.explain_sql` have been removed. Instead use `show_query()` and `explain()`. ## Minor improvements and bug fixes * Main verbs now have individual documentation pages (#519). * `%>%` is simply re-exported from magrittr, instead of creating a local copy (#496, thanks to @jimhester) * Examples now use `nycflights13` instead of `hflights` because it the variables have better names and there are a few interlinked tables (#562). `Lahman` and `nycflights13` are (once again) suggested packages. This means many examples will not work unless you explicitly install them with `install.packages(c("Lahman", "nycflights13"))` (#508). dplyr now depends on Lahman 3.0.1. A number of examples have been updated to reflect modified field names (#586). * `do()` now displays the progress bar only when used in interactive prompts and not when knitting (#428, @jimhester). * `glimpse()` now prints a trailing new line (#590). * `group_by()` has more consistent behaviour when grouping by constants: it creates a new column with that value (#410). It renames grouping variables (#410). The first argument is now `.data` so you can create new groups with name x (#534). * Now instead of overriding `lag()`, dplyr overrides `lag.default()`, which should avoid clobbering lag methods added by other packages. (#277). * `mutate(data, a = NULL)` removes the variable `a` from the returned dataset (#462). * `trunc_mat()` and hence `print.tbl_df()` and friends gets a `width` argument to control the default output width. Set `options(dplyr.width = Inf)` to always show all columns (#589). * `select()` gains `one_of()` selector: this allows you to select variables provided by a character vector (#396). It fails immediately if you give an empty pattern to `starts_with()`, `ends_with()`, `contains()` or `matches()` (#481, @leondutoit). Fixed buglet in `select()` so that you can now create variables called `val` (#564). * Switched from RC to R6. * `tally()` and `top_n()` work consistently: neither accidentally evaluates the `wt` param. (#426, @mnel) * `rename` handles grouped data (#640). ## Minor improvements and bug fixes by backend ### Databases * Correct SQL generation for `paste()` when used with the collapse parameter targeting a Postgres database. (@rbdixon, #1357) * The db backend system has been completely overhauled in order to make it possible to add backends in other packages, and to support a much wider range of databases. See `vignette("new-sql-backend")` for instruction on how to create your own (#568). * `src_mysql()` gains a method for `explain()`. * When `mutate()` creates a new variable that uses a window function, automatically wrap the result in a subquery (#484). * Correct SQL generation for `first()` and `last()` (#531). * `order_by()` now works in conjunction with window functions in databases that support them. ### Data frames/`tbl_df` * All verbs now understand how to work with `difftime()` (#390) and `AsIs` (#453) objects. They all check that colnames are unique (#483), and are more robust when columns are not present (#348, #569, #600). * Hybrid evaluation bugs fixed: * Call substitution stopped too early when a sub expression contained a `$` (#502). * Handle `::` and `:::` (#412). * `cumany()` and `cumall()` properly handle `NA` (#408). * `nth()` now correctly preserve the class when using dates, times and factors (#509). * no longer substitutes within `order_by()` because `order_by()` needs to do its own NSE (#169). * `[.tbl_df` always returns a tbl_df (i.e. `drop = FALSE` is the default) (#587, #610). `[.grouped_df` preserves important output attributes (#398). * `arrange()` keeps the grouping structure of grouped data (#491, #605), and preserves input classes (#563). * `contains()` accidentally matched regular expressions, now it passes `fixed = TRUE` to `grep()` (#608). * `filter()` asserts all variables are white listed (#566). * `mutate()` makes a `rowwise_df` when given a `rowwise_df` (#463). * `rbind_all()` creates `tbl_df` objects instead of raw `data.frame`s. * If `select()` doesn't match any variables, it returns a 0-column data frame, instead of the original (#498). It no longer fails when if some columns are not named (#492) * `sample_n()` and `sample_frac()` methods for data.frames exported. (#405, @alyst) * A grouped data frame may have 0 groups (#486). Grouped df objects gain some basic validity checking, which should prevent some crashes related to corrupt `grouped_df` objects made by `rbind()` (#606). * More coherence when joining columns of compatible but different types, e.g. when joining a character vector and a factor (#455), or a numeric and integer (#450) * `mutate()` works for on zero-row grouped data frame, and with list columns (#555). * `LazySubset` was confused about input data size (#452). * Internal `n_distinct()` is stricter about its inputs: it requires one symbol which must be from the data frame (#567). * `rbind_*()` handle data frames with 0 rows (#597). They fill character vector columns with `NA` instead of blanks (#595). They work with list columns (#463). * Improved handling of encoding for column names (#636). * Improved handling of hybrid evaluation re $ and @ (#645). ### Data tables * Fix major omission in `tbl_dt()` and `grouped_dt()` methods - I was accidentally doing a deep copy on every result :( * `summarise()` and `group_by()` now retain over-allocation when working with data.tables (#475, @arunsrinivasan). * joining two data.tables now correctly dispatches to data table methods, and result is a data table (#470) ### Cubes * `summarise.tbl_cube()` works with single grouping variable (#480). # dplyr 0.2 ## Piping dplyr now imports `%>%` from magrittr (#330). I recommend that you use this instead of `%.%` because it is easier to type (since you can hold down the shift key) and is more flexible. With you `%>%`, you can control which argument on the RHS receives the LHS by using the pronoun `.`. This makes `%>%` more useful with base R functions because they don't always take the data frame as the first argument. For example you could pipe `mtcars` to `xtabs()` with: mtcars %>% xtabs( ~ cyl + vs, data = .) Thanks to @smbache for the excellent magrittr package. dplyr only provides `%>%` from magrittr, but it contains many other useful functions. To use them, load `magrittr` explicitly: `library(magrittr)`. For more details, see `vignette("magrittr")`. `%.%` will be deprecated in a future version of dplyr, but it won't happen for a while. I've also deprecated `chain()` to encourage a single style of dplyr usage: please use `%>%` instead. ## Do `do()` has been completely overhauled. There are now two ways to use it, either with multiple named arguments or a single unnamed arguments. `group_by()` + `do()` is equivalent to `plyr::dlply`, except it always returns a data frame. If you use named arguments, each argument becomes a list-variable in the output. A list-variable can contain any arbitrary R object so it's particularly well suited for storing models. library(dplyr) models <- mtcars %>% group_by(cyl) %>% do(lm = lm(mpg ~ wt, data = .)) models %>% summarise(rsq = summary(lm)$r.squared) If you use an unnamed argument, the result should be a data frame. This allows you to apply arbitrary functions to each group. mtcars %>% group_by(cyl) %>% do(head(., 1)) Note the use of the `.` pronoun to refer to the data in the current group. `do()` also has an automatic progress bar. It appears if the computation takes longer than 5 seconds and lets you know (approximately) how much longer the job will take to complete. ## New verbs dplyr 0.2 adds three new verbs: * `glimpse()` makes it possible to see all the columns in a tbl, displaying as much data for each variable as can be fit on a single line. * `sample_n()` randomly samples a fixed number of rows from a tbl; `sample_frac()` randomly samples a fixed fraction of rows. Only works for local data frames and data tables (#202). * `summarise_each()` and `mutate_each()` make it easy to apply one or more functions to multiple columns in a tbl (#178). ## Minor improvements * If you load plyr after dplyr, you'll get a message suggesting that you load plyr first (#347). * `as.tbl_cube()` gains a method for matrices (#359, @paulstaab) * `compute()` gains `temporary` argument so you can control whether the results are temporary or permanent (#382, @cpsievert) * `group_by()` now defaults to `add = FALSE` so that it sets the grouping variables rather than adding to the existing list. I think this is how most people expected `group_by` to work anyway, so it's unlikely to cause problems (#385). * Support for [MonetDB](http://www.monetdb.org) tables with `src_monetdb()` (#8, thanks to @hannesmuehleisen). * New vignettes: * `memory` vignette which discusses how dplyr minimises memory usage for local data frames (#198). * `new-sql-backend` vignette which discusses how to add a new SQL backend/source to dplyr. * `changes()` output more clearly distinguishes which columns were added or deleted. * `explain()` is now generic. * dplyr is more careful when setting the keys of data tables, so it never accidentally modifies an object that it doesn't own. It also avoids unnecessary key setting which negatively affected performance. (#193, #255). * `print()` methods for `tbl_df`, `tbl_dt` and `tbl_sql` gain `n` argument to control the number of rows printed (#362). They also works better when you have columns containing lists of complex objects. * `row_number()` can be called without arguments, in which case it returns the same as `1:n()` (#303). * `"comment"` attribute is allowed (white listed) as well as names (#346). * hybrid versions of `min`, `max`, `mean`, `var`, `sd` and `sum` handle the `na.rm` argument (#168). This should yield substantial performance improvements for those functions. * Special case for call to `arrange()` on a grouped data frame with no arguments. (#369) ## Bug fixes * Code adapted to Rcpp > 0.11.1 * internal `DataDots` class protects against missing variables in verbs (#314), including the case where `...` is missing. (#338) * `all.equal.data.frame` from base is no longer bypassed. we now have `all.equal.tbl_df` and `all.equal.tbl_dt` methods (#332). * `arrange()` correctly handles NA in numeric vectors (#331) and 0 row data frames (#289). * `copy_to.src_mysql()` now works on windows (#323) * `*_join()` doesn't reorder column names (#324). * `rbind_all()` is stricter and only accepts list of data frames (#288) * `rbind_*` propagates time zone information for `POSIXct` columns (#298). * `rbind_*` is less strict about type promotion. The numeric `Collecter` allows collection of integer and logical vectors. The integer `Collecter` also collects logical values (#321). * internal `sum` correctly handles integer (under/over)flow (#308). * `summarise()` checks consistency of outputs (#300) and drops `names` attribute of output columns (#357). * join functions throw error instead of crashing when there are no common variables between the data frames, and also give a better error message when only one data frame has a by variable (#371). * `top_n()` returns `n` rows instead of `n - 1` (@leondutoit, #367). * SQL translation always evaluates subsetting operators (`$`, `[`, `[[`) locally. (#318). * `select()` now renames variables in remote sql tbls (#317) and implicitly adds grouping variables (#170). * internal `grouped_df_impl` function errors if there are no variables to group by (#398). * `n_distinct` did not treat NA correctly in the numeric case #384. * Some compiler warnings triggered by -Wall or -pedantic have been eliminated. * `group_by` only creates one group for NA (#401). * Hybrid evaluator did not evaluate expression in correct environment (#403). # dplyr 0.1.3 ## Bug fixes * `select()` actually renames columns in a data table (#284). * `rbind_all()` and `rbind_list()` now handle missing values in factors (#279). * SQL joins now work better if names duplicated in both x and y tables (#310). * Builds against Rcpp 0.11.1 * `select()` correctly works with the vars attribute (#309). * Internal code is stricter when deciding if a data frame is grouped (#308): this avoids a number of situations which previously caused problems. * More data frame joins work with missing values in keys (#306). # dplyr 0.1.2 ## New features * `select()` is substantially more powerful. You can use named arguments to rename existing variables, and new functions `starts_with()`, `ends_with()`, `contains()`, `matches()` and `num_range()` to select variables based on their names. It now also makes a shallow copy, substantially reducing its memory impact (#158, #172, #192, #232). * `summarize()` added as alias for `summarise()` for people from countries that don't don't spell things correctly ;) (#245) ## Bug fixes * `filter()` now fails when given anything other than a logical vector, and correctly handles missing values (#249). `filter.numeric()` proxies `stats::filter()` so you can continue to use `filter()` function with numeric inputs (#264). * `summarise()` correctly uses newly created variables (#259). * `mutate()` correctly propagates attributes (#265) and `mutate.data.frame()` correctly mutates the same variable repeatedly (#243). * `lead()` and `lag()` preserve attributes, so they now work with dates, times and factors (#166). * `n()` never accepts arguments (#223). * `row_number()` gives correct results (#227). * `rbind_all()` silently ignores data frames with 0 rows or 0 columns (#274). * `group_by()` orders the result (#242). It also checks that columns are of supported types (#233, #276). * The hybrid evaluator did not handle some expressions correctly, for example in `if(n() > 5) 1 else 2` the subexpression `n()` was not substituted correctly. It also correctly processes `$` (#278). * `arrange()` checks that all columns are of supported types (#266). It also handles list columns (#282). * Working towards Solaris compatibility. * Benchmarking vignette temporarily disabled due to microbenchmark problems reported by BDR. # dplyr 0.1.1 ## Improvements * new `location()` and `changes()` functions which provide more information about how data frames are stored in memory so that you can see what gets copied. * renamed `explain_tbl()` to `explain()` (#182). * `tally()` gains `sort` argument to sort output so highest counts come first (#173). * `ungroup.grouped_df()`, `tbl_df()`, `as.data.frame.tbl_df()` now only make shallow copies of their inputs (#191). * The `benchmark-baseball` vignette now contains fairer (including grouping times) comparisons with `data.table`. (#222) ## Bug fixes * `filter()` (#221) and `summarise()` (#194) correctly propagate attributes. * `summarise()` throws an error when asked to summarise an unknown variable instead of crashing (#208). * `group_by()` handles factors with missing values (#183). * `filter()` handles scalar results (#217) and better handles scoping, e.g. `filter(., variable)` where `variable` is defined in the function that calls `filter`. It also handles `T` and `F` as aliases to `TRUE` and `FALSE` if there are no `T` or `F` variables in the data or in the scope. * `select.grouped_df` fails when the grouping variables are not included in the selected variables (#170) * `all.equal.data.frame()` handles a corner case where the data frame has `NULL` names (#217) * `mutate()` gives informative error message on unsupported types (#179) * dplyr source package no longer includes pandas benchmark, reducing download size from 2.8 MB to 0.5 MB. dplyr/inst/0000755000176200001440000000000015137234471012370 5ustar liggesusersdplyr/inst/doc/0000755000176200001440000000000015137234471013135 5ustar liggesusersdplyr/inst/doc/two-table.R0000644000176200001440000000507415137234467015171 0ustar liggesusers## ----echo = FALSE, message = FALSE-------------------------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 5) library(dplyr) ## ----warning = FALSE---------------------------------------------------------- library(nycflights13) # Drop unimportant variables so it's easier to understand the join results. flights2 <- flights |> select(year:day, hour, origin, dest, tailnum, carrier) flights2 |> left_join(airlines) ## ----------------------------------------------------------------------------- flights2 |> left_join(weather) ## ----------------------------------------------------------------------------- flights2 |> left_join(planes, by = "tailnum") ## ----------------------------------------------------------------------------- flights2 |> left_join(airports, c("dest" = "faa")) flights2 |> left_join(airports, c("origin" = "faa")) ## ----------------------------------------------------------------------------- df1 <- tibble(x = c(1, 2), y = 2:1) df2 <- tibble(x = c(3, 1), a = 10, b = "a") ## ----------------------------------------------------------------------------- df1 |> inner_join(df2) |> knitr::kable() ## ----------------------------------------------------------------------------- df1 |> left_join(df2) ## ----------------------------------------------------------------------------- df1 |> right_join(df2) df2 |> left_join(df1) ## ----------------------------------------------------------------------------- df1 |> full_join(df2) ## ----------------------------------------------------------------------------- df1 <- tibble(x = c(1, 1, 2), y = 1:3) df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a")) df1 |> left_join(df2) ## ----------------------------------------------------------------------------- library("nycflights13") flights |> anti_join(planes, by = "tailnum") |> count(tailnum, sort = TRUE) ## ----------------------------------------------------------------------------- df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4) df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a")) # Four rows to start with: df1 |> nrow() # And we get four rows after the join df1 |> inner_join(df2, by = "x") |> nrow() # But only two rows actually match df1 |> semi_join(df2, by = "x") |> nrow() ## ----------------------------------------------------------------------------- (df1 <- tibble(x = 1:2, y = c(1L, 1L))) (df2 <- tibble(x = 1:2, y = 1:2)) ## ----------------------------------------------------------------------------- intersect(df1, df2) # Note that we get 3 rows, not 4 union(df1, df2) setdiff(df1, df2) setdiff(df2, df1) dplyr/inst/doc/grouping.R0000644000176200001440000001011615137234455015113 0ustar liggesusers## ----echo = FALSE, message = FALSE, warning = FALSE--------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) ## ----message = FALSE---------------------------------------------------------- library(dplyr) ## ----------------------------------------------------------------------------- by_species <- starwars |> group_by(species) by_sex_gender <- starwars |> group_by(sex, gender) ## ----------------------------------------------------------------------------- by_species by_sex_gender ## ----------------------------------------------------------------------------- by_species |> tally() by_sex_gender |> tally(sort = TRUE) ## ----group_by_with_expression------------------------------------------------- bmi_breaks <- c(0, 18.5, 25, 30, Inf) starwars |> group_by(bmi_cat = cut(mass/(height/100)^2, breaks=bmi_breaks)) |> tally() ## ----group_vars--------------------------------------------------------------- by_species |> group_keys() by_sex_gender |> group_keys() ## ----------------------------------------------------------------------------- by_species |> group_indices() ## ----------------------------------------------------------------------------- by_species |> group_rows() |> head() ## ----------------------------------------------------------------------------- by_species |> group_vars() by_sex_gender |> group_vars() ## ----------------------------------------------------------------------------- by_species |> group_by(homeworld) |> tally() ## ----------------------------------------------------------------------------- by_species |> group_by(homeworld, .add = TRUE) |> tally() ## ----------------------------------------------------------------------------- by_species |> ungroup() |> tally() ## ----------------------------------------------------------------------------- by_sex_gender |> ungroup(sex) |> tally() ## ----summarise---------------------------------------------------------------- by_species |> summarise( n = n(), height = mean(height, na.rm = TRUE) ) ## ----------------------------------------------------------------------------- by_sex_gender |> summarise(n = n()) |> group_vars() by_sex_gender |> summarise(n = n(), .groups = "drop_last") |> group_vars() ## ----------------------------------------------------------------------------- by_sex_gender |> summarise(n = n(), .groups = "keep") |> group_vars() by_sex_gender |> summarise(n = n(), .groups = "drop") |> group_vars() ## ----select------------------------------------------------------------------- by_species |> select(mass) ## ----------------------------------------------------------------------------- by_species |> arrange(desc(mass)) |> relocate(species, mass) by_species |> arrange(desc(mass), .by_group = TRUE) |> relocate(species, mass) ## ----by_homeworld------------------------------------------------------------- # Subtract off global mean starwars |> select(name, homeworld, mass) |> mutate(standard_mass = mass - mean(mass, na.rm = TRUE)) # Subtract off homeworld mean starwars |> select(name, homeworld, mass) |> group_by(homeworld) |> mutate(standard_mass = mass - mean(mass, na.rm = TRUE)) ## ----------------------------------------------------------------------------- # Overall rank starwars |> select(name, homeworld, height) |> mutate(rank = min_rank(height)) # Rank per homeworld starwars |> select(name, homeworld, height) |> group_by(homeworld) |> mutate(rank = min_rank(height)) ## ----filter------------------------------------------------------------------- by_species |> select(name, species, height) |> filter(height == max(height)) ## ----filter_group------------------------------------------------------------- by_species |> filter_out(n() == 1) |> tally() ## ----slice-------------------------------------------------------------------- by_species |> relocate(species) |> slice(1) ## ----slice_min---------------------------------------------------------------- by_species |> filter_out(is.na(height)) |> slice_min(height, n = 2) dplyr/inst/doc/window-functions.Rmd0000644000176200001440000002230415106134104017103 0ustar liggesusers--- title: "Window functions" description: > Window functions are a useful family of functions that work with vectors (returning an output the same size as the input), and combine naturally with `mutate()` and `filter()`. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Window functions} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} --- ```{r, include = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) library(dplyr) set.seed(1014) if (!rlang::is_installed("Lahman")) { knitr::opts_chunk$set(eval = FALSE) } ``` A __window function__ is a variation on an aggregation function. Where an aggregation function, like `sum()` and `mean()`, takes n inputs and return a single value, a window function returns n values. The output of a window function depends on all its input values, so window functions don't include functions that work element-wise, like `+` or `round()`. Window functions include variations on aggregate functions, like `cumsum()` and `cummean()`, functions for ranking and ordering, like `rank()`, and functions for taking offsets, like `lead()` and `lag()`. In this vignette, we'll use a small sample of the Lahman batting dataset, including the players that have won an award. ```{r} library(Lahman) batting <- Lahman::Batting |> as_tibble() |> select(playerID, yearID, teamID, G, AB:H) |> arrange(playerID, yearID, teamID) |> semi_join(Lahman::AwardsPlayers, by = "playerID") players <- batting |> group_by(playerID) ``` Window functions are used in conjunction with `mutate()` and `filter()` to solve a wide range of problems. Here's a selection: ```{r, eval = FALSE} # For each player, find the two years with most hits filter(players, min_rank(desc(H)) <= 2 & H > 0) # Within each player, rank each year by the number of games played mutate(players, G_rank = min_rank(G)) # For each player, find every year that was better than the previous year filter(players, G > lag(G)) # For each player, compute avg change in games played per year mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID))) # For each player, find all years where they played more games than they did on average filter(players, G > mean(G)) # For each, player compute a z score based on number of games played mutate(players, G_z = (G - mean(G)) / sd(G)) ``` Before reading this vignette, you should be familiar with `mutate()` and `filter()`. ## Types of window functions There are five main families of window functions. Two families are unrelated to aggregation functions: * Ranking and ordering functions: `row_number()`, `min_rank()`, `dense_rank()`, `cume_dist()`, `percent_rank()`, and `ntile()`. These functions all take a vector to order by, and return various types of ranks. * Offsets `lead()` and `lag()` allow you to access the previous and next values in a vector, making it easy to compute differences and trends. The other three families are variations on familiar aggregate functions: * Cumulative aggregates: `cumsum()`, `cummin()`, `cummax()` (from base R), and `cumall()`, `cumany()`, and `cummean()` (from dplyr). * Rolling aggregates operate in a fixed width window. You won't find them in base R or in dplyr, but there are many implementations in other packages, such as [RcppRoll](https://cran.r-project.org/package=RcppRoll). * Recycled aggregates, where an aggregate is repeated to match the length of the input. These are not needed in R because vector recycling automatically recycles aggregates where needed. They are important in SQL, because the presence of an aggregation function usually tells the database to return only one row per group. Each family is described in more detail below, focussing on the general goals and how to use them with dplyr. For more details, refer to the individual function documentation. ## Ranking functions The ranking functions are variations on a theme, differing in how they handle ties: ```{r} x <- c(1, 1, 2, 2, 2) row_number(x) min_rank(x) dense_rank(x) ``` If you're familiar with R, you may recognise that `row_number()` and `min_rank()` can be computed with the base `rank()` function and various values of the `ties.method` argument. These functions are provided to save a little typing, and to make it easier to convert between R and SQL. Two other ranking functions return numbers between 0 and 1. `percent_rank()` gives the percentage of the rank; `cume_dist()` gives the proportion of values less than or equal to the current value. ```{r} cume_dist(x) percent_rank(x) ``` These are useful if you want to select (for example) the top 10% of records within each group. For example: ```{r} filter(players, cume_dist(desc(G)) < 0.1) ``` Finally, `ntile()` divides the data up into `n` evenly sized buckets. It's a coarse ranking, and it can be used in with `mutate()` to divide the data into buckets for further summary. For example, we could use `ntile()` to divide the players within a team into four ranked groups, and calculate the average number of games within each group. ```{r} by_team_player <- group_by(batting, teamID, playerID) by_team <- summarise(by_team_player, G = sum(G)) by_team_quartile <- group_by(by_team, quartile = ntile(G, 4)) summarise(by_team_quartile, mean(G)) ``` All ranking functions rank from lowest to highest so that small input values get small ranks. Use `desc()` to rank from highest to lowest. ## Lead and lag `lead()` and `lag()` produce offset versions of a input vector that is either ahead of or behind the original vector. ```{r} x <- 1:5 lead(x) lag(x) ``` You can use them to: * Compute differences or percent changes. ```{r, results = "hide"} # Compute the relative change in games played mutate(players, G_delta = G - lag(G)) ``` Using `lag()` is more convenient than `diff()` because for `n` inputs `diff()` returns `n - 1` outputs. * Find out when a value changes. ```{r, results = "hide"} # Find when a player changed teams filter(players, teamID != lag(teamID)) ``` `lead()` and `lag()` have an optional argument `order_by`. If set, instead of using the row order to determine which value comes before another, they will use another variable. This is important if you have not already sorted the data, or you want to sort one way and lag another. Here's a simple example of what happens if you don't specify `order_by` when you need it: ```{r} df <- data.frame(year = 2000:2005, value = (0:5) ^ 2) scrambled <- df[sample(nrow(df)), ] wrong <- mutate(scrambled, prev_value = lag(value)) arrange(wrong, year) right <- mutate(scrambled, prev_value = lag(value, order_by = year)) arrange(right, year) ``` ## Cumulative aggregates Base R provides cumulative sum (`cumsum()`), cumulative min (`cummin()`), and cumulative max (`cummax()`). (It also provides `cumprod()` but that is rarely useful). Other common accumulating functions are `cumany()` and `cumall()`, cumulative versions of `||` and `&&`, and `cummean()`, a cumulative mean. These are not included in base R, but efficient versions are provided by `dplyr`. `cumany()` and `cumall()` are useful for selecting all rows up to, or all rows after, a condition is true for the first (or last) time. For example, we can use `cumany()` to find all records for a player after they played a year with 150 games: ```{r, eval = FALSE} filter(players, cumany(G > 150)) ``` Like lead and lag, you may want to control the order in which the accumulation occurs. None of the built in functions have an `order_by` argument so `dplyr` provides a helper: `order_by()`. You give it the variable you want to order by, and then the call to the window function: ```{r} x <- 1:10 y <- 10:1 order_by(y, cumsum(x)) ``` This function uses a bit of non-standard evaluation, so I wouldn't recommend using it inside another function; use the simpler but less concise `with_order()` instead. ## Recycled aggregates R's vector recycling makes it easy to select values that are higher or lower than a summary. I call this a recycled aggregate because the value of the aggregate is recycled to be the same length as the original vector. Recycled aggregates are useful if you want to find all records greater than the mean or less than the median: ```{r, eval = FALSE} filter(players, G > mean(G)) filter(players, G < median(G)) ``` While most SQL databases don't have an equivalent of `median()` or `quantile()`, when filtering you can achieve the same effect with `ntile()`. For example, `x > median(x)` is equivalent to `ntile(x, 2) == 2`; `x > quantile(x, 75)` is equivalent to `ntile(x, 100) > 75` or `ntile(x, 4) > 3`. ```{r, eval = FALSE} filter(players, ntile(G, 2) == 2) ``` You can also use this idea to select the records with the highest (`x == max(x)`) or lowest value (`x == min(x)`) for a field, but the ranking functions give you more control over ties, and allow you to select any number of records. Recycled aggregates are also useful in conjunction with `mutate()`. For example, with the batting data, we could compute the "career year", the number of years a player has played since they entered the league: ```{r} mutate(players, career_year = yearID - min(yearID) + 1) ``` Or, as in the introductory example, we could compute a z-score: ```{r} mutate(players, G_z = (G - mean(G)) / sd(G)) ``` dplyr/inst/doc/dplyr.Rmd0000644000176200001440000003302615137161765014744 0ustar liggesusers--- title: "Introduction to dplyr" output: rmarkdown::html_vignette description: > Start here if this is your first time using dplyr. You'll learn the basic philosophy, the most important data manipulation verbs, and the pipe, `|>`, which allows you to combine multiple verbs together to solve real problems. vignette: > %\VignetteIndexEntry{Introduction to dplyr} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) library(dplyr) set.seed(1014) ``` When working with data you must: * Figure out what you want to do. * Describe those tasks in the form of a computer program. * Execute the program. The dplyr package makes these steps fast and easy: * By constraining your options, it helps you think about your data manipulation challenges. * It provides simple "verbs", functions that correspond to the most common data manipulation tasks, to help you translate your thoughts into code. * It uses efficient backends, so you spend less time waiting for the computer. This document introduces you to dplyr's basic set of tools, and shows you how to apply them to data frames. dplyr also supports databases via the dbplyr package, once you've installed, read `vignette("dbplyr")` to learn more. ## Data: starwars To explore the basic data manipulation verbs of dplyr, we'll use the dataset `starwars`. This dataset contains `r nrow(starwars)` characters and comes from the [Star Wars API](https://swapi.py4e.com/), and is documented in `?starwars` ```{r} dim(starwars) starwars ``` Note that `starwars` is a tibble, a modern reimagining of the data frame. It's particularly useful for large datasets because it only prints the first few rows. You can learn more about tibbles at ; in particular you can convert data frames to tibbles with `as_tibble()`. ## Single table verbs dplyr aims to provide a function for each basic verb of data manipulation. These verbs can be organised into three categories based on the component of the dataset that they work with: * Rows: * `filter()` chooses rows based on column values. * `slice()` chooses rows based on location. * `arrange()` changes the order of the rows. * Columns: * `select()` changes whether or not a column is included. * `rename()` changes the name of columns. * `mutate()` changes the values of columns and creates new columns. * `relocate()` changes the order of the columns. * Groups of rows: * `summarise()` collapses a group into a single row. ### The pipe All of the dplyr functions take a data frame (or tibble) as the first argument. Rather than forcing the user to either save intermediate objects or nest functions, dplyr provides the `|>` operator from magrittr. `x |> f(y)` turns into `f(x, y)` so the result from one step is then "piped" into the next step. You can use the pipe to rewrite multiple operations that you can read left-to-right, top-to-bottom (reading the pipe operator as "then"). ### Filter rows with `filter()` `filter()` allows you to select a subset of rows in a data frame. Like all single verbs, the first argument is the tibble (or data frame). The second and subsequent arguments refer to variables within that data frame, selecting rows where the expression is `TRUE`. For example, we can select all character with light skin color and brown eyes with: ```{r} starwars |> filter(skin_color == "light", eye_color == "brown") ``` This is roughly equivalent to this base R code: ```{r, eval = FALSE} starwars[starwars$skin_color == "light" & starwars$eye_color == "brown", ] ``` ### Arrange rows with `arrange()` `arrange()` works similarly to `filter()` except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns: ```{r} starwars |> arrange(height, mass) ``` Use `desc()` to order a column in descending order: ```{r} starwars |> arrange(desc(height)) ``` ### Choose rows using their position with `slice()` `slice()` lets you index rows by their (integer) locations. It allows you to select, remove, and duplicate rows. We can get characters from row numbers 5 through 10. ```{r} starwars |> slice(5:10) ``` It is accompanied by a number of helpers for common use cases: * `slice_head()` and `slice_tail()` select the first or last rows. ```{r} starwars |> slice_head(n = 3) ``` * `slice_sample()` randomly selects rows. Use the option prop to choose a certain proportion of the cases. ```{r} starwars |> slice_sample(n = 5) starwars |> slice_sample(prop = 0.1) ``` Use `replace = TRUE` to perform a bootstrap sample. If needed, you can weight the sample with the `weight` argument. * `slice_min()` and `slice_max()` select the rows with the smallest or largest values of the selected column. By default, they return a single minimum or maximum, but you can supply `n` to control how many rows remain. ```{r} starwars |> slice_max(height, n = 3) ``` ### Select columns with `select()` Often you work with large datasets with many columns but only a few are actually of interest to you. `select()` allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions: ```{r} # Select columns by name starwars |> select(hair_color, skin_color, eye_color) # Select all columns between hair_color and eye_color (inclusive) starwars |> select(hair_color:eye_color) # Select all columns except those from hair_color to eye_color (inclusive) starwars |> select(!(hair_color:eye_color)) # Select all columns ending with color starwars |> select(ends_with("color")) ``` There are a number of helper functions you can use within `select()`, like `starts_with()`, `ends_with()`, `matches()` and `contains()`. These let you quickly match larger blocks of variables that meet some criterion. See `?select` for more details. You can rename variables with `select()` by using named arguments: ```{r} starwars |> select(home_world = homeworld) ``` But because `select()` drops all the variables not explicitly mentioned, it's not that useful. Instead, use `rename()`: ```{r} starwars |> rename(home_world = homeworld) ``` ### Add new columns with `mutate()` Besides selecting sets of existing columns, it's often useful to add new columns that are functions of existing columns. This is the job of `mutate()`: ```{r} starwars |> mutate(height_m = height / 100) ``` We can't see the height in meters we just calculated, but we can fix that using a select command. ```{r} starwars |> mutate(height_m = height / 100) |> select(height_m, height, everything()) ``` `dplyr::mutate()` is similar to the base `transform()`, but allows you to refer to columns that you've just created: ```{r} starwars |> mutate( height_m = height / 100, BMI = mass / (height_m^2) ) |> select(BMI, everything()) ``` If you only want to keep the new variables, use `.keep = "none"`: ```{r} starwars |> mutate( height_m = height / 100, BMI = mass / (height_m^2), .keep = "none" ) ``` ### Change column order with `relocate()` Use a similar syntax as `select()` to move blocks of columns at once ```{r} starwars |> relocate(sex:homeworld, .before = height) ``` ### Summarise values with `summarise()` The last verb is `summarise()`. It collapses a data frame to a single row. ```{r} starwars |> summarise(height = mean(height, na.rm = TRUE)) ``` It's not that useful until we learn the `group_by()` verb below. ### Commonalities You may have noticed that the syntax and function of all these verbs are very similar: * The first argument is a data frame. * The subsequent arguments describe what to do with the data frame. You can refer to columns in the data frame directly without using `$`. * The result is a new data frame Together these properties make it easy to chain together multiple simple steps to achieve a complex result. These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (`arrange()`), pick observations and variables of interest (`filter()` and `select()`), add new variables that are functions of existing variables (`mutate()`), or collapse many values to a summary (`summarise()`). ## Combining functions with `|>` The dplyr API is functional in the sense that function calls don't have side-effects. You must always save their results. This doesn't lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step: ```{r, eval = FALSE} a1 <- group_by(starwars, species, sex) a2 <- select(a1, height, mass) a3 <- summarise(a2, height = mean(height, na.rm = TRUE), mass = mean(mass, na.rm = TRUE) ) ``` Or if you don't want to name the intermediate results, you need to wrap the function calls inside each other: ```{r} summarise( select( group_by(starwars, species, sex), height, mass ), height = mean(height, na.rm = TRUE), mass = mean(mass, na.rm = TRUE) ) ``` This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the `|>` operator from magrittr. `x |> f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom (reading the pipe operator as "then"): ```{r, eval = FALSE} starwars |> group_by(species, sex) |> select(height, mass) |> summarise( height = mean(height, na.rm = TRUE), mass = mean(mass, na.rm = TRUE) ) ``` ## Patterns of operations The dplyr verbs can be classified by the type of operations they accomplish (we sometimes speak of their **semantics**, i.e., their meaning). It's helpful to have a good grasp of the difference between select and mutate operations. ### Selecting operations One of the appealing features of dplyr is that you can refer to columns from the tibble as if they were regular variables. However, the syntactic uniformity of referring to bare column names hides semantical differences across the verbs. A column symbol supplied to `select()` does not have the same meaning as the same symbol supplied to `mutate()`. Selecting operations expect column names and positions. Hence, when you call `select()` with bare variable names, they actually represent their own positions in the tibble. The following calls are completely equivalent from dplyr's point of view: ```{r} # `name` represents the integer 1 select(starwars, name) select(starwars, 1) ``` By the same token, this means that you cannot refer to variables from the surrounding context if they have the same name as one of the columns. In the following example, `height` still represents 2, not 5: ```{r} height <- 5 select(starwars, height) ``` One useful subtlety is that this only applies to bare names and to selecting calls like `c(height, mass)` or `height:mass`. In all other cases, the columns of the data frame are not put in scope. This allows you to refer to contextual variables in selection helpers: ```{r} name <- "color" select(starwars, ends_with(name)) ``` These semantics are usually intuitive. But note the subtle difference: ```{r} name <- 5 select(starwars, name, identity(name)) ``` In the first argument, `name` represents its own position `1`. In the second argument, `name` is evaluated in the surrounding context and represents the fifth column. For a long time, `select()` used to only understand column positions. Counting from dplyr 0.6, it now understands column names as well. This makes it a bit easier to program with `select()`: ```{r} vars <- c("name", "height") select(starwars, all_of(vars), "mass") ``` ### Mutating operations Mutate semantics are quite different from selection semantics. Whereas `select()` expects column names or positions, `mutate()` expects *column vectors*. We will set up a smaller tibble to use for our examples. ```{r} df <- starwars |> select(name, height, mass) ``` When we use `select()`, the bare column names stand for their own positions in the tibble. For `mutate()` on the other hand, column symbols represent the actual column vectors stored in the tibble. Consider what happens if we give a string or a number to `mutate()`: ```{r} mutate(df, "height", 2) ``` `mutate()` gets length-1 vectors that it interprets as new columns in the data frame. These vectors are recycled so they match the number of rows. That's why it doesn't make sense to supply expressions like `"height" + 10` to `mutate()`. This amounts to adding 10 to a string! The correct expression is: ```{r} mutate(df, height + 10) ``` In the same way, you can unquote values from the context if these values represent a valid column. They must be either length 1 (they then get recycled) or have the same length as the number of rows. In the following example we create a new vector that we add to the data frame: ```{r} var <- seq(1, nrow(df)) mutate(df, new = var) ``` A case in point is `group_by()`. While you might think it has select semantics, it actually has mutate semantics. This is quite handy as it allows to group by a modified column: ```{r} group_by(starwars, sex) group_by(starwars, sex = as.factor(sex)) group_by(starwars, height_binned = cut(height, 3)) ``` This is why you can't supply a column name to `group_by()`. This amounts to creating a new column containing the string recycled to the number of rows: ```{r} group_by(df, "month") ``` dplyr/inst/doc/rowwise.html0000644000176200001440000023134615137234465015536 0ustar liggesusers Row-wise operations

Row-wise operations

dplyr, and R in general, are particularly well suited to performing operations over columns, and performing operations over rows is much harder. In this vignette, you’ll learn dplyr’s approach centred around the row-wise data frame created by rowwise().

There are three common use cases that we discuss in this vignette:

  • Row-wise aggregates (e.g. compute the mean of x, y, z).
  • Calling a function multiple times with varying arguments.
  • Working with list-columns.

These types of problems are often easily solved with a for loop, but it’s nice to have a solution that fits naturally into a pipeline.

Of course, someone has to write loops. It doesn’t have to be you. — Jenny Bryan

library(dplyr, warn.conflicts = FALSE)

Creating

Row-wise operations require a special type of grouping where each group consists of a single row. You create this with rowwise():

df <- tibble(x = 1:2, y = 3:4, z = 5:6)
df |> rowwise()
#> # A tibble: 2 × 3
#> # Rowwise: 
#>       x     y     z
#>   <int> <int> <int>
#> 1     1     3     5
#> 2     2     4     6

Like group_by(), rowwise() doesn’t really do anything itself; it just changes how the other verbs work. For example, compare the results of mutate() in the following code:

df |> mutate(m = mean(c(x, y, z)))
#> # A tibble: 2 × 4
#>       x     y     z     m
#>   <int> <int> <int> <dbl>
#> 1     1     3     5   3.5
#> 2     2     4     6   3.5
df |> rowwise() |> mutate(m = mean(c(x, y, z)))
#> # A tibble: 2 × 4
#> # Rowwise: 
#>       x     y     z     m
#>   <int> <int> <int> <dbl>
#> 1     1     3     5     3
#> 2     2     4     6     4

If you use mutate() with a regular data frame, it computes the mean of x, y, and z across all rows. If you apply it to a row-wise data frame, it computes the mean for each row.

You can optionally supply “identifier” variables in your call to rowwise(). These variables are preserved when you call summarise(), so they behave somewhat similarly to the grouping variables passed to group_by():

df <- tibble(name = c("Mara", "Hadley"), x = 1:2, y = 3:4, z = 5:6)

df |>
  rowwise() |>
  summarise(m = mean(c(x, y, z)))
#> # A tibble: 2 × 1
#>       m
#>   <dbl>
#> 1     3
#> 2     4

df |>
  rowwise(name) |>
  summarise(m = mean(c(x, y, z)))
#> `summarise()` has converted the output from a rowwise data frame to a grouped
#> data frame.
#> ℹ Summaries were computed rowwise.
#> ℹ Output is grouped by name.
#> ℹ Use `summarise(.groups = "keep")` to silence this message.
#> # A tibble: 2 × 2
#> # Groups:   name [2]
#>   name       m
#>   <chr>  <dbl>
#> 1 Mara       3
#> 2 Hadley     4

rowwise() is just a special form of grouping, so if you want to remove it from a data frame, just call ungroup().

Per row summary statistics

dplyr::summarise() makes it really easy to summarise values across rows within one column. When combined with rowwise() it also makes it easy to summarise values across columns within one row. To see how, we’ll start by making a little dataset:

df <- tibble(id = 1:6, w = 10:15, x = 20:25, y = 30:35, z = 40:45)
df
#> # A tibble: 6 × 5
#>      id     w     x     y     z
#>   <int> <int> <int> <int> <int>
#> 1     1    10    20    30    40
#> 2     2    11    21    31    41
#> 3     3    12    22    32    42
#> 4     4    13    23    33    43
#> # ℹ 2 more rows

Let’s say we want compute the sum of w, x, y, and z for each row. We start by making a row-wise data frame:

rf <- df |> rowwise(id)

We can then use mutate() to add a new column to each row, or summarise() to return just that one summary:

rf |> mutate(total = sum(c(w, x, y, z)))
#> # A tibble: 6 × 6
#> # Rowwise:  id
#>      id     w     x     y     z total
#>   <int> <int> <int> <int> <int> <int>
#> 1     1    10    20    30    40   100
#> 2     2    11    21    31    41   104
#> 3     3    12    22    32    42   108
#> 4     4    13    23    33    43   112
#> # ℹ 2 more rows
rf |> summarise(total = sum(c(w, x, y, z)))
#> `summarise()` has converted the output from a rowwise data frame to a grouped
#> data frame.
#> ℹ Summaries were computed rowwise.
#> ℹ Output is grouped by id.
#> ℹ Use `summarise(.groups = "keep")` to silence this message.
#> # A tibble: 6 × 2
#> # Groups:   id [6]
#>      id total
#>   <int> <int>
#> 1     1   100
#> 2     2   104
#> 3     3   108
#> 4     4   112
#> # ℹ 2 more rows

Of course, if you have a lot of variables, it’s going to be tedious to type in every variable name. Instead, you can use c_across() which uses tidy selection syntax so you can to succinctly select many variables:

rf |> mutate(total = sum(c_across(w:z)))
#> # A tibble: 6 × 6
#> # Rowwise:  id
#>      id     w     x     y     z total
#>   <int> <int> <int> <int> <int> <int>
#> 1     1    10    20    30    40   100
#> 2     2    11    21    31    41   104
#> 3     3    12    22    32    42   108
#> 4     4    13    23    33    43   112
#> # ℹ 2 more rows
rf |> mutate(total = sum(c_across(where(is.numeric))))
#> # A tibble: 6 × 6
#> # Rowwise:  id
#>      id     w     x     y     z total
#>   <int> <int> <int> <int> <int> <int>
#> 1     1    10    20    30    40   100
#> 2     2    11    21    31    41   104
#> 3     3    12    22    32    42   108
#> 4     4    13    23    33    43   112
#> # ℹ 2 more rows

You could combine this with column-wise operations (see vignette("colwise") for more details) to compute the proportion of the total for each column:

rf |>
  mutate(total = sum(c_across(w:z))) |>
  ungroup() |>
  mutate(across(w:z, ~ . / total))
#> # A tibble: 6 × 6
#>      id     w     x     y     z total
#>   <int> <dbl> <dbl> <dbl> <dbl> <int>
#> 1     1 0.1   0.2   0.3   0.4     100
#> 2     2 0.106 0.202 0.298 0.394   104
#> 3     3 0.111 0.204 0.296 0.389   108
#> 4     4 0.116 0.205 0.295 0.384   112
#> # ℹ 2 more rows

Row-wise summary functions

The rowwise() approach will work for any summary function. But if you need greater speed, it’s worth looking for a built-in row-wise variant of your summary function. These are more efficient because they operate on the data frame as whole; they don’t split it into rows, compute the summary, and then join the results back together again.

df |> mutate(total = rowSums(pick(where(is.numeric), -id)))
#> # A tibble: 6 × 6
#>      id     w     x     y     z total
#>   <int> <int> <int> <int> <int> <dbl>
#> 1     1    10    20    30    40   100
#> 2     2    11    21    31    41   104
#> 3     3    12    22    32    42   108
#> 4     4    13    23    33    43   112
#> # ℹ 2 more rows
df |> mutate(mean = rowMeans(pick(where(is.numeric), -id)))
#> # A tibble: 6 × 6
#>      id     w     x     y     z  mean
#>   <int> <int> <int> <int> <int> <dbl>
#> 1     1    10    20    30    40    25
#> 2     2    11    21    31    41    26
#> 3     3    12    22    32    42    27
#> 4     4    13    23    33    43    28
#> # ℹ 2 more rows

NB: I use df (not rf) and pick() (not c_across()) here because rowMeans() and rowSums() take a multi-row data frame as input. Also note that -id is needed to avoid selecting id in pick(). This wasn’t required with the rowwise data frame because we had specified id as an identifier in our original call to rowwise(), preventing it from being selected as a grouping column.

List-columns

rowwise() operations are a natural pairing when you have list-columns. They allow you to avoid explicit loops and/or functions from the apply() or purrr::map() families.

Motivation

Imagine you have this data frame, and you want to count the lengths of each element:

df <- tibble(
  x = list(1, 2:3, 4:6)
)

You might try calling length():

df |> mutate(l = length(x))
#> # A tibble: 3 × 2
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     3
#> 2 <int [2]>     3
#> 3 <int [3]>     3

But that returns the length of the column, not the length of the individual values. If you’re an R documentation aficionado, you might know there’s already a base R function just for this purpose:

df |> mutate(l = lengths(x))
#> # A tibble: 3 × 2
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     1
#> 2 <int [2]>     2
#> 3 <int [3]>     3

Or if you’re an experienced R programmer, you might know how to apply a function to each element of a list using sapply(), vapply(), or one of the purrr map() functions:

df |> mutate(l = sapply(x, length))
#> # A tibble: 3 × 2
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     1
#> 2 <int [2]>     2
#> 3 <int [3]>     3
df |> mutate(l = purrr::map_int(x, length))
#> # A tibble: 3 × 2
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     1
#> 2 <int [2]>     2
#> 3 <int [3]>     3

But wouldn’t it be nice if you could just write length(x) and dplyr would figure out that you wanted to compute the length of the element inside of x? Since you’re here, you might already be guessing at the answer: this is just another application of the row-wise pattern.

df |>
  rowwise() |>
  mutate(l = length(x))
#> # A tibble: 3 × 2
#> # Rowwise: 
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     1
#> 2 <int [2]>     2
#> 3 <int [3]>     3

Subsetting

Before we continue on, I wanted to briefly mention the magic that makes this work. This isn’t something you’ll generally need to think about (it’ll just work), but it’s useful to know about when something goes wrong.

There’s an important difference between a grouped data frame where each group happens to have one row, and a row-wise data frame where every group always has one row. Take these two data frames:

df <- tibble(g = 1:2, y = list(1:3, "a"))
gf <- df |> group_by(g)
rf <- df |> rowwise(g)

If we compute some properties of y, you’ll notice the results look different:

gf |> mutate(type = typeof(y), length = length(y))
#> # A tibble: 2 × 4
#> # Groups:   g [2]
#>       g y         type  length
#>   <int> <list>    <chr>  <int>
#> 1     1 <int [3]> list       1
#> 2     2 <chr [1]> list       1
rf |> mutate(type = typeof(y), length = length(y))
#> # A tibble: 2 × 4
#> # Rowwise:  g
#>       g y         type      length
#>   <int> <list>    <chr>      <int>
#> 1     1 <int [3]> integer        3
#> 2     2 <chr [1]> character      1

They key difference is that when mutate() slices up the columns to pass to length(y) the grouped mutate uses [ and the row-wise mutate uses [[. The following code gives a flavour of the differences if you used a for loop:

# grouped
out1 <- integer(2)
for (i in 1:2) {
  out1[[i]] <- length(df$y[i])
}
out1
#> [1] 1 1

# rowwise
out2 <- integer(2)
for (i in 1:2) {
  out2[[i]] <- length(df$y[[i]])
}
out2
#> [1] 3 1

Note that this magic only applies when you’re referring to existing columns, not when you’re creating new rows. This is potentially confusing, but we’re fairly confident it’s the least worst solution, particularly given the hint in the error message.

gf |> mutate(y2 = y)
#> # A tibble: 2 × 3
#> # Groups:   g [2]
#>       g y         y2       
#>   <int> <list>    <list>   
#> 1     1 <int [3]> <int [3]>
#> 2     2 <chr [1]> <chr [1]>
rf |> mutate(y2 = y)
#> Error in `mutate()`:
#> ℹ In argument: `y2 = y`.
#> ℹ In row 1.
#> Caused by error:
#> ! `y2` must be size 1, not 3.
#> ℹ Did you mean: `y2 = list(y)` ?
rf |> mutate(y2 = list(y))
#> # A tibble: 2 × 3
#> # Rowwise:  g
#>       g y         y2       
#>   <int> <list>    <list>   
#> 1     1 <int [3]> <int [3]>
#> 2     2 <chr [1]> <chr [1]>

Modelling

rowwise() data frames allow you to solve a variety of modelling problems in what I think is a particularly elegant way. We’ll start by creating a nested data frame:

by_cyl <- mtcars |> nest_by(cyl)
by_cyl
#> # A tibble: 3 × 2
#> # Rowwise:  cyl
#>     cyl data              
#>   <dbl> <list>            
#> 1     4 <tibble [11 × 12]>
#> 2     6 <tibble [7 × 12]> 
#> 3     8 <tibble [14 × 12]>

This is a little different to the usual group_by() output: we have visibly changed the structure of the data. Now we have three rows (one for each group), and we have a list-col, data, that stores the data for that group. Also note that the output is rowwise(); this is important because it’s going to make working with that list of data frames much easier.

Once we have one data frame per row, it’s straightforward to make one model per row:

mods <- by_cyl |> mutate(mod = list(lm(mpg ~ wt, data = data)))
mods
#> # A tibble: 3 × 3
#> # Rowwise:  cyl
#>     cyl data               mod   
#>   <dbl> <list>             <list>
#> 1     4 <tibble [11 × 12]> <lm>  
#> 2     6 <tibble [7 × 12]>  <lm>  
#> 3     8 <tibble [14 × 12]> <lm>

And supplement that with one set of predictions per row:

mods <- mods |> mutate(pred = list(predict(mod, data)))
mods
#> # A tibble: 3 × 4
#> # Rowwise:  cyl
#>     cyl data               mod    pred      
#>   <dbl> <list>             <list> <list>    
#> 1     4 <tibble [11 × 12]> <lm>   <dbl [11]>
#> 2     6 <tibble [7 × 12]>  <lm>   <dbl [7]> 
#> 3     8 <tibble [14 × 12]> <lm>   <dbl [14]>

You could then summarise the model in a variety of ways:

mods |> summarise(rmse = sqrt(mean((pred - data$mpg) ^ 2)))
#> `summarise()` has converted the output from a rowwise data frame to a grouped
#> data frame.
#> ℹ Summaries were computed rowwise.
#> ℹ Output is grouped by cyl.
#> ℹ Use `summarise(.groups = "keep")` to silence this message.
#> # A tibble: 3 × 2
#> # Groups:   cyl [3]
#>     cyl  rmse
#>   <dbl> <dbl>
#> 1     4 3.01 
#> 2     6 0.985
#> 3     8 1.87
mods |> summarise(rsq = summary(mod)$r.squared)
#> `summarise()` has converted the output from a rowwise data frame to a grouped
#> data frame.
#> ℹ Summaries were computed rowwise.
#> ℹ Output is grouped by cyl.
#> ℹ Use `summarise(.groups = "keep")` to silence this message.
#> # A tibble: 3 × 2
#> # Groups:   cyl [3]
#>     cyl   rsq
#>   <dbl> <dbl>
#> 1     4 0.509
#> 2     6 0.465
#> 3     8 0.423
mods |> summarise(broom::glance(mod))
#> `summarise()` has converted the output from a rowwise data frame to a grouped
#> data frame.
#> ℹ Summaries were computed rowwise.
#> ℹ Output is grouped by cyl.
#> ℹ Use `summarise(.groups = "keep")` to silence this message.
#> # A tibble: 3 × 13
#> # Groups:   cyl [3]
#>     cyl r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
#>   <dbl>     <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
#> 1     4     0.509         0.454  3.33      9.32  0.0137     1 -27.7   61.5  62.7
#> 2     6     0.465         0.357  1.17      4.34  0.0918     1  -9.83  25.7  25.5
#> 3     8     0.423         0.375  2.02      8.80  0.0118     1 -28.7   63.3  65.2
#> # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Or easily access the parameters of each model:

mods |> reframe(broom::tidy(mod))
#> # A tibble: 6 × 6
#>     cyl term        estimate std.error statistic    p.value
#>   <dbl> <chr>          <dbl>     <dbl>     <dbl>      <dbl>
#> 1     4 (Intercept)    39.6       4.35      9.10 0.00000777
#> 2     4 wt             -5.65      1.85     -3.05 0.0137    
#> 3     6 (Intercept)    28.4       4.18      6.79 0.00105   
#> 4     6 wt             -2.78      1.33     -2.08 0.0918    
#> # ℹ 2 more rows

Repeated function calls

rowwise() doesn’t just work with functions that return a length-1 vector (aka summary functions); it can work with any function if the result is a list. This means that rowwise() and mutate() provide an elegant way to call a function many times with varying arguments, storing the outputs alongside the inputs.

Simulations

I think this is a particularly elegant way to perform simulations, because it lets you store simulated values along with the parameters that generated them. For example, imagine you have the following data frame that describes the properties of 3 samples from the uniform distribution:

df <- tribble(
  ~ n, ~ min, ~ max,
    1,     0,     1,
    2,    10,   100,
    3,   100,  1000,
)

You can supply these parameters to runif() by using rowwise() and mutate():

df |>
  rowwise() |>
  mutate(data = list(runif(n, min, max)))
#> # A tibble: 3 × 4
#> # Rowwise: 
#>       n   min   max data     
#>   <dbl> <dbl> <dbl> <list>   
#> 1     1     0     1 <dbl [1]>
#> 2     2    10   100 <dbl [2]>
#> 3     3   100  1000 <dbl [3]>

Note the use of list() here - runif() returns multiple values and a mutate() expression has to return something of length 1. list() means that we’ll get a list column where each row is a list containing multiple values. If you forget to use list(), dplyr will give you a hint:

df |>
  rowwise() |>
  mutate(data = runif(n, min, max))
#> Error in `mutate()`:
#> ℹ In argument: `data = runif(n, min, max)`.
#> ℹ In row 2.
#> Caused by error:
#> ! `data` must be size 1, not 2.
#> ℹ Did you mean: `data = list(runif(n, min, max))` ?

Multiple combinations

What if you want to call a function for every combination of inputs? You can use expand.grid() (or tidyr::expand_grid()) to generate the data frame and then repeat the same pattern as above:

df <- expand.grid(mean = c(-1, 0, 1), sd = c(1, 10, 100))

df |>
  rowwise() |>
  mutate(data = list(rnorm(10, mean, sd)))
#> # A tibble: 9 × 3
#> # Rowwise: 
#>    mean    sd data      
#>   <dbl> <dbl> <list>    
#> 1    -1     1 <dbl [10]>
#> 2     0     1 <dbl [10]>
#> 3     1     1 <dbl [10]>
#> 4    -1    10 <dbl [10]>
#> # ℹ 5 more rows

Varying functions

In more complicated problems, you might also want to vary the function being called. This tends to be a bit more of an awkward fit with this approach because the columns in the input tibble will be less regular. But it’s still possible, and it’s a natural place to use do.call():

df <- tribble(
   ~rng,     ~params,
   "runif",  list(n = 10),
   "rnorm",  list(n = 20),
   "rpois",  list(n = 10, lambda = 5),
) |>
  rowwise()

df |>
  mutate(data = list(do.call(rng, params)))
#> # A tibble: 3 × 3
#> # Rowwise: 
#>   rng   params           data      
#>   <chr> <list>           <list>    
#> 1 runif <named list [1]> <dbl [10]>
#> 2 rnorm <named list [1]> <dbl [20]>
#> 3 rpois <named list [2]> <int [10]>

Previously

rowwise()

rowwise() was also questioning for quite some time, partly because I didn’t appreciate how many people needed the native ability to compute summaries across multiple variables for each row. As an alternative, we recommended performing row-wise operations with the purrr map() functions. However, this was challenging because you needed to pick a map function based on the number of arguments that were varying and the type of result, which required quite some knowledge of purrr functions.

I was also resistant to rowwise() because I felt like automatically switching between [ to [[ was too magical in the same way that automatically list()-ing results made do() too magical. I’ve now persuaded myself that the row-wise magic is good magic partly because most people find the distinction between [ and [[ mystifying and rowwise() means that you don’t need to think about it.

Since rowwise() clearly is useful it is not longer questioning, and we expect it to be around for the long term.

do()

We’ve questioned the need for do() for quite some time, because it never felt very similar to the other dplyr verbs. It had two main modes of operation:

  • Without argument names: you could call functions that input and output data frames using . to refer to the “current” group. For example, the following code gets the first row of each group:

    mtcars |>
      group_by(cyl) |>
      do(head(., 1))
    #> # A tibble: 3 × 13
    #> # Groups:   cyl [3]
    #>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb  cyl2  cyl4
    #>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    #> 1  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1     8    16
    #> 2  21       6   160   110  3.9   2.62  16.5     0     1     4     4    12    24
    #> 3  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2    16    32

    This has been superseded by pick() plus reframe(), a variant of summarise() that can create multiple rows and columns per group.

    mtcars |>
      group_by(cyl) |>
      reframe(head(pick(everything()), 1))
    #> # A tibble: 3 × 13
    #>     cyl   mpg  disp    hp  drat    wt  qsec    vs    am  gear  carb  cyl2  cyl4
    #>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    #> 1     4  22.8   108    93  3.85  2.32  18.6     1     1     4     1     8    16
    #> 2     6  21     160   110  3.9   2.62  16.5     0     1     4     4    12    24
    #> 3     8  18.7   360   175  3.15  3.44  17.0     0     0     3     2    16    32
  • With arguments: it worked like mutate() but automatically wrapped every element in a list:

    mtcars |>
      group_by(cyl) |>
      do(nrows = nrow(.))
    #> # A tibble: 3 × 2
    #> # Rowwise: 
    #>     cyl nrows    
    #>   <dbl> <list>   
    #> 1     4 <int [1]>
    #> 2     6 <int [1]>
    #> 3     8 <int [1]>

    I now believe that behaviour is both too magical and not very useful, and it can be replaced by summarise() and pick().

    mtcars |>
      group_by(cyl) |>
      summarise(nrows = nrow(pick(everything())))
    #> # A tibble: 3 × 2
    #>     cyl nrows
    #>   <dbl> <int>
    #> 1     4    11
    #> 2     6     7
    #> 3     8    14

    If needed (unlike here), you can wrap the results in a list yourself.

The addition of pick()/across() and the increased scope of summarise()/reframe() means that do() is no longer needed, so it is now superseded.

dplyr/inst/doc/programming.R0000644000176200001440000001261115137234460015601 0ustar liggesusers## ----echo = FALSE, message = FALSE-------------------------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) set.seed(1014) ## ----setup, message = FALSE--------------------------------------------------- library(dplyr) ## ----results = FALSE---------------------------------------------------------- starwars[starwars$homeworld == "Naboo" & starwars$species == "Human", ,] ## ----results = FALSE---------------------------------------------------------- starwars |> filter(homeworld == "Naboo", species == "Human") ## ----------------------------------------------------------------------------- df <- data.frame(x = runif(3), y = runif(3)) df$x ## ----results = FALSE---------------------------------------------------------- var_summary <- function(data, var) { data |> summarise(n = n(), min = min({{ var }}), max = max({{ var }})) } mtcars |> group_by(cyl) |> var_summary(mpg) ## ----results = FALSE---------------------------------------------------------- for (var in names(mtcars)) { mtcars |> count(.data[[var]]) |> print() } ## ----------------------------------------------------------------------------- name <- "susan" tibble("{name}" := 2) ## ----------------------------------------------------------------------------- my_df <- function(x) { tibble("{{x}}_2" := x * 2) } my_var <- 10 my_df(my_var) ## ----results = FALSE---------------------------------------------------------- summarise_mean <- function(data, vars) { data |> summarise(n = n(), across({{ vars }}, mean)) } mtcars |> group_by(cyl) |> summarise_mean(where(is.numeric)) ## ----results = FALSE---------------------------------------------------------- vars <- c("mpg", "vs") mtcars |> select(all_of(vars)) mtcars |> select(!all_of(vars)) ## ----------------------------------------------------------------------------- mutate_y <- function(data) { mutate(data, y = a + x) } ## ----------------------------------------------------------------------------- my_summarise <- function(data, group_var) { data |> group_by({{ group_var }}) |> summarise(mean = mean(mass)) } ## ----------------------------------------------------------------------------- my_summarise2 <- function(data, expr) { data |> summarise( mean = mean({{ expr }}), sum = sum({{ expr }}), n = n() ) } ## ----------------------------------------------------------------------------- my_summarise3 <- function(data, mean_var, sd_var) { data |> summarise(mean = mean({{ mean_var }}), sd = sd({{ sd_var }})) } ## ----------------------------------------------------------------------------- my_summarise4 <- function(data, expr) { data |> summarise( "mean_{{expr}}" := mean({{ expr }}), "sum_{{expr}}" := sum({{ expr }}), "n_{{expr}}" := n() ) } my_summarise5 <- function(data, mean_var, sd_var) { data |> summarise( "mean_{{mean_var}}" := mean({{ mean_var }}), "sd_{{sd_var}}" := sd({{ sd_var }}) ) } ## ----------------------------------------------------------------------------- my_summarise <- function(.data, ...) { .data |> group_by(...) |> summarise(mass = mean(mass, na.rm = TRUE), height = mean(height, na.rm = TRUE)) } starwars |> my_summarise(homeworld) starwars |> my_summarise(sex, gender) ## ----------------------------------------------------------------------------- quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) { tibble( val = quantile(x, probs), quant = probs ) } x <- 1:5 quantile_df(x) ## ----------------------------------------------------------------------------- df <- tibble( grp = rep(1:3, each = 10), x = runif(30), y = rnorm(30) ) df |> group_by(grp) |> summarise(quantile_df(x, probs = .5)) df |> group_by(grp) |> summarise(across(x:y, ~ quantile_df(.x, probs = .5), .unpack = TRUE)) ## ----------------------------------------------------------------------------- df |> group_by(grp) |> reframe(across(x:y, quantile_df, .unpack = TRUE)) ## ----------------------------------------------------------------------------- my_summarise <- function(data, summary_vars) { data |> summarise(across({{ summary_vars }}, ~ mean(., na.rm = TRUE))) } starwars |> group_by(species) |> my_summarise(c(mass, height)) ## ----------------------------------------------------------------------------- my_summarise <- function(data, group_var, summarise_var) { data |> group_by(pick({{ group_var }})) |> summarise(across({{ summarise_var }}, mean)) } ## ----------------------------------------------------------------------------- my_summarise <- function(data, group_var, summarise_var) { data |> group_by(pick({{ group_var }})) |> summarise(across({{ summarise_var }}, mean, .names = "mean_{.col}")) } ## ----results = FALSE---------------------------------------------------------- for (var in names(mtcars)) { mtcars |> count(.data[[var]]) |> print() } ## ----results = FALSE---------------------------------------------------------- mtcars |> names() |> purrr::map(~ count(mtcars, .data[[.x]])) ## ----eval = FALSE------------------------------------------------------------- # library(shiny) # ui <- fluidPage( # selectInput("var", "Variable", choices = names(diamonds)), # tableOutput("output") # ) # server <- function(input, output, session) { # data <- reactive(filter(diamonds, .data[[input$var]] > 0)) # output$output <- renderTable(head(data())) # } dplyr/inst/doc/recoding-replacing.html0000644000176200001440000022576615137234463017602 0ustar liggesusers Recoding columns and replacing values

Recoding columns and replacing values

library(dplyr)

Introduction

dplyr provides a family of functions for recoding columns and replacing values within a column. These are extremely common operations, so mastering this family can be a big productivity boost!

Before we begin, it’ll be helpful to define exactly what we mean by recoding vs replacing:

  • Recoding a column creates an entirely new column using values from an existing column. The new column may have a different type from the original column.

  • Replacing values within a column partially updates an existing column with new values. The result has the same type as the original column.

The family of functions can be summarized by the following table:

Recoding Replacing
Match with conditions case_when() replace_when()
Match with values recode_values() replace_values()

This vignette walks through use cases for each of these functions, which should help you build some intuition about when to use them.

case_when()

case_when() is the most general function in the family. It works by evaluating each case sequentially and using the first match for each element to determine the corresponding value in the output. To demonstrate, we’ll look at a dataset of some 5k times in minutes:

set.seed(123)
racers <- tibble(
  id = seq_len(100),
  time = round(sample(1200:2100, size = 100, replace = TRUE) / 60, 2)
)
racers
#> # A tibble: 100 × 2
#>      id  time
#>   <int> <dbl>
#> 1     1  26.9
#> 2     2  27.7
#> 3     3  23.0
#> 4     4  28.8
#> # ℹ 96 more rows

We can use case_when() to categorize these times into tiers:

tiers <- racers |>
  mutate(
    tier = case_when(
      time < 23 ~ "A",
      time < 27 ~ "B",
      time < 30 ~ "C",
      time < 33 ~ "D"
    )
  )

tiers
#> # A tibble: 100 × 3
#>      id  time tier 
#>   <int> <dbl> <chr>
#> 1     1  26.9 B    
#> 2     2  27.7 C    
#> 3     3  23.0 A    
#> 4     4  28.8 C    
#> # ℹ 96 more rows

There’s a few things to note here:

  • The first condition that is TRUE is used, i.e. a time of 21 minutes meets all of the conditions, but would be placed in tier A because time < 23 is listed first.

  • Unmatched values fall through as NA. We have some racers above 33 minutes that aren’t captured here!

There are a few options for dealing with unmatched locations. You can leave them as NA if that makes sense for your use case, or you can specify a .default value:

racers |>
  mutate(
    tier = case_when(
      time < 23 ~ "A",
      time < 27 ~ "B",
      time < 30 ~ "C",
      time < 33 ~ "D",
      .default = "unknown"
    )
  )
#> # A tibble: 100 × 3
#>      id  time tier 
#>   <int> <dbl> <chr>
#> 1     1  26.9 B    
#> 2     2  27.7 C    
#> 3     3  23.0 A    
#> 4     4  28.8 C    
#> # ℹ 96 more rows

If you are confident that you’ve captured every case, you can supply .unmatched = "error" rather than .default and case_when() will error if that assertion doesn’t hold. This is great for defensive programming!

racers |>
  mutate(
    tier = case_when(
      time < 23 ~ "A",
      time < 27 ~ "B",
      time < 30 ~ "C",
      time < 33 ~ "D",
      .unmatched = "error"
    )
  )
#> Error in `mutate()`:
#> ℹ In argument: `tier = case_when(...)`.
#> Caused by error in `case_when()`:
#> ! Each location must be matched.
#> ✖ Locations 6, 22, 32, 34, 40, 44, 55, 56, 73, 79, 84, and 93 are unmatched.

Note that missing values must be explicitly handled when setting .unmatched = "error", even if that’s just is.na(time) ~ NA, otherwise they will trigger the unmatched error.

replace_when()

Let’s assume that some of our racers used banned running shoes and are disqualified. Also, some racers had a false start and need to incur a 20 second (1/3 minute) penalty.

id_banned_shoes <- c(2, 10, 15, 32, 65)
id_false_start <- c(1, 2, 5, 20, 55, 74, 91)

We could add this information in a few ways. With case_when():

racers |>
  mutate(
    time = case_when(
      id %in% id_banned_shoes ~ NA,
      id %in% id_false_start ~ time + 1 / 3,
      .default = time
    )
  )
#> # A tibble: 100 × 2
#>      id  time
#>   <int> <dbl>
#> 1     1  27.2
#> 2     2  NA  
#> 3     3  23.0
#> 4     4  28.8
#> # ℹ 96 more rows

Or in two steps with if_else():

racers |>
  mutate(time = if_else(id %in% id_banned_shoes, NA, time)) |>
  mutate(time = if_else(id %in% id_false_start, time + 1 / 3, time))
#> # A tibble: 100 × 2
#>      id  time
#>   <int> <dbl>
#> 1     1  27.2
#> 2     2  NA  
#> 3     3  23.0
#> 4     4  28.8
#> # ℹ 96 more rows

Neither of these feel particularly elegant at expressing the intent of this operation. All you’re trying to do is replace a few values of time! We like to think of time as the primary input: time goes in, and time comes out (slightly adjusted). But both case_when() and if_else() have time as their last input, making the intent a bit hard to understand at first glance.

replace_when() lets you pull the primary input to the front (which also makes it compatible with the pipe!), making the intent more clear:

racers |>
  mutate(
    time = time |>
      replace_when(
        id %in% id_banned_shoes ~ NA,
        id %in% id_false_start ~ time + 1 / 3
      )
  )
#> # A tibble: 100 × 2
#>      id  time
#>   <int> <dbl>
#> 1     1  27.2
#> 2     2  NA  
#> 3     3  23.0
#> 4     4  28.8
#> # ℹ 96 more rows

As a side note, you might have been tempted to reach for base::replace() here, i.e. as:

racers |>
  mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |>
  mutate(time = base::replace(time, id %in% id_false_start, time + 1 / 3))

This actually doesn’t work! Replacing with NA does work, but replace() requires that the result of time + 1 / 3 must be preemptively subset to the places where the condition is true. You’d have to do something more complicated to mimic replace_when():

racers |>
  mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |>
  mutate(time = {
    loc <- id %in% id_false_start
    base::replace(time, loc, time[loc] + 1 / 3)
  })
#> # A tibble: 100 × 2
#>      id  time
#>   <int> <dbl>
#> 1     1  27.2
#> 2     2  NA  
#> 3     3  23.0
#> 4     4  28.8
#> # ℹ 96 more rows

Type stability

Beyond readability, an important benefit of replace_when() (and replace_values(), which we’ll see later) is that it is type stable on the column you are modifying, which means that it can’t change types out from under you.

Type stability is particularly useful with factors. Taking another look at our tiers of race times, imagine that some of the race times were discovered to be faulty due to malfunctioning timers, and you need to replace a few ids with the unknown level.

id_with_malfunction <- c(1, 5, 20, 50)

tiers <- racers |>
  mutate(
    tier = case_when(
      time < 23 ~ "A",
      time < 27 ~ "B",
      time < 30 ~ "C",
      time < 33 ~ "D",
      .default = "unknown"
    ) |>
      factor(levels = c("A", "B", "C", "D", "unknown"))
  )

tiers
#> # A tibble: 100 × 3
#>      id  time tier 
#>   <int> <dbl> <fct>
#> 1     1  26.9 B    
#> 2     2  27.7 C    
#> 3     3  23.0 A    
#> 4     4  28.8 C    
#> # ℹ 96 more rows

Note that the following case_when() solution results in tier becoming a character column, losing its factor class. This is due to the fact that case_when() is a recoding function, it creates an entirely new column and doesn’t know that you’re trying to retain existing type information.

tiers |>
  mutate(
    tier = case_when(id %in% id_with_malfunction ~ "unknown", .default = tier)
  )
#> # A tibble: 100 × 3
#>      id  time tier   
#>   <int> <dbl> <chr>  
#> 1     1  26.9 unknown
#> 2     2  27.7 C      
#> 3     3  23.0 A      
#> 4     4  28.8 C      
#> # ℹ 96 more rows

As a replacing function, replace_when() knows to be type stable on tier, and casts "unknown" to tier’s factor type before performing the replacement:

tiers |>
  mutate(
    tier = tier |> replace_when(id %in% id_with_malfunction ~ "unknown")
  )
#> # A tibble: 100 × 3
#>      id  time tier   
#>   <int> <dbl> <fct>  
#> 1     1  26.9 unknown
#> 2     2  27.7 C      
#> 3     3  23.0 A      
#> 4     4  28.8 C      
#> # ℹ 96 more rows

recode_values()

case_when() and replace_when() both take logical vectors on the left-hand side of the formula. This is very flexible, but sometimes these functions require a large amount of repetition. Consider the following Likert scale scores. We’d like to recode these from their numeric values to their character counterparts.

likert <- tibble(
  score = c(1, 2, 3, 4, 5, 2, 3, 1, 4)
)

We could certainly use a case_when():

likert |>
  mutate(
    score = case_when(
      score == 1 ~ "Strongly disagree",
      score == 2 ~ "Disagree",
      score == 3 ~ "Neutral",
      score == 4 ~ "Agree",
      score == 5 ~ "Strongly agree"
    )
  )
#> # A tibble: 9 × 1
#>   score            
#>   <chr>            
#> 1 Strongly disagree
#> 2 Disagree         
#> 3 Neutral          
#> 4 Agree            
#> # ℹ 5 more rows

But score == is repeated many times! If you find yourself using == or %in% on the left-hand side in this manner, you likely want to use recode_values() instead. Rather than taking logical vectors, recode_values() takes values on the left-hand side to match against a single input that you’ll provide as the first argument.

likert |>
  mutate(
    score = score |>
      recode_values(
        1 ~ "Strongly disagree",
        2 ~ "Disagree",
        3 ~ "Neutral",
        4 ~ "Agree",
        5 ~ "Strongly agree"
      )
  )
#> # A tibble: 9 × 1
#>   score            
#>   <chr>            
#> 1 Strongly disagree
#> 2 Disagree         
#> 3 Neutral          
#> 4 Agree            
#> # ℹ 5 more rows

This removes all of the repetition, allowing you to focus on the mapping. If you squint, the mapping should look roughly like a lookup table between the numeric value and the likert encoding. recode_values() actually has a second interface that allows us to make this lookup table representation even more explicit.

Using a tribble(), we can extract out the lookup table into its own standalone data frame.

lookup <- tribble(
  ~from , ~to                 ,
      1 , "Strongly disagree" ,
      2 , "Disagree"          ,
      3 , "Neutral"           ,
      4 , "Agree"             ,
      5 , "Strongly agree"
)

We can then utilize the alternative from and to arguments of recode_values() rather than supplying formulas to specify how the values should be recoded:

likert |>
  mutate(score = recode_values(score, from = lookup$from, to = lookup$to))
#> # A tibble: 9 × 1
#>   score            
#>   <chr>            
#> 1 Strongly disagree
#> 2 Disagree         
#> 3 Neutral          
#> 4 Agree            
#> # ℹ 5 more rows

Lifting the lookup table to the top of the file is particularly nice when you have a long pipe chain. The details of the mapping get some room to breathe, and in the pipe chain you can focus on the actual manipulations you are trying to perform.

It’s also very common for your lookup table to exist in a CSV file that you have to read in separately. In that case, you can replace the tribble() call with:

lookup <- readr::read_csv("lookup.csv")

But everything else works the same. This would be quite hard to specify with just the formula interface!

Like case_when(), recode_values() also has default and unmatched arguments to handle unmatched locations:

likert <- tibble(
  score = c(0, 1, 2, 2, 4, 5, 2, 3, 1, 4)
)

# Missed the `0`
likert |>
  mutate(
    score = score |>
      recode_values(
        from = lookup$from,
        to = lookup$to,
        unmatched = "error"
      )
  )
#> Error in `mutate()`:
#> ℹ In argument: `score = recode_values(score, from = lookup$from, to =
#>   lookup$to, unmatched = "error")`.
#> Caused by error in `recode_values()`:
#> ! Each location must be matched.
#> ✖ Location 1 is unmatched.

replace_values()

As seen above, when replacing a few locations in a column using logical conditions, we reached for replace_when() rather than case_when(). Similarly, when replacing a few locations using values to match against, it’s best to use replace_values() over recode_values().

Imagine we’d like to collapse some, but not all, of these school names into common buckets:

schools <- tibble(
  name = c(
    "UNC",
    "Chapel Hill",
    NA,
    "Duke",
    "Duke University",
    "UNC",
    "NC State",
    "ECU"
  )
)

We could use recode_values():

schools |>
  mutate(
    name = recode_values(
      name,
      c("UNC", "Chapel Hill") ~ "UNC Chapel Hill",
      c("Duke", "Duke University") ~ "Duke",
      default = name
    )
  )
#> # A tibble: 8 × 1
#>   name           
#>   <chr>          
#> 1 UNC Chapel Hill
#> 2 UNC Chapel Hill
#> 3 <NA>           
#> 4 Duke           
#> # ℹ 4 more rows

But this “partial update by value” is so common that it really deserves its own name that doesn’t require you to specify default. For that, we have replace_values():

schools |>
  mutate(
    name = name |>
      replace_values(
        c("UNC", "Chapel Hill") ~ "UNC Chapel Hill",
        c("Duke", "Duke University") ~ "Duke"
      )
  )
#> # A tibble: 8 × 1
#>   name           
#>   <chr>          
#> 1 UNC Chapel Hill
#> 2 UNC Chapel Hill
#> 3 <NA>           
#> 4 Duke           
#> # ℹ 4 more rows

Like recode_values(), replace_values() has an alternative from and to API that works well with lookup tables and allows you to move your mapping out of the pipe chain:

lookup <- tribble(
  ~from             , ~to               ,
  "UNC"             , "UNC Chapel Hill" ,
  "Chapel Hill"     , "UNC Chapel Hill" ,
  "Duke"            , "Duke"            ,
  "Duke University" , "Duke"
)

schools |>
  mutate(name = replace_values(name, from = lookup$from, to = lookup$to))
#> # A tibble: 8 × 1
#>   name           
#>   <chr>          
#> 1 UNC Chapel Hill
#> 2 UNC Chapel Hill
#> 3 <NA>           
#> 4 Duke           
#> # ℹ 4 more rows

An extremely neat feature of the from and to API is that they also take lists of vectors that describe the mapping, which has been designed to work elegantly with the fact that tribble() can create list columns, allowing you to further collapse this lookup table:

# Condensed lookup table with a `many:1` mapping per row
lookup <- tribble(
  ~from                        , ~to               ,
  c("UNC", "Chapel Hill")      , "UNC Chapel Hill" ,
  c("Duke", "Duke University") , "Duke"
)

# Note that `from` is a list column
lookup
#> # A tibble: 2 × 2
#>   from      to             
#>   <list>    <chr>          
#> 1 <chr [2]> UNC Chapel Hill
#> 2 <chr [2]> Duke

lookup$from
#> [[1]]
#> [1] "UNC"         "Chapel Hill"
#> 
#> [[2]]
#> [1] "Duke"            "Duke University"

# Works the same as before
schools |>
  mutate(name = replace_values(name, from = lookup$from, to = lookup$to))
#> # A tibble: 8 × 1
#>   name           
#>   <chr>          
#> 1 UNC Chapel Hill
#> 2 UNC Chapel Hill
#> 3 <NA>           
#> 4 Duke           
#> # ℹ 4 more rows

Comparisons

We’ll end this vignette with some comparisons of the recoding and replacing family to other dplyr functions and to other technologies, like SQL.

if_else()

if_else() is a type of recoding function, as it creates an entirely new column. In fact, it’s closely tied to case_when():

if_else(condition, true, false, missing)

case_when(
  condition ~ true,
  !condition ~ false,
  is.na(condition) ~ missing
)

Similar to case_when(), if_else() doesn’t offer type stability on any particular input. The output’s type is computed as the common type of true, false, and missing. If you find yourself writing an if_else() where the purpose is to partially update an existing column, consider using replace_when() instead for clarity and type stability:

x <- if_else(x > 5, new, x)

# Type stable on `x`.
# Intent of "partially updating" `x` is clear.
# Pipe friendly.
x <- x |> replace_when(x > 5 ~ new)

coalesce()

For converting from NA to some other value, the most common cases of coalesce() are often a replace_values() call in disguise:

x <- c(1, 2, NA, 3, NA, 5)
y <- c(0, 3, 1, 4, 6, 7)

coalesce(x, 0)
#> [1] 1 2 0 3 0 5
replace_values(x, NA ~ 0)
#> [1] 1 2 0 3 0 5

coalesce(x, y)
#> [1] 1 2 1 3 6 5
replace_values(x, NA ~ y)
#> [1] 1 2 1 3 6 5

And with replace_values() you can replace any value, not just NA.

na_if()

For converting from a problematic value to NA, replace_values() is a more flexible (and likely more intuitive) alternative to na_if():

x <- c(1, 2, 0, -99, 12)

# To convert `0` and `-99` to `NA`, you have to do it in two calls
x |> na_if(0) |> na_if(-99)
#> [1]  1  2 NA NA 12

x |> replace_values(from = c(0, -99), to = NA)
#> [1]  1  2 NA NA 12

SQL

case_when() is an R equivalent of SQL’s Searched CASE statement:

case_when(
  x < 100 ~ this,
  x < 20 ~ that,
  .default = default
)
CASE
  WHEN x < 100 THEN this
  WHEN x < 20 THEN that
  ELSE default
END

And dbplyr will translate a case_when() to this form!

recode_values() is an R equivalent of SQL’s Simple CASE statement:

recode_values(
  x,
  "E" ~ "East",
  "W" ~ "West",
  "N" ~ "North",
  "S" ~ "South",
  .default = "Unknown"
)
CASE x
  WHEN 'E' THEN 'East'
  WHEN 'W' THEN 'West'
  WHEN 'N' THEN 'North'
  WHEN 'S' THEN 'South'
  ELSE 'Unknown'
END

As of dbplyr 2.5.1, we don’t currently have a translation for recode_values() since it is so new, but we expect to have one soon.

dplyr/inst/doc/two-table.Rmd0000644000176200001440000001606415106134104015472 0ustar liggesusers--- title: "Two-table verbs" description: > Most dplyr verbs work with a single data set, but most data analyses involve multiple datasets. This vignette introduces you to the dplyr verbs that work with more one than data set, and introduces to the mutating joins, filtering joins, and the set operations. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Two-table verbs} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 5) library(dplyr) ``` It's rare that a data analysis involves only a single table of data. In practice, you'll normally have many tables that contribute to an analysis, and you need flexible tools to combine them. In dplyr, there are three families of verbs that work with two tables at a time: * Mutating joins, which add new variables to one table from matching rows in another. * Filtering joins, which filter observations from one table based on whether or not they match an observation in the other table. * Set operations, which combine the observations in the data sets as if they were set elements. (This discussion assumes that you have [tidy data](https://www.jstatsoft.org/v59/i10/), where the rows are observations and the columns are variables. If you're not familiar with that framework, I'd recommend reading up on it first.) All two-table verbs work similarly. The first two arguments are `x` and `y`, and provide the tables to combine. The output is always a new table with the same type as `x`. ## Mutating joins Mutating joins allow you to combine variables from multiple tables. For example, consider the flights and airlines data from the nycflights13 package. In one table we have flight information with an abbreviation for carrier, and in another we have a mapping between abbreviations and full names. You can use a join to add the carrier names to the flight data: ```{r, warning = FALSE} library(nycflights13) # Drop unimportant variables so it's easier to understand the join results. flights2 <- flights |> select(year:day, hour, origin, dest, tailnum, carrier) flights2 |> left_join(airlines) ``` ### Controlling how the tables are matched As well as `x` and `y`, each mutating join takes an argument `by` that controls which variables are used to match observations in the two tables. There are a few ways to specify it, as I illustrate below with various tables from nycflights13: * `NULL`, the default. dplyr will will use all variables that appear in both tables, a __natural__ join. For example, the flights and weather tables match on their common variables: year, month, day, hour and origin. ```{r} flights2 |> left_join(weather) ``` * A character vector, `by = "x"`. Like a natural join, but uses only some of the common variables. For example, `flights` and `planes` have `year` columns, but they mean different things so we only want to join by `tailnum`. ```{r} flights2 |> left_join(planes, by = "tailnum") ``` Note that the year columns in the output are disambiguated with a suffix. * A named character vector: `by = c("x" = "a")`. This will match variable `x` in table `x` to variable `a` in table `y`. The variables from use will be used in the output. Each flight has an origin and destination `airport`, so we need to specify which one we want to join to: ```{r} flights2 |> left_join(airports, c("dest" = "faa")) flights2 |> left_join(airports, c("origin" = "faa")) ``` ### Types of join There are four types of mutating join, which differ in their behaviour when a match is not found. We'll illustrate each with a simple example: ```{r} df1 <- tibble(x = c(1, 2), y = 2:1) df2 <- tibble(x = c(3, 1), a = 10, b = "a") ``` * `inner_join(x, y)` only includes observations that match in both `x` and `y`. ```{r} df1 |> inner_join(df2) |> knitr::kable() ``` * `left_join(x, y)` includes all observations in `x`, regardless of whether they match or not. This is the most commonly used join because it ensures that you don't lose observations from your primary table. ```{r} df1 |> left_join(df2) ``` * `right_join(x, y)` includes all observations in `y`. It's equivalent to `left_join(y, x)`, but the columns and rows will be ordered differently. ```{r} df1 |> right_join(df2) df2 |> left_join(df1) ``` * `full_join()` includes all observations from `x` and `y`. ```{r} df1 |> full_join(df2) ``` The left, right and full joins are collectively know as __outer joins__. When a row doesn't match in an outer join, the new variables are filled in with missing values. ### Observations While mutating joins are primarily used to add new variables, they can also generate new observations. If a match is not unique, a join will add all possible combinations (the Cartesian product) of the matching observations: ```{r} df1 <- tibble(x = c(1, 1, 2), y = 1:3) df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a")) df1 |> left_join(df2) ``` ## Filtering joins Filtering joins match observations in the same way as mutating joins, but affect the observations, not the variables. There are two types: * `semi_join(x, y)` __keeps__ all observations in `x` that have a match in `y`. * `anti_join(x, y)` __drops__ all observations in `x` that have a match in `y`. These are most useful for diagnosing join mismatches. For example, there are many flights in the nycflights13 dataset that don't have a matching tail number in the planes table: ```{r} library("nycflights13") flights |> anti_join(planes, by = "tailnum") |> count(tailnum, sort = TRUE) ``` If you're worried about what observations your joins will match, start with a `semi_join()` or `anti_join()`. `semi_join()` and `anti_join()` never duplicate; they only ever remove observations. ```{r} df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4) df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a")) # Four rows to start with: df1 |> nrow() # And we get four rows after the join df1 |> inner_join(df2, by = "x") |> nrow() # But only two rows actually match df1 |> semi_join(df2, by = "x") |> nrow() ``` ## Set operations The final type of two-table verb is set operations. These expect the `x` and `y` inputs to have the same variables, and treat the observations like sets: * `intersect(x, y)`: return only observations in both `x` and `y` * `union(x, y)`: return unique observations in `x` and `y` * `setdiff(x, y)`: return observations in `x`, but not in `y`. Given this simple data: ```{r} (df1 <- tibble(x = 1:2, y = c(1L, 1L))) (df2 <- tibble(x = 1:2, y = 1:2)) ``` The four possibilities are: ```{r} intersect(df1, df2) # Note that we get 3 rows, not 4 union(df1, df2) setdiff(df1, df2) setdiff(df2, df1) ``` ## Multiple-table verbs dplyr does not provide any functions for working with three or more tables. Instead use `purrr::reduce()` or `Reduce()`, as described in [Advanced R](http://adv-r.had.co.nz/Functionals.html#functionals-fp), to iteratively combine the two-table verbs to handle as many tables as you need. dplyr/inst/doc/in-packages.Rmd0000644000176200001440000002003415106134104015746 0ustar liggesusers--- title: "Using dplyr in packages" description: > A guide for package authors who use dplyr. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Using dplyr in packages} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup, message = FALSE} library(dplyr) ``` This vignette is aimed at package authors who use dplyr in their packages. We will discuss best practices learned over the years to avoid `R CMD check` notes and warnings, and how to handle when dplyr deprecates functions. ## Join helpers As of dplyr 1.1.0, we've introduced `join_by()` along 4 helpers for performing various types of joins: - `closest()` - `between()` - `within()` - `overlaps()` `join_by()` implements a domain specific language (DSL) for joins, and internally interprets calls to these functions. You'll notice that `dplyr::closest()` isn't an exported function from dplyr (`dplyr::between()` and `base::within()` do happen to be preexisting functions). If you use `closest()` in your package, then this will cause an `R CMD check` note letting you know that you've used a symbol that doesn't belong to any package. To silence this, place `utils::globalVariables("closest")` in a source file in your package (but outside of any function). dbplyr does a similar thing for SQL functions, so you can see an example of that [here](https://github.com/tidyverse/dbplyr/blob/7edf5d607fd6b0b897721ea96d1c9ca9401f0f9b/R/backend-redshift.R#L144). You may also have to add utils to your package Imports, even though it is a base package. You can do that easily with `usethis::use_package("utils")`. ## Data masking and tidy selection NOTEs If you're writing a package and you have a function that uses data masking or tidy selection: ```{r} my_summary_function <- function(data) { data |> select(grp, x, y) |> filter(x > 0) |> group_by(grp) |> summarise(y = mean(y), n = n()) } ``` You'll get an `NOTE` because `R CMD check` doesn't know that dplyr functions use tidy evaluation: N checking R code for possible problems my_summary_function: no visible binding for global variable ‘grp’, ‘x’, ‘y’ Undefined global functions or variables: grp x y To eliminate this note: - For data masking, import `.data` from [rlang](https://rlang.r-lib.org/) and then use `.data$var` instead of `var`. - For tidy selection, use `"var"` instead of `var`. That yields: ```{r} #' @importFrom rlang .data my_summary_function <- function(data) { data |> select("grp", "x", "y") |> filter(.data$x > 0) |> group_by(.data$grp) |> summarise(y = mean(.data$y), n = n()) } ``` For more about programming with dplyr, see `vignette("programming", package = "dplyr")`. ## Deprecation This section is focused on updating package code to deal with backwards incompatible changes in dplyr. We do try and minimize backward incompatible changes as much as possible, but sometimes they are necessary in order to radically simplify existing code, or unlock a lot of potential value in the future. We will start with some general advice about supporting multiple versions of dplyr at once, and then we will discuss some specific changes in dplyr. ### Multiple dplyr versions Ideally, when we introduce a breaking change you'll want to make sure that your package works with both the released version and the development version of dplyr. This is typically a little bit more work, but has two big advantages: - It's more convenient for your users, since your package will work for them regardless of what version of dplyr they have installed. - It's easier on CRAN since it doesn't require a massive coordinated release of multiple packages. If we break your package, we will typically send you a pull request that implements a patch before releasing the next version of dplyr. Most of the time, this patch will be backwards compatible with older versions of dplyr as well. Ideally, you'll accept this patch and submit a new version of your package to CRAN before the new version of dplyr is released. To make code work with multiple versions of a package, your first tool is the simple if statement: ```{r, eval=FALSE} if (utils::packageVersion("dplyr") > "0.5.0") { # code for new version } else { # code for old version } ``` Always condition on `> current-version`, not `>= next-version` because this will ensure that this branch is also used for the development version of the package. For example, if the current release is version `"0.5.0"`, the development version will be `"0.5.0.9000"`. This typically works well if the branch for the "new version" introduces a new argument or has a slightly different return value. This *doesn't* work if we've introduced a new function that you need to switch to, like: ```{r, eval=FALSE} if (utils::packageVersion("dplyr") > "1.0.10") { dplyr::reframe(df, x = unique(x)) } else { dplyr::summarise(df, x = unique(x)) } ``` In this case, when checks are run with dplyr 1.0.10 you'll get a warning about using a function from dplyr that doesn't exist (`reframe()`) even though that branch will never run. You can get around this by using `utils::getFromNamespace()` to indirectly call the new dplyr function: ```{r, eval=FALSE} if (utils::packageVersion("dplyr") > "1.0.10") { utils::getFromNamespace("reframe", "dplyr")(df, x = unique(x)) } else { dplyr::summarise(df, x = unique(x)) } ``` As soon as the next version of dplyr is actually on CRAN (1.1.0 in this case), you should feel free to remove this code and unconditionally use `reframe()` as long as you also require `dplyr (>= 1.1.0)` in your `DESCRIPTION` file. This is typically not very painful for users, because they'd already be updating your package when they run into this requirement, so updating one more package along the way is generally easy. It also helps them get the latest bug fixes and features from dplyr. Sometimes, it isn't possible to avoid a call to `@importFrom`. For example you might be importing a generic so that you can define a method for it, but that generic has moved between packages. In this case, you can take advantage of a little-known feature in the `NAMESPACE` file: you can include raw `if` statements. ```{r, eval=FALSE} #' @rawNamespace #' if (utils::packageVersion("dplyr") > "0.5.0") { #' importFrom("dbplyr", "build_sql") #' } else { #' importFrom("dplyr", "build_sql") #' } ``` ### Deprecation of `mutate_*()` and `summarise_*()` The following `mutate()` and `summarise()` variants were deprecated in dplyr 0.7.0: - `mutate_each()`, `summarise_each()` and the following variants were superseded in dplyr 1.0.0: - `mutate_all()`, `summarise_all()` - `mutate_if()`, `summarise_if()` - `mutate_at()`, `summarise_at()` These have all been replaced by using `mutate()` or `summarise()` in combination with `across()`, which was introduced in dplyr 1.0.0. If you used `mutate_all()` or `mutate_each()` without supplying a selection, you should update to use `across(everything())`: ```{r, eval=FALSE} starwars |> mutate_each(funs(as.character)) starwars |> mutate_all(funs(as.character)) starwars |> mutate(across(everything(), as.character)) ``` If you provided a selection through `mutate_at()` or `mutate_each()`, then you can switch to `across()` with a selection: ```{r, eval = FALSE} starwars |> mutate_each(funs(as.character), height, mass) starwars |> mutate_at(vars(height, mass), as.character) starwars |> mutate(across(c(height, mass), as.character)) ``` If you used predicates with `mutate_if()`, you can switch to using `across()` in combination with `where()`: ```{r, eval=FALSE} starwars |> mutate_if(is.factor, as.character) starwars |> mutate(across(where(is.factor), as.character)) ``` ## Data frame subclasses If you are a package author that is *extending* dplyr to work with a new data frame subclass, then we encourage you to read the documentation in `?dplyr_extending`. This contains advice on how to implement the minimal number of extension generics possible to get maximal compatibility across dplyr's verbs. dplyr/inst/doc/colwise.R0000644000176200001440000001242615137234450014727 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) set.seed(1014) ## ----eval = FALSE------------------------------------------------------------- # df |> # group_by(g1, g2) |> # summarise(a = mean(a), b = mean(b), c = mean(c), d = mean(d)) ## ----eval = FALSE------------------------------------------------------------- # df |> # group_by(g1, g2) |> # summarise(across(a:d, mean)) ## ----setup-------------------------------------------------------------------- library(dplyr, warn.conflicts = FALSE) ## ----------------------------------------------------------------------------- starwars |> summarise(across(where(is.character), n_distinct)) starwars |> group_by(species) |> filter(n() > 1) |> summarise(across(c(sex, gender, homeworld), n_distinct)) starwars |> group_by(homeworld) |> filter(n() > 1) |> summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE))) ## ----------------------------------------------------------------------------- df <- data.frame(g = c(1, 1, 2), x = c(-1, 1, 3), y = c(-1, -4, -9)) df |> group_by(g) |> summarise(across(where(is.numeric), sum)) ## ----------------------------------------------------------------------------- min_max <- list( min = ~min(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE) ) starwars |> summarise(across(where(is.numeric), min_max)) starwars |> summarise(across(c(height, mass, birth_year), min_max)) ## ----------------------------------------------------------------------------- starwars |> summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}")) starwars |> summarise(across(c(height, mass, birth_year), min_max, .names = "{.fn}.{.col}")) ## ----------------------------------------------------------------------------- starwars |> summarise( across(c(height, mass, birth_year), ~min(.x, na.rm = TRUE), .names = "min_{.col}"), across(c(height, mass, birth_year), ~max(.x, na.rm = TRUE), .names = "max_{.col}") ) ## ----------------------------------------------------------------------------- starwars |> summarise( tibble( across(where(is.numeric), ~min(.x, na.rm = TRUE), .names = "min_{.col}"), across(where(is.numeric), ~max(.x, na.rm = TRUE), .names = "max_{.col}") ) ) ## ----------------------------------------------------------------------------- starwars |> summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}")) |> relocate(starts_with("min")) ## ----------------------------------------------------------------------------- df <- tibble(x = 1:3, y = 3:5, z = 5:7) mult <- list(x = 1, y = 10, z = 100) df |> mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]])) ## ----------------------------------------------------------------------------- df <- data.frame(x = c(1, 2, 3), y = c(1, 4, 9)) df |> summarise(n = n(), across(where(is.numeric), sd)) ## ----------------------------------------------------------------------------- df |> summarise(across(where(is.numeric), sd), n = n()) ## ----------------------------------------------------------------------------- df |> summarise(n = n(), across(where(is.numeric) & !n, sd)) ## ----------------------------------------------------------------------------- df |> summarise( tibble(n = n(), across(where(is.numeric), sd)) ) ## ----------------------------------------------------------------------------- rescale01 <- function(x) { rng <- range(x, na.rm = TRUE) (x - rng[1]) / (rng[2] - rng[1]) } df <- tibble(x = 1:4, y = rnorm(4)) df |> mutate(across(where(is.numeric), rescale01)) ## ----------------------------------------------------------------------------- starwars |> distinct(pick(contains("color"))) ## ----------------------------------------------------------------------------- starwars |> count(pick(contains("color")), sort = TRUE) ## ----------------------------------------------------------------------------- starwars |> filter_out(if_any(everything(), is.na)) ## ----------------------------------------------------------------------------- starwars |> filter_out(if_all(everything(), is.na)) ## ----eval = FALSE------------------------------------------------------------- # df |> # group_by(g1, g2) |> # summarise( # across(where(is.numeric), mean), # across(where(is.factor), nlevels), # n = n(), # ) ## ----results = FALSE---------------------------------------------------------- df |> mutate_if(is.numeric, ~mean(.x, na.rm = TRUE)) # -> df |> mutate(across(where(is.numeric), ~mean(.x, na.rm = TRUE))) df |> mutate_at(vars(c(x, starts_with("y"))), mean) # -> df |> mutate(across(c(x, starts_with("y")), mean)) df |> mutate_all(mean) # -> df |> mutate(across(everything(), mean)) ## ----------------------------------------------------------------------------- df <- tibble(x = c("a", "b"), y = c(1, 1), z = c(-1, 1)) # Find all rows where EVERY numeric variable is greater than zero df |> filter(if_all(where(is.numeric), ~ .x > 0)) # Find all rows where ANY numeric variable is greater than zero df |> filter(if_any(where(is.numeric), ~ .x > 0)) ## ----------------------------------------------------------------------------- df <- tibble(x = 2, y = 4, z = 8) df |> mutate_all(~ .x / y) df |> mutate(across(everything(), ~ .x / y)) dplyr/inst/doc/grouping.Rmd0000644000176200001440000001626115137161765015446 0ustar liggesusers--- title: "Grouped data" description: > To unlock the full potential of dplyr, you need to understand how each verb interacts with grouping. This vignette shows you how to manipulate grouping, how each verb changes its behaviour when working with grouped data, and how you can access data about the "current" group from within a verb. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Grouped data} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE, warning = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) ``` dplyr verbs are particularly powerful when you apply them to grouped data frames (`grouped_df` objects). This vignette shows you: * How to group, inspect, and ungroup with `group_by()` and friends. * How individual dplyr verbs changes their behaviour when applied to grouped data frame. * How to access data about the "current" group from within a verb. We'll start by loading dplyr: ```{r, message = FALSE} library(dplyr) ``` ## `group_by()` The most important grouping verb is `group_by()`: it takes a data frame and one or more variables to group by: ```{r} by_species <- starwars |> group_by(species) by_sex_gender <- starwars |> group_by(sex, gender) ``` You can see the grouping when you print the data: ```{r} by_species by_sex_gender ``` Or use `tally()` to count the number of rows in each group. The `sort` argument is useful if you want to see the largest groups up front. ```{r} by_species |> tally() by_sex_gender |> tally(sort = TRUE) ``` As well as grouping by existing variables, you can group by any function of existing variables. This is equivalent to performing a `mutate()` **before** the `group_by()`: ```{r group_by_with_expression} bmi_breaks <- c(0, 18.5, 25, 30, Inf) starwars |> group_by(bmi_cat = cut(mass/(height/100)^2, breaks=bmi_breaks)) |> tally() ``` ## Group metadata You can see underlying group data with `group_keys()`. It has one row for each group and one column for each grouping variable: ```{r group_vars} by_species |> group_keys() by_sex_gender |> group_keys() ``` You can see which group each row belongs to with `group_indices()`: ```{r} by_species |> group_indices() ``` And which rows each group contains with `group_rows()`: ```{r} by_species |> group_rows() |> head() ``` Use `group_vars()` if you just want the names of the grouping variables: ```{r} by_species |> group_vars() by_sex_gender |> group_vars() ``` ### Changing and adding to grouping variables If you apply `group_by()` to an already grouped dataset, will overwrite the existing grouping variables. For example, the following code groups by `homeworld` instead of `species`: ```{r} by_species |> group_by(homeworld) |> tally() ``` To **augment** the grouping, using `.add = TRUE`[^add]. For example, the following code groups by species and homeworld: ```{r} by_species |> group_by(homeworld, .add = TRUE) |> tally() ``` [^add]: Note that the argument changed from `add = TRUE` to `.add = TRUE` in dplyr 1.0.0. ### Removing grouping variables To remove all grouping variables, use `ungroup()`: ```{r} by_species |> ungroup() |> tally() ``` You can also choose to selectively ungroup by listing the variables you want to remove: ```{r} by_sex_gender |> ungroup(sex) |> tally() ``` ## Verbs The following sections describe how grouping affects the main dplyr verbs. ### `summarise()` `summarise()` computes a summary for each group. This means that it starts from `group_keys()`, adding summary variables to the right hand side: ```{r summarise} by_species |> summarise( n = n(), height = mean(height, na.rm = TRUE) ) ``` The `.groups=` argument controls the grouping structure of the output. The historical behaviour of removing the right hand side grouping variable corresponds to `.groups = "drop_last"` without a message or `.groups = NULL` with a message (the default). ```{r} by_sex_gender |> summarise(n = n()) |> group_vars() by_sex_gender |> summarise(n = n(), .groups = "drop_last") |> group_vars() ``` Since version 1.0.0 the groups may also be kept (`.groups = "keep"`) or dropped (`.groups = "drop"`). ```{r} by_sex_gender |> summarise(n = n(), .groups = "keep") |> group_vars() by_sex_gender |> summarise(n = n(), .groups = "drop") |> group_vars() ``` When the output no longer have grouping variables, it becomes ungrouped (i.e. a regular tibble). ### `select()`, `rename()`, and `relocate()` `rename()` and `relocate()` behave identically with grouped and ungrouped data because they only affect the name or position of existing columns. Grouped `select()` is almost identical to ungrouped select, except that it always includes the grouping variables: ```{r select} by_species |> select(mass) ``` If you don't want the grouping variables, you'll have to first `ungroup()`. (This design is possibly a mistake, but we're stuck with it for now.) ### `arrange()` Grouped `arrange()` is the same as ungrouped `arrange()`, unless you set `.by_group = TRUE`, in which case it will order first by the grouping variables. ```{r} by_species |> arrange(desc(mass)) |> relocate(species, mass) by_species |> arrange(desc(mass), .by_group = TRUE) |> relocate(species, mass) ``` Note that second example is sorted by `species` (from the `group_by()` statement) and then by `mass` (within species). ### `mutate()` In simple cases with vectorised functions, grouped and ungrouped `mutate()` give the same results. They differ when used with summary functions: ```{r by_homeworld} # Subtract off global mean starwars |> select(name, homeworld, mass) |> mutate(standard_mass = mass - mean(mass, na.rm = TRUE)) # Subtract off homeworld mean starwars |> select(name, homeworld, mass) |> group_by(homeworld) |> mutate(standard_mass = mass - mean(mass, na.rm = TRUE)) ``` Or with window functions like `min_rank()`: ```{r} # Overall rank starwars |> select(name, homeworld, height) |> mutate(rank = min_rank(height)) # Rank per homeworld starwars |> select(name, homeworld, height) |> group_by(homeworld) |> mutate(rank = min_rank(height)) ``` ### `filter()` A grouped `filter()` effectively does a `mutate()` to generate a logical variable, and then only keeps the rows where the variable is `TRUE`. This means that grouped filters can be used with summary functions. For example, we can find the tallest character of each species: ```{r filter} by_species |> select(name, species, height) |> filter(height == max(height)) ``` You can also use `filter_out()` to remove entire groups. For example, the following code eliminates all groups that only have a single member: ```{r filter_group} by_species |> filter_out(n() == 1) |> tally() ``` ### `slice()` and friends `slice()` and friends (`slice_head()`, `slice_tail()`, `slice_sample()`, `slice_min()` and `slice_max()`) select rows within a group. For example, we can select the first observation within each species: ```{r slice} by_species |> relocate(species) |> slice(1) ``` Similarly, we can use `slice_min()` to select the smallest `n` values of a variable: ```{r slice_min} by_species |> filter_out(is.na(height)) |> slice_min(height, n = 2) ``` dplyr/inst/doc/dplyr.R0000644000176200001440000001216315137234453014415 0ustar liggesusers## ----echo = FALSE, message = FALSE-------------------------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) library(dplyr) set.seed(1014) ## ----------------------------------------------------------------------------- dim(starwars) starwars ## ----------------------------------------------------------------------------- starwars |> filter(skin_color == "light", eye_color == "brown") ## ----eval = FALSE------------------------------------------------------------- # starwars[starwars$skin_color == "light" & starwars$eye_color == "brown", ] ## ----------------------------------------------------------------------------- starwars |> arrange(height, mass) ## ----------------------------------------------------------------------------- starwars |> arrange(desc(height)) ## ----------------------------------------------------------------------------- starwars |> slice(5:10) ## ----------------------------------------------------------------------------- starwars |> slice_head(n = 3) ## ----------------------------------------------------------------------------- starwars |> slice_sample(n = 5) starwars |> slice_sample(prop = 0.1) ## ----------------------------------------------------------------------------- starwars |> slice_max(height, n = 3) ## ----------------------------------------------------------------------------- # Select columns by name starwars |> select(hair_color, skin_color, eye_color) # Select all columns between hair_color and eye_color (inclusive) starwars |> select(hair_color:eye_color) # Select all columns except those from hair_color to eye_color (inclusive) starwars |> select(!(hair_color:eye_color)) # Select all columns ending with color starwars |> select(ends_with("color")) ## ----------------------------------------------------------------------------- starwars |> select(home_world = homeworld) ## ----------------------------------------------------------------------------- starwars |> rename(home_world = homeworld) ## ----------------------------------------------------------------------------- starwars |> mutate(height_m = height / 100) ## ----------------------------------------------------------------------------- starwars |> mutate(height_m = height / 100) |> select(height_m, height, everything()) ## ----------------------------------------------------------------------------- starwars |> mutate( height_m = height / 100, BMI = mass / (height_m^2) ) |> select(BMI, everything()) ## ----------------------------------------------------------------------------- starwars |> mutate( height_m = height / 100, BMI = mass / (height_m^2), .keep = "none" ) ## ----------------------------------------------------------------------------- starwars |> relocate(sex:homeworld, .before = height) ## ----------------------------------------------------------------------------- starwars |> summarise(height = mean(height, na.rm = TRUE)) ## ----eval = FALSE------------------------------------------------------------- # a1 <- group_by(starwars, species, sex) # a2 <- select(a1, height, mass) # a3 <- summarise(a2, # height = mean(height, na.rm = TRUE), # mass = mean(mass, na.rm = TRUE) # ) ## ----------------------------------------------------------------------------- summarise( select( group_by(starwars, species, sex), height, mass ), height = mean(height, na.rm = TRUE), mass = mean(mass, na.rm = TRUE) ) ## ----eval = FALSE------------------------------------------------------------- # starwars |> # group_by(species, sex) |> # select(height, mass) |> # summarise( # height = mean(height, na.rm = TRUE), # mass = mean(mass, na.rm = TRUE) # ) ## ----------------------------------------------------------------------------- # `name` represents the integer 1 select(starwars, name) select(starwars, 1) ## ----------------------------------------------------------------------------- height <- 5 select(starwars, height) ## ----------------------------------------------------------------------------- name <- "color" select(starwars, ends_with(name)) ## ----------------------------------------------------------------------------- name <- 5 select(starwars, name, identity(name)) ## ----------------------------------------------------------------------------- vars <- c("name", "height") select(starwars, all_of(vars), "mass") ## ----------------------------------------------------------------------------- df <- starwars |> select(name, height, mass) ## ----------------------------------------------------------------------------- mutate(df, "height", 2) ## ----------------------------------------------------------------------------- mutate(df, height + 10) ## ----------------------------------------------------------------------------- var <- seq(1, nrow(df)) mutate(df, new = var) ## ----------------------------------------------------------------------------- group_by(starwars, sex) group_by(starwars, sex = as.factor(sex)) group_by(starwars, height_binned = cut(height, 3)) ## ----------------------------------------------------------------------------- group_by(df, "month") dplyr/inst/doc/two-table.html0000644000176200001440000012442615137234470015731 0ustar liggesusers Two-table verbs

Two-table verbs

It’s rare that a data analysis involves only a single table of data. In practice, you’ll normally have many tables that contribute to an analysis, and you need flexible tools to combine them. In dplyr, there are three families of verbs that work with two tables at a time:

  • Mutating joins, which add new variables to one table from matching rows in another.

  • Filtering joins, which filter observations from one table based on whether or not they match an observation in the other table.

  • Set operations, which combine the observations in the data sets as if they were set elements.

(This discussion assumes that you have tidy data, where the rows are observations and the columns are variables. If you’re not familiar with that framework, I’d recommend reading up on it first.)

All two-table verbs work similarly. The first two arguments are x and y, and provide the tables to combine. The output is always a new table with the same type as x.

Mutating joins

Mutating joins allow you to combine variables from multiple tables. For example, consider the flights and airlines data from the nycflights13 package. In one table we have flight information with an abbreviation for carrier, and in another we have a mapping between abbreviations and full names. You can use a join to add the carrier names to the flight data:

library(nycflights13)
# Drop unimportant variables so it's easier to understand the join results.
flights2 <- flights |> select(year:day, hour, origin, dest, tailnum, carrier)

flights2 |>
  left_join(airlines)
#> Joining with `by = join_by(carrier)`
#> # A tibble: 336,776 × 9
#>    year month   day  hour origin dest  tailnum carrier name                  
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                 
#> 1  2013     1     1     5 EWR    IAH   N14228  UA      United Air Lines Inc. 
#> 2  2013     1     1     5 LGA    IAH   N24211  UA      United Air Lines Inc. 
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA      American Airlines Inc.
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlue Airways       
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta Air Lines Inc.  
#> # ℹ 336,771 more rows

Controlling how the tables are matched

As well as x and y, each mutating join takes an argument by that controls which variables are used to match observations in the two tables. There are a few ways to specify it, as I illustrate below with various tables from nycflights13:

  • NULL, the default. dplyr will will use all variables that appear in both tables, a natural join. For example, the flights and weather tables match on their common variables: year, month, day, hour and origin.

    flights2 |> left_join(weather)
    #> Joining with `by = join_by(year, month, day, hour, origin)`
    #> # A tibble: 336,776 × 18
    #>    year month   day  hour origin dest  tailnum carrier  temp  dewp humid
    #>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
    #> 1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
    #> 2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
    #> 3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
    #> 4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
    #> 5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
    #> # ℹ 336,771 more rows
    #> # ℹ 7 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
    #> #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
  • A character vector, by = "x". Like a natural join, but uses only some of the common variables. For example, flights and planes have year columns, but they mean different things so we only want to join by tailnum.

    flights2 |> left_join(planes, by = "tailnum")
    #> # A tibble: 336,776 × 16
    #>   year.x month   day  hour origin dest  tailnum carrier year.y type             
    #>    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>            
    #> 1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixed wing multi…
    #> 2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixed wing multi…
    #> 3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixed wing multi…
    #> 4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixed wing multi…
    #> 5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixed wing multi…
    #> # ℹ 336,771 more rows
    #> # ℹ 6 more variables: manufacturer <chr>, model <chr>, engines <int>,
    #> #   seats <int>, speed <int>, engine <chr>

    Note that the year columns in the output are disambiguated with a suffix.

  • A named character vector: by = c("x" = "a"). This will match variable x in table x to variable a in table y. The variables from use will be used in the output.

    Each flight has an origin and destination airport, so we need to specify which one we want to join to:

    flights2 |> left_join(airports, c("dest" = "faa"))
    #> # A tibble: 336,776 × 15
    #>    year month   day  hour origin dest  tailnum carrier name      lat   lon   alt
    #>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
    #> 1  2013     1     1     5 EWR    IAH   N14228  UA      George…  30.0 -95.3    97
    #> 2  2013     1     1     5 LGA    IAH   N24211  UA      George…  30.0 -95.3    97
    #> 3  2013     1     1     5 JFK    MIA   N619AA  AA      Miami …  25.8 -80.3     8
    #> 4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>     NA    NA      NA
    #> 5  2013     1     1     6 LGA    ATL   N668DN  DL      Hartsf…  33.6 -84.4  1026
    #> # ℹ 336,771 more rows
    #> # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>
    flights2 |> left_join(airports, c("origin" = "faa"))
    #> # A tibble: 336,776 × 15
    #>    year month   day  hour origin dest  tailnum carrier name      lat   lon   alt
    #>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
    #> 1  2013     1     1     5 EWR    IAH   N14228  UA      Newark…  40.7 -74.2    18
    #> 2  2013     1     1     5 LGA    IAH   N24211  UA      La Gua…  40.8 -73.9    22
    #> 3  2013     1     1     5 JFK    MIA   N619AA  AA      John F…  40.6 -73.8    13
    #> 4  2013     1     1     5 JFK    BQN   N804JB  B6      John F…  40.6 -73.8    13
    #> 5  2013     1     1     6 LGA    ATL   N668DN  DL      La Gua…  40.8 -73.9    22
    #> # ℹ 336,771 more rows
    #> # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>

Types of join

There are four types of mutating join, which differ in their behaviour when a match is not found. We’ll illustrate each with a simple example:

df1 <- tibble(x = c(1, 2), y = 2:1)
df2 <- tibble(x = c(3, 1), a = 10, b = "a")
  • inner_join(x, y) only includes observations that match in both x and y.

    df1 |> inner_join(df2) |> knitr::kable()
    #> Joining with `by = join_by(x)`
    x y a b
    1 2 10 a
  • left_join(x, y) includes all observations in x, regardless of whether they match or not. This is the most commonly used join because it ensures that you don’t lose observations from your primary table.

    df1 |> left_join(df2)
    #> Joining with `by = join_by(x)`
    #> # A tibble: 2 × 4
    #>       x     y     a b    
    #>   <dbl> <int> <dbl> <chr>
    #> 1     1     2    10 a    
    #> 2     2     1    NA <NA>
  • right_join(x, y) includes all observations in y. It’s equivalent to left_join(y, x), but the columns and rows will be ordered differently.

    df1 |> right_join(df2)
    #> Joining with `by = join_by(x)`
    #> # A tibble: 2 × 4
    #>       x     y     a b    
    #>   <dbl> <int> <dbl> <chr>
    #> 1     1     2    10 a    
    #> 2     3    NA    10 a
    df2 |> left_join(df1)
    #> Joining with `by = join_by(x)`
    #> # A tibble: 2 × 4
    #>       x     a b         y
    #>   <dbl> <dbl> <chr> <int>
    #> 1     3    10 a        NA
    #> 2     1    10 a         2
  • full_join() includes all observations from x and y.

    df1 |> full_join(df2)
    #> Joining with `by = join_by(x)`
    #> # A tibble: 3 × 4
    #>       x     y     a b    
    #>   <dbl> <int> <dbl> <chr>
    #> 1     1     2    10 a    
    #> 2     2     1    NA <NA> 
    #> 3     3    NA    10 a

The left, right and full joins are collectively know as outer joins. When a row doesn’t match in an outer join, the new variables are filled in with missing values.

Observations

While mutating joins are primarily used to add new variables, they can also generate new observations. If a match is not unique, a join will add all possible combinations (the Cartesian product) of the matching observations:

df1 <- tibble(x = c(1, 1, 2), y = 1:3)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))

df1 |> left_join(df2)
#> Joining with `by = join_by(x)`
#> Warning in left_join(df1, df2): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#>   "many-to-many"` to silence this warning.
#> # A tibble: 5 × 3
#>       x     y z    
#>   <dbl> <int> <chr>
#> 1     1     1 a    
#> 2     1     1 b    
#> 3     1     2 a    
#> 4     1     2 b    
#> 5     2     3 a

Filtering joins

Filtering joins match observations in the same way as mutating joins, but affect the observations, not the variables. There are two types:

  • semi_join(x, y) keeps all observations in x that have a match in y.
  • anti_join(x, y) drops all observations in x that have a match in y.

These are most useful for diagnosing join mismatches. For example, there are many flights in the nycflights13 dataset that don’t have a matching tail number in the planes table:

library("nycflights13")
flights |>
  anti_join(planes, by = "tailnum") |>
  count(tailnum, sort = TRUE)
#> # A tibble: 722 × 2
#>   tailnum     n
#>   <chr>   <int>
#> 1 <NA>     2512
#> 2 N725MQ    575
#> 3 N722MQ    513
#> 4 N723MQ    507
#> 5 N713MQ    483
#> # ℹ 717 more rows

If you’re worried about what observations your joins will match, start with a semi_join() or anti_join(). semi_join() and anti_join() never duplicate; they only ever remove observations.

df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))

# Four rows to start with:
df1 |> nrow()
#> [1] 4
# And we get four rows after the join
df1 |> inner_join(df2, by = "x") |> nrow()
#> Warning in inner_join(df1, df2, by = "x"): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#>   "many-to-many"` to silence this warning.
#> [1] 4
# But only two rows actually match
df1 |> semi_join(df2, by = "x") |> nrow()
#> [1] 2

Set operations

The final type of two-table verb is set operations. These expect the x and y inputs to have the same variables, and treat the observations like sets:

  • intersect(x, y): return only observations in both x and y
  • union(x, y): return unique observations in x and y
  • setdiff(x, y): return observations in x, but not in y.

Given this simple data:

(df1 <- tibble(x = 1:2, y = c(1L, 1L)))
#> # A tibble: 2 × 2
#>       x     y
#>   <int> <int>
#> 1     1     1
#> 2     2     1
(df2 <- tibble(x = 1:2, y = 1:2))
#> # A tibble: 2 × 2
#>       x     y
#>   <int> <int>
#> 1     1     1
#> 2     2     2

The four possibilities are:

intersect(df1, df2)
#> # A tibble: 1 × 2
#>       x     y
#>   <int> <int>
#> 1     1     1
# Note that we get 3 rows, not 4
union(df1, df2)
#> # A tibble: 3 × 2
#>       x     y
#>   <int> <int>
#> 1     1     1
#> 2     2     1
#> 3     2     2
setdiff(df1, df2)
#> # A tibble: 1 × 2
#>       x     y
#>   <int> <int>
#> 1     2     1
setdiff(df2, df1)
#> # A tibble: 1 × 2
#>       x     y
#>   <int> <int>
#> 1     2     2

Multiple-table verbs

dplyr does not provide any functions for working with three or more tables. Instead use purrr::reduce() or Reduce(), as described in Advanced R, to iteratively combine the two-table verbs to handle as many tables as you need.

dplyr/inst/doc/in-packages.html0000644000176200001440000005737315137234456016227 0ustar liggesusers Using dplyr in packages

Using dplyr in packages

library(dplyr)

This vignette is aimed at package authors who use dplyr in their packages. We will discuss best practices learned over the years to avoid R CMD check notes and warnings, and how to handle when dplyr deprecates functions.

Join helpers

As of dplyr 1.1.0, we’ve introduced join_by() along 4 helpers for performing various types of joins:

  • closest()

  • between()

  • within()

  • overlaps()

join_by() implements a domain specific language (DSL) for joins, and internally interprets calls to these functions.

You’ll notice that dplyr::closest() isn’t an exported function from dplyr (dplyr::between() and base::within() do happen to be preexisting functions). If you use closest() in your package, then this will cause an R CMD check note letting you know that you’ve used a symbol that doesn’t belong to any package.

To silence this, place utils::globalVariables("closest") in a source file in your package (but outside of any function). dbplyr does a similar thing for SQL functions, so you can see an example of that here.

You may also have to add utils to your package Imports, even though it is a base package. You can do that easily with usethis::use_package("utils").

Data masking and tidy selection NOTEs

If you’re writing a package and you have a function that uses data masking or tidy selection:

my_summary_function <- function(data) {
  data |>
    select(grp, x, y) |>
    filter(x > 0) |>
    group_by(grp) |>
    summarise(y = mean(y), n = n())
}

You’ll get an NOTE because R CMD check doesn’t know that dplyr functions use tidy evaluation:

N  checking R code for possible problems
   my_summary_function: no visible binding for global variable ‘grp’, ‘x’, ‘y’
   Undefined global functions or variables:
     grp x y

To eliminate this note:

  • For data masking, import .data from rlang and then use .data$var instead of var.
  • For tidy selection, use "var" instead of var.

That yields:

#' @importFrom rlang .data
my_summary_function <- function(data) {
  data |>
    select("grp", "x", "y") |>
    filter(.data$x > 0) |>
    group_by(.data$grp) |>
    summarise(y = mean(.data$y), n = n())
}

For more about programming with dplyr, see vignette("programming", package = "dplyr").

Deprecation

This section is focused on updating package code to deal with backwards incompatible changes in dplyr. We do try and minimize backward incompatible changes as much as possible, but sometimes they are necessary in order to radically simplify existing code, or unlock a lot of potential value in the future.

We will start with some general advice about supporting multiple versions of dplyr at once, and then we will discuss some specific changes in dplyr.

Multiple dplyr versions

Ideally, when we introduce a breaking change you’ll want to make sure that your package works with both the released version and the development version of dplyr. This is typically a little bit more work, but has two big advantages:

  • It’s more convenient for your users, since your package will work for them regardless of what version of dplyr they have installed.

  • It’s easier on CRAN since it doesn’t require a massive coordinated release of multiple packages.

If we break your package, we will typically send you a pull request that implements a patch before releasing the next version of dplyr. Most of the time, this patch will be backwards compatible with older versions of dplyr as well. Ideally, you’ll accept this patch and submit a new version of your package to CRAN before the new version of dplyr is released.

To make code work with multiple versions of a package, your first tool is the simple if statement:

if (utils::packageVersion("dplyr") > "0.5.0") {
  # code for new version
} else {
  # code for old version
}

Always condition on > current-version, not >= next-version because this will ensure that this branch is also used for the development version of the package. For example, if the current release is version "0.5.0", the development version will be "0.5.0.9000".

This typically works well if the branch for the “new version” introduces a new argument or has a slightly different return value.

This doesn’t work if we’ve introduced a new function that you need to switch to, like:

if (utils::packageVersion("dplyr") > "1.0.10") {
  dplyr::reframe(df, x = unique(x))
} else {
  dplyr::summarise(df, x = unique(x))
}

In this case, when checks are run with dplyr 1.0.10 you’ll get a warning about using a function from dplyr that doesn’t exist (reframe()) even though that branch will never run. You can get around this by using utils::getFromNamespace() to indirectly call the new dplyr function:

if (utils::packageVersion("dplyr") > "1.0.10") {
  utils::getFromNamespace("reframe", "dplyr")(df, x = unique(x))
} else {
  dplyr::summarise(df, x = unique(x))
}

As soon as the next version of dplyr is actually on CRAN (1.1.0 in this case), you should feel free to remove this code and unconditionally use reframe() as long as you also require dplyr (>= 1.1.0) in your DESCRIPTION file. This is typically not very painful for users, because they’d already be updating your package when they run into this requirement, so updating one more package along the way is generally easy. It also helps them get the latest bug fixes and features from dplyr.

Sometimes, it isn’t possible to avoid a call to @importFrom. For example you might be importing a generic so that you can define a method for it, but that generic has moved between packages. In this case, you can take advantage of a little-known feature in the NAMESPACE file: you can include raw if statements.

#' @rawNamespace
#' if (utils::packageVersion("dplyr") > "0.5.0") {
#'   importFrom("dbplyr", "build_sql")
#' } else {
#'   importFrom("dplyr", "build_sql")
#' }

Deprecation of mutate_*() and summarise_*()

The following mutate() and summarise() variants were deprecated in dplyr 0.7.0:

  • mutate_each(), summarise_each()

and the following variants were superseded in dplyr 1.0.0:

  • mutate_all(), summarise_all()

  • mutate_if(), summarise_if()

  • mutate_at(), summarise_at()

These have all been replaced by using mutate() or summarise() in combination with across(), which was introduced in dplyr 1.0.0.

If you used mutate_all() or mutate_each() without supplying a selection, you should update to use across(everything()):

starwars |> mutate_each(funs(as.character))
starwars |> mutate_all(funs(as.character))
starwars |> mutate(across(everything(), as.character))

If you provided a selection through mutate_at() or mutate_each(), then you can switch to across() with a selection:

starwars |> mutate_each(funs(as.character), height, mass)
starwars |> mutate_at(vars(height, mass), as.character)
starwars |> mutate(across(c(height, mass), as.character))

If you used predicates with mutate_if(), you can switch to using across() in combination with where():

starwars |> mutate_if(is.factor, as.character)
starwars |> mutate(across(where(is.factor), as.character))

Data frame subclasses

If you are a package author that is extending dplyr to work with a new data frame subclass, then we encourage you to read the documentation in ?dplyr_extending. This contains advice on how to implement the minimal number of extension generics possible to get maximal compatibility across dplyr’s verbs.

dplyr/inst/doc/programming.html0000644000176200001440000014400115137234460016343 0ustar liggesusers Programming with dplyr

Programming with dplyr

Introduction

Most dplyr verbs use tidy evaluation in some way. Tidy evaluation is a special type of non-standard evaluation used throughout the tidyverse. There are two basic forms found in dplyr:

  • arrange(), count(), filter(), group_by(), mutate(), and summarise() use data masking so that you can use data variables as if they were variables in the environment (i.e. you write my_variable not df$my_variable).

  • across(), relocate(), rename(), select(), and pull() use tidy selection so you can easily choose variables based on their position, name, or type (e.g. starts_with("x") or is.numeric).

To determine whether a function argument uses data masking or tidy selection, look at the documentation: in the arguments list, you’ll see <data-masking> or <tidy-select>.

Data masking and tidy selection make interactive data exploration fast and fluid, but they add some new challenges when you attempt to use them indirectly such as in a for loop or a function. This vignette shows you how to overcome those challenges. We’ll first go over the basics of data masking and tidy selection, talk about how to use them indirectly, and then show you a number of recipes to solve common problems.

This vignette will give you the minimum knowledge you need to be an effective programmer with tidy evaluation. If you’d like to learn more about the underlying theory, or precisely how it’s different from non-standard evaluation, we recommend that you read the Metaprogramming chapters in Advanced R.

library(dplyr)

Data masking

Data masking makes data manipulation faster because it requires less typing. In most (but not all1) base R functions you need to refer to variables with $, leading to code that repeats the name of the data frame many times:

starwars[starwars$homeworld == "Naboo" & starwars$species == "Human", ,]

The dplyr equivalent of this code is more concise because data masking allows you to need to type starwars once:

starwars |> filter(homeworld == "Naboo", species == "Human")

Data- and env-variables

The key idea behind data masking is that it blurs the line between the two different meanings of the word “variable”:

  • env-variables are “programming” variables that live in an environment. They are usually created with <-.

  • data-variables are “statistical” variables that live in a data frame. They usually come from data files (e.g. .csv, .xls), or are created manipulating existing variables.

To make those definitions a little more concrete, take this piece of code:

df <- data.frame(x = runif(3), y = runif(3))
df$x
#> [1] 0.08075014 0.83433304 0.60076089

It creates a env-variable, df, that contains two data-variables, x and y. Then it extracts the data-variable x out of the env-variable df using $.

I think this blurring of the meaning of “variable” is a really nice feature for interactive data analysis because it allows you to refer to data-vars as is, without any prefix. And this seems to be fairly intuitive since many newer R users will attempt to write diamonds[x == 0 | y == 0, ].

Unfortunately, this benefit does not come for free. When you start to program with these tools, you’re going to have to grapple with the distinction. This will be hard because you’ve never had to think about it before, so it’ll take a while for your brain to learn these new concepts and categories. However, once you’ve teased apart the idea of “variable” into data-variable and env-variable, I think you’ll find it fairly straightforward to use.

Indirection

The main challenge of programming with functions that use data masking arises when you introduce some indirection, i.e. when you want to get the data-variable from an env-variable instead of directly typing the data-variable’s name. There are two main cases:

  • When you have the data-variable in a function argument (i.e. an env-variable that holds a promise2), you need to embrace the argument by surrounding it in doubled braces, like filter(df, {{ var }}).

    The following function uses embracing to create a wrapper around summarise() that computes the minimum and maximum values of a variable, as well as the number of observations that were summarised:

    var_summary <- function(data, var) {
      data |>
        summarise(n = n(), min = min({{ var }}), max = max({{ var }}))
    }
    mtcars |>
      group_by(cyl) |>
      var_summary(mpg)
  • When you have an env-variable that is a character vector, you need to index into the .data pronoun with [[, like summarise(df, mean = mean(.data[[var]])).

    The following example uses .data to count the number of unique values in each variable of mtcars:

    for (var in names(mtcars)) {
      mtcars |> count(.data[[var]]) |> print()
    }

    Note that .data is not a data frame; it’s a special construct, a pronoun, that allows you to access the current variables either directly, with .data$x or indirectly with .data[[var]]. Don’t expect other functions to work with it.

Name injection

Many data masking functions also use dynamic dots, which gives you another useful feature: generating names programmatically by using := instead of =. There are two basics forms, as illustrated below with tibble():

  • If you have the name in an env-variable, you can use glue syntax to interpolate in:

    name <- "susan"
    tibble("{name}" := 2)
    #> # A tibble: 1 × 1
    #>   susan
    #>   <dbl>
    #> 1     2
  • If the name should be derived from a data-variable in an argument, you can use embracing syntax:

    my_df <- function(x) {
      tibble("{{x}}_2" := x * 2)
    }
    my_var <- 10
    my_df(my_var)
    #> # A tibble: 1 × 1
    #>   my_var_2
    #>      <dbl>
    #> 1       20

Learn more in ?rlang::`dyn-dots`.

Tidy selection

Data masking makes it easy to compute on values within a dataset. Tidy selection is a complementary tool that makes it easy to work with the columns of a dataset.

The tidyselect DSL

Underneath all functions that use tidy selection is the tidyselect package. It provides a miniature domain specific language that makes it easy to select columns by name, position, or type. For example:

  • select(df, 1) selects the first column; select(df, last_col()) selects the last column.

  • select(df, c(a, b, c)) selects columns a, b, and c.

  • select(df, starts_with("a")) selects all columns whose name starts with “a”; select(df, ends_with("z")) selects all columns whose name ends with “z”.

  • select(df, where(is.numeric)) selects all numeric columns.

You can see more details in ?dplyr_tidy_select.

Indirection

As with data masking, tidy selection makes a common task easier at the cost of making a less common task harder. When you want to use tidy select indirectly with the column specification stored in an intermediate variable, you’ll need to learn some new tools. Again, there are two forms of indirection:

  • When you have the data-variable in an env-variable that is a function argument, you use the same technique as data masking: you embrace the argument by surrounding it in doubled braces.

    The following function summarises a data frame by computing the mean of all variables selected by the user:

    summarise_mean <- function(data, vars) {
      data |> summarise(n = n(), across({{ vars }}, mean))
    }
    mtcars |>
      group_by(cyl) |>
      summarise_mean(where(is.numeric))
  • When you have an env-variable that is a character vector, you need to use all_of() or any_of() depending on whether you want the function to error if a variable is not found.

    The following code uses all_of() to select all of the variables found in a character vector; then ! plus all_of() to select all of the variables not found in a character vector:

    vars <- c("mpg", "vs")
    mtcars |> select(all_of(vars))
    mtcars |> select(!all_of(vars))

How-tos

The following examples solve a grab bag of common problems. We show you the minimum amount of code so that you can get the basic idea; most real problems will require more code or combining multiple techniques.

User-supplied data

If you check the documentation, you’ll see that .data never uses data masking or tidy select. That means you don’t need to do anything special in your function:

mutate_y <- function(data) {
  mutate(data, y = a + x)
}

One or more user-supplied expressions

If you want the user to supply an expression that’s passed onto an argument which uses data masking or tidy select, embrace the argument:

my_summarise <- function(data, group_var) {
  data |>
    group_by({{ group_var }}) |>
    summarise(mean = mean(mass))
}

This generalises in a straightforward way if you want to use one user-supplied expression in multiple places:

my_summarise2 <- function(data, expr) {
  data |> summarise(
    mean = mean({{ expr }}),
    sum = sum({{ expr }}),
    n = n()
  )
}

If you want the user to provide multiple expressions, embrace each of them:

my_summarise3 <- function(data, mean_var, sd_var) {
  data |>
    summarise(mean = mean({{ mean_var }}), sd = sd({{ sd_var }}))
}

If you want to use the name of a variable in the output, you can embrace the variable name on the left-hand side of := with {{:

my_summarise4 <- function(data, expr) {
  data |> summarise(
    "mean_{{expr}}" := mean({{ expr }}),
    "sum_{{expr}}" := sum({{ expr }}),
    "n_{{expr}}" := n()
  )
}
my_summarise5 <- function(data, mean_var, sd_var) {
  data |>
    summarise(
      "mean_{{mean_var}}" := mean({{ mean_var }}),
      "sd_{{sd_var}}" := sd({{ sd_var }})
    )
}

Any number of user-supplied expressions

If you want to take an arbitrary number of user supplied expressions, use .... This is most often useful when you want to give the user full control over a single part of the pipeline, like a group_by() or a mutate().

my_summarise <- function(.data, ...) {
  .data |>
    group_by(...) |>
    summarise(mass = mean(mass, na.rm = TRUE), height = mean(height, na.rm = TRUE))
}

starwars |> my_summarise(homeworld)
#> # A tibble: 49 × 3
#>   homeworld    mass height
#>   <chr>       <dbl>  <dbl>
#> 1 Alderaan       64   176.
#> 2 Aleen Minor    15    79 
#> 3 Bespin         79   175 
#> 4 Bestine IV    110   180 
#> # ℹ 45 more rows
starwars |> my_summarise(sex, gender)
#> `summarise()` has regrouped the output.
#> ℹ Summaries were computed grouped by sex and gender.
#> ℹ Output is grouped by sex.
#> ℹ Use `summarise(.groups = "drop_last")` to silence this message.
#> ℹ Use `summarise(.by = c(sex, gender))` for per-operation grouping
#>   (`?dplyr::dplyr_by`) instead.
#> # A tibble: 6 × 4
#> # Groups:   sex [5]
#>   sex            gender      mass height
#>   <chr>          <chr>      <dbl>  <dbl>
#> 1 female         feminine    54.7   172.
#> 2 hermaphroditic masculine 1358     175 
#> 3 male           masculine   80.2   179.
#> 4 none           feminine   NaN      96 
#> # ℹ 2 more rows

When you use ... in this way, make sure that any other arguments start with . to reduce the chances of argument clashes; see https://design.tidyverse.org/dots-prefix.html for more details.

Creating multiple columns

Sometimes it can be useful for a single expression to return multiple columns. You can do this by returning an unnamed data frame:

quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
  tibble(
    val = quantile(x, probs),
    quant = probs
  )
}

x <- 1:5
quantile_df(x)
#> # A tibble: 3 × 2
#>     val quant
#>   <dbl> <dbl>
#> 1     2  0.25
#> 2     3  0.5 
#> 3     4  0.75

This sort of function is useful inside summarise() and mutate() which allow you to add multiple columns by returning a data frame:

df <- tibble(
  grp = rep(1:3, each = 10),
  x = runif(30),
  y = rnorm(30)
)

df |>
  group_by(grp) |>
  summarise(quantile_df(x, probs = .5))
#> # A tibble: 3 × 3
#>     grp   val quant
#>   <int> <dbl> <dbl>
#> 1     1 0.361   0.5
#> 2     2 0.541   0.5
#> 3     3 0.456   0.5

df |>
  group_by(grp) |>
  summarise(across(x:y, ~ quantile_df(.x, probs = .5), .unpack = TRUE))
#> # A tibble: 3 × 5
#>     grp x_val x_quant   y_val y_quant
#>   <int> <dbl>   <dbl>   <dbl>   <dbl>
#> 1     1 0.361     0.5  0.174      0.5
#> 2     2 0.541     0.5 -0.0110     0.5
#> 3     3 0.456     0.5  0.0583     0.5

Notice that we set .unpack = TRUE inside across(). This tells across() to unpack the data frame returned by quantile_df() into its respective columns, combining the column names of the original columns (x and y) with the column names returned from the function (val and quant).

If your function returns multiple rows per group, then you’ll need to switch from summarise() to reframe(). summarise() is restricted to returning 1 row summaries per group, but reframe() lifts this restriction:

df |>
  group_by(grp) |>
  reframe(across(x:y, quantile_df, .unpack = TRUE))
#> # A tibble: 9 × 5
#>     grp x_val x_quant  y_val y_quant
#>   <int> <dbl>   <dbl>  <dbl>   <dbl>
#> 1     1 0.219    0.25 -0.710    0.25
#> 2     1 0.361    0.5   0.174    0.5 
#> 3     1 0.674    0.75  0.524    0.75
#> 4     2 0.315    0.25 -0.690    0.25
#> # ℹ 5 more rows

Transforming user-supplied variables

If you want the user to provide a set of data-variables that are then transformed, use across() and pick():

my_summarise <- function(data, summary_vars) {
  data |>
    summarise(across({{ summary_vars }}, ~ mean(., na.rm = TRUE)))
}
starwars |>
  group_by(species) |>
  my_summarise(c(mass, height))
#> # A tibble: 38 × 3
#>   species   mass height
#>   <chr>    <dbl>  <dbl>
#> 1 Aleena      15     79
#> 2 Besalisk   102    198
#> 3 Cerean      82    198
#> 4 Chagrian   NaN    196
#> # ℹ 34 more rows

You can use this same idea for multiple sets of input data-variables:

my_summarise <- function(data, group_var, summarise_var) {
  data |>
    group_by(pick({{ group_var }})) |>
    summarise(across({{ summarise_var }}, mean))
}

Use the .names argument to across() to control the names of the output.

my_summarise <- function(data, group_var, summarise_var) {
  data |>
    group_by(pick({{ group_var }})) |>
    summarise(across({{ summarise_var }}, mean, .names = "mean_{.col}"))
}

Loop over multiple variables

If you have a character vector of variable names, and want to operate on them with a for loop, index into the special .data pronoun:

for (var in names(mtcars)) {
  mtcars |> count(.data[[var]]) |> print()
}

This same technique works with for loop alternatives like the base R apply() family and the purrr map() family:

mtcars |>
  names() |>
  purrr::map(~ count(mtcars, .data[[.x]]))

(Note that the x in .data[[x]] is always treated as an env-variable; it will never come from the data.)

Use a variable from an Shiny input

Many Shiny input controls return character vectors, so you can use the same approach as above: .data[[input$var]].

library(shiny)
ui <- fluidPage(
  selectInput("var", "Variable", choices = names(diamonds)),
  tableOutput("output")
)
server <- function(input, output, session) {
  data <- reactive(filter(diamonds, .data[[input$var]] > 0))
  output$output <- renderTable(head(data()))
}

See https://mastering-shiny.org/action-tidy.html for more details and case studies.


  1. dplyr’s filter() is inspired by base R’s subset(). subset() provides data masking, but not with tidy evaluation, so the techniques described in this chapter don’t apply to it.↩︎

  2. In R, arguments are lazily evaluated which means that until you attempt to use, they don’t hold a value, just a promise that describes how to compute the value. You can learn more at https://adv-r.hadley.nz/functions.html#lazy-evaluation↩︎

dplyr/inst/doc/colwise.html0000644000176200001440000015744115137234451015502 0ustar liggesusers Column-wise operations

Column-wise operations

It’s often useful to perform the same operation on multiple columns, but copying and pasting is both tedious and error prone:

df |>
  group_by(g1, g2) |>
  summarise(a = mean(a), b = mean(b), c = mean(c), d = mean(d))

(If you’re trying to compute mean(a, b, c, d) for each row, instead see vignette("rowwise"))

This vignette will introduce you to the across() function, which lets you rewrite the previous code more succinctly:

df |>
  group_by(g1, g2) |>
  summarise(across(a:d, mean))

We’ll start by discussing the basic usage of across(), particularly as it applies to summarise(), and show how to use it with multiple functions. We’ll then show a few uses with other verbs. We’ll finish off with a bit of history, showing why we prefer across() to our last approach (the _if(), _at() and _all() functions) and how to translate your old code to the new syntax.

library(dplyr, warn.conflicts = FALSE)

Basic usage

across() has two primary arguments:

  • The first argument, .cols, selects the columns you want to operate on. It uses tidy selection (like select()) so you can pick variables by position, name, and type.

  • The second argument, .fns, is a function or list of functions to apply to each column. This can also be a purrr style formula (or list of formulas) like ~ .x / 2. (This argument is optional, and you can omit it if you just want to get the underlying data; you’ll see that technique used in vignette("rowwise").)

Here are a couple of examples of across() in conjunction with its favourite verb, summarise(). But you can use across() with any dplyr verb, as you’ll see a little later.

starwars |>
  summarise(across(where(is.character), n_distinct))
#> # A tibble: 1 × 8
#>    name hair_color skin_color eye_color   sex gender homeworld species
#>   <int>      <int>      <int>     <int> <int>  <int>     <int>   <int>
#> 1    87         12         31        15     5      3        49      38

starwars |>
  group_by(species) |>
  filter(n() > 1) |>
  summarise(across(c(sex, gender, homeworld), n_distinct))
#> # A tibble: 9 × 4
#>   species    sex gender homeworld
#>   <chr>    <int>  <int>     <int>
#> 1 Droid        1      2         3
#> 2 Gungan       1      1         1
#> 3 Human        2      2        15
#> 4 Kaminoan     2      2         1
#> # ℹ 5 more rows

starwars |>
  group_by(homeworld) |>
  filter(n() > 1) |>
  summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
#> # A tibble: 10 × 4
#>   homeworld height  mass birth_year
#>   <chr>      <dbl> <dbl>      <dbl>
#> 1 Alderaan    176.  64         43  
#> 2 Corellia    175   78.5       25  
#> 3 Coruscant   174.  50         91  
#> 4 Kamino      208.  83.1       31.5
#> # ℹ 6 more rows

Because across() is usually used in combination with summarise() and mutate(), it doesn’t select grouping variables in order to avoid accidentally modifying them:

df <- data.frame(g = c(1, 1, 2), x = c(-1, 1, 3), y = c(-1, -4, -9))
df |>
  group_by(g) |>
  summarise(across(where(is.numeric), sum))
#> # A tibble: 2 × 3
#>       g     x     y
#>   <dbl> <dbl> <dbl>
#> 1     1     0    -5
#> 2     2     3    -9

Multiple functions

You can transform each variable with more than one function by supplying a named list of functions or lambda functions in the second argument:

min_max <- list(
  min = ~min(.x, na.rm = TRUE),
  max = ~max(.x, na.rm = TRUE)
)
starwars |> summarise(across(where(is.numeric), min_max))
#> # A tibble: 1 × 6
#>   height_min height_max mass_min mass_max birth_year_min birth_year_max
#>        <int>      <int>    <dbl>    <dbl>          <dbl>          <dbl>
#> 1         66        264       15     1358              8            896
starwars |> summarise(across(c(height, mass, birth_year), min_max))
#> # A tibble: 1 × 6
#>   height_min height_max mass_min mass_max birth_year_min birth_year_max
#>        <int>      <int>    <dbl>    <dbl>          <dbl>          <dbl>
#> 1         66        264       15     1358              8            896

Control how the names are created with the .names argument which takes a glue spec:

starwars |> summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}"))
#> # A tibble: 1 × 6
#>   min.height max.height min.mass max.mass min.birth_year max.birth_year
#>        <int>      <int>    <dbl>    <dbl>          <dbl>          <dbl>
#> 1         66        264       15     1358              8            896
starwars |> summarise(across(c(height, mass, birth_year), min_max, .names = "{.fn}.{.col}"))
#> # A tibble: 1 × 6
#>   min.height max.height min.mass max.mass min.birth_year max.birth_year
#>        <int>      <int>    <dbl>    <dbl>          <dbl>          <dbl>
#> 1         66        264       15     1358              8            896

If you’d prefer all summaries with the same function to be grouped together, you’ll have to expand the calls yourself:

starwars |> summarise(
  across(c(height, mass, birth_year), ~min(.x, na.rm = TRUE), .names = "min_{.col}"),
  across(c(height, mass, birth_year), ~max(.x, na.rm = TRUE), .names = "max_{.col}")
)
#> # A tibble: 1 × 6
#>   min_height min_mass min_birth_year max_height max_mass max_birth_year
#>        <int>    <dbl>          <dbl>      <int>    <dbl>          <dbl>
#> 1         66       15              8        264     1358            896

(One day this might become an argument to across() but we’re not yet sure how it would work.)

We cannot however use where(is.numeric) in that last case because the second across() would pick up the variables that were newly created (“min_height”, “min_mass” and “min_birth_year”).

We can work around this by combining both calls to across() into a single expression that returns a tibble:

starwars |> summarise(
  tibble(
    across(where(is.numeric), ~min(.x, na.rm = TRUE), .names = "min_{.col}"),
    across(where(is.numeric), ~max(.x, na.rm = TRUE), .names = "max_{.col}")
  )
)
#> # A tibble: 1 × 6
#>   min_height min_mass min_birth_year max_height max_mass max_birth_year
#>        <int>    <dbl>          <dbl>      <int>    <dbl>          <dbl>
#> 1         66       15              8        264     1358            896

Alternatively we could reorganize results with relocate():

starwars |>
  summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}")) |>
  relocate(starts_with("min"))
#> # A tibble: 1 × 6
#>   min.height min.mass min.birth_year max.height max.mass max.birth_year
#>        <int>    <dbl>          <dbl>      <int>    <dbl>          <dbl>
#> 1         66       15              8        264     1358            896

Current column

If you need to, you can access the name of the “current” column inside by calling cur_column(). This can be useful if you want to perform some sort of context dependent transformation that’s already encoded in a vector:

df <- tibble(x = 1:3, y = 3:5, z = 5:7)
mult <- list(x = 1, y = 10, z = 100)

df |> mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]]))
#> # A tibble: 3 × 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1    30   500
#> 2     2    40   600
#> 3     3    50   700

Gotchas

Be careful when combining numeric summaries with where(is.numeric):

df <- data.frame(x = c(1, 2, 3), y = c(1, 4, 9))

df |>
  summarise(n = n(), across(where(is.numeric), sd))
#>    n x        y
#> 1 NA 1 4.041452

Here n becomes NA because n is numeric, so the across() computes its standard deviation, and the standard deviation of 3 (a constant) is NA. You probably want to compute n() last to avoid this problem:

df |>
  summarise(across(where(is.numeric), sd), n = n())
#>   x        y n
#> 1 1 4.041452 3

Alternatively, you could explicitly exclude n from the columns to operate on:

df |>
  summarise(n = n(), across(where(is.numeric) & !n, sd))
#>   n x        y
#> 1 3 1 4.041452

Another approach is to combine both the call to n() and across() in a single expression that returns a tibble:

df |>
  summarise(
    tibble(n = n(), across(where(is.numeric), sd))
  )
#>   n x        y
#> 1 3 1 4.041452

Other verbs

So far we’ve focused on the use of across() with summarise(), but it works with any other dplyr verb that uses data masking:

  • Rescale all numeric variables to range 0-1:

    rescale01 <- function(x) {
      rng <- range(x, na.rm = TRUE)
      (x - rng[1]) / (rng[2] - rng[1])
    }
    df <- tibble(x = 1:4, y = rnorm(4))
    df |> mutate(across(where(is.numeric), rescale01))
    #> # A tibble: 4 × 2
    #>       x     y
    #>   <dbl> <dbl>
    #> 1 0     0.385
    #> 2 0.333 1    
    #> 3 0.667 0    
    #> 4 1     0.903

For some verbs, like group_by(), count() and distinct(), you don’t need to supply a summary function, but it can be useful to use tidy-selection to dynamically select a set of columns. In those cases, we recommend using the complement to across(), pick(), which works like across() but doesn’t apply any functions and instead returns a data frame containing the selected columns.

  • Find all distinct

    starwars |> distinct(pick(contains("color")))
    #> # A tibble: 67 × 3
    #>   hair_color skin_color  eye_color
    #>   <chr>      <chr>       <chr>    
    #> 1 blond      fair        blue     
    #> 2 <NA>       gold        yellow   
    #> 3 <NA>       white, blue red      
    #> 4 none       white       yellow   
    #> # ℹ 63 more rows
  • Count all combinations of variables with a given pattern:

    starwars |> count(pick(contains("color")), sort = TRUE)
    #> # A tibble: 67 × 4
    #>   hair_color skin_color eye_color     n
    #>   <chr>      <chr>      <chr>     <int>
    #> 1 brown      light      brown         6
    #> 2 brown      fair       blue          4
    #> 3 none       grey       black         4
    #> 4 black      dark       brown         3
    #> # ℹ 63 more rows

across() doesn’t work with select() or rename() because they already use tidy select syntax; if you want to transform column names with a function, you can use rename_with().

filter() and filter_out()

We cannot directly use across() in filter() or filter_out() because we need an extra step to combine the results into a single logical vector. To that end, filter() and filter_out() have two special purpose companion functions:

  • if_any() keeps the rows where the predicate is true for at least one selected column:
starwars |>
  filter_out(if_any(everything(), is.na))
#> # A tibble: 29 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> 3 Leia Org…    150    49 brown      light      brown           19   fema… femin…
#> 4 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
#> # ℹ 25 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
  • if_all() keeps the rows where the predicate is true for all selected columns:
starwars |>
  filter_out(if_all(everything(), is.na))
#> # A tibble: 87 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

_if, _at, _all

Prior versions of dplyr allowed you to apply a function to multiple columns in a different way: using functions with _if, _at, and _all() suffixes. These functions solved a pressing need and are used by many people, but are now superseded. That means that they’ll stay around, but won’t receive any new features and will only get critical bug fixes.

Why do we like across()?

Why did we decide to move away from these functions in favour of across()?

  1. across() makes it possible to express useful summaries that were previously impossible:

    df |>
      group_by(g1, g2) |>
      summarise(
        across(where(is.numeric), mean),
        across(where(is.factor), nlevels),
        n = n(),
      )
  2. across() reduces the number of functions that dplyr needs to provide. This makes dplyr easier for you to use (because there are fewer functions to remember) and easier for us to implement new verbs (since we only need to implement one function, not four).

  3. across() unifies _if and _at semantics so that you can select by position, name, and type, and you can now create compound selections that were previously impossible. For example, you can now transform all numeric columns whose name begins with “x”: across(where(is.numeric) & starts_with("x")).

  4. across() doesn’t need to use vars(). The _at() functions are the only place in dplyr where you have to manually quote variable names, which makes them a little weird and hence harder to remember.

Why did it take so long to discover across()?

It’s disappointing that we didn’t discover across() earlier, and instead worked through several false starts (first not realising that it was a common problem, then with the _each() functions, and most recently with the _if()/_at()/_all() functions). But across() couldn’t work without three recent discoveries:

  • You can have a column of a data frame that is itself a data frame. This is something provided by base R, but it’s not very well documented, and it took a while to see that it was useful, not just a theoretical curiosity.

  • We can use data frames to allow summary functions to return multiple columns.

  • We can use the absence of an outer name as a convention that you want to unpack a data frame column into individual columns.

How do you convert existing code?

Fortunately, it’s generally straightforward to translate your existing code to use across():

  • Strip the _if(), _at() and _all() suffix off the function.

  • Call across(). The first argument will be:

    1. For _if(), the old second argument wrapped in where().
    2. For _at(), the old second argument, with the call to vars() removed.
    3. For _all(), everything().

    The subsequent arguments can be copied as is.

For example:

df |> mutate_if(is.numeric, ~mean(.x, na.rm = TRUE))
# ->
df |> mutate(across(where(is.numeric), ~mean(.x, na.rm = TRUE)))

df |> mutate_at(vars(c(x, starts_with("y"))), mean)
# ->
df |> mutate(across(c(x, starts_with("y")), mean))

df |> mutate_all(mean)
# ->
df |> mutate(across(everything(), mean))

There are a few exceptions to this rule:

  • rename_*() and select_*() follow a different pattern. They already have select semantics, so are generally used in a different way that doesn’t have a direct equivalent with across(); use the new rename_with() instead.

  • Previously, filter_*() were paired with the all_vars() and any_vars() helpers. The new helpers if_any() and if_all() can be used inside filter() to keep rows for which the predicate is true for at least one, or all selected columns:

    df <- tibble(x = c("a", "b"), y = c(1, 1), z = c(-1, 1))
    
    # Find all rows where EVERY numeric variable is greater than zero
    df |> filter(if_all(where(is.numeric), ~ .x > 0))
    #> # A tibble: 1 × 3
    #>   x         y     z
    #>   <chr> <dbl> <dbl>
    #> 1 b         1     1
    
    # Find all rows where ANY numeric variable is greater than zero
    df |> filter(if_any(where(is.numeric), ~ .x > 0))
    #> # A tibble: 2 × 3
    #>   x         y     z
    #>   <chr> <dbl> <dbl>
    #> 1 a         1    -1
    #> 2 b         1     1
  • When used in a mutate(), all transformations performed by an across() are applied at once. This is different to the behaviour of mutate_if(), mutate_at(), and mutate_all(), which apply the transformations one at a time. We expect that you’ll generally find the new behaviour less surprising:

    df <- tibble(x = 2, y = 4, z = 8)
    df |> mutate_all(~ .x / y)
    #> # A tibble: 1 × 3
    #>       x     y     z
    #>   <dbl> <dbl> <dbl>
    #> 1   0.5     1     8
    
    df |> mutate(across(everything(), ~ .x / y))
    #> # A tibble: 1 × 3
    #>       x     y     z
    #>   <dbl> <dbl> <dbl>
    #> 1   0.5     1     2
dplyr/inst/doc/programming.Rmd0000644000176200001440000003527615106134104016124 0ustar liggesusers--- title: "Programming with dplyr" description: > Most dplyr verbs use "tidy evaluation", a special type of non-standard evaluation. In this vignette, you'll learn the two basic forms, data masking and tidy selection, and how you can program with them using either functions or for loops. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Programming with dplyr} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) set.seed(1014) ``` ## Introduction Most dplyr verbs use **tidy evaluation** in some way. Tidy evaluation is a special type of non-standard evaluation used throughout the tidyverse. There are two basic forms found in dplyr: - `arrange()`, `count()`, `filter()`, `group_by()`, `mutate()`, and `summarise()` use **data masking** so that you can use data variables as if they were variables in the environment (i.e. you write `my_variable` not `df$my_variable`). - `across()`, `relocate()`, `rename()`, `select()`, and `pull()` use **tidy selection** so you can easily choose variables based on their position, name, or type (e.g. `starts_with("x")` or `is.numeric`). To determine whether a function argument uses data masking or tidy selection, look at the documentation: in the arguments list, you'll see `` or ``. Data masking and tidy selection make interactive data exploration fast and fluid, but they add some new challenges when you attempt to use them indirectly such as in a for loop or a function. This vignette shows you how to overcome those challenges. We'll first go over the basics of data masking and tidy selection, talk about how to use them indirectly, and then show you a number of recipes to solve common problems. This vignette will give you the minimum knowledge you need to be an effective programmer with tidy evaluation. If you'd like to learn more about the underlying theory, or precisely how it's different from non-standard evaluation, we recommend that you read the Metaprogramming chapters in [*Advanced R*](https://adv-r.hadley.nz). ```{r setup, message = FALSE} library(dplyr) ``` ## Data masking Data masking makes data manipulation faster because it requires less typing. In most (but not all[^1]) base R functions you need to refer to variables with `$`, leading to code that repeats the name of the data frame many times: [^1]: dplyr's `filter()` is inspired by base R's `subset()`. `subset()` provides data masking, but not with tidy evaluation, so the techniques described in this chapter don't apply to it. ```{r, results = FALSE} starwars[starwars$homeworld == "Naboo" & starwars$species == "Human", ,] ``` The dplyr equivalent of this code is more concise because data masking allows you to need to type `starwars` once: ```{r, results = FALSE} starwars |> filter(homeworld == "Naboo", species == "Human") ``` ### Data- and env-variables The key idea behind data masking is that it blurs the line between the two different meanings of the word "variable": - **env-variables** are "programming" variables that live in an environment. They are usually created with `<-`. - **data-variables** are "statistical" variables that live in a data frame. They usually come from data files (e.g. `.csv`, `.xls`), or are created manipulating existing variables. To make those definitions a little more concrete, take this piece of code: ```{r} df <- data.frame(x = runif(3), y = runif(3)) df$x ``` It creates a env-variable, `df`, that contains two data-variables, `x` and `y`. Then it extracts the data-variable `x` out of the env-variable `df` using `$`. I think this blurring of the meaning of "variable" is a really nice feature for interactive data analysis because it allows you to refer to data-vars as is, without any prefix. And this seems to be fairly intuitive since many newer R users will attempt to write `diamonds[x == 0 | y == 0, ]`. Unfortunately, this benefit does not come for free. When you start to program with these tools, you're going to have to grapple with the distinction. This will be hard because you've never had to think about it before, so it'll take a while for your brain to learn these new concepts and categories. However, once you've teased apart the idea of "variable" into data-variable and env-variable, I think you'll find it fairly straightforward to use. ### Indirection The main challenge of programming with functions that use data masking arises when you introduce some indirection, i.e. when you want to get the data-variable from an env-variable instead of directly typing the data-variable's name. There are two main cases: - When you have the data-variable in a function argument (i.e. an env-variable that holds a promise[^2]), you need to **embrace** the argument by surrounding it in doubled braces, like `filter(df, {{ var }})`. The following function uses embracing to create a wrapper around `summarise()` that computes the minimum and maximum values of a variable, as well as the number of observations that were summarised: ```{r, results = FALSE} var_summary <- function(data, var) { data |> summarise(n = n(), min = min({{ var }}), max = max({{ var }})) } mtcars |> group_by(cyl) |> var_summary(mpg) ``` - When you have an env-variable that is a character vector, you need to index into the `.data` pronoun with `[[`, like `summarise(df, mean = mean(.data[[var]]))`. The following example uses `.data` to count the number of unique values in each variable of `mtcars`: ```{r, results = FALSE} for (var in names(mtcars)) { mtcars |> count(.data[[var]]) |> print() } ``` Note that `.data` is not a data frame; it's a special construct, a pronoun, that allows you to access the current variables either directly, with `.data$x` or indirectly with `.data[[var]]`. Don't expect other functions to work with it. [^2]: In R, arguments are lazily evaluated which means that until you attempt to use, they don't hold a value, just a **promise** that describes how to compute the value. You can learn more at ### Name injection Many data masking functions also use dynamic dots, which gives you another useful feature: generating names programmatically by using `:=` instead of `=`. There are two basics forms, as illustrated below with `tibble()`: - If you have the name in an env-variable, you can use glue syntax to interpolate in: ```{r} name <- "susan" tibble("{name}" := 2) ``` - If the name should be derived from a data-variable in an argument, you can use embracing syntax: ```{r} my_df <- function(x) { tibble("{{x}}_2" := x * 2) } my_var <- 10 my_df(my_var) ``` Learn more in `` ?rlang::`dyn-dots` ``. ## Tidy selection Data masking makes it easy to compute on values within a dataset. Tidy selection is a complementary tool that makes it easy to work with the columns of a dataset. ### The tidyselect DSL Underneath all functions that use tidy selection is the [tidyselect](https://tidyselect.r-lib.org/) package. It provides a miniature domain specific language that makes it easy to select columns by name, position, or type. For example: - `select(df, 1)` selects the first column; `select(df, last_col())` selects the last column. - `select(df, c(a, b, c))` selects columns `a`, `b`, and `c`. - `select(df, starts_with("a"))` selects all columns whose name starts with "a"; `select(df, ends_with("z"))` selects all columns whose name ends with "z". - `select(df, where(is.numeric))` selects all numeric columns. You can see more details in `?dplyr_tidy_select`. ### Indirection As with data masking, tidy selection makes a common task easier at the cost of making a less common task harder. When you want to use tidy select indirectly with the column specification stored in an intermediate variable, you'll need to learn some new tools. Again, there are two forms of indirection: - When you have the data-variable in an env-variable that is a function argument, you use the same technique as data masking: you **embrace** the argument by surrounding it in doubled braces. The following function summarises a data frame by computing the mean of all variables selected by the user: ```{r, results = FALSE} summarise_mean <- function(data, vars) { data |> summarise(n = n(), across({{ vars }}, mean)) } mtcars |> group_by(cyl) |> summarise_mean(where(is.numeric)) ``` - When you have an env-variable that is a character vector, you need to use `all_of()` or `any_of()` depending on whether you want the function to error if a variable is not found. The following code uses `all_of()` to select all of the variables found in a character vector; then `!` plus `all_of()` to select all of the variables *not* found in a character vector: ```{r, results = FALSE} vars <- c("mpg", "vs") mtcars |> select(all_of(vars)) mtcars |> select(!all_of(vars)) ``` ## How-tos The following examples solve a grab bag of common problems. We show you the minimum amount of code so that you can get the basic idea; most real problems will require more code or combining multiple techniques. ### User-supplied data If you check the documentation, you'll see that `.data` never uses data masking or tidy select. That means you don't need to do anything special in your function: ```{r} mutate_y <- function(data) { mutate(data, y = a + x) } ``` ### One or more user-supplied expressions If you want the user to supply an expression that's passed onto an argument which uses data masking or tidy select, embrace the argument: ```{r} my_summarise <- function(data, group_var) { data |> group_by({{ group_var }}) |> summarise(mean = mean(mass)) } ``` This generalises in a straightforward way if you want to use one user-supplied expression in multiple places: ```{r} my_summarise2 <- function(data, expr) { data |> summarise( mean = mean({{ expr }}), sum = sum({{ expr }}), n = n() ) } ``` If you want the user to provide multiple expressions, embrace each of them: ```{r} my_summarise3 <- function(data, mean_var, sd_var) { data |> summarise(mean = mean({{ mean_var }}), sd = sd({{ sd_var }})) } ``` If you want to use the name of a variable in the output, you can embrace the variable name on the left-hand side of `:=` with `{{`: ```{r} my_summarise4 <- function(data, expr) { data |> summarise( "mean_{{expr}}" := mean({{ expr }}), "sum_{{expr}}" := sum({{ expr }}), "n_{{expr}}" := n() ) } my_summarise5 <- function(data, mean_var, sd_var) { data |> summarise( "mean_{{mean_var}}" := mean({{ mean_var }}), "sd_{{sd_var}}" := sd({{ sd_var }}) ) } ``` ### Any number of user-supplied expressions If you want to take an arbitrary number of user supplied expressions, use `...`. This is most often useful when you want to give the user full control over a single part of the pipeline, like a `group_by()` or a `mutate()`. ```{r} my_summarise <- function(.data, ...) { .data |> group_by(...) |> summarise(mass = mean(mass, na.rm = TRUE), height = mean(height, na.rm = TRUE)) } starwars |> my_summarise(homeworld) starwars |> my_summarise(sex, gender) ``` When you use `...` in this way, make sure that any other arguments start with `.` to reduce the chances of argument clashes; see for more details. ### Creating multiple columns Sometimes it can be useful for a single expression to return multiple columns. You can do this by returning an unnamed data frame: ```{r} quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) { tibble( val = quantile(x, probs), quant = probs ) } x <- 1:5 quantile_df(x) ``` This sort of function is useful inside `summarise()` and `mutate()` which allow you to add multiple columns by returning a data frame: ```{r} df <- tibble( grp = rep(1:3, each = 10), x = runif(30), y = rnorm(30) ) df |> group_by(grp) |> summarise(quantile_df(x, probs = .5)) df |> group_by(grp) |> summarise(across(x:y, ~ quantile_df(.x, probs = .5), .unpack = TRUE)) ``` Notice that we set `.unpack = TRUE` inside `across()`. This tells `across()` to _unpack_ the data frame returned by `quantile_df()` into its respective columns, combining the column names of the original columns (`x` and `y`) with the column names returned from the function (`val` and `quant`). If your function returns multiple _rows_ per group, then you'll need to switch from `summarise()` to `reframe()`. `summarise()` is restricted to returning 1 row summaries per group, but `reframe()` lifts this restriction: ```{r} df |> group_by(grp) |> reframe(across(x:y, quantile_df, .unpack = TRUE)) ``` ### Transforming user-supplied variables If you want the user to provide a set of data-variables that are then transformed, use `across()` and `pick()`: ```{r} my_summarise <- function(data, summary_vars) { data |> summarise(across({{ summary_vars }}, ~ mean(., na.rm = TRUE))) } starwars |> group_by(species) |> my_summarise(c(mass, height)) ``` You can use this same idea for multiple sets of input data-variables: ```{r} my_summarise <- function(data, group_var, summarise_var) { data |> group_by(pick({{ group_var }})) |> summarise(across({{ summarise_var }}, mean)) } ``` Use the `.names` argument to `across()` to control the names of the output. ```{r} my_summarise <- function(data, group_var, summarise_var) { data |> group_by(pick({{ group_var }})) |> summarise(across({{ summarise_var }}, mean, .names = "mean_{.col}")) } ``` ### Loop over multiple variables If you have a character vector of variable names, and want to operate on them with a for loop, index into the special `.data` pronoun: ```{r, results = FALSE} for (var in names(mtcars)) { mtcars |> count(.data[[var]]) |> print() } ``` This same technique works with for loop alternatives like the base R `apply()` family and the purrr `map()` family: ```{r, results = FALSE} mtcars |> names() |> purrr::map(~ count(mtcars, .data[[.x]])) ``` (Note that the `x` in `.data[[x]]` is always treated as an env-variable; it will never come from the data.) ### Use a variable from an Shiny input Many Shiny input controls return character vectors, so you can use the same approach as above: `.data[[input$var]]`. ```{r, eval = FALSE} library(shiny) ui <- fluidPage( selectInput("var", "Variable", choices = names(diamonds)), tableOutput("output") ) server <- function(input, output, session) { data <- reactive(filter(diamonds, .data[[input$var]] > 0)) output$output <- renderTable(head(data())) } ``` See for more details and case studies. dplyr/inst/doc/colwise.Rmd0000644000176200001440000003001715137161765015254 0ustar liggesusers--- title: "Column-wise operations" description: > Learn how to easily repeat the same operation across multiple columns using `across()`. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Column-wise operations} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) set.seed(1014) ``` It's often useful to perform the same operation on multiple columns, but copying and pasting is both tedious and error prone: ```{r, eval = FALSE} df |> group_by(g1, g2) |> summarise(a = mean(a), b = mean(b), c = mean(c), d = mean(d)) ``` (If you're trying to compute `mean(a, b, c, d)` for each row, instead see `vignette("rowwise")`) This vignette will introduce you to the `across()` function, which lets you rewrite the previous code more succinctly: ```{r, eval = FALSE} df |> group_by(g1, g2) |> summarise(across(a:d, mean)) ``` We'll start by discussing the basic usage of `across()`, particularly as it applies to `summarise()`, and show how to use it with multiple functions. We'll then show a few uses with other verbs. We'll finish off with a bit of history, showing why we prefer `across()` to our last approach (the `_if()`, `_at()` and `_all()` functions) and how to translate your old code to the new syntax. ```{r setup} library(dplyr, warn.conflicts = FALSE) ``` ## Basic usage `across()` has two primary arguments: * The first argument, `.cols`, selects the columns you want to operate on. It uses tidy selection (like `select()`) so you can pick variables by position, name, and type. * The second argument, `.fns`, is a function or list of functions to apply to each column. This can also be a purrr style formula (or list of formulas) like `~ .x / 2`. (This argument is optional, and you can omit it if you just want to get the underlying data; you'll see that technique used in `vignette("rowwise")`.) Here are a couple of examples of `across()` in conjunction with its favourite verb, `summarise()`. But you can use `across()` with any dplyr verb, as you'll see a little later. ```{r} starwars |> summarise(across(where(is.character), n_distinct)) starwars |> group_by(species) |> filter(n() > 1) |> summarise(across(c(sex, gender, homeworld), n_distinct)) starwars |> group_by(homeworld) |> filter(n() > 1) |> summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE))) ``` Because `across()` is usually used in combination with `summarise()` and `mutate()`, it doesn't select grouping variables in order to avoid accidentally modifying them: ```{r} df <- data.frame(g = c(1, 1, 2), x = c(-1, 1, 3), y = c(-1, -4, -9)) df |> group_by(g) |> summarise(across(where(is.numeric), sum)) ``` ### Multiple functions You can transform each variable with more than one function by supplying a named list of functions or lambda functions in the second argument: ```{r} min_max <- list( min = ~min(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE) ) starwars |> summarise(across(where(is.numeric), min_max)) starwars |> summarise(across(c(height, mass, birth_year), min_max)) ``` Control how the names are created with the `.names` argument which takes a [glue](https://glue.tidyverse.org/) spec: ```{r} starwars |> summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}")) starwars |> summarise(across(c(height, mass, birth_year), min_max, .names = "{.fn}.{.col}")) ``` If you'd prefer all summaries with the same function to be grouped together, you'll have to expand the calls yourself: ```{r} starwars |> summarise( across(c(height, mass, birth_year), ~min(.x, na.rm = TRUE), .names = "min_{.col}"), across(c(height, mass, birth_year), ~max(.x, na.rm = TRUE), .names = "max_{.col}") ) ``` (One day this might become an argument to `across()` but we're not yet sure how it would work.) We cannot however use `where(is.numeric)` in that last case because the second `across()` would pick up the variables that were newly created ("min_height", "min_mass" and "min_birth_year"). We can work around this by combining both calls to `across()` into a single expression that returns a tibble: ```{r} starwars |> summarise( tibble( across(where(is.numeric), ~min(.x, na.rm = TRUE), .names = "min_{.col}"), across(where(is.numeric), ~max(.x, na.rm = TRUE), .names = "max_{.col}") ) ) ``` Alternatively we could reorganize results with `relocate()`: ```{r} starwars |> summarise(across(where(is.numeric), min_max, .names = "{.fn}.{.col}")) |> relocate(starts_with("min")) ``` ### Current column If you need to, you can access the name of the "current" column inside by calling `cur_column()`. This can be useful if you want to perform some sort of context dependent transformation that's already encoded in a vector: ```{r} df <- tibble(x = 1:3, y = 3:5, z = 5:7) mult <- list(x = 1, y = 10, z = 100) df |> mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]])) ``` ### Gotchas Be careful when combining numeric summaries with `where(is.numeric)`: ```{r} df <- data.frame(x = c(1, 2, 3), y = c(1, 4, 9)) df |> summarise(n = n(), across(where(is.numeric), sd)) ``` Here `n` becomes `NA` because `n` is numeric, so the `across()` computes its standard deviation, and the standard deviation of 3 (a constant) is `NA`. You probably want to compute `n()` last to avoid this problem: ```{r} df |> summarise(across(where(is.numeric), sd), n = n()) ``` Alternatively, you could explicitly exclude `n` from the columns to operate on: ```{r} df |> summarise(n = n(), across(where(is.numeric) & !n, sd)) ``` Another approach is to combine both the call to `n()` and `across()` in a single expression that returns a tibble: ```{r} df |> summarise( tibble(n = n(), across(where(is.numeric), sd)) ) ``` ### Other verbs So far we've focused on the use of `across()` with `summarise()`, but it works with any other dplyr verb that uses data masking: * Rescale all numeric variables to range 0-1: ```{r} rescale01 <- function(x) { rng <- range(x, na.rm = TRUE) (x - rng[1]) / (rng[2] - rng[1]) } df <- tibble(x = 1:4, y = rnorm(4)) df |> mutate(across(where(is.numeric), rescale01)) ``` For some verbs, like `group_by()`, `count()` and `distinct()`, you don't need to supply a summary function, but it can be useful to use tidy-selection to dynamically select a set of columns. In those cases, we recommend using the complement to `across()`, `pick()`, which works like `across()` but doesn't apply any functions and instead returns a data frame containing the selected columns. * Find all distinct ```{r} starwars |> distinct(pick(contains("color"))) ``` * Count all combinations of variables with a given pattern: ```{r} starwars |> count(pick(contains("color")), sort = TRUE) ``` `across()` doesn't work with `select()` or `rename()` because they already use tidy select syntax; if you want to transform column names with a function, you can use `rename_with()`. ### filter() and filter_out() We cannot directly use `across()` in `filter()` or `filter_out()` because we need an extra step to combine the results into a single logical vector. To that end, `filter()` and `filter_out()` have two special purpose companion functions: * `if_any()` keeps the rows where the predicate is true for *at least one* selected column: ```{r} starwars |> filter_out(if_any(everything(), is.na)) ``` * `if_all()` keeps the rows where the predicate is true for *all* selected columns: ```{r} starwars |> filter_out(if_all(everything(), is.na)) ``` ## `_if`, `_at`, `_all` Prior versions of dplyr allowed you to apply a function to multiple columns in a different way: using functions with `_if`, `_at`, and `_all()` suffixes. These functions solved a pressing need and are used by many people, but are now superseded. That means that they'll stay around, but won't receive any new features and will only get critical bug fixes. ### Why do we like `across()`? Why did we decide to move away from these functions in favour of `across()`? 1. `across()` makes it possible to express useful summaries that were previously impossible: ```{r, eval = FALSE} df |> group_by(g1, g2) |> summarise( across(where(is.numeric), mean), across(where(is.factor), nlevels), n = n(), ) ``` 1. `across()` reduces the number of functions that dplyr needs to provide. This makes dplyr easier for you to use (because there are fewer functions to remember) and easier for us to implement new verbs (since we only need to implement one function, not four). 1. `across()` unifies `_if` and `_at` semantics so that you can select by position, name, and type, and you can now create compound selections that were previously impossible. For example, you can now transform all numeric columns whose name begins with "x": `across(where(is.numeric) & starts_with("x"))`. 1. `across()` doesn't need to use `vars()`. The `_at()` functions are the only place in dplyr where you have to manually quote variable names, which makes them a little weird and hence harder to remember. ### Why did it take so long to discover `across()`? It's disappointing that we didn't discover `across()` earlier, and instead worked through several false starts (first not realising that it was a common problem, then with the `_each()` functions, and most recently with the `_if()`/`_at()`/`_all()` functions). But `across()` couldn't work without three recent discoveries: * You can have a column of a data frame that is itself a data frame. This is something provided by base R, but it's not very well documented, and it took a while to see that it was useful, not just a theoretical curiosity. * We can use data frames to allow summary functions to return multiple columns. * We can use the absence of an outer name as a convention that you want to unpack a data frame column into individual columns. ### How do you convert existing code? Fortunately, it's generally straightforward to translate your existing code to use `across()`: * Strip the `_if()`, `_at()` and `_all()` suffix off the function. * Call `across()`. The first argument will be: 1. For `_if()`, the old second argument wrapped in `where()`. 1. For `_at()`, the old second argument, with the call to `vars()` removed. 1. For `_all()`, `everything()`. The subsequent arguments can be copied as is. For example: ```{r, results = FALSE} df |> mutate_if(is.numeric, ~mean(.x, na.rm = TRUE)) # -> df |> mutate(across(where(is.numeric), ~mean(.x, na.rm = TRUE))) df |> mutate_at(vars(c(x, starts_with("y"))), mean) # -> df |> mutate(across(c(x, starts_with("y")), mean)) df |> mutate_all(mean) # -> df |> mutate(across(everything(), mean)) ``` There are a few exceptions to this rule: * `rename_*()` and `select_*()` follow a different pattern. They already have select semantics, so are generally used in a different way that doesn't have a direct equivalent with `across()`; use the new `rename_with()` instead. * Previously, `filter_*()` were paired with the `all_vars()` and `any_vars()` helpers. The new helpers `if_any()` and `if_all()` can be used inside `filter()` to keep rows for which the predicate is true for at least one, or all selected columns: ```{r} df <- tibble(x = c("a", "b"), y = c(1, 1), z = c(-1, 1)) # Find all rows where EVERY numeric variable is greater than zero df |> filter(if_all(where(is.numeric), ~ .x > 0)) # Find all rows where ANY numeric variable is greater than zero df |> filter(if_any(where(is.numeric), ~ .x > 0)) ``` * When used in a `mutate()`, all transformations performed by an `across()` are applied at once. This is different to the behaviour of `mutate_if()`, `mutate_at()`, and `mutate_all()`, which apply the transformations one at a time. We expect that you'll generally find the new behaviour less surprising: ```{r} df <- tibble(x = 2, y = 4, z = 8) df |> mutate_all(~ .x / y) df |> mutate(across(everything(), ~ .x / y)) ``` dplyr/inst/doc/base.html0000644000176200001440000025571015137234447014752 0ustar liggesusers dplyr <-> base R

dplyr <-> base R

This vignette compares dplyr functions to their base R equivalents. This helps those familiar with base R understand better what dplyr does, and shows dplyr users how you might express the same ideas in base R code. We’ll start with a rough overview of the major differences, then discuss the one table verbs in more detail, followed by the two table verbs.

Overview

  1. The code dplyr verbs input and output data frames. This contrasts with base R functions which more frequently work with individual vectors.

  2. dplyr relies heavily on “non-standard evaluation” so that you don’t need to use $ to refer to columns in the “current” data frame. This behaviour is inspired by the base functions subset() and transform().

  3. dplyr solutions tend to use a variety of single purpose verbs, while base R solutions typically tend to use [ in a variety of ways, depending on the task at hand.

  4. Multiple dplyr verbs are often strung together into a pipeline by |>. In base R, you’ll typically save intermediate results to a variable that you either discard, or repeatedly overwrite.

  5. All dplyr verbs handle “grouped” data frames so that the code to perform a computation per-group looks very similar to code that works on a whole data frame. In base R, per-group operations tend to have varied forms.

One table verbs

The following table shows a condensed translation between dplyr verbs and their base R equivalents. The following sections describe each operation in more detail. You’ll learn more about the dplyr verbs in their documentation and in vignette("dplyr").

dplyr base
arrange(df, x) df[order(x), , drop = FALSE]
distinct(df, x) df[!duplicated(x), , drop = FALSE], unique()
filter(df, x) df[which(x), , drop = FALSE], subset()
mutate(df, z = x + y) df$z <- df$x + df$y, transform()
pull(df, 1) df[[1]]
pull(df, x) df$x
rename(df, y = x) names(df)[names(df) == "x"] <- "y"
relocate(df, y) df[union("y", names(df))]
select(df, x, y) df[c("x", "y")], subset()
select(df, starts_with("x")) df[grepl("^x", names(df))]
summarise(df, mean(x)) mean(df$x), tapply(), aggregate(), by()
slice(df, c(1, 2, 5)) df[c(1, 2, 5), , drop = FALSE]

To begin, we’ll load dplyr and convert mtcars and iris to tibbles so that we can easily show only abbreviated output for each operation.

library(dplyr)
mtcars <- as_tibble(mtcars)
iris <- as_tibble(iris)

arrange(): Arrange rows by variables

dplyr::arrange() orders the rows of a data frame by the values of one or more columns:

mtcars |> arrange(cyl, disp)
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  33.9     4  71.1    65  4.22  1.84  19.9     1     1     4     1
#> 2  30.4     4  75.7    52  4.93  1.62  18.5     1     1     4     2
#> 3  32.4     4  78.7    66  4.08  2.2   19.5     1     1     4     1
#> 4  27.3     4  79      66  4.08  1.94  18.9     1     1     4     1
#> # ℹ 28 more rows

The desc() helper allows you to order selected variables in descending order:

mtcars |> arrange(desc(cyl), desc(disp))
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  10.4     8   472   205  2.93  5.25  18.0     0     0     3     4
#> 2  10.4     8   460   215  3     5.42  17.8     0     0     3     4
#> 3  14.7     8   440   230  3.23  5.34  17.4     0     0     3     4
#> 4  19.2     8   400   175  3.08  3.84  17.0     0     0     3     2
#> # ℹ 28 more rows

We can replicate in base R by using [ with order():

mtcars[order(mtcars$cyl, mtcars$disp), , drop = FALSE]
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  33.9     4  71.1    65  4.22  1.84  19.9     1     1     4     1
#> 2  30.4     4  75.7    52  4.93  1.62  18.5     1     1     4     2
#> 3  32.4     4  78.7    66  4.08  2.2   19.5     1     1     4     1
#> 4  27.3     4  79      66  4.08  1.94  18.9     1     1     4     1
#> # ℹ 28 more rows

Note the use of drop = FALSE. If you forget this, and the input is a data frame with a single column, the output will be a vector, not a data frame. This is a source of subtle bugs.

Base R does not provide a convenient and general way to sort individual variables in descending order, so you have two options:

  • For numeric variables, you can use -x.
  • You can request order() to sort all variables in descending order.
mtcars[order(mtcars$cyl, mtcars$disp, decreasing = TRUE), , drop = FALSE]
mtcars[order(-mtcars$cyl, -mtcars$disp), , drop = FALSE]

distinct(): Select distinct/unique rows

dplyr::distinct() selects unique rows:

df <- tibble(
  x = sample(10, 100, rep = TRUE),
  y = sample(10, 100, rep = TRUE)
)

df |> distinct(x) # selected columns
#> # A tibble: 10 × 1
#>       x
#>   <int>
#> 1     5
#> 2     6
#> 3     4
#> 4     1
#> # ℹ 6 more rows
df |> distinct(x, .keep_all = TRUE) # whole data frame
#> # A tibble: 10 × 2
#>       x     y
#>   <int> <int>
#> 1     5     6
#> 2     6     4
#> 3     4     7
#> 4     1     3
#> # ℹ 6 more rows

There are two equivalents in base R, depending on whether you want the whole data frame, or just selected variables:

unique(df["x"]) # selected columns
#> # A tibble: 10 × 1
#>       x
#>   <int>
#> 1     5
#> 2     6
#> 3     4
#> 4     1
#> # ℹ 6 more rows
df[!duplicated(df$x), , drop = FALSE] # whole data frame
#> # A tibble: 10 × 2
#>       x     y
#>   <int> <int>
#> 1     5     6
#> 2     6     4
#> 3     4     7
#> 4     1     3
#> # ℹ 6 more rows

filter(): Return rows with matching conditions

dplyr::filter() selects rows where an expression is TRUE:

starwars |> filter(species == "Human")
#> # A tibble: 35 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> 3 Leia Org…    150    49 brown      light      brown           19   fema… femin…
#> 4 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
#> # ℹ 31 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
starwars |> filter(mass > 1000)
#> # A tibble: 1 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Jabba De…    175  1358 <NA>       green-tan… orange           600 herm… mascu…
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
starwars |> filter(hair_color == "none" & eye_color == "black")
#> # A tibble: 9 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Nien Nunb    160    68 none       grey       black             NA male  mascu…
#> 2 Gasgano      122    NA none       white, bl… black             NA male  mascu…
#> 3 Kit Fisto    196    87 none       green      black             NA male  mascu…
#> 4 Plo Koon     188    80 none       orange     black             22 male  mascu…
#> # ℹ 5 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

The closest base equivalent (and the inspiration for filter()) is subset():

subset(starwars, species == "Human")
#> # A tibble: 35 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> 3 Leia Org…    150    49 brown      light      brown           19   fema… femin…
#> 4 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
#> # ℹ 31 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
subset(starwars, mass > 1000)
#> # A tibble: 1 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Jabba De…    175  1358 <NA>       green-tan… orange           600 herm… mascu…
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
subset(starwars, hair_color == "none" & eye_color == "black")
#> # A tibble: 9 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Nien Nunb    160    68 none       grey       black             NA male  mascu…
#> 2 Gasgano      122    NA none       white, bl… black             NA male  mascu…
#> 3 Kit Fisto    196    87 none       green      black             NA male  mascu…
#> 4 Plo Koon     188    80 none       orange     black             22 male  mascu…
#> # ℹ 5 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

You can also use [ but this also requires the use of which() to remove NAs:

starwars[which(starwars$species == "Human"), , drop = FALSE]
#> # A tibble: 35 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> 3 Leia Org…    150    49 brown      light      brown           19   fema… femin…
#> 4 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
#> # ℹ 31 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
starwars[which(starwars$mass > 1000), , drop = FALSE]
#> # A tibble: 1 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Jabba De…    175  1358 <NA>       green-tan… orange           600 herm… mascu…
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
starwars[which(starwars$hair_color == "none" & starwars$eye_color == "black"), , drop = FALSE]
#> # A tibble: 9 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Nien Nunb    160    68 none       grey       black             NA male  mascu…
#> 2 Gasgano      122    NA none       white, bl… black             NA male  mascu…
#> 3 Kit Fisto    196    87 none       green      black             NA male  mascu…
#> 4 Plo Koon     188    80 none       orange     black             22 male  mascu…
#> # ℹ 5 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

mutate(): Create or transform variables

dplyr::mutate() creates new variables from existing variables:

df |> mutate(z = x + y, z2 = z ^ 2)
#> # A tibble: 100 × 4
#>       x     y     z    z2
#>   <int> <int> <int> <dbl>
#> 1     5     6    11   121
#> 2     6     4    10   100
#> 3     5     6    11   121
#> 4     4     7    11   121
#> # ℹ 96 more rows

The closest base equivalent is transform(), but note that it cannot use freshly created variables:

head(transform(df, z = x + y, z2 = (x + y) ^ 2))
#>   x  y  z  z2
#> 1 5  6 11 121
#> 2 6  4 10 100
#> 3 5  6 11 121
#> 4 4  7 11 121
#> 5 1  3  4  16
#> 6 1 10 11 121

Alternatively, you can use $<-:

mtcars$cyl2 <- mtcars$cyl * 2
mtcars$cyl4 <- mtcars$cyl2 * 2

When applied to a grouped data frame, dplyr::mutate() computes new variable once per group:

gf <- tibble(g = c(1, 1, 2, 2), x = c(0.5, 1.5, 2.5, 3.5))
gf |>
  group_by(g) |>
  mutate(x_mean = mean(x), x_rank = rank(x))
#> # A tibble: 4 × 4
#> # Groups:   g [2]
#>       g     x x_mean x_rank
#>   <dbl> <dbl>  <dbl>  <dbl>
#> 1     1   0.5      1      1
#> 2     1   1.5      1      2
#> 3     2   2.5      3      1
#> 4     2   3.5      3      2

To replicate this in base R, you can use ave():

transform(gf,
  x_mean = ave(x, g, FUN = mean),
  x_rank = ave(x, g, FUN = rank)
)
#>   g   x x_mean x_rank
#> 1 1 0.5      1      1
#> 2 1 1.5      1      2
#> 3 2 2.5      3      1
#> 4 2 3.5      3      2

pull(): Pull out a single variable

dplyr::pull() extracts a variable either by name or position:

mtcars |> pull(1)
#>  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
#> [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
#> [31] 15.0 21.4
mtcars |> pull(cyl)
#>  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4

This equivalent to [[ for positions and $ for names:

mtcars[[1]]
#>  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
#> [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
#> [31] 15.0 21.4
mtcars$cyl
#>  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4

relocate(): Change column order

dplyr::relocate() makes it easy to move a set of columns to a new position (by default, the front):

# to front
mtcars |> relocate(gear, carb)
#> # A tibble: 32 × 13
#>    gear  carb   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  cyl2  cyl4
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     4     4  21       6   160   110  3.9   2.62  16.5     0     1    12    24
#> 2     4     4  21       6   160   110  3.9   2.88  17.0     0     1    12    24
#> 3     4     1  22.8     4   108    93  3.85  2.32  18.6     1     1     8    16
#> 4     3     1  21.4     6   258   110  3.08  3.22  19.4     1     0    12    24
#> # ℹ 28 more rows

# to back
mtcars |> relocate(mpg, cyl, .after = last_col())
#> # A tibble: 32 × 13
#>    disp    hp  drat    wt  qsec    vs    am  gear  carb  cyl2  cyl4   mpg   cyl
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1   160   110  3.9   2.62  16.5     0     1     4     4    12    24  21       6
#> 2   160   110  3.9   2.88  17.0     0     1     4     4    12    24  21       6
#> 3   108    93  3.85  2.32  18.6     1     1     4     1     8    16  22.8     4
#> 4   258   110  3.08  3.22  19.4     1     0     3     1    12    24  21.4     6
#> # ℹ 28 more rows

We can replicate this in base R with a little set manipulation:

mtcars[union(c("gear", "carb"), names(mtcars))]
#> # A tibble: 32 × 13
#>    gear  carb   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  cyl2  cyl4
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     4     4  21       6   160   110  3.9   2.62  16.5     0     1    12    24
#> 2     4     4  21       6   160   110  3.9   2.88  17.0     0     1    12    24
#> 3     4     1  22.8     4   108    93  3.85  2.32  18.6     1     1     8    16
#> 4     3     1  21.4     6   258   110  3.08  3.22  19.4     1     0    12    24
#> # ℹ 28 more rows

to_back <- c("mpg", "cyl")
mtcars[c(setdiff(names(mtcars), to_back), to_back)]
#> # A tibble: 32 × 13
#>    disp    hp  drat    wt  qsec    vs    am  gear  carb  cyl2  cyl4   mpg   cyl
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1   160   110  3.9   2.62  16.5     0     1     4     4    12    24  21       6
#> 2   160   110  3.9   2.88  17.0     0     1     4     4    12    24  21       6
#> 3   108    93  3.85  2.32  18.6     1     1     4     1     8    16  22.8     4
#> 4   258   110  3.08  3.22  19.4     1     0     3     1    12    24  21.4     6
#> # ℹ 28 more rows

Moving columns to somewhere in the middle requires a little more set twiddling.

rename(): Rename variables by name

dplyr::rename() allows you to rename variables by name or position:

iris |> rename(sepal_length = Sepal.Length, sepal_width = 2)
#> # A tibble: 150 × 5
#>   sepal_length sepal_width Petal.Length Petal.Width Species
#>          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#> 1          5.1         3.5          1.4         0.2 setosa 
#> 2          4.9         3            1.4         0.2 setosa 
#> 3          4.7         3.2          1.3         0.2 setosa 
#> 4          4.6         3.1          1.5         0.2 setosa 
#> # ℹ 146 more rows

Renaming variables by position is straight forward in base R:

iris2 <- iris
names(iris2)[2] <- "sepal_width"

Renaming variables by name requires a bit more work:

names(iris2)[names(iris2) == "Sepal.Length"] <- "sepal_length"

rename_with(): Rename variables with a function

dplyr::rename_with() transform column names with a function:

iris |> rename_with(toupper)
#> # A tibble: 150 × 5
#>   SEPAL.LENGTH SEPAL.WIDTH PETAL.LENGTH PETAL.WIDTH SPECIES
#>          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#> 1          5.1         3.5          1.4         0.2 setosa 
#> 2          4.9         3            1.4         0.2 setosa 
#> 3          4.7         3.2          1.3         0.2 setosa 
#> 4          4.6         3.1          1.5         0.2 setosa 
#> # ℹ 146 more rows

A similar effect can be achieved with setNames() in base R:

setNames(iris, toupper(names(iris)))
#> # A tibble: 150 × 5
#>   SEPAL.LENGTH SEPAL.WIDTH PETAL.LENGTH PETAL.WIDTH SPECIES
#>          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#> 1          5.1         3.5          1.4         0.2 setosa 
#> 2          4.9         3            1.4         0.2 setosa 
#> 3          4.7         3.2          1.3         0.2 setosa 
#> 4          4.6         3.1          1.5         0.2 setosa 
#> # ℹ 146 more rows

select(): Select variables by name

dplyr::select() subsets columns by position, name, function of name, or other property:

iris |> select(1:3)
#> # A tibble: 150 × 3
#>   Sepal.Length Sepal.Width Petal.Length
#>          <dbl>       <dbl>        <dbl>
#> 1          5.1         3.5          1.4
#> 2          4.9         3            1.4
#> 3          4.7         3.2          1.3
#> 4          4.6         3.1          1.5
#> # ℹ 146 more rows
iris |> select(Species, Sepal.Length)
#> # A tibble: 150 × 2
#>   Species Sepal.Length
#>   <fct>          <dbl>
#> 1 setosa           5.1
#> 2 setosa           4.9
#> 3 setosa           4.7
#> 4 setosa           4.6
#> # ℹ 146 more rows
iris |> select(starts_with("Petal"))
#> # A tibble: 150 × 2
#>   Petal.Length Petal.Width
#>          <dbl>       <dbl>
#> 1          1.4         0.2
#> 2          1.4         0.2
#> 3          1.3         0.2
#> 4          1.5         0.2
#> # ℹ 146 more rows
iris |> select(where(is.factor))
#> # A tibble: 150 × 1
#>   Species
#>   <fct>  
#> 1 setosa 
#> 2 setosa 
#> 3 setosa 
#> 4 setosa 
#> # ℹ 146 more rows

Subsetting variables by position is straightforward in base R:

iris[1:3] # single argument selects columns; never drops
#> # A tibble: 150 × 3
#>   Sepal.Length Sepal.Width Petal.Length
#>          <dbl>       <dbl>        <dbl>
#> 1          5.1         3.5          1.4
#> 2          4.9         3            1.4
#> 3          4.7         3.2          1.3
#> 4          4.6         3.1          1.5
#> # ℹ 146 more rows
iris[1:3, , drop = FALSE]
#> # A tibble: 3 × 5
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#> 1          5.1         3.5          1.4         0.2 setosa 
#> 2          4.9         3            1.4         0.2 setosa 
#> 3          4.7         3.2          1.3         0.2 setosa

You have two options to subset by name:

iris[c("Species", "Sepal.Length")]
#> # A tibble: 150 × 2
#>   Species Sepal.Length
#>   <fct>          <dbl>
#> 1 setosa           5.1
#> 2 setosa           4.9
#> 3 setosa           4.7
#> 4 setosa           4.6
#> # ℹ 146 more rows
subset(iris, select = c(Species, Sepal.Length))
#> # A tibble: 150 × 2
#>   Species Sepal.Length
#>   <fct>          <dbl>
#> 1 setosa           5.1
#> 2 setosa           4.9
#> 3 setosa           4.7
#> 4 setosa           4.6
#> # ℹ 146 more rows

Subsetting by function of name requires a bit of work with grep():

iris[grep("^Petal", names(iris))]
#> # A tibble: 150 × 2
#>   Petal.Length Petal.Width
#>          <dbl>       <dbl>
#> 1          1.4         0.2
#> 2          1.4         0.2
#> 3          1.3         0.2
#> 4          1.5         0.2
#> # ℹ 146 more rows

And you can use Filter() to subset by type:

Filter(is.factor, iris)
#> # A tibble: 150 × 1
#>   Species
#>   <fct>  
#> 1 setosa 
#> 2 setosa 
#> 3 setosa 
#> 4 setosa 
#> # ℹ 146 more rows

summarise(): Reduce multiple values down to a single value

dplyr::summarise() computes one or more summaries for each group:

mtcars |>
  group_by(cyl) |>
  summarise(mean = mean(disp), n = n())
#> # A tibble: 3 × 3
#>     cyl  mean     n
#>   <dbl> <dbl> <int>
#> 1     4  105.    11
#> 2     6  183.     7
#> 3     8  353.    14

I think the closest base R equivalent uses by(). Unfortunately by() returns a list of data frames, but you can combine them back together again with do.call() and rbind():

mtcars_by <- by(mtcars, mtcars$cyl, function(df) {
  with(df, data.frame(cyl = cyl[[1]], mean = mean(disp), n = nrow(df)))
})
do.call(rbind, mtcars_by)
#>   cyl     mean  n
#> 4   4 105.1364 11
#> 6   6 183.3143  7
#> 8   8 353.1000 14

aggregate() comes very close to providing an elegant answer:

agg <- aggregate(disp ~ cyl, mtcars, function(x) c(mean = mean(x), n = length(x)))
agg
#>   cyl disp.mean   disp.n
#> 1   4  105.1364  11.0000
#> 2   6  183.3143   7.0000
#> 3   8  353.1000  14.0000

But unfortunately while it looks like there are disp.mean and disp.n columns, it’s actually a single matrix column:

str(agg)
#> 'data.frame':    3 obs. of  2 variables:
#>  $ cyl : num  4 6 8
#>  $ disp: num [1:3, 1:2] 105 183 353 11 7 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : NULL
#>   .. ..$ : chr [1:2] "mean" "n"

You can see a variety of other options at https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec.

slice(): Choose rows by position

slice() selects rows with their location:

slice(mtcars, 25:n())
#> # A tibble: 8 × 13
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb  cyl2  cyl4
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  19.2     8 400     175  3.08  3.84  17.0     0     0     3     2    16    32
#> 2  27.3     4  79      66  4.08  1.94  18.9     1     1     4     1     8    16
#> 3  26       4 120.     91  4.43  2.14  16.7     0     1     5     2     8    16
#> 4  30.4     4  95.1   113  3.77  1.51  16.9     1     1     5     2     8    16
#> # ℹ 4 more rows

This is straightforward to replicate with [:

mtcars[25:nrow(mtcars), , drop = FALSE]
#> # A tibble: 8 × 13
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb  cyl2  cyl4
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  19.2     8 400     175  3.08  3.84  17.0     0     0     3     2    16    32
#> 2  27.3     4  79      66  4.08  1.94  18.9     1     1     4     1     8    16
#> 3  26       4 120.     91  4.43  2.14  16.7     0     1     5     2     8    16
#> 4  30.4     4  95.1   113  3.77  1.51  16.9     1     1     5     2     8    16
#> # ℹ 4 more rows

Two-table verbs

When we want to merge two data frames, x and y), we have a variety of different ways to bring them together. Various base R merge() calls are replaced by a variety of dplyr join() functions.

dplyr base
inner_join(df1, df2) merge(df1, df2)
left_join(df1, df2) merge(df1, df2, all.x = TRUE)
right_join(df1, df2) merge(df1, df2, all.y = TRUE)
full_join(df1, df2) merge(df1, df2, all = TRUE)
semi_join(df1, df2) df1[df1$x %in% df2$x, , drop = FALSE]
anti_join(df1, df2) df1[!df1$x %in% df2$x, , drop = FALSE]

For more information about two-table verbs, see vignette("two-table").

Mutating joins

dplyr’s inner_join(), left_join(), right_join(), and full_join() add new columns from y to x, matching rows based on a set of “keys”, and differ only in how missing matches are handled. They are equivalent to calls to merge() with various settings of the all, all.x, and all.y arguments. The main difference is the order of the rows:

  • dplyr preserves the order of the x data frame.
  • merge() sorts the key columns.

Filtering joins

dplyr’s semi_join() and anti_join() affect only the rows, not the columns:

band_members |> semi_join(band_instruments)
#> Joining with `by = join_by(name)`
#> # A tibble: 2 × 2
#>   name  band   
#>   <chr> <chr>  
#> 1 John  Beatles
#> 2 Paul  Beatles
band_members |> anti_join(band_instruments)
#> Joining with `by = join_by(name)`
#> # A tibble: 1 × 2
#>   name  band  
#>   <chr> <chr> 
#> 1 Mick  Stones

They can be replicated in base R with [ and %in%:

band_members[band_members$name %in% band_instruments$name, , drop = FALSE]
#> # A tibble: 2 × 2
#>   name  band   
#>   <chr> <chr>  
#> 1 John  Beatles
#> 2 Paul  Beatles
band_members[!band_members$name %in% band_instruments$name, , drop = FALSE]
#> # A tibble: 1 × 2
#>   name  band  
#>   <chr> <chr> 
#> 1 Mick  Stones

Semi and anti joins with multiple key variables are considerably more challenging to implement.

dplyr/inst/doc/base.Rmd0000644000176200001440000002734115106134104014506 0ustar liggesusers--- title: "dplyr <-> base R" output: rmarkdown::html_vignette description: > How does dplyr compare to base R? This vignette describes the main differences in philosophy, and shows the base R code most closely equivalent to each dplyr verb. vignette: > %\VignetteIndexEntry{dplyr <-> base R} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4, tibble.print_max = 4) ``` This vignette compares dplyr functions to their base R equivalents. This helps those familiar with base R understand better what dplyr does, and shows dplyr users how you might express the same ideas in base R code. We'll start with a rough overview of the major differences, then discuss the one table verbs in more detail, followed by the two table verbs. # Overview 1. The code dplyr verbs input and output data frames. This contrasts with base R functions which more frequently work with individual vectors. 1. dplyr relies heavily on "non-standard evaluation" so that you don't need to use `$` to refer to columns in the "current" data frame. This behaviour is inspired by the base functions `subset()` and `transform()`. 1. dplyr solutions tend to use a variety of single purpose verbs, while base R solutions typically tend to use `[` in a variety of ways, depending on the task at hand. 1. Multiple dplyr verbs are often strung together into a pipeline by `|>`. In base R, you'll typically save intermediate results to a variable that you either discard, or repeatedly overwrite. 1. All dplyr verbs handle "grouped" data frames so that the code to perform a computation per-group looks very similar to code that works on a whole data frame. In base R, per-group operations tend to have varied forms. # One table verbs The following table shows a condensed translation between dplyr verbs and their base R equivalents. The following sections describe each operation in more detail. You'll learn more about the dplyr verbs in their documentation and in `vignette("dplyr")`. | dplyr | base | |------------------------------- |--------------------------------------------------| | `arrange(df, x)` | `df[order(x), , drop = FALSE]` | | `distinct(df, x)` | `df[!duplicated(x), , drop = FALSE]`, `unique()` | | `filter(df, x)` | `df[which(x), , drop = FALSE]`, `subset()` | | `mutate(df, z = x + y)` | `df$z <- df$x + df$y`, `transform()` | | `pull(df, 1)` | `df[[1]]` | | `pull(df, x)` | `df$x` | | `rename(df, y = x)` | `names(df)[names(df) == "x"] <- "y"` | | `relocate(df, y)` | `df[union("y", names(df))]` | | `select(df, x, y)` | `df[c("x", "y")]`, `subset()` | | `select(df, starts_with("x"))` | `df[grepl("^x", names(df))]` | | `summarise(df, mean(x))` | `mean(df$x)`, `tapply()`, `aggregate()`, `by()` | | `slice(df, c(1, 2, 5))` | `df[c(1, 2, 5), , drop = FALSE]` | To begin, we'll load dplyr and convert `mtcars` and `iris` to tibbles so that we can easily show only abbreviated output for each operation. ```{r setup, message = FALSE} library(dplyr) mtcars <- as_tibble(mtcars) iris <- as_tibble(iris) ``` ## `arrange()`: Arrange rows by variables `dplyr::arrange()` orders the rows of a data frame by the values of one or more columns: ```{r} mtcars |> arrange(cyl, disp) ``` The `desc()` helper allows you to order selected variables in descending order: ```{r} mtcars |> arrange(desc(cyl), desc(disp)) ``` We can replicate in base R by using `[` with `order()`: ```{r} mtcars[order(mtcars$cyl, mtcars$disp), , drop = FALSE] ``` Note the use of `drop = FALSE`. If you forget this, and the input is a data frame with a single column, the output will be a vector, not a data frame. This is a source of subtle bugs. Base R does not provide a convenient and general way to sort individual variables in descending order, so you have two options: * For numeric variables, you can use `-x`. * You can request `order()` to sort all variables in descending order. ```{r, results = FALSE} mtcars[order(mtcars$cyl, mtcars$disp, decreasing = TRUE), , drop = FALSE] mtcars[order(-mtcars$cyl, -mtcars$disp), , drop = FALSE] ``` ## `distinct()`: Select distinct/unique rows `dplyr::distinct()` selects unique rows: ```{r} df <- tibble( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) df |> distinct(x) # selected columns df |> distinct(x, .keep_all = TRUE) # whole data frame ``` There are two equivalents in base R, depending on whether you want the whole data frame, or just selected variables: ```{r} unique(df["x"]) # selected columns df[!duplicated(df$x), , drop = FALSE] # whole data frame ``` ## `filter()`: Return rows with matching conditions `dplyr::filter()` selects rows where an expression is `TRUE`: ```{r} starwars |> filter(species == "Human") starwars |> filter(mass > 1000) starwars |> filter(hair_color == "none" & eye_color == "black") ``` The closest base equivalent (and the inspiration for `filter()`) is `subset()`: ```{r} subset(starwars, species == "Human") subset(starwars, mass > 1000) subset(starwars, hair_color == "none" & eye_color == "black") ``` You can also use `[` but this also requires the use of `which()` to remove `NA`s: ```{r} starwars[which(starwars$species == "Human"), , drop = FALSE] starwars[which(starwars$mass > 1000), , drop = FALSE] starwars[which(starwars$hair_color == "none" & starwars$eye_color == "black"), , drop = FALSE] ``` ## `mutate()`: Create or transform variables `dplyr::mutate()` creates new variables from existing variables: ```{r} df |> mutate(z = x + y, z2 = z ^ 2) ``` The closest base equivalent is `transform()`, but note that it cannot use freshly created variables: ```{r} head(transform(df, z = x + y, z2 = (x + y) ^ 2)) ``` Alternatively, you can use `$<-`: ```{r} mtcars$cyl2 <- mtcars$cyl * 2 mtcars$cyl4 <- mtcars$cyl2 * 2 ``` When applied to a grouped data frame, `dplyr::mutate()` computes new variable once per group: ```{r} gf <- tibble(g = c(1, 1, 2, 2), x = c(0.5, 1.5, 2.5, 3.5)) gf |> group_by(g) |> mutate(x_mean = mean(x), x_rank = rank(x)) ``` To replicate this in base R, you can use `ave()`: ```{r} transform(gf, x_mean = ave(x, g, FUN = mean), x_rank = ave(x, g, FUN = rank) ) ``` ## `pull()`: Pull out a single variable `dplyr::pull()` extracts a variable either by name or position: ```{r} mtcars |> pull(1) mtcars |> pull(cyl) ``` This equivalent to `[[` for positions and `$` for names: ```{r} mtcars[[1]] mtcars$cyl ``` ## `relocate()`: Change column order `dplyr::relocate()` makes it easy to move a set of columns to a new position (by default, the front): ```{r} # to front mtcars |> relocate(gear, carb) # to back mtcars |> relocate(mpg, cyl, .after = last_col()) ``` We can replicate this in base R with a little set manipulation: ```{r} mtcars[union(c("gear", "carb"), names(mtcars))] to_back <- c("mpg", "cyl") mtcars[c(setdiff(names(mtcars), to_back), to_back)] ``` Moving columns to somewhere in the middle requires a little more set twiddling. ## `rename()`: Rename variables by name `dplyr::rename()` allows you to rename variables by name or position: ```{r} iris |> rename(sepal_length = Sepal.Length, sepal_width = 2) ``` Renaming variables by position is straight forward in base R: ```{r} iris2 <- iris names(iris2)[2] <- "sepal_width" ``` Renaming variables by name requires a bit more work: ```{r} names(iris2)[names(iris2) == "Sepal.Length"] <- "sepal_length" ``` ## `rename_with()`: Rename variables with a function `dplyr::rename_with()` transform column names with a function: ```{r} iris |> rename_with(toupper) ``` A similar effect can be achieved with `setNames()` in base R: ```{r} setNames(iris, toupper(names(iris))) ``` ## `select()`: Select variables by name `dplyr::select()` subsets columns by position, name, function of name, or other property: ```{r} iris |> select(1:3) iris |> select(Species, Sepal.Length) iris |> select(starts_with("Petal")) iris |> select(where(is.factor)) ``` Subsetting variables by position is straightforward in base R: ```{r} iris[1:3] # single argument selects columns; never drops iris[1:3, , drop = FALSE] ``` You have two options to subset by name: ```{r} iris[c("Species", "Sepal.Length")] subset(iris, select = c(Species, Sepal.Length)) ``` Subsetting by function of name requires a bit of work with `grep()`: ```{r} iris[grep("^Petal", names(iris))] ``` And you can use `Filter()` to subset by type: ```{r} Filter(is.factor, iris) ``` ## `summarise()`: Reduce multiple values down to a single value `dplyr::summarise()` computes one or more summaries for each group: ```{r} mtcars |> group_by(cyl) |> summarise(mean = mean(disp), n = n()) ``` I think the closest base R equivalent uses `by()`. Unfortunately `by()` returns a list of data frames, but you can combine them back together again with `do.call()` and `rbind()`: ```{r} mtcars_by <- by(mtcars, mtcars$cyl, function(df) { with(df, data.frame(cyl = cyl[[1]], mean = mean(disp), n = nrow(df))) }) do.call(rbind, mtcars_by) ``` `aggregate()` comes very close to providing an elegant answer: ```{r} agg <- aggregate(disp ~ cyl, mtcars, function(x) c(mean = mean(x), n = length(x))) agg ``` But unfortunately while it looks like there are `disp.mean` and `disp.n` columns, it's actually a single matrix column: ```{r} str(agg) ``` You can see a variety of other options at . ## `slice()`: Choose rows by position `slice()` selects rows with their location: ```{r} slice(mtcars, 25:n()) ``` This is straightforward to replicate with `[`: ```{r} mtcars[25:nrow(mtcars), , drop = FALSE] ``` # Two-table verbs When we want to merge two data frames, `x` and `y`), we have a variety of different ways to bring them together. Various base R `merge()` calls are replaced by a variety of dplyr `join()` functions. | dplyr | base | |------------------------|-----------------------------------------| | `inner_join(df1, df2)` |`merge(df1, df2)` | | `left_join(df1, df2) ` |`merge(df1, df2, all.x = TRUE)` | | `right_join(df1, df2)` |`merge(df1, df2, all.y = TRUE)` | | `full_join(df1, df2)` |`merge(df1, df2, all = TRUE)` | | `semi_join(df1, df2)` |`df1[df1$x %in% df2$x, , drop = FALSE]` | | `anti_join(df1, df2)` |`df1[!df1$x %in% df2$x, , drop = FALSE]` | For more information about two-table verbs, see `vignette("two-table")`. ### Mutating joins dplyr's `inner_join()`, `left_join()`, `right_join()`, and `full_join()` add new columns from `y` to `x`, matching rows based on a set of "keys", and differ only in how missing matches are handled. They are equivalent to calls to `merge()` with various settings of the `all`, `all.x`, and `all.y` arguments. The main difference is the order of the rows: * dplyr preserves the order of the `x` data frame. * `merge()` sorts the key columns. ### Filtering joins dplyr's `semi_join()` and `anti_join()` affect only the rows, not the columns: ```{r} band_members |> semi_join(band_instruments) band_members |> anti_join(band_instruments) ``` They can be replicated in base R with `[` and `%in%`: ```{r} band_members[band_members$name %in% band_instruments$name, , drop = FALSE] band_members[!band_members$name %in% band_instruments$name, , drop = FALSE] ``` Semi and anti joins with multiple key variables are considerably more challenging to implement. dplyr/inst/doc/window-functions.R0000644000176200001440000000702315137234471016577 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) library(dplyr) set.seed(1014) if (!rlang::is_installed("Lahman")) { knitr::opts_chunk$set(eval = FALSE) } ## ----------------------------------------------------------------------------- library(Lahman) batting <- Lahman::Batting |> as_tibble() |> select(playerID, yearID, teamID, G, AB:H) |> arrange(playerID, yearID, teamID) |> semi_join(Lahman::AwardsPlayers, by = "playerID") players <- batting |> group_by(playerID) ## ----eval = FALSE------------------------------------------------------------- # # For each player, find the two years with most hits # filter(players, min_rank(desc(H)) <= 2 & H > 0) # # Within each player, rank each year by the number of games played # mutate(players, G_rank = min_rank(G)) # # # For each player, find every year that was better than the previous year # filter(players, G > lag(G)) # # For each player, compute avg change in games played per year # mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID))) # # # For each player, find all years where they played more games than they did on average # filter(players, G > mean(G)) # # For each, player compute a z score based on number of games played # mutate(players, G_z = (G - mean(G)) / sd(G)) ## ----------------------------------------------------------------------------- x <- c(1, 1, 2, 2, 2) row_number(x) min_rank(x) dense_rank(x) ## ----------------------------------------------------------------------------- cume_dist(x) percent_rank(x) ## ----------------------------------------------------------------------------- filter(players, cume_dist(desc(G)) < 0.1) ## ----------------------------------------------------------------------------- by_team_player <- group_by(batting, teamID, playerID) by_team <- summarise(by_team_player, G = sum(G)) by_team_quartile <- group_by(by_team, quartile = ntile(G, 4)) summarise(by_team_quartile, mean(G)) ## ----------------------------------------------------------------------------- x <- 1:5 lead(x) lag(x) ## ----results = "hide"--------------------------------------------------------- # Compute the relative change in games played mutate(players, G_delta = G - lag(G)) ## ----results = "hide"--------------------------------------------------------- # Find when a player changed teams filter(players, teamID != lag(teamID)) ## ----------------------------------------------------------------------------- df <- data.frame(year = 2000:2005, value = (0:5) ^ 2) scrambled <- df[sample(nrow(df)), ] wrong <- mutate(scrambled, prev_value = lag(value)) arrange(wrong, year) right <- mutate(scrambled, prev_value = lag(value, order_by = year)) arrange(right, year) ## ----eval = FALSE------------------------------------------------------------- # filter(players, cumany(G > 150)) ## ----------------------------------------------------------------------------- x <- 1:10 y <- 10:1 order_by(y, cumsum(x)) ## ----eval = FALSE------------------------------------------------------------- # filter(players, G > mean(G)) # filter(players, G < median(G)) ## ----eval = FALSE------------------------------------------------------------- # filter(players, ntile(G, 2) == 2) ## ----------------------------------------------------------------------------- mutate(players, career_year = yearID - min(yearID) + 1) ## ----------------------------------------------------------------------------- mutate(players, G_z = (G - mean(G)) / sd(G)) dplyr/inst/doc/recoding-replacing.Rmd0000644000176200001440000003560215137161765017350 0ustar liggesusers--- title: "Recoding columns and replacing values" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Recoding columns and replacing values} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} editor: markdown: wrap: sentence canonical: true --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup, message = FALSE} library(dplyr) ``` ## Introduction dplyr provides a family of functions for *recoding* columns and *replacing* values within a column. These are extremely common operations, so mastering this family can be a big productivity boost! Before we begin, it'll be helpful to define exactly what we mean by recoding vs replacing: - *Recoding* a column creates an entirely new column using values from an existing column. The new column may have a different type from the original column. - *Replacing* values within a column partially updates an existing column with new values. The result has the same type as the original column. The family of functions can be summarized by the following table: | | **Recoding** | **Replacing** | |---------------------------|-------------------|--------------------| | **Match with conditions** | `case_when()` | `replace_when()` | | **Match with values** | `recode_values()` | `replace_values()` | This vignette walks through use cases for each of these functions, which should help you build some intuition about when to use them. ## `case_when()` `case_when()` is the most general function in the family. It works by evaluating each case sequentially and using the first match for each element to determine the corresponding value in the output. To demonstrate, we'll look at a dataset of some 5k times in minutes: ```{r} set.seed(123) racers <- tibble( id = seq_len(100), time = round(sample(1200:2100, size = 100, replace = TRUE) / 60, 2) ) racers ``` We can use `case_when()` to categorize these times into tiers: ```{r} tiers <- racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D" ) ) tiers ``` There's a few things to note here: - The *first* condition that is `TRUE` is used, i.e. a time of 21 minutes meets all of the conditions, but would be placed in tier `A` because `time < 23` is listed first. - Unmatched values fall through as `NA`. We have some racers above 33 minutes that aren't captured here! There are a few options for dealing with unmatched locations. You can leave them as `NA` if that makes sense for your use case, or you can specify a `.default` value: ```{r} racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .default = "unknown" ) ) ``` If you are confident that you've captured every case, you can supply `.unmatched = "error"` rather than `.default` and `case_when()` will error if that assertion doesn't hold. This is great for defensive programming! ```{r, error = TRUE} racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .unmatched = "error" ) ) ``` Note that missing values must be explicitly handled when setting `.unmatched = "error"`, even if that's just `is.na(time) ~ NA`, otherwise they will trigger the unmatched error. ## `replace_when()` Let's assume that some of our racers used banned running shoes and are disqualified. Also, some racers had a false start and need to incur a 20 second (1/3 minute) penalty. ```{r} id_banned_shoes <- c(2, 10, 15, 32, 65) id_false_start <- c(1, 2, 5, 20, 55, 74, 91) ``` We could add this information in a few ways. With `case_when()`: ```{r} racers |> mutate( time = case_when( id %in% id_banned_shoes ~ NA, id %in% id_false_start ~ time + 1 / 3, .default = time ) ) ``` Or in two steps with `if_else()`: ```{r} racers |> mutate(time = if_else(id %in% id_banned_shoes, NA, time)) |> mutate(time = if_else(id %in% id_false_start, time + 1 / 3, time)) ``` Neither of these feel particularly elegant at expressing the *intent* of this operation. All you're trying to do is replace a few values of `time`! We like to think of `time` as the *primary* input: `time` goes in, and `time` comes out (slightly adjusted). But both `case_when()` and `if_else()` have `time` as their last input, making the intent a bit hard to understand at first glance. `replace_when()` lets you pull the primary input to the front (which also makes it compatible with the pipe!), making the intent more clear: ```{r} racers |> mutate( time = time |> replace_when( id %in% id_banned_shoes ~ NA, id %in% id_false_start ~ time + 1 / 3 ) ) ``` As a side note, you might have been tempted to reach for `base::replace()` here, i.e. as: ```{r, eval = FALSE} racers |> mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |> mutate(time = base::replace(time, id %in% id_false_start, time + 1 / 3)) ``` This actually doesn't work! Replacing with `NA` does work, but `replace()` requires that the result of `time + 1 / 3` must be preemptively subset to the places where the condition is true. You'd have to do something more complicated to mimic `replace_when()`: ```{r} racers |> mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |> mutate(time = { loc <- id %in% id_false_start base::replace(time, loc, time[loc] + 1 / 3) }) ``` ### Type stability Beyond readability, an important benefit of `replace_when()` (and `replace_values()`, which we'll see later) is that it is *type stable* on the column you are modifying, which means that it can't change types out from under you. Type stability is particularly useful with factors. Taking another look at our `tiers` of race times, imagine that some of the race times were discovered to be faulty due to malfunctioning timers, and you need to replace a few `id`s with the `unknown` level. ```{r} id_with_malfunction <- c(1, 5, 20, 50) tiers <- racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .default = "unknown" ) |> factor(levels = c("A", "B", "C", "D", "unknown")) ) tiers ``` Note that the following `case_when()` solution results in `tier` becoming a *character* column, losing its factor class. This is due to the fact that `case_when()` is a *recoding* function, it creates an entirely new column and doesn't know that you're trying to retain existing type information. ```{r} tiers |> mutate( tier = case_when(id %in% id_with_malfunction ~ "unknown", .default = tier) ) ``` As a *replacing* function, `replace_when()` knows to be type stable on `tier`, and casts `"unknown"` to `tier`'s factor type before performing the replacement: ```{r} tiers |> mutate( tier = tier |> replace_when(id %in% id_with_malfunction ~ "unknown") ) ``` ## `recode_values()` `case_when()` and `replace_when()` both take *logical* vectors on the left-hand side of the formula. This is very flexible, but sometimes these functions require a large amount of repetition. Consider the following [Likert scale](https://en.wikipedia.org/wiki/Likert_scale) scores. We'd like to recode these from their numeric values to their character counterparts. ```{r} likert <- tibble( score = c(1, 2, 3, 4, 5, 2, 3, 1, 4) ) ``` We could certainly use a `case_when()`: ```{r} likert |> mutate( score = case_when( score == 1 ~ "Strongly disagree", score == 2 ~ "Disagree", score == 3 ~ "Neutral", score == 4 ~ "Agree", score == 5 ~ "Strongly agree" ) ) ``` But `score ==` is repeated many times! If you find yourself using `==` or `%in%` on the left-hand side in this manner, you likely want to use `recode_values()` instead. Rather than taking logical vectors, `recode_values()` takes *values* on the left-hand side to match against a single input that you'll provide as the first argument. ```{r} likert |> mutate( score = score |> recode_values( 1 ~ "Strongly disagree", 2 ~ "Disagree", 3 ~ "Neutral", 4 ~ "Agree", 5 ~ "Strongly agree" ) ) ``` This removes all of the repetition, allowing you to focus on the mapping. If you squint, the mapping should look roughly like a lookup table between the numeric value and the likert encoding. `recode_values()` actually has a second interface that allows us to make this lookup table representation even more explicit. Using a `tribble()`, we can extract out the lookup table into its own standalone data frame. ```{r} lookup <- tribble( ~from , ~to , 1 , "Strongly disagree" , 2 , "Disagree" , 3 , "Neutral" , 4 , "Agree" , 5 , "Strongly agree" ) ``` We can then utilize the alternative `from` and `to` arguments of `recode_values()` rather than supplying formulas to specify how the values should be recoded: ```{r} likert |> mutate(score = recode_values(score, from = lookup$from, to = lookup$to)) ``` Lifting the lookup table to the top of the file is particularly nice when you have a long pipe chain. The details of the mapping get some room to breathe, and in the pipe chain you can focus on the actual manipulations you are trying to perform. It's also very common for your `lookup` table to exist in a CSV file that you have to read in separately. In that case, you can replace the `tribble()` call with: ```{r, eval = FALSE} lookup <- readr::read_csv("lookup.csv") ``` But everything else works the same. This would be quite hard to specify with just the formula interface! Like `case_when()`, `recode_values()` also has `default` and `unmatched` arguments to handle unmatched locations: ```{r, error = TRUE} likert <- tibble( score = c(0, 1, 2, 2, 4, 5, 2, 3, 1, 4) ) # Missed the `0` likert |> mutate( score = score |> recode_values( from = lookup$from, to = lookup$to, unmatched = "error" ) ) ``` ## `replace_values()` As seen above, when replacing a few locations in a column using *logical conditions*, we reached for `replace_when()` rather than `case_when()`. Similarly, when replacing a few locations using *values* to match against, it's best to use `replace_values()` over `recode_values()`. Imagine we'd like to collapse some, but not all, of these school names into common buckets: ```{r} schools <- tibble( name = c( "UNC", "Chapel Hill", NA, "Duke", "Duke University", "UNC", "NC State", "ECU" ) ) ``` We could use `recode_values()`: ```{r} schools |> mutate( name = recode_values( name, c("UNC", "Chapel Hill") ~ "UNC Chapel Hill", c("Duke", "Duke University") ~ "Duke", default = name ) ) ``` But this "partial update by value" is so common that it really deserves its own name that doesn't require you to specify `default`. For that, we have `replace_values()`: ```{r} schools |> mutate( name = name |> replace_values( c("UNC", "Chapel Hill") ~ "UNC Chapel Hill", c("Duke", "Duke University") ~ "Duke" ) ) ``` Like `recode_values()`, `replace_values()` has an alternative `from` and `to` API that works well with lookup tables and allows you to move your mapping out of the pipe chain: ```{r} lookup <- tribble( ~from , ~to , "UNC" , "UNC Chapel Hill" , "Chapel Hill" , "UNC Chapel Hill" , "Duke" , "Duke" , "Duke University" , "Duke" ) schools |> mutate(name = replace_values(name, from = lookup$from, to = lookup$to)) ``` An extremely neat feature of the `from` and `to` API is that they also take *lists* of vectors that describe the mapping, which has been designed to work elegantly with the fact that `tribble()` can create list columns, allowing you to further collapse this lookup table: ```{r} # Condensed lookup table with a `many:1` mapping per row lookup <- tribble( ~from , ~to , c("UNC", "Chapel Hill") , "UNC Chapel Hill" , c("Duke", "Duke University") , "Duke" ) # Note that `from` is a list column lookup lookup$from # Works the same as before schools |> mutate(name = replace_values(name, from = lookup$from, to = lookup$to)) ``` ## Comparisons We'll end this vignette with some comparisons of the recoding and replacing family to other dplyr functions and to other technologies, like SQL. ### `if_else()` `if_else()` is a type of recoding function, as it creates an entirely new column. In fact, it's closely tied to `case_when()`: ```{r, eval = FALSE} if_else(condition, true, false, missing) case_when( condition ~ true, !condition ~ false, is.na(condition) ~ missing ) ``` Similar to `case_when()`, `if_else()` doesn't offer type stability on any particular input. The output's type is computed as the common type of `true`, `false`, and `missing`. If you find yourself writing an `if_else()` where the purpose is to partially update an existing column, consider using `replace_when()` instead for clarity and type stability: ```{r, eval = FALSE} x <- if_else(x > 5, new, x) # Type stable on `x`. # Intent of "partially updating" `x` is clear. # Pipe friendly. x <- x |> replace_when(x > 5 ~ new) ``` ### `coalesce()` For converting from `NA` to some other value, the most common cases of `coalesce()` are often a `replace_values()` call in disguise: ```{r} x <- c(1, 2, NA, 3, NA, 5) y <- c(0, 3, 1, 4, 6, 7) coalesce(x, 0) replace_values(x, NA ~ 0) coalesce(x, y) replace_values(x, NA ~ y) ``` And with `replace_values()` you can replace any value, not just `NA`. ### `na_if()` For converting from a problematic value to `NA`, `replace_values()` is a more flexible (and likely more intuitive) alternative to `na_if()`: ```{r} x <- c(1, 2, 0, -99, 12) # To convert `0` and `-99` to `NA`, you have to do it in two calls x |> na_if(0) |> na_if(-99) x |> replace_values(from = c(0, -99), to = NA) ``` ### SQL `case_when()` is an R equivalent of SQL's [Searched `CASE`](https://learn.microsoft.com/en-us/sql/t-sql/language-elements/case-transact-sql?view=sql-server-ver17#syntax) statement: ``` r case_when( x < 100 ~ this, x < 20 ~ that, .default = default ) ``` ``` sql CASE WHEN x < 100 THEN this WHEN x < 20 THEN that ELSE default END ``` And dbplyr will translate a `case_when()` to this form! `recode_values()` is an R equivalent of SQL's [Simple `CASE`](https://learn.microsoft.com/en-us/sql/t-sql/language-elements/case-transact-sql?view=sql-server-ver17#syntax) statement: ``` r recode_values( x, "E" ~ "East", "W" ~ "West", "N" ~ "North", "S" ~ "South", .default = "Unknown" ) ``` ``` sql CASE x WHEN 'E' THEN 'East' WHEN 'W' THEN 'West' WHEN 'N' THEN 'North' WHEN 'S' THEN 'South' ELSE 'Unknown' END ``` As of dbplyr 2.5.1, we don't currently have a translation for `recode_values()` since it is so new, but we expect to have one soon. dplyr/inst/doc/base.R0000644000176200001440000001377615137234446014212 0ustar liggesusers## ----echo = FALSE, message = FALSE-------------------------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4, tibble.print_max = 4) ## ----setup, message = FALSE--------------------------------------------------- library(dplyr) mtcars <- as_tibble(mtcars) iris <- as_tibble(iris) ## ----------------------------------------------------------------------------- mtcars |> arrange(cyl, disp) ## ----------------------------------------------------------------------------- mtcars |> arrange(desc(cyl), desc(disp)) ## ----------------------------------------------------------------------------- mtcars[order(mtcars$cyl, mtcars$disp), , drop = FALSE] ## ----results = FALSE---------------------------------------------------------- mtcars[order(mtcars$cyl, mtcars$disp, decreasing = TRUE), , drop = FALSE] mtcars[order(-mtcars$cyl, -mtcars$disp), , drop = FALSE] ## ----------------------------------------------------------------------------- df <- tibble( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) df |> distinct(x) # selected columns df |> distinct(x, .keep_all = TRUE) # whole data frame ## ----------------------------------------------------------------------------- unique(df["x"]) # selected columns df[!duplicated(df$x), , drop = FALSE] # whole data frame ## ----------------------------------------------------------------------------- starwars |> filter(species == "Human") starwars |> filter(mass > 1000) starwars |> filter(hair_color == "none" & eye_color == "black") ## ----------------------------------------------------------------------------- subset(starwars, species == "Human") subset(starwars, mass > 1000) subset(starwars, hair_color == "none" & eye_color == "black") ## ----------------------------------------------------------------------------- starwars[which(starwars$species == "Human"), , drop = FALSE] starwars[which(starwars$mass > 1000), , drop = FALSE] starwars[which(starwars$hair_color == "none" & starwars$eye_color == "black"), , drop = FALSE] ## ----------------------------------------------------------------------------- df |> mutate(z = x + y, z2 = z ^ 2) ## ----------------------------------------------------------------------------- head(transform(df, z = x + y, z2 = (x + y) ^ 2)) ## ----------------------------------------------------------------------------- mtcars$cyl2 <- mtcars$cyl * 2 mtcars$cyl4 <- mtcars$cyl2 * 2 ## ----------------------------------------------------------------------------- gf <- tibble(g = c(1, 1, 2, 2), x = c(0.5, 1.5, 2.5, 3.5)) gf |> group_by(g) |> mutate(x_mean = mean(x), x_rank = rank(x)) ## ----------------------------------------------------------------------------- transform(gf, x_mean = ave(x, g, FUN = mean), x_rank = ave(x, g, FUN = rank) ) ## ----------------------------------------------------------------------------- mtcars |> pull(1) mtcars |> pull(cyl) ## ----------------------------------------------------------------------------- mtcars[[1]] mtcars$cyl ## ----------------------------------------------------------------------------- # to front mtcars |> relocate(gear, carb) # to back mtcars |> relocate(mpg, cyl, .after = last_col()) ## ----------------------------------------------------------------------------- mtcars[union(c("gear", "carb"), names(mtcars))] to_back <- c("mpg", "cyl") mtcars[c(setdiff(names(mtcars), to_back), to_back)] ## ----------------------------------------------------------------------------- iris |> rename(sepal_length = Sepal.Length, sepal_width = 2) ## ----------------------------------------------------------------------------- iris2 <- iris names(iris2)[2] <- "sepal_width" ## ----------------------------------------------------------------------------- names(iris2)[names(iris2) == "Sepal.Length"] <- "sepal_length" ## ----------------------------------------------------------------------------- iris |> rename_with(toupper) ## ----------------------------------------------------------------------------- setNames(iris, toupper(names(iris))) ## ----------------------------------------------------------------------------- iris |> select(1:3) iris |> select(Species, Sepal.Length) iris |> select(starts_with("Petal")) iris |> select(where(is.factor)) ## ----------------------------------------------------------------------------- iris[1:3] # single argument selects columns; never drops iris[1:3, , drop = FALSE] ## ----------------------------------------------------------------------------- iris[c("Species", "Sepal.Length")] subset(iris, select = c(Species, Sepal.Length)) ## ----------------------------------------------------------------------------- iris[grep("^Petal", names(iris))] ## ----------------------------------------------------------------------------- Filter(is.factor, iris) ## ----------------------------------------------------------------------------- mtcars |> group_by(cyl) |> summarise(mean = mean(disp), n = n()) ## ----------------------------------------------------------------------------- mtcars_by <- by(mtcars, mtcars$cyl, function(df) { with(df, data.frame(cyl = cyl[[1]], mean = mean(disp), n = nrow(df))) }) do.call(rbind, mtcars_by) ## ----------------------------------------------------------------------------- agg <- aggregate(disp ~ cyl, mtcars, function(x) c(mean = mean(x), n = length(x))) agg ## ----------------------------------------------------------------------------- str(agg) ## ----------------------------------------------------------------------------- slice(mtcars, 25:n()) ## ----------------------------------------------------------------------------- mtcars[25:nrow(mtcars), , drop = FALSE] ## ----------------------------------------------------------------------------- band_members |> semi_join(band_instruments) band_members |> anti_join(band_instruments) ## ----------------------------------------------------------------------------- band_members[band_members$name %in% band_instruments$name, , drop = FALSE] band_members[!band_members$name %in% band_instruments$name, , drop = FALSE] dplyr/inst/doc/dplyr.html0000644000176200001440000024211715137234453015164 0ustar liggesusers Introduction to dplyr

Introduction to dplyr

When working with data you must:

  • Figure out what you want to do.

  • Describe those tasks in the form of a computer program.

  • Execute the program.

The dplyr package makes these steps fast and easy:

  • By constraining your options, it helps you think about your data manipulation challenges.

  • It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate your thoughts into code.

  • It uses efficient backends, so you spend less time waiting for the computer.

This document introduces you to dplyr’s basic set of tools, and shows you how to apply them to data frames. dplyr also supports databases via the dbplyr package, once you’ve installed, read vignette("dbplyr") to learn more.

Data: starwars

To explore the basic data manipulation verbs of dplyr, we’ll use the dataset starwars. This dataset contains 87 characters and comes from the Star Wars API, and is documented in ?starwars

dim(starwars)
#> [1] 87 14
starwars
#> # A tibble: 87 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Note that starwars is a tibble, a modern reimagining of the data frame. It’s particularly useful for large datasets because it only prints the first few rows. You can learn more about tibbles at https://tibble.tidyverse.org; in particular you can convert data frames to tibbles with as_tibble().

Single table verbs

dplyr aims to provide a function for each basic verb of data manipulation. These verbs can be organised into three categories based on the component of the dataset that they work with:

  • Rows:
    • filter() chooses rows based on column values.
    • slice() chooses rows based on location.
    • arrange() changes the order of the rows.
  • Columns:
    • select() changes whether or not a column is included.
    • rename() changes the name of columns.
    • mutate() changes the values of columns and creates new columns.
    • relocate() changes the order of the columns.
  • Groups of rows:
    • summarise() collapses a group into a single row.

The pipe

All of the dplyr functions take a data frame (or tibble) as the first argument. Rather than forcing the user to either save intermediate objects or nest functions, dplyr provides the |> operator from magrittr. x |> f(y) turns into f(x, y) so the result from one step is then “piped” into the next step. You can use the pipe to rewrite multiple operations that you can read left-to-right, top-to-bottom (reading the pipe operator as “then”).

Filter rows with filter()

filter() allows you to select a subset of rows in a data frame. Like all single verbs, the first argument is the tibble (or data frame). The second and subsequent arguments refer to variables within that data frame, selecting rows where the expression is TRUE.

For example, we can select all character with light skin color and brown eyes with:

starwars |> filter(skin_color == "light", eye_color == "brown")
#> # A tibble: 7 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Leia Org…    150    49 brown      light      brown             19 fema… femin…
#> 2 Biggs Da…    183    84 black      light      brown             24 male  mascu…
#> 3 Padmé Am…    185    45 brown      light      brown             46 fema… femin…
#> 4 Cordé        157    NA brown      light      brown             NA <NA>  <NA>  
#> # ℹ 3 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

This is roughly equivalent to this base R code:

starwars[starwars$skin_color == "light" & starwars$eye_color == "brown", ]

Arrange rows with arrange()

arrange() works similarly to filter() except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:

starwars |> arrange(height, mass)
#> # A tibble: 87 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Yoda          66    17 white      green      brown            896 male  mascu…
#> 2 Ratts Ty…     79    15 none       grey, blue unknown           NA male  mascu…
#> 3 Wicket S…     88    20 brown      brown      brown              8 male  mascu…
#> 4 Dud Bolt      94    45 none       blue, grey yellow            NA male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Use desc() to order a column in descending order:

starwars |> arrange(desc(height))
#> # A tibble: 87 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Yarael P…    264    NA none       white      yellow            NA male  mascu…
#> 2 Tarfful      234   136 brown      brown      blue              NA male  mascu…
#> 3 Lama Su      229    88 none       grey       black             NA male  mascu…
#> 4 Chewbacca    228   112 brown      unknown    blue             200 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Choose rows using their position with slice()

slice() lets you index rows by their (integer) locations. It allows you to select, remove, and duplicate rows.

We can get characters from row numbers 5 through 10.

starwars |> slice(5:10)
#> # A tibble: 6 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Leia Org…    150    49 brown      light      brown             19 fema… femin…
#> 2 Owen Lars    178   120 brown, gr… light      blue              52 male  mascu…
#> 3 Beru Whi…    165    75 brown      light      blue              47 fema… femin…
#> 4 R5-D4         97    32 <NA>       white, red red               NA none  mascu…
#> # ℹ 2 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

It is accompanied by a number of helpers for common use cases:

  • slice_head() and slice_tail() select the first or last rows.
starwars |> slice_head(n = 3)
#> # A tibble: 3 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue              19 male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow           112 none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red               33 none  mascu…
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
  • slice_sample() randomly selects rows. Use the option prop to choose a certain proportion of the cases.
starwars |> slice_sample(n = 5)
#> # A tibble: 5 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Ayla Sec…    178  55   none       blue       hazel             48 fema… femin…
#> 2 Bossk        190 113   none       green      red               53 male  mascu…
#> 3 San Hill     191  NA   none       grey       gold              NA male  mascu…
#> 4 Luminara…    170  56.2 black      yellow     blue              58 fema… femin…
#> # ℹ 1 more row
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
starwars |> slice_sample(prop = 0.1)
#> # A tibble: 8 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Qui-Gon …    193    89 brown      fair       blue              92 male  mascu…
#> 2 Jango Fe…    183    79 black      tan        brown             66 male  mascu…
#> 3 Jocasta …    167    NA white      fair       blue              NA fema… femin…
#> 4 Zam Wese…    168    55 blonde     fair, gre… yellow            NA fema… femin…
#> # ℹ 4 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Use replace = TRUE to perform a bootstrap sample. If needed, you can weight the sample with the weight argument.

  • slice_min() and slice_max() select the rows with the smallest or largest values of the selected column. By default, they return a single minimum or maximum, but you can supply n to control how many rows remain.
starwars |> slice_max(height, n = 3)
#> # A tibble: 3 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Yarael P…    264    NA none       white      yellow            NA male  mascu…
#> 2 Tarfful      234   136 brown      brown      blue              NA male  mascu…
#> 3 Lama Su      229    88 none       grey       black             NA male  mascu…
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Select columns with select()

Often you work with large datasets with many columns but only a few are actually of interest to you. select() allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions:

# Select columns by name
starwars |> select(hair_color, skin_color, eye_color)
#> # A tibble: 87 × 3
#>   hair_color skin_color  eye_color
#>   <chr>      <chr>       <chr>    
#> 1 blond      fair        blue     
#> 2 <NA>       gold        yellow   
#> 3 <NA>       white, blue red      
#> 4 none       white       yellow   
#> # ℹ 83 more rows
# Select all columns between hair_color and eye_color (inclusive)
starwars |> select(hair_color:eye_color)
#> # A tibble: 87 × 3
#>   hair_color skin_color  eye_color
#>   <chr>      <chr>       <chr>    
#> 1 blond      fair        blue     
#> 2 <NA>       gold        yellow   
#> 3 <NA>       white, blue red      
#> 4 none       white       yellow   
#> # ℹ 83 more rows
# Select all columns except those from hair_color to eye_color (inclusive)
starwars |> select(!(hair_color:eye_color))
#> # A tibble: 87 × 11
#>   name     height  mass birth_year sex   gender homeworld species films vehicles
#>   <chr>     <int> <dbl>      <dbl> <chr> <chr>  <chr>     <chr>   <lis> <list>  
#> 1 Luke Sk…    172    77       19   male  mascu… Tatooine  Human   <chr> <chr>   
#> 2 C-3PO       167    75      112   none  mascu… Tatooine  Droid   <chr> <chr>   
#> 3 R2-D2        96    32       33   none  mascu… Naboo     Droid   <chr> <chr>   
#> 4 Darth V…    202   136       41.9 male  mascu… Tatooine  Human   <chr> <chr>   
#> # ℹ 83 more rows
#> # ℹ 1 more variable: starships <list>
# Select all columns ending with color
starwars |> select(ends_with("color"))
#> # A tibble: 87 × 3
#>   hair_color skin_color  eye_color
#>   <chr>      <chr>       <chr>    
#> 1 blond      fair        blue     
#> 2 <NA>       gold        yellow   
#> 3 <NA>       white, blue red      
#> 4 none       white       yellow   
#> # ℹ 83 more rows

There are a number of helper functions you can use within select(), like starts_with(), ends_with(), matches() and contains(). These let you quickly match larger blocks of variables that meet some criterion. See ?select for more details.

You can rename variables with select() by using named arguments:

starwars |> select(home_world = homeworld)
#> # A tibble: 87 × 1
#>   home_world
#>   <chr>     
#> 1 Tatooine  
#> 2 Tatooine  
#> 3 Naboo     
#> 4 Tatooine  
#> # ℹ 83 more rows

But because select() drops all the variables not explicitly mentioned, it’s not that useful. Instead, use rename():

starwars |> rename(home_world = homeworld)
#> # A tibble: 87 × 14
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: home_world <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Add new columns with mutate()

Besides selecting sets of existing columns, it’s often useful to add new columns that are functions of existing columns. This is the job of mutate():

starwars |> mutate(height_m = height / 100)
#> # A tibble: 87 × 15
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 6 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>, height_m <dbl>

We can’t see the height in meters we just calculated, but we can fix that using a select command.

starwars |>
  mutate(height_m = height / 100) |>
  select(height_m, height, everything())
#> # A tibble: 87 × 15
#>   height_m height name     mass hair_color skin_color eye_color birth_year sex  
#>      <dbl>  <int> <chr>   <dbl> <chr>      <chr>      <chr>          <dbl> <chr>
#> 1     1.72    172 Luke S…    77 blond      fair       blue            19   male 
#> 2     1.67    167 C-3PO      75 <NA>       gold       yellow         112   none 
#> 3     0.96     96 R2-D2      32 <NA>       white, bl… red             33   none 
#> 4     2.02    202 Darth …   136 none       white      yellow          41.9 male 
#> # ℹ 83 more rows
#> # ℹ 6 more variables: gender <chr>, homeworld <chr>, species <chr>,
#> #   films <list>, vehicles <list>, starships <list>

dplyr::mutate() is similar to the base transform(), but allows you to refer to columns that you’ve just created:

starwars |>
  mutate(
    height_m = height / 100,
    BMI = mass / (height_m^2)
  ) |>
  select(BMI, everything())
#> # A tibble: 87 × 16
#>     BMI name       height  mass hair_color skin_color eye_color birth_year sex  
#>   <dbl> <chr>       <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr>
#> 1  26.0 Luke Skyw…    172    77 blond      fair       blue            19   male 
#> 2  26.9 C-3PO         167    75 <NA>       gold       yellow         112   none 
#> 3  34.7 R2-D2          96    32 <NA>       white, bl… red             33   none 
#> 4  33.3 Darth Vad…    202   136 none       white      yellow          41.9 male 
#> # ℹ 83 more rows
#> # ℹ 7 more variables: gender <chr>, homeworld <chr>, species <chr>,
#> #   films <list>, vehicles <list>, starships <list>, height_m <dbl>

If you only want to keep the new variables, use .keep = "none":

starwars |>
  mutate(
    height_m = height / 100,
    BMI = mass / (height_m^2),
    .keep = "none"
  )
#> # A tibble: 87 × 2
#>   height_m   BMI
#>      <dbl> <dbl>
#> 1     1.72  26.0
#> 2     1.67  26.9
#> 3     0.96  34.7
#> 4     2.02  33.3
#> # ℹ 83 more rows

Change column order with relocate()

Use a similar syntax as select() to move blocks of columns at once

starwars |> relocate(sex:homeworld, .before = height)
#> # A tibble: 87 × 14
#>   name       sex   gender homeworld height  mass hair_color skin_color eye_color
#>   <chr>      <chr> <chr>  <chr>      <int> <dbl> <chr>      <chr>      <chr>    
#> 1 Luke Skyw… male  mascu… Tatooine     172    77 blond      fair       blue     
#> 2 C-3PO      none  mascu… Tatooine     167    75 <NA>       gold       yellow   
#> 3 R2-D2      none  mascu… Naboo         96    32 <NA>       white, bl… red      
#> 4 Darth Vad… male  mascu… Tatooine     202   136 none       white      yellow   
#> # ℹ 83 more rows
#> # ℹ 5 more variables: birth_year <dbl>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Summarise values with summarise()

The last verb is summarise(). It collapses a data frame to a single row.

starwars |> summarise(height = mean(height, na.rm = TRUE))
#> # A tibble: 1 × 1
#>   height
#>    <dbl>
#> 1   175.

It’s not that useful until we learn the group_by() verb below.

Commonalities

You may have noticed that the syntax and function of all these verbs are very similar:

  • The first argument is a data frame.

  • The subsequent arguments describe what to do with the data frame. You can refer to columns in the data frame directly without using $.

  • The result is a new data frame

Together these properties make it easy to chain together multiple simple steps to achieve a complex result.

These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (arrange()), pick observations and variables of interest (filter() and select()), add new variables that are functions of existing variables (mutate()), or collapse many values to a summary (summarise()).

Combining functions with |>

The dplyr API is functional in the sense that function calls don’t have side-effects. You must always save their results. This doesn’t lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step:

a1 <- group_by(starwars, species, sex)
a2 <- select(a1, height, mass)
a3 <- summarise(a2,
  height = mean(height, na.rm = TRUE),
  mass = mean(mass, na.rm = TRUE)
)

Or if you don’t want to name the intermediate results, you need to wrap the function calls inside each other:

summarise(
  select(
    group_by(starwars, species, sex),
    height, mass
  ),
  height = mean(height, na.rm = TRUE),
  mass = mean(mass, na.rm = TRUE)
)
#> Adding missing grouping variables: `species`, `sex`
#> `summarise()` has regrouped the output.
#> # A tibble: 41 × 4
#> # Groups:   species [38]
#>   species  sex   height  mass
#>   <chr>    <chr>  <dbl> <dbl>
#> 1 Aleena   male      79    15
#> 2 Besalisk male     198   102
#> 3 Cerean   male     198    82
#> 4 Chagrian male     196   NaN
#> # ℹ 37 more rows

This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the |> operator from magrittr. x |> f(y) turns into f(x, y) so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom (reading the pipe operator as “then”):

starwars |>
  group_by(species, sex) |>
  select(height, mass) |>
  summarise(
    height = mean(height, na.rm = TRUE),
    mass = mean(mass, na.rm = TRUE)
  )

Patterns of operations

The dplyr verbs can be classified by the type of operations they accomplish (we sometimes speak of their semantics, i.e., their meaning). It’s helpful to have a good grasp of the difference between select and mutate operations.

Selecting operations

One of the appealing features of dplyr is that you can refer to columns from the tibble as if they were regular variables. However, the syntactic uniformity of referring to bare column names hides semantical differences across the verbs. A column symbol supplied to select() does not have the same meaning as the same symbol supplied to mutate().

Selecting operations expect column names and positions. Hence, when you call select() with bare variable names, they actually represent their own positions in the tibble. The following calls are completely equivalent from dplyr’s point of view:

# `name` represents the integer 1
select(starwars, name)
#> # A tibble: 87 × 1
#>   name          
#>   <chr>         
#> 1 Luke Skywalker
#> 2 C-3PO         
#> 3 R2-D2         
#> 4 Darth Vader   
#> # ℹ 83 more rows
select(starwars, 1)
#> # A tibble: 87 × 1
#>   name          
#>   <chr>         
#> 1 Luke Skywalker
#> 2 C-3PO         
#> 3 R2-D2         
#> 4 Darth Vader   
#> # ℹ 83 more rows

By the same token, this means that you cannot refer to variables from the surrounding context if they have the same name as one of the columns. In the following example, height still represents 2, not 5:

height <- 5
select(starwars, height)
#> # A tibble: 87 × 1
#>   height
#>    <int>
#> 1    172
#> 2    167
#> 3     96
#> 4    202
#> # ℹ 83 more rows

One useful subtlety is that this only applies to bare names and to selecting calls like c(height, mass) or height:mass. In all other cases, the columns of the data frame are not put in scope. This allows you to refer to contextual variables in selection helpers:

name <- "color"
select(starwars, ends_with(name))
#> # A tibble: 87 × 3
#>   hair_color skin_color  eye_color
#>   <chr>      <chr>       <chr>    
#> 1 blond      fair        blue     
#> 2 <NA>       gold        yellow   
#> 3 <NA>       white, blue red      
#> 4 none       white       yellow   
#> # ℹ 83 more rows

These semantics are usually intuitive. But note the subtle difference:

name <- 5
select(starwars, name, identity(name))
#> # A tibble: 87 × 2
#>   name           skin_color 
#>   <chr>          <chr>      
#> 1 Luke Skywalker fair       
#> 2 C-3PO          gold       
#> 3 R2-D2          white, blue
#> 4 Darth Vader    white      
#> # ℹ 83 more rows

In the first argument, name represents its own position 1. In the second argument, name is evaluated in the surrounding context and represents the fifth column.

For a long time, select() used to only understand column positions. Counting from dplyr 0.6, it now understands column names as well. This makes it a bit easier to program with select():

vars <- c("name", "height")
select(starwars, all_of(vars), "mass")
#> # A tibble: 87 × 3
#>   name           height  mass
#>   <chr>           <int> <dbl>
#> 1 Luke Skywalker    172    77
#> 2 C-3PO             167    75
#> 3 R2-D2              96    32
#> 4 Darth Vader       202   136
#> # ℹ 83 more rows

Mutating operations

Mutate semantics are quite different from selection semantics. Whereas select() expects column names or positions, mutate() expects column vectors. We will set up a smaller tibble to use for our examples.

df <- starwars |> select(name, height, mass)

When we use select(), the bare column names stand for their own positions in the tibble. For mutate() on the other hand, column symbols represent the actual column vectors stored in the tibble. Consider what happens if we give a string or a number to mutate():

mutate(df, "height", 2)
#> # A tibble: 87 × 5
#>   name           height  mass `"height"`   `2`
#>   <chr>           <int> <dbl> <chr>      <dbl>
#> 1 Luke Skywalker    172    77 height         2
#> 2 C-3PO             167    75 height         2
#> 3 R2-D2              96    32 height         2
#> 4 Darth Vader       202   136 height         2
#> # ℹ 83 more rows

mutate() gets length-1 vectors that it interprets as new columns in the data frame. These vectors are recycled so they match the number of rows. That’s why it doesn’t make sense to supply expressions like "height" + 10 to mutate(). This amounts to adding 10 to a string! The correct expression is:

mutate(df, height + 10)
#> # A tibble: 87 × 4
#>   name           height  mass `height + 10`
#>   <chr>           <int> <dbl>         <dbl>
#> 1 Luke Skywalker    172    77           182
#> 2 C-3PO             167    75           177
#> 3 R2-D2              96    32           106
#> 4 Darth Vader       202   136           212
#> # ℹ 83 more rows

In the same way, you can unquote values from the context if these values represent a valid column. They must be either length 1 (they then get recycled) or have the same length as the number of rows. In the following example we create a new vector that we add to the data frame:

var <- seq(1, nrow(df))
mutate(df, new = var)
#> # A tibble: 87 × 4
#>   name           height  mass   new
#>   <chr>           <int> <dbl> <int>
#> 1 Luke Skywalker    172    77     1
#> 2 C-3PO             167    75     2
#> 3 R2-D2              96    32     3
#> 4 Darth Vader       202   136     4
#> # ℹ 83 more rows

A case in point is group_by(). While you might think it has select semantics, it actually has mutate semantics. This is quite handy as it allows to group by a modified column:

group_by(starwars, sex)
#> # A tibble: 87 × 14
#> # Groups:   sex [5]
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
group_by(starwars, sex = as.factor(sex))
#> # A tibble: 87 × 14
#> # Groups:   sex [5]
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <fct> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
group_by(starwars, height_binned = cut(height, 3))
#> # A tibble: 87 × 15
#> # Groups:   height_binned [4]
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 6 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>, height_binned <fct>

This is why you can’t supply a column name to group_by(). This amounts to creating a new column containing the string recycled to the number of rows:

group_by(df, "month")
#> # A tibble: 87 × 4
#> # Groups:   "month" [1]
#>   name           height  mass `"month"`
#>   <chr>           <int> <dbl> <chr>    
#> 1 Luke Skywalker    172    77 month    
#> 2 C-3PO             167    75 month    
#> 3 R2-D2              96    32 month    
#> 4 Darth Vader       202   136 month    
#> # ℹ 83 more rows
dplyr/inst/doc/rowwise.Rmd0000644000176200001440000003307415137161765015314 0ustar liggesusers--- title: "Row-wise operations" description: > In R, it's usually easier to do something for each column than for each row. In this vignette you will learn how to use the `rowwise()` function to perform operations by row. Along the way, you'll learn about list-columns, and see how you might perform simulations and modelling within dplyr verbs. output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Row-wise operations} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) ``` dplyr, and R in general, are particularly well suited to performing operations over columns, and performing operations over rows is much harder. In this vignette, you'll learn dplyr's approach centred around the row-wise data frame created by `rowwise()`. There are three common use cases that we discuss in this vignette: * Row-wise aggregates (e.g. compute the mean of x, y, z). * Calling a function multiple times with varying arguments. * Working with list-columns. These types of problems are often easily solved with a for loop, but it's nice to have a solution that fits naturally into a pipeline. > Of course, someone has to write loops. It doesn't have to be you. > --- Jenny Bryan ```{r setup} library(dplyr, warn.conflicts = FALSE) ``` ```{r include = FALSE} nest_by <- function(df, ...) { df |> group_by(...) |> summarise(data = list(pick(everything()))) |> rowwise(...) } # mtcars |> nest_by(cyl) ``` ## Creating Row-wise operations require a special type of grouping where each group consists of a single row. You create this with `rowwise()`: ```{r} df <- tibble(x = 1:2, y = 3:4, z = 5:6) df |> rowwise() ``` Like `group_by()`, `rowwise()` doesn't really do anything itself; it just changes how the other verbs work. For example, compare the results of `mutate()` in the following code: ```{r} df |> mutate(m = mean(c(x, y, z))) df |> rowwise() |> mutate(m = mean(c(x, y, z))) ``` If you use `mutate()` with a regular data frame, it computes the mean of `x`, `y`, and `z` across all rows. If you apply it to a row-wise data frame, it computes the mean for each row. You can optionally supply "identifier" variables in your call to `rowwise()`. These variables are preserved when you call `summarise()`, so they behave somewhat similarly to the grouping variables passed to `group_by()`: ```{r} df <- tibble(name = c("Mara", "Hadley"), x = 1:2, y = 3:4, z = 5:6) df |> rowwise() |> summarise(m = mean(c(x, y, z))) df |> rowwise(name) |> summarise(m = mean(c(x, y, z))) ``` `rowwise()` is just a special form of grouping, so if you want to remove it from a data frame, just call `ungroup()`. ## Per row summary statistics `dplyr::summarise()` makes it really easy to summarise values across rows within one column. When combined with `rowwise()` it also makes it easy to summarise values across columns within one row. To see how, we'll start by making a little dataset: ```{r} df <- tibble(id = 1:6, w = 10:15, x = 20:25, y = 30:35, z = 40:45) df ``` Let's say we want compute the sum of `w`, `x`, `y`, and `z` for each row. We start by making a row-wise data frame: ```{r} rf <- df |> rowwise(id) ``` We can then use `mutate()` to add a new column to each row, or `summarise()` to return just that one summary: ```{r} rf |> mutate(total = sum(c(w, x, y, z))) rf |> summarise(total = sum(c(w, x, y, z))) ``` Of course, if you have a lot of variables, it's going to be tedious to type in every variable name. Instead, you can use `c_across()` which uses tidy selection syntax so you can to succinctly select many variables: ```{r} rf |> mutate(total = sum(c_across(w:z))) rf |> mutate(total = sum(c_across(where(is.numeric)))) ``` You could combine this with column-wise operations (see `vignette("colwise")` for more details) to compute the proportion of the total for each column: ```{r} rf |> mutate(total = sum(c_across(w:z))) |> ungroup() |> mutate(across(w:z, ~ . / total)) ``` ### Row-wise summary functions The `rowwise()` approach will work for any summary function. But if you need greater speed, it's worth looking for a built-in row-wise variant of your summary function. These are more efficient because they operate on the data frame as whole; they don't split it into rows, compute the summary, and then join the results back together again. ```{r} df |> mutate(total = rowSums(pick(where(is.numeric), -id))) df |> mutate(mean = rowMeans(pick(where(is.numeric), -id))) ``` **NB**: I use `df` (not `rf`) and `pick()` (not `c_across()`) here because `rowMeans()` and `rowSums()` take a multi-row data frame as input. Also note that `-id` is needed to avoid selecting `id` in `pick()`. This wasn't required with the rowwise data frame because we had specified `id` as an identifier in our original call to `rowwise()`, preventing it from being selected as a grouping column. ## List-columns `rowwise()` operations are a natural pairing when you have list-columns. They allow you to avoid explicit loops and/or functions from the `apply()` or `purrr::map()` families. ### Motivation Imagine you have this data frame, and you want to count the lengths of each element: ```{r} df <- tibble( x = list(1, 2:3, 4:6) ) ``` You might try calling `length()`: ```{r} df |> mutate(l = length(x)) ``` But that returns the length of the column, not the length of the individual values. If you're an R documentation aficionado, you might know there's already a base R function just for this purpose: ```{r} df |> mutate(l = lengths(x)) ``` Or if you're an experienced R programmer, you might know how to apply a function to each element of a list using `sapply()`, `vapply()`, or one of the purrr `map()` functions: ```{r} df |> mutate(l = sapply(x, length)) df |> mutate(l = purrr::map_int(x, length)) ``` But wouldn't it be nice if you could just write `length(x)` and dplyr would figure out that you wanted to compute the length of the element inside of `x`? Since you're here, you might already be guessing at the answer: this is just another application of the row-wise pattern. ```{r} df |> rowwise() |> mutate(l = length(x)) ``` ### Subsetting Before we continue on, I wanted to briefly mention the magic that makes this work. This isn't something you'll generally need to think about (it'll just work), but it's useful to know about when something goes wrong. There's an important difference between a grouped data frame where each group happens to have one row, and a row-wise data frame where every group always has one row. Take these two data frames: ```{r} df <- tibble(g = 1:2, y = list(1:3, "a")) gf <- df |> group_by(g) rf <- df |> rowwise(g) ``` If we compute some properties of `y`, you'll notice the results look different: ```{r} gf |> mutate(type = typeof(y), length = length(y)) rf |> mutate(type = typeof(y), length = length(y)) ``` They key difference is that when `mutate()` slices up the columns to pass to `length(y)` the grouped mutate uses `[` and the row-wise mutate uses `[[`. The following code gives a flavour of the differences if you used a for loop: ```{r} # grouped out1 <- integer(2) for (i in 1:2) { out1[[i]] <- length(df$y[i]) } out1 # rowwise out2 <- integer(2) for (i in 1:2) { out2[[i]] <- length(df$y[[i]]) } out2 ``` Note that this magic only applies when you're referring to existing columns, not when you're creating new rows. This is potentially confusing, but we're fairly confident it's the least worst solution, particularly given the hint in the error message. ```{r, error = TRUE} gf |> mutate(y2 = y) rf |> mutate(y2 = y) rf |> mutate(y2 = list(y)) ``` ### Modelling `rowwise()` data frames allow you to solve a variety of modelling problems in what I think is a particularly elegant way. We'll start by creating a nested data frame: ```{r} by_cyl <- mtcars |> nest_by(cyl) by_cyl ``` This is a little different to the usual `group_by()` output: we have visibly changed the structure of the data. Now we have three rows (one for each group), and we have a list-col, `data`, that stores the data for that group. Also note that the output is `rowwise()`; this is important because it's going to make working with that list of data frames much easier. Once we have one data frame per row, it's straightforward to make one model per row: ```{r} mods <- by_cyl |> mutate(mod = list(lm(mpg ~ wt, data = data))) mods ``` And supplement that with one set of predictions per row: ```{r} mods <- mods |> mutate(pred = list(predict(mod, data))) mods ``` You could then summarise the model in a variety of ways: ```{r} mods |> summarise(rmse = sqrt(mean((pred - data$mpg) ^ 2))) mods |> summarise(rsq = summary(mod)$r.squared) mods |> summarise(broom::glance(mod)) ``` Or easily access the parameters of each model: ```{r} mods |> reframe(broom::tidy(mod)) ``` ## Repeated function calls `rowwise()` doesn't just work with functions that return a length-1 vector (aka summary functions); it can work with any function if the result is a list. This means that `rowwise()` and `mutate()` provide an elegant way to call a function many times with varying arguments, storing the outputs alongside the inputs. ### Simulations I think this is a particularly elegant way to perform simulations, because it lets you store simulated values along with the parameters that generated them. For example, imagine you have the following data frame that describes the properties of 3 samples from the uniform distribution: ```{r} df <- tribble( ~ n, ~ min, ~ max, 1, 0, 1, 2, 10, 100, 3, 100, 1000, ) ``` You can supply these parameters to `runif()` by using `rowwise()` and `mutate()`: ```{r} df |> rowwise() |> mutate(data = list(runif(n, min, max))) ``` Note the use of `list()` here - `runif()` returns multiple values and a `mutate()` expression has to return something of length 1. `list()` means that we'll get a list column where each row is a list containing multiple values. If you forget to use `list()`, dplyr will give you a hint: ```{r, error = TRUE} df |> rowwise() |> mutate(data = runif(n, min, max)) ``` ### Multiple combinations What if you want to call a function for every combination of inputs? You can use `expand.grid()` (or `tidyr::expand_grid()`) to generate the data frame and then repeat the same pattern as above: ```{r} df <- expand.grid(mean = c(-1, 0, 1), sd = c(1, 10, 100)) df |> rowwise() |> mutate(data = list(rnorm(10, mean, sd))) ``` ### Varying functions In more complicated problems, you might also want to vary the function being called. This tends to be a bit more of an awkward fit with this approach because the columns in the input tibble will be less regular. But it's still possible, and it's a natural place to use `do.call()`: ```{r} df <- tribble( ~rng, ~params, "runif", list(n = 10), "rnorm", list(n = 20), "rpois", list(n = 10, lambda = 5), ) |> rowwise() df |> mutate(data = list(do.call(rng, params))) ``` ```{r, include = FALSE, eval = FALSE} df <- rowwise(tribble( ~rng, ~params, "runif", list(min = -1, max = 1), "rnorm", list(), "rpois", list(lambda = 5), )) # Has to happen in separate function to avoid eager unquoting f <- function(rng, params) purrr::exec(rng, n = 10, !!!params) df |> mutate(data = list(f(rng, params))) ``` ## Previously ### `rowwise()` `rowwise()` was also questioning for quite some time, partly because I didn't appreciate how many people needed the native ability to compute summaries across multiple variables for each row. As an alternative, we recommended performing row-wise operations with the purrr `map()` functions. However, this was challenging because you needed to pick a map function based on the number of arguments that were varying and the type of result, which required quite some knowledge of purrr functions. I was also resistant to `rowwise()` because I felt like automatically switching between `[` to `[[` was too magical in the same way that automatically `list()`-ing results made `do()` too magical. I've now persuaded myself that the row-wise magic is good magic partly because most people find the distinction between `[` and `[[` mystifying and `rowwise()` means that you don't need to think about it. Since `rowwise()` clearly is useful it is not longer questioning, and we expect it to be around for the long term. ### `do()` We've questioned the need for `do()` for quite some time, because it never felt very similar to the other dplyr verbs. It had two main modes of operation: * Without argument names: you could call functions that input and output data frames using `.` to refer to the "current" group. For example, the following code gets the first row of each group: ```{r} mtcars |> group_by(cyl) |> do(head(., 1)) ``` This has been superseded by `pick()` plus `reframe()`, a variant of `summarise()` that can create multiple rows and columns per group. ```{r} mtcars |> group_by(cyl) |> reframe(head(pick(everything()), 1)) ``` * With arguments: it worked like `mutate()` but automatically wrapped every element in a list: ```{r} mtcars |> group_by(cyl) |> do(nrows = nrow(.)) ``` I now believe that behaviour is both too magical and not very useful, and it can be replaced by `summarise()` and `pick()`. ```{r} mtcars |> group_by(cyl) |> summarise(nrows = nrow(pick(everything()))) ``` If needed (unlike here), you can wrap the results in a list yourself. The addition of `pick()`/`across()` and the increased scope of `summarise()`/`reframe()` means that `do()` is no longer needed, so it is now superseded. dplyr/inst/doc/in-packages.R0000644000176200001440000000452115137234456015447 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message = FALSE--------------------------------------------------- library(dplyr) ## ----------------------------------------------------------------------------- my_summary_function <- function(data) { data |> select(grp, x, y) |> filter(x > 0) |> group_by(grp) |> summarise(y = mean(y), n = n()) } ## ----------------------------------------------------------------------------- #' @importFrom rlang .data my_summary_function <- function(data) { data |> select("grp", "x", "y") |> filter(.data$x > 0) |> group_by(.data$grp) |> summarise(y = mean(.data$y), n = n()) } ## ----eval=FALSE--------------------------------------------------------------- # if (utils::packageVersion("dplyr") > "0.5.0") { # # code for new version # } else { # # code for old version # } ## ----eval=FALSE--------------------------------------------------------------- # if (utils::packageVersion("dplyr") > "1.0.10") { # dplyr::reframe(df, x = unique(x)) # } else { # dplyr::summarise(df, x = unique(x)) # } ## ----eval=FALSE--------------------------------------------------------------- # if (utils::packageVersion("dplyr") > "1.0.10") { # utils::getFromNamespace("reframe", "dplyr")(df, x = unique(x)) # } else { # dplyr::summarise(df, x = unique(x)) # } ## ----eval=FALSE--------------------------------------------------------------- # #' @rawNamespace # #' if (utils::packageVersion("dplyr") > "0.5.0") { # #' importFrom("dbplyr", "build_sql") # #' } else { # #' importFrom("dplyr", "build_sql") # #' } ## ----eval=FALSE--------------------------------------------------------------- # starwars |> mutate_each(funs(as.character)) # starwars |> mutate_all(funs(as.character)) # starwars |> mutate(across(everything(), as.character)) ## ----eval = FALSE------------------------------------------------------------- # starwars |> mutate_each(funs(as.character), height, mass) # starwars |> mutate_at(vars(height, mass), as.character) # starwars |> mutate(across(c(height, mass), as.character)) ## ----eval=FALSE--------------------------------------------------------------- # starwars |> mutate_if(is.factor, as.character) # starwars |> mutate(across(where(is.factor), as.character)) dplyr/inst/doc/grouping.html0000644000176200001440000016471615137234455015676 0ustar liggesusers Grouped data

Grouped data

dplyr verbs are particularly powerful when you apply them to grouped data frames (grouped_df objects). This vignette shows you:

  • How to group, inspect, and ungroup with group_by() and friends.

  • How individual dplyr verbs changes their behaviour when applied to grouped data frame.

  • How to access data about the “current” group from within a verb.

We’ll start by loading dplyr:

library(dplyr)

group_by()

The most important grouping verb is group_by(): it takes a data frame and one or more variables to group by:

by_species <- starwars |> group_by(species)
by_sex_gender <- starwars |> group_by(sex, gender)

You can see the grouping when you print the data:

by_species
#> # A tibble: 87 × 14
#> # Groups:   species [38]
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>
by_sex_gender
#> # A tibble: 87 × 14
#> # Groups:   sex, gender [6]
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
#> 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
#> 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
#> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
#> # ℹ 83 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Or use tally() to count the number of rows in each group. The sort argument is useful if you want to see the largest groups up front.

by_species |> tally()
#> # A tibble: 38 × 2
#>   species      n
#>   <chr>    <int>
#> 1 Aleena       1
#> 2 Besalisk     1
#> 3 Cerean       1
#> 4 Chagrian     1
#> # ℹ 34 more rows

by_sex_gender |> tally(sort = TRUE)
#> # A tibble: 6 × 3
#> # Groups:   sex [5]
#>   sex    gender        n
#>   <chr>  <chr>     <int>
#> 1 male   masculine    60
#> 2 female feminine     16
#> 3 none   masculine     5
#> 4 <NA>   <NA>          4
#> # ℹ 2 more rows

As well as grouping by existing variables, you can group by any function of existing variables. This is equivalent to performing a mutate() before the group_by():

bmi_breaks <- c(0, 18.5, 25, 30, Inf)

starwars |>
  group_by(bmi_cat = cut(mass/(height/100)^2, breaks=bmi_breaks)) |>
  tally()
#> # A tibble: 5 × 2
#>   bmi_cat       n
#>   <fct>     <int>
#> 1 (0,18.5]     10
#> 2 (18.5,25]    24
#> 3 (25,30]      13
#> 4 (30,Inf]     12
#> # ℹ 1 more row

Group metadata

You can see underlying group data with group_keys(). It has one row for each group and one column for each grouping variable:

by_species |> group_keys()
#> # A tibble: 38 × 1
#>   species 
#>   <chr>   
#> 1 Aleena  
#> 2 Besalisk
#> 3 Cerean  
#> 4 Chagrian
#> # ℹ 34 more rows

by_sex_gender |> group_keys()
#> # A tibble: 6 × 2
#>   sex            gender   
#>   <chr>          <chr>    
#> 1 female         feminine 
#> 2 hermaphroditic masculine
#> 3 male           masculine
#> 4 none           feminine 
#> # ℹ 2 more rows

You can see which group each row belongs to with group_indices():

by_species |> group_indices()
#>  [1] 11  6  6 11 11 11 11  6 11 11 11 11 34 11 24 12 11 38 36 11 11  6 31 11 11
#> [26] 18 11 11  8 26 11 21 11 11 10 10 10 11 30  7 11 11 37 32 32  1 33 35 29 11
#> [51]  3 20 37 27 13 23 16  4 38 38 11  9 17 17 11 11 11 11  5  2 15 15 11  6 25
#> [76] 19 28 14 34 11 38 22 11 11 11  6 11

And which rows each group contains with group_rows():

by_species |> group_rows() |> head()
#> <list_of<integer>[6]>
#> [[1]]
#> [1] 46
#> 
#> [[2]]
#> [1] 70
#> 
#> [[3]]
#> [1] 51
#> 
#> [[4]]
#> [1] 58
#> 
#> [[5]]
#> [1] 69
#> 
#> [[6]]
#> [1]  2  3  8 22 74 86

Use group_vars() if you just want the names of the grouping variables:

by_species |> group_vars()
#> [1] "species"
by_sex_gender |> group_vars()
#> [1] "sex"    "gender"

Changing and adding to grouping variables

If you apply group_by() to an already grouped dataset, will overwrite the existing grouping variables. For example, the following code groups by homeworld instead of species:

by_species |>
  group_by(homeworld) |>
  tally()
#> # A tibble: 49 × 2
#>   homeworld       n
#>   <chr>       <int>
#> 1 Alderaan        3
#> 2 Aleen Minor     1
#> 3 Bespin          1
#> 4 Bestine IV      1
#> # ℹ 45 more rows

To augment the grouping, using .add = TRUE1. For example, the following code groups by species and homeworld:

by_species |>
  group_by(homeworld, .add = TRUE) |>
  tally()
#> # A tibble: 57 × 3
#> # Groups:   species [38]
#>   species  homeworld       n
#>   <chr>    <chr>       <int>
#> 1 Aleena   Aleen Minor     1
#> 2 Besalisk Ojom            1
#> 3 Cerean   Cerea           1
#> 4 Chagrian Champala        1
#> # ℹ 53 more rows

Removing grouping variables

To remove all grouping variables, use ungroup():

by_species |>
  ungroup() |>
  tally()
#> # A tibble: 1 × 1
#>       n
#>   <int>
#> 1    87

You can also choose to selectively ungroup by listing the variables you want to remove:

by_sex_gender |>
  ungroup(sex) |>
  tally()
#> # A tibble: 3 × 2
#>   gender        n
#>   <chr>     <int>
#> 1 feminine     17
#> 2 masculine    66
#> 3 <NA>          4

Verbs

The following sections describe how grouping affects the main dplyr verbs.

summarise()

summarise() computes a summary for each group. This means that it starts from group_keys(), adding summary variables to the right hand side:

by_species |>
  summarise(
    n = n(),
    height = mean(height, na.rm = TRUE)
  )
#> # A tibble: 38 × 3
#>   species      n height
#>   <chr>    <int>  <dbl>
#> 1 Aleena       1     79
#> 2 Besalisk     1    198
#> 3 Cerean       1    198
#> 4 Chagrian     1    196
#> # ℹ 34 more rows

The .groups= argument controls the grouping structure of the output. The historical behaviour of removing the right hand side grouping variable corresponds to .groups = "drop_last" without a message or .groups = NULL with a message (the default).

by_sex_gender |>
  summarise(n = n()) |>
  group_vars()
#> `summarise()` has regrouped the output.
#> ℹ Summaries were computed grouped by sex and gender.
#> ℹ Output is grouped by sex.
#> ℹ Use `summarise(.groups = "drop_last")` to silence this message.
#> ℹ Use `summarise(.by = c(sex, gender))` for per-operation grouping
#>   (`?dplyr::dplyr_by`) instead.
#> [1] "sex"

by_sex_gender |>
  summarise(n = n(), .groups = "drop_last") |>
  group_vars()
#> [1] "sex"

Since version 1.0.0 the groups may also be kept (.groups = "keep") or dropped (.groups = "drop").

by_sex_gender |>
  summarise(n = n(), .groups = "keep") |>
  group_vars()
#> [1] "sex"    "gender"

by_sex_gender |>
  summarise(n = n(), .groups = "drop") |>
  group_vars()
#> character(0)

When the output no longer have grouping variables, it becomes ungrouped (i.e. a regular tibble).

select(), rename(), and relocate()

rename() and relocate() behave identically with grouped and ungrouped data because they only affect the name or position of existing columns. Grouped select() is almost identical to ungrouped select, except that it always includes the grouping variables:

by_species |> select(mass)
#> Adding missing grouping variables: `species`
#> # A tibble: 87 × 2
#> # Groups:   species [38]
#>   species  mass
#>   <chr>   <dbl>
#> 1 Human      77
#> 2 Droid      75
#> 3 Droid      32
#> 4 Human     136
#> # ℹ 83 more rows

If you don’t want the grouping variables, you’ll have to first ungroup(). (This design is possibly a mistake, but we’re stuck with it for now.)

arrange()

Grouped arrange() is the same as ungrouped arrange(), unless you set .by_group = TRUE, in which case it will order first by the grouping variables.

by_species |>
  arrange(desc(mass)) |>
  relocate(species, mass)
#> # A tibble: 87 × 14
#> # Groups:   species [38]
#>   species  mass name     height hair_color skin_color eye_color birth_year sex  
#>   <chr>   <dbl> <chr>     <int> <chr>      <chr>      <chr>          <dbl> <chr>
#> 1 Hutt     1358 Jabba D…    175 <NA>       green-tan… orange         600   herm…
#> 2 Kaleesh   159 Grievous    216 none       brown, wh… green, y…       NA   male 
#> 3 Droid     140 IG-88       200 none       metal      red             15   none 
#> 4 Human     136 Darth V…    202 none       white      yellow          41.9 male 
#> # ℹ 83 more rows
#> # ℹ 5 more variables: gender <chr>, homeworld <chr>, films <list>,
#> #   vehicles <list>, starships <list>

by_species |>
  arrange(desc(mass), .by_group = TRUE) |>
  relocate(species, mass)
#> # A tibble: 87 × 14
#> # Groups:   species [38]
#>   species   mass name    height hair_color skin_color eye_color birth_year sex  
#>   <chr>    <dbl> <chr>    <int> <chr>      <chr>      <chr>          <dbl> <chr>
#> 1 Aleena      15 Ratts …     79 none       grey, blue unknown           NA male 
#> 2 Besalisk   102 Dexter…    198 none       brown      yellow            NA male 
#> 3 Cerean      82 Ki-Adi…    198 white      pale       yellow            92 male 
#> 4 Chagrian    NA Mas Am…    196 none       blue       blue              NA male 
#> # ℹ 83 more rows
#> # ℹ 5 more variables: gender <chr>, homeworld <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Note that second example is sorted by species (from the group_by() statement) and then by mass (within species).

mutate()

In simple cases with vectorised functions, grouped and ungrouped mutate() give the same results. They differ when used with summary functions:

# Subtract off global mean
starwars |>
  select(name, homeworld, mass) |>
  mutate(standard_mass = mass - mean(mass, na.rm = TRUE))
#> # A tibble: 87 × 4
#>   name           homeworld  mass standard_mass
#>   <chr>          <chr>     <dbl>         <dbl>
#> 1 Luke Skywalker Tatooine     77         -20.3
#> 2 C-3PO          Tatooine     75         -22.3
#> 3 R2-D2          Naboo        32         -65.3
#> 4 Darth Vader    Tatooine    136          38.7
#> # ℹ 83 more rows

# Subtract off homeworld mean
starwars |>
  select(name, homeworld, mass) |>
  group_by(homeworld) |>
  mutate(standard_mass = mass - mean(mass, na.rm = TRUE))
#> # A tibble: 87 × 4
#> # Groups:   homeworld [49]
#>   name           homeworld  mass standard_mass
#>   <chr>          <chr>     <dbl>         <dbl>
#> 1 Luke Skywalker Tatooine     77         -8.38
#> 2 C-3PO          Tatooine     75        -10.4 
#> 3 R2-D2          Naboo        32        -32.2 
#> 4 Darth Vader    Tatooine    136         50.6 
#> # ℹ 83 more rows

Or with window functions like min_rank():

# Overall rank
starwars |>
  select(name, homeworld, height) |>
  mutate(rank = min_rank(height))
#> # A tibble: 87 × 4
#>   name           homeworld height  rank
#>   <chr>          <chr>      <int> <int>
#> 1 Luke Skywalker Tatooine     172    28
#> 2 C-3PO          Tatooine     167    20
#> 3 R2-D2          Naboo         96     5
#> 4 Darth Vader    Tatooine     202    72
#> # ℹ 83 more rows

# Rank per homeworld
starwars |>
  select(name, homeworld, height) |>
  group_by(homeworld) |>
  mutate(rank = min_rank(height))
#> # A tibble: 87 × 4
#> # Groups:   homeworld [49]
#>   name           homeworld height  rank
#>   <chr>          <chr>      <int> <int>
#> 1 Luke Skywalker Tatooine     172     5
#> 2 C-3PO          Tatooine     167     4
#> 3 R2-D2          Naboo         96     1
#> 4 Darth Vader    Tatooine     202    10
#> # ℹ 83 more rows

filter()

A grouped filter() effectively does a mutate() to generate a logical variable, and then only keeps the rows where the variable is TRUE. This means that grouped filters can be used with summary functions. For example, we can find the tallest character of each species:

by_species |>
  select(name, species, height) |>
  filter(height == max(height))
#> # A tibble: 36 × 3
#> # Groups:   species [36]
#>   name                  species        height
#>   <chr>                 <chr>           <int>
#> 1 Greedo                Rodian            173
#> 2 Jabba Desilijic Tiure Hutt              175
#> 3 Yoda                  Yoda's species     66
#> 4 Bossk                 Trandoshan        190
#> # ℹ 32 more rows

You can also use filter_out() to remove entire groups. For example, the following code eliminates all groups that only have a single member:

by_species |>
  filter_out(n() == 1) |>
  tally()
#> # A tibble: 9 × 2
#>   species      n
#>   <chr>    <int>
#> 1 Droid        6
#> 2 Gungan       3
#> 3 Human       35
#> 4 Kaminoan     2
#> # ℹ 5 more rows

slice() and friends

slice() and friends (slice_head(), slice_tail(), slice_sample(), slice_min() and slice_max()) select rows within a group. For example, we can select the first observation within each species:

by_species |>
  relocate(species) |>
  slice(1)
#> # A tibble: 38 × 14
#> # Groups:   species [38]
#>   species  name    height  mass hair_color skin_color eye_color birth_year sex  
#>   <chr>    <chr>    <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr>
#> 1 Aleena   Ratts …     79    15 none       grey, blue unknown           NA male 
#> 2 Besalisk Dexter…    198   102 none       brown      yellow            NA male 
#> 3 Cerean   Ki-Adi…    198    82 white      pale       yellow            92 male 
#> 4 Chagrian Mas Am…    196    NA none       blue       blue              NA male 
#> # ℹ 34 more rows
#> # ℹ 5 more variables: gender <chr>, homeworld <chr>, films <list>,
#> #   vehicles <list>, starships <list>

Similarly, we can use slice_min() to select the smallest n values of a variable:

by_species |>
  filter_out(is.na(height)) |>
  slice_min(height, n = 2)
#> # A tibble: 47 × 14
#> # Groups:   species [38]
#>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
#>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#> 1 Ratts Ty…     79    15 none       grey, blue unknown           NA male  mascu…
#> 2 Dexter J…    198   102 none       brown      yellow            NA male  mascu…
#> 3 Ki-Adi-M…    198    82 white      pale       yellow            92 male  mascu…
#> 4 Mas Amed…    196    NA none       blue       blue              NA male  mascu…
#> # ℹ 43 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

  1. Note that the argument changed from add = TRUE to .add = TRUE in dplyr 1.0.0.↩︎

dplyr/inst/doc/recoding-replacing.R0000644000176200001440000001651015137234462017017 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message = FALSE--------------------------------------------------- library(dplyr) ## ----------------------------------------------------------------------------- set.seed(123) racers <- tibble( id = seq_len(100), time = round(sample(1200:2100, size = 100, replace = TRUE) / 60, 2) ) racers ## ----------------------------------------------------------------------------- tiers <- racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D" ) ) tiers ## ----------------------------------------------------------------------------- racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .default = "unknown" ) ) ## ----error = TRUE------------------------------------------------------------- try({ racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .unmatched = "error" ) ) }) ## ----------------------------------------------------------------------------- id_banned_shoes <- c(2, 10, 15, 32, 65) id_false_start <- c(1, 2, 5, 20, 55, 74, 91) ## ----------------------------------------------------------------------------- racers |> mutate( time = case_when( id %in% id_banned_shoes ~ NA, id %in% id_false_start ~ time + 1 / 3, .default = time ) ) ## ----------------------------------------------------------------------------- racers |> mutate(time = if_else(id %in% id_banned_shoes, NA, time)) |> mutate(time = if_else(id %in% id_false_start, time + 1 / 3, time)) ## ----------------------------------------------------------------------------- racers |> mutate( time = time |> replace_when( id %in% id_banned_shoes ~ NA, id %in% id_false_start ~ time + 1 / 3 ) ) ## ----eval = FALSE------------------------------------------------------------- # racers |> # mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |> # mutate(time = base::replace(time, id %in% id_false_start, time + 1 / 3)) ## ----------------------------------------------------------------------------- racers |> mutate(time = base::replace(time, id %in% id_banned_shoes, NA)) |> mutate(time = { loc <- id %in% id_false_start base::replace(time, loc, time[loc] + 1 / 3) }) ## ----------------------------------------------------------------------------- id_with_malfunction <- c(1, 5, 20, 50) tiers <- racers |> mutate( tier = case_when( time < 23 ~ "A", time < 27 ~ "B", time < 30 ~ "C", time < 33 ~ "D", .default = "unknown" ) |> factor(levels = c("A", "B", "C", "D", "unknown")) ) tiers ## ----------------------------------------------------------------------------- tiers |> mutate( tier = case_when(id %in% id_with_malfunction ~ "unknown", .default = tier) ) ## ----------------------------------------------------------------------------- tiers |> mutate( tier = tier |> replace_when(id %in% id_with_malfunction ~ "unknown") ) ## ----------------------------------------------------------------------------- likert <- tibble( score = c(1, 2, 3, 4, 5, 2, 3, 1, 4) ) ## ----------------------------------------------------------------------------- likert |> mutate( score = case_when( score == 1 ~ "Strongly disagree", score == 2 ~ "Disagree", score == 3 ~ "Neutral", score == 4 ~ "Agree", score == 5 ~ "Strongly agree" ) ) ## ----------------------------------------------------------------------------- likert |> mutate( score = score |> recode_values( 1 ~ "Strongly disagree", 2 ~ "Disagree", 3 ~ "Neutral", 4 ~ "Agree", 5 ~ "Strongly agree" ) ) ## ----------------------------------------------------------------------------- lookup <- tribble( ~from , ~to , 1 , "Strongly disagree" , 2 , "Disagree" , 3 , "Neutral" , 4 , "Agree" , 5 , "Strongly agree" ) ## ----------------------------------------------------------------------------- likert |> mutate(score = recode_values(score, from = lookup$from, to = lookup$to)) ## ----eval = FALSE------------------------------------------------------------- # lookup <- readr::read_csv("lookup.csv") ## ----error = TRUE------------------------------------------------------------- try({ likert <- tibble( score = c(0, 1, 2, 2, 4, 5, 2, 3, 1, 4) ) # Missed the `0` likert |> mutate( score = score |> recode_values( from = lookup$from, to = lookup$to, unmatched = "error" ) ) }) ## ----------------------------------------------------------------------------- schools <- tibble( name = c( "UNC", "Chapel Hill", NA, "Duke", "Duke University", "UNC", "NC State", "ECU" ) ) ## ----------------------------------------------------------------------------- schools |> mutate( name = recode_values( name, c("UNC", "Chapel Hill") ~ "UNC Chapel Hill", c("Duke", "Duke University") ~ "Duke", default = name ) ) ## ----------------------------------------------------------------------------- schools |> mutate( name = name |> replace_values( c("UNC", "Chapel Hill") ~ "UNC Chapel Hill", c("Duke", "Duke University") ~ "Duke" ) ) ## ----------------------------------------------------------------------------- lookup <- tribble( ~from , ~to , "UNC" , "UNC Chapel Hill" , "Chapel Hill" , "UNC Chapel Hill" , "Duke" , "Duke" , "Duke University" , "Duke" ) schools |> mutate(name = replace_values(name, from = lookup$from, to = lookup$to)) ## ----------------------------------------------------------------------------- # Condensed lookup table with a `many:1` mapping per row lookup <- tribble( ~from , ~to , c("UNC", "Chapel Hill") , "UNC Chapel Hill" , c("Duke", "Duke University") , "Duke" ) # Note that `from` is a list column lookup lookup$from # Works the same as before schools |> mutate(name = replace_values(name, from = lookup$from, to = lookup$to)) ## ----eval = FALSE------------------------------------------------------------- # if_else(condition, true, false, missing) # # case_when( # condition ~ true, # !condition ~ false, # is.na(condition) ~ missing # ) ## ----eval = FALSE------------------------------------------------------------- # x <- if_else(x > 5, new, x) # # # Type stable on `x`. # # Intent of "partially updating" `x` is clear. # # Pipe friendly. # x <- x |> replace_when(x > 5 ~ new) ## ----------------------------------------------------------------------------- x <- c(1, 2, NA, 3, NA, 5) y <- c(0, 3, 1, 4, 6, 7) coalesce(x, 0) replace_values(x, NA ~ 0) coalesce(x, y) replace_values(x, NA ~ y) ## ----------------------------------------------------------------------------- x <- c(1, 2, 0, -99, 12) # To convert `0` and `-99` to `NA`, you have to do it in two calls x |> na_if(0) |> na_if(-99) x |> replace_values(from = c(0, -99), to = NA) dplyr/inst/doc/window-functions.html0000644000176200001440000010530515137234471017344 0ustar liggesusers Window functions

Window functions

A window function is a variation on an aggregation function. Where an aggregation function, like sum() and mean(), takes n inputs and return a single value, a window function returns n values. The output of a window function depends on all its input values, so window functions don’t include functions that work element-wise, like + or round(). Window functions include variations on aggregate functions, like cumsum() and cummean(), functions for ranking and ordering, like rank(), and functions for taking offsets, like lead() and lag().

In this vignette, we’ll use a small sample of the Lahman batting dataset, including the players that have won an award.

library(Lahman)

batting <- Lahman::Batting |>
  as_tibble() |>
  select(playerID, yearID, teamID, G, AB:H) |>
  arrange(playerID, yearID, teamID) |>
  semi_join(Lahman::AwardsPlayers, by = "playerID")

players <- batting |> group_by(playerID)

Window functions are used in conjunction with mutate() and filter() to solve a wide range of problems. Here’s a selection:

# For each player, find the two years with most hits
filter(players, min_rank(desc(H)) <= 2 & H > 0)
# Within each player, rank each year by the number of games played
mutate(players, G_rank = min_rank(G))

# For each player, find every year that was better than the previous year
filter(players, G > lag(G))
# For each player, compute avg change in games played per year
mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID)))

# For each player, find all years where they played more games than they did on average
filter(players, G > mean(G))
# For each, player compute a z score based on number of games played
mutate(players, G_z = (G - mean(G)) / sd(G))

Before reading this vignette, you should be familiar with mutate() and filter().

Types of window functions

There are five main families of window functions. Two families are unrelated to aggregation functions:

  • Ranking and ordering functions: row_number(), min_rank(), dense_rank(), cume_dist(), percent_rank(), and ntile(). These functions all take a vector to order by, and return various types of ranks.

  • Offsets lead() and lag() allow you to access the previous and next values in a vector, making it easy to compute differences and trends.

The other three families are variations on familiar aggregate functions:

  • Cumulative aggregates: cumsum(), cummin(), cummax() (from base R), and cumall(), cumany(), and cummean() (from dplyr).

  • Rolling aggregates operate in a fixed width window. You won’t find them in base R or in dplyr, but there are many implementations in other packages, such as RcppRoll.

  • Recycled aggregates, where an aggregate is repeated to match the length of the input. These are not needed in R because vector recycling automatically recycles aggregates where needed. They are important in SQL, because the presence of an aggregation function usually tells the database to return only one row per group.

Each family is described in more detail below, focussing on the general goals and how to use them with dplyr. For more details, refer to the individual function documentation.

Ranking functions

The ranking functions are variations on a theme, differing in how they handle ties:

x <- c(1, 1, 2, 2, 2)

row_number(x)
#> [1] 1 2 3 4 5
min_rank(x)
#> [1] 1 1 3 3 3
dense_rank(x)
#> [1] 1 1 2 2 2

If you’re familiar with R, you may recognise that row_number() and min_rank() can be computed with the base rank() function and various values of the ties.method argument. These functions are provided to save a little typing, and to make it easier to convert between R and SQL.

Two other ranking functions return numbers between 0 and 1. percent_rank() gives the percentage of the rank; cume_dist() gives the proportion of values less than or equal to the current value.

cume_dist(x)
#> [1] 0.4 0.4 1.0 1.0 1.0
percent_rank(x)
#> [1] 0.0 0.0 0.5 0.5 0.5

These are useful if you want to select (for example) the top 10% of records within each group. For example:

filter(players, cume_dist(desc(G)) < 0.1)
#> # A tibble: 1,604 × 7
#> # Groups:   playerID [1,487]
#>   playerID  yearID teamID     G    AB     R     H
#>   <chr>      <int> <fct>  <int> <int> <int> <int>
#> 1 aaronha01   1963 ML1      161   631   121   201
#> 2 aaronha01   1968 ATL      160   606    84   174
#> 3 aasedo01    1986 BAL       66     0     0     0
#> 4 abbotji01   1991 CAL       34     0     0     0
#> # ℹ 1,600 more rows

Finally, ntile() divides the data up into n evenly sized buckets. It’s a coarse ranking, and it can be used in with mutate() to divide the data into buckets for further summary. For example, we could use ntile() to divide the players within a team into four ranked groups, and calculate the average number of games within each group.

by_team_player <- group_by(batting, teamID, playerID)
by_team <- summarise(by_team_player, G = sum(G))
#> `summarise()` has regrouped the output.
#> ℹ Summaries were computed grouped by teamID and playerID.
#> ℹ Output is grouped by teamID.
#> ℹ Use `summarise(.groups = "drop_last")` to silence this message.
#> ℹ Use `summarise(.by = c(teamID, playerID))` for per-operation grouping
#>   (`?dplyr::dplyr_by`) instead.
by_team_quartile <- group_by(by_team, quartile = ntile(G, 4))
summarise(by_team_quartile, mean(G))
#> # A tibble: 4 × 2
#>   quartile `mean(G)`
#>      <int>     <dbl>
#> 1        1      18.1
#> 2        2      70.6
#> 3        3     191. 
#> 4        4     764.

All ranking functions rank from lowest to highest so that small input values get small ranks. Use desc() to rank from highest to lowest.

Lead and lag

lead() and lag() produce offset versions of a input vector that is either ahead of or behind the original vector.

x <- 1:5
lead(x)
#> [1]  2  3  4  5 NA
lag(x)
#> [1] NA  1  2  3  4

You can use them to:

  • Compute differences or percent changes.

    # Compute the relative change in games played
    mutate(players, G_delta = G - lag(G))

    Using lag() is more convenient than diff() because for n inputs diff() returns n - 1 outputs.

  • Find out when a value changes.

    # Find when a player changed teams
    filter(players, teamID != lag(teamID))

lead() and lag() have an optional argument order_by. If set, instead of using the row order to determine which value comes before another, they will use another variable. This is important if you have not already sorted the data, or you want to sort one way and lag another.

Here’s a simple example of what happens if you don’t specify order_by when you need it:

df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
scrambled <- df[sample(nrow(df)), ]

wrong <- mutate(scrambled, prev_value = lag(value))
arrange(wrong, year)
#>   year value prev_value
#> 1 2000     0          4
#> 2 2001     1          0
#> 3 2002     4          9
#> 4 2003     9         16
#> 5 2004    16         NA
#> 6 2005    25          1

right <- mutate(scrambled, prev_value = lag(value, order_by = year))
arrange(right, year)
#>   year value prev_value
#> 1 2000     0         NA
#> 2 2001     1          0
#> 3 2002     4          1
#> 4 2003     9          4
#> 5 2004    16          9
#> 6 2005    25         16

Cumulative aggregates

Base R provides cumulative sum (cumsum()), cumulative min (cummin()), and cumulative max (cummax()). (It also provides cumprod() but that is rarely useful). Other common accumulating functions are cumany() and cumall(), cumulative versions of || and &&, and cummean(), a cumulative mean. These are not included in base R, but efficient versions are provided by dplyr.

cumany() and cumall() are useful for selecting all rows up to, or all rows after, a condition is true for the first (or last) time. For example, we can use cumany() to find all records for a player after they played a year with 150 games:

filter(players, cumany(G > 150))

Like lead and lag, you may want to control the order in which the accumulation occurs. None of the built in functions have an order_by argument so dplyr provides a helper: order_by(). You give it the variable you want to order by, and then the call to the window function:

x <- 1:10
y <- 10:1
order_by(y, cumsum(x))
#>  [1] 55 54 52 49 45 40 34 27 19 10

This function uses a bit of non-standard evaluation, so I wouldn’t recommend using it inside another function; use the simpler but less concise with_order() instead.

Recycled aggregates

R’s vector recycling makes it easy to select values that are higher or lower than a summary. I call this a recycled aggregate because the value of the aggregate is recycled to be the same length as the original vector. Recycled aggregates are useful if you want to find all records greater than the mean or less than the median:

filter(players, G > mean(G))
filter(players, G < median(G))

While most SQL databases don’t have an equivalent of median() or quantile(), when filtering you can achieve the same effect with ntile(). For example, x > median(x) is equivalent to ntile(x, 2) == 2; x > quantile(x, 75) is equivalent to ntile(x, 100) > 75 or ntile(x, 4) > 3.

filter(players, ntile(G, 2) == 2)

You can also use this idea to select the records with the highest (x == max(x)) or lowest value (x == min(x)) for a field, but the ranking functions give you more control over ties, and allow you to select any number of records.

Recycled aggregates are also useful in conjunction with mutate(). For example, with the batting data, we could compute the “career year”, the number of years a player has played since they entered the league:

mutate(players, career_year = yearID - min(yearID) + 1)
#> # A tibble: 32,112 × 8
#> # Groups:   playerID [2,466]
#>   playerID  yearID teamID     G    AB     R     H career_year
#>   <chr>      <int> <fct>  <int> <int> <int> <int>       <dbl>
#> 1 aaronha01   1954 ML1      122   468    58   131           1
#> 2 aaronha01   1955 ML1      153   602   105   189           2
#> 3 aaronha01   1956 ML1      153   609   106   200           3
#> 4 aaronha01   1957 ML1      151   615   118   198           4
#> # ℹ 32,108 more rows

Or, as in the introductory example, we could compute a z-score:

mutate(players, G_z = (G - mean(G)) / sd(G))
#> # A tibble: 32,112 × 8
#> # Groups:   playerID [2,466]
#>   playerID  yearID teamID     G    AB     R     H    G_z
#>   <chr>      <int> <fct>  <int> <int> <int> <int>  <dbl>
#> 1 aaronha01   1954 ML1      122   468    58   131 -1.16 
#> 2 aaronha01   1955 ML1      153   602   105   189  0.519
#> 3 aaronha01   1956 ML1      153   609   106   200  0.519
#> 4 aaronha01   1957 ML1      151   615   118   198  0.411
#> # ℹ 32,108 more rows
dplyr/inst/doc/rowwise.R0000644000176200001440000001406615137234465014771 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set(collapse = T, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) ## ----setup-------------------------------------------------------------------- library(dplyr, warn.conflicts = FALSE) ## ----include = FALSE---------------------------------------------------------- nest_by <- function(df, ...) { df |> group_by(...) |> summarise(data = list(pick(everything()))) |> rowwise(...) } # mtcars |> nest_by(cyl) ## ----------------------------------------------------------------------------- df <- tibble(x = 1:2, y = 3:4, z = 5:6) df |> rowwise() ## ----------------------------------------------------------------------------- df |> mutate(m = mean(c(x, y, z))) df |> rowwise() |> mutate(m = mean(c(x, y, z))) ## ----------------------------------------------------------------------------- df <- tibble(name = c("Mara", "Hadley"), x = 1:2, y = 3:4, z = 5:6) df |> rowwise() |> summarise(m = mean(c(x, y, z))) df |> rowwise(name) |> summarise(m = mean(c(x, y, z))) ## ----------------------------------------------------------------------------- df <- tibble(id = 1:6, w = 10:15, x = 20:25, y = 30:35, z = 40:45) df ## ----------------------------------------------------------------------------- rf <- df |> rowwise(id) ## ----------------------------------------------------------------------------- rf |> mutate(total = sum(c(w, x, y, z))) rf |> summarise(total = sum(c(w, x, y, z))) ## ----------------------------------------------------------------------------- rf |> mutate(total = sum(c_across(w:z))) rf |> mutate(total = sum(c_across(where(is.numeric)))) ## ----------------------------------------------------------------------------- rf |> mutate(total = sum(c_across(w:z))) |> ungroup() |> mutate(across(w:z, ~ . / total)) ## ----------------------------------------------------------------------------- df |> mutate(total = rowSums(pick(where(is.numeric), -id))) df |> mutate(mean = rowMeans(pick(where(is.numeric), -id))) ## ----------------------------------------------------------------------------- df <- tibble( x = list(1, 2:3, 4:6) ) ## ----------------------------------------------------------------------------- df |> mutate(l = length(x)) ## ----------------------------------------------------------------------------- df |> mutate(l = lengths(x)) ## ----------------------------------------------------------------------------- df |> mutate(l = sapply(x, length)) df |> mutate(l = purrr::map_int(x, length)) ## ----------------------------------------------------------------------------- df |> rowwise() |> mutate(l = length(x)) ## ----------------------------------------------------------------------------- df <- tibble(g = 1:2, y = list(1:3, "a")) gf <- df |> group_by(g) rf <- df |> rowwise(g) ## ----------------------------------------------------------------------------- gf |> mutate(type = typeof(y), length = length(y)) rf |> mutate(type = typeof(y), length = length(y)) ## ----------------------------------------------------------------------------- # grouped out1 <- integer(2) for (i in 1:2) { out1[[i]] <- length(df$y[i]) } out1 # rowwise out2 <- integer(2) for (i in 1:2) { out2[[i]] <- length(df$y[[i]]) } out2 ## ----error = TRUE------------------------------------------------------------- try({ gf |> mutate(y2 = y) rf |> mutate(y2 = y) rf |> mutate(y2 = list(y)) }) ## ----------------------------------------------------------------------------- by_cyl <- mtcars |> nest_by(cyl) by_cyl ## ----------------------------------------------------------------------------- mods <- by_cyl |> mutate(mod = list(lm(mpg ~ wt, data = data))) mods ## ----------------------------------------------------------------------------- mods <- mods |> mutate(pred = list(predict(mod, data))) mods ## ----------------------------------------------------------------------------- mods |> summarise(rmse = sqrt(mean((pred - data$mpg) ^ 2))) mods |> summarise(rsq = summary(mod)$r.squared) mods |> summarise(broom::glance(mod)) ## ----------------------------------------------------------------------------- mods |> reframe(broom::tidy(mod)) ## ----------------------------------------------------------------------------- df <- tribble( ~ n, ~ min, ~ max, 1, 0, 1, 2, 10, 100, 3, 100, 1000, ) ## ----------------------------------------------------------------------------- df |> rowwise() |> mutate(data = list(runif(n, min, max))) ## ----error = TRUE------------------------------------------------------------- try({ df |> rowwise() |> mutate(data = runif(n, min, max)) }) ## ----------------------------------------------------------------------------- df <- expand.grid(mean = c(-1, 0, 1), sd = c(1, 10, 100)) df |> rowwise() |> mutate(data = list(rnorm(10, mean, sd))) ## ----------------------------------------------------------------------------- df <- tribble( ~rng, ~params, "runif", list(n = 10), "rnorm", list(n = 20), "rpois", list(n = 10, lambda = 5), ) |> rowwise() df |> mutate(data = list(do.call(rng, params))) ## ----include = FALSE, eval = FALSE-------------------------------------------- # df <- rowwise(tribble( # ~rng, ~params, # "runif", list(min = -1, max = 1), # "rnorm", list(), # "rpois", list(lambda = 5), # )) # # # Has to happen in separate function to avoid eager unquoting # f <- function(rng, params) purrr::exec(rng, n = 10, !!!params) # df |> # mutate(data = list(f(rng, params))) ## ----------------------------------------------------------------------------- mtcars |> group_by(cyl) |> do(head(., 1)) ## ----------------------------------------------------------------------------- mtcars |> group_by(cyl) |> reframe(head(pick(everything()), 1)) ## ----------------------------------------------------------------------------- mtcars |> group_by(cyl) |> do(nrows = nrow(.)) ## ----------------------------------------------------------------------------- mtcars |> group_by(cyl) |> summarise(nrows = nrow(pick(everything()))) dplyr/README.md0000644000176200001440000001471715137161765012711 0ustar liggesusers # dplyr [![CRAN status](https://www.r-pkg.org/badges/version/dplyr)](https://cran.r-project.org/package=dplyr) [![R-CMD-check](https://github.com/tidyverse/dplyr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidyverse/dplyr/actions/workflows/R-CMD-check.yaml) [![Codecov test coverage](https://codecov.io/gh/tidyverse/dplyr/graph/badge.svg)](https://app.codecov.io/gh/tidyverse/dplyr) ## Overview dplyr is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges: - `mutate()` adds new variables that are functions of existing variables - `select()` picks variables based on their names. - `filter()` picks cases based on their values. - `summarise()` reduces multiple values down to a single summary. - `arrange()` changes the ordering of the rows. These all combine naturally with `group_by()` which allows you to perform any operation “by group”. You can learn more about them in `vignette("dplyr")`. As well as these single-table verbs, dplyr also provides a variety of two-table verbs, which you can learn about in `vignette("two-table")`. If you are new to dplyr, the best place to start is the [data transformation chapter](https://r4ds.hadley.nz/data-transform) in R for Data Science. ## Backends In addition to data frames/tibbles, dplyr makes working with other computational backends accessible and efficient. Below is a list of alternative backends: - [arrow](https://arrow.apache.org/docs/r/) for larger-than-memory datasets, including on remote cloud storage like AWS S3, using the Apache Arrow C++ engine, [Acero](https://arrow.apache.org/docs/cpp/acero/overview.html). - [dbplyr](https://dbplyr.tidyverse.org/) for data stored in a relational database. Translates your dplyr code to SQL. - [dtplyr](https://dtplyr.tidyverse.org/) for large, in-memory datasets. Translates your dplyr code to high performance [data.table](https://rdatatable.gitlab.io/data.table/) code. - [duckplyr](https://duckplyr.tidyverse.org/) for large, in-memory datasets. Translates your dplyr code to high performance [duckdb](https://duckdb.org) queries with zero extra copies and an automatic R fallback when translation isn’t possible. - [sparklyr](https://spark.posit.co/) for very large datasets stored in [Apache Spark](https://spark.apache.org). ## Installation ``` r # The easiest way to get dplyr is to install the whole tidyverse: install.packages("tidyverse") # Alternatively, install just dplyr: install.packages("dplyr") ``` ### Development version To get a bug fix or to use a feature from the development version, you can install the development version of dplyr from GitHub. ``` r # install.packages("pak") pak::pak("tidyverse/dplyr") ``` ## Cheat Sheet ## Usage ``` r library(dplyr) starwars |> filter(species == "Droid") #> # A tibble: 6 × 14 #> name height mass hair_color skin_color eye_color birth_year sex gender #> #> 1 C-3PO 167 75 gold yellow 112 none masculi… #> 2 R2-D2 96 32 white, blue red 33 none masculi… #> 3 R5-D4 97 32 white, red red NA none masculi… #> 4 IG-88 200 140 none metal red 15 none masculi… #> 5 R4-P17 96 NA none silver, red red, blue NA none feminine #> # ℹ 1 more row #> # ℹ 5 more variables: homeworld , species , films , #> # vehicles , starships starwars |> select(name, ends_with("color")) #> # A tibble: 87 × 4 #> name hair_color skin_color eye_color #> #> 1 Luke Skywalker blond fair blue #> 2 C-3PO gold yellow #> 3 R2-D2 white, blue red #> 4 Darth Vader none white yellow #> 5 Leia Organa brown light brown #> # ℹ 82 more rows starwars |> mutate(name, bmi = mass / ((height / 100)^2)) |> select(name:mass, bmi) #> # A tibble: 87 × 4 #> name height mass bmi #> #> 1 Luke Skywalker 172 77 26.0 #> 2 C-3PO 167 75 26.9 #> 3 R2-D2 96 32 34.7 #> 4 Darth Vader 202 136 33.3 #> 5 Leia Organa 150 49 21.8 #> # ℹ 82 more rows starwars |> arrange(desc(mass)) #> # A tibble: 87 × 14 #> name height mass hair_color skin_color eye_color birth_year sex gender #> #> 1 Jabba De… 175 1358 green-tan… orange 600 herm… mascu… #> 2 Grievous 216 159 none brown, wh… green, y… NA male mascu… #> 3 IG-88 200 140 none metal red 15 none mascu… #> 4 Darth Va… 202 136 none white yellow 41.9 male mascu… #> 5 Tarfful 234 136 brown brown blue NA male mascu… #> # ℹ 82 more rows #> # ℹ 5 more variables: homeworld , species , films , #> # vehicles , starships starwars |> group_by(species) |> summarise( n = n(), mass = mean(mass, na.rm = TRUE) ) |> filter( n > 1, mass > 50 ) #> # A tibble: 9 × 3 #> species n mass #> #> 1 Droid 6 69.8 #> 2 Gungan 3 74 #> 3 Human 35 81.3 #> 4 Kaminoan 2 88 #> 5 Mirialan 2 53.1 #> # ℹ 4 more rows ``` ## Getting help If you encounter a clear bug, please file an issue with a minimal reproducible example on [GitHub](https://github.com/tidyverse/dplyr/issues). For questions and other discussion, please use [forum.posit.co](https://forum.posit.co/). ## Code of conduct Please note that this project is released with a [Contributor Code of Conduct](https://dplyr.tidyverse.org/CODE_OF_CONDUCT). By participating in this project you agree to abide by its terms. dplyr/build/0000755000176200001440000000000015137234471012512 5ustar liggesusersdplyr/build/vignette.rds0000644000176200001440000000074015137234471015052 0ustar liggesuserso '͚tYG\vtiU^z%&,cֿf9n0|Aq$i28Io 3 ͘1N|OEA,a4"%ߪ@R.X9YU(J.$F-93%&K D6s-rͤx=Zh.DA Nh%Im"-)Akx!&=ۀQo ۢ*A3hyC],U?BІ%?MȷE#Z 6  !r# eB^}`NۣOt,G!hzn'}vK)w\P@o-.<27[繡%>F*C髙vaE9T󜛟zlޓ_6Ddplyr/build/stage23.rdb0000644000176200001440000036471515137234442014471 0ustar liggesusers |Iv'IL<P $Hl$GwO7fYU U5Y 3#K,Y>$Y>$[H%kmy-][W>ZV{m?_UQJ4,M*V|Eċ/>5 ذZO? c=B\gl0zB7e۳`M}`'&͂k N@,ZalJ/v~<;{/^޳C'IԱ_uj)ڡM_YoMln)/4@/o,[%owk`J? V2JLLv~Eˏ+~'0u(f3Uqkvc9KǃReiƟNp劝;w%g]8xR\KVy<`g6ku3DV?Ԡu?[6U:}M~#)AjVw yL2m'5 '.4 Y2'+ڣLK g (DlgqܒYtF36yss%c6{uknc],&{xMeg̜]t_gy5+2 J-B 3NA؜NUa/{^Z mz^1t'-oIRe_i/^|]ceB+ހ,55X!ǐ+:8_-6|ӝ7bY/\ CjoS#/@Жvf?yKA&i/QQq _R6!3pMС{2m63w_PɥuIϐ[lﵝ+6`wP9Գ'Lcj ^=EAz$R*%?oJy(Xw :iu|]n9RGE>1_;b;B~G fJ~C<5O!?U&>8n#t:enqZ! wb;۴iAr#$TJ`6}hX0ݏY%;z!Y:݇^|YNUULmC [PbѦCdtxi*j^,5k ;jh>tV<.9ڔE?f_~ <Yu ;C}~ƸPtsVьplUls ,ѦRXQmrqǴ9& w@ޡQb+jh+fZilQ<Lz! }wˤد&ElUvqxԤqCgrtT//qѻ_W,qߢ%$^4[●?Ir*Ђl)N*TmD( Į!3<ɍ+V'4M^;b֜ {0Uh<5&W!_eag00߭mcT5 eUnbj5+pTXwk6lid{N*w>a\Y9Ygx@_ p"߅nzC" ;;69?Z":FwBVn[,)*}_U=Sq؉. /cvcc,aChi |iA-5Xh3{#L?Zr;'ܕ !׭hͬ q>ke0CAT|xҦq 5ڸBk·kf=(3t*?e+t!*7!nOLM@ȮAQw3{ |hye5 N`ыn9伡 {Q^P[pvEwhachoC-enYf;ec|V[$D\bIyª Yq ,dga!Esr3f<%O4;.Y$^%N͜l/Pn(QeebE) 09o9Tyhz9[`Zbkue.]ٖat6s'RttsM/,Wl 4EȭiV=QR݊JOUVJ50:C?P"06$*cD~@pGj=^Sɢ- 1(}N$,K0 mkWbqy>̂3?o{am|^|5}r&U㨼qyONkh1h \?Fm.9:9~(TCQ[Oz9aީs†gGUl$,Z?i[m r 53ato8ZieIpwcTs[Ua+x4=q^QQ‘ 1 B0T4&;zyl`w)>K2[fES.NxD| DÛ. { ./EU藅Lg/Hq Bs8 9wx əYl: e;We*jSD4PRٶ @N BC+Q)jpW`dW&g79,Fg!KOdTNi1 '<3E6yEډRU>]&I?8N.if0 W\[myS_7GG=y ZbT5*š^ΔĨC/rKfoHRIS?p?dв,F4q'mCQ=Qi "DAz7Ap1pQ <Y_đ& ##L@lG2|&}B hs50YWTh UƀS^C}kqH)IClrB QP{.GH=(Hu~8<[5?Pbc#2Qq ?SO*9dp`)˾ kղog+Uj{TᔖڳKn`ê v%ǝo+6w!;Qt}8v[eίyCwc[p3aNV ^:F9\ҤI2,mIۢZWv1I[eZbc9;Yo}CuKZ;|D p=g\Q.0ae5dB/%*>|UVzpp3E]: Pg=;`~.gοfLczT M7eʺ#XX.[%'?SdwMɎ=*HJ6j(uq(K)xwQ0m$N3#bģX-R;lѢ;YsbV*d%Y@vwCޭ" {_8=>YV S0O;m?M4?9~ai[M;ԯ``$F"f``݆  6 F[Hlu%l`w]7"̑0ns,6hE#ns,6hE#z4UV95̱i,Kf Xi`FE;2pQaݖY5 /T-QU4$ 1,F7he2r!Wf=~T<N颚eI:]X6B-JuHAr^C<ޞbol-J^KWZoDmx$[gsܱrOߗhёdr%H(y0(*x*`ŷZk2Ih V;h_{2|Wu6Vu[[|g+yj_Le*:}cS˟aw改ػu9,/|b7~B}dnYzWw5}e\k|csoloƹr#ԾlλMw=VJkz12K3r[rVݍJ|]oi[,RZpޞ,f[+(կ6|7lՒY Ƶŏqlbwj_ڸ"X+Bz\Uo,e1W,s~%}mfj.ªͳγQެj7HMsUZmL0Msł ir!Ƶu'bS:Da9Avb zEh!Dxv.5G8,o">FEjPB,}nϩǕp0W5ZjynswUHOsiV Uݷ2Msfx®1^NPs8iu"s1}Sv0NT{_-WAXT _JErri%HF呡C_k1/<eInՠg.GnjЈ~3H^ƺO_lv g??m.NγڕW|鏟^OE˳ǟƣY븿쇝M6?oi.xsBx<(UƟ__l)S•+vܕu,eݬ[wskL-2ʶ䩔rj5}ӑ- [%^V7$5FsCEɿqLR uk_!/+_J׾Hhg^\F>My."L:϶x)&<( McGsH-5Wϒxx*ډ&0p4jk~;I_Qy $Iot~*}e1*8vXht'L1lNZP`vuw g!j1ЏC^n6ma1[U|wm-;Pϣ^L/S_jV˗q/ꨡ5D'Q| I0X>?S>4!>[Вָ nWAGƄ= ᆹyﱷm’r oFxMeu^#++fϹ7t- )2l{j`O%P擢M u6frӸjn"fOM ' O(SA*1PQI]oDqieH,5(hY:V[W ?!Ϊze0@ %lbģO'"tzؘNtF|/F:L:*a JElBlAڿ %гCd~1A`]?Vc}w2T|x$ĈGLjAZnbt66ѹ>pw&`مl8N.WG32gˢULAw>q#[2 9vgH4̵KYc|"s8 yRT\8 rjYlxUeȘ9([.苐۰Mۄ -}8$n~j5-{Y' tt8`^$Zh^~sc6J招[q)j5:D+|Q?d g=d& ~nxW5o|G7uB}g/R=GqQXi\.FvX.C-х jRbчCI,8ݲϻ}J<ƄGwq%Mp$'Ѱ).ƦR kQz.-[@HnipߌD|L[;_?*p78ʊjpי `|3gߥ# >a tW}ΙJ?~S3HwH;U+T/DHk$,Q\)p t|[$k 1V$͓JפZ[̓W ߇~xj#3`iB-WrٍSlT\7- \;- ;r8LFΰ-w \eW"Y 5]*mt}AcŽjf?Ct/pGmp)Z:.7!M8#W)%TVGO: Qhp'HJh[8lp[q43oxg?J602 e&-w4vsjʼ,e@pdN*  M<H@i]@11avmvZbqD- LM #n*j@vSMQvzyY9ھ7sVl>f5T*ز &Yr:Ы(;F= =*ԳM:?ab8txю\¿];a|^8Vd^?{%)pZ_UroKK~vuTa;S3y* xU*o|nhmm³O*CIq7h3FMuɛ S"DoڈV @>^yHKP,v򗻁&dSce$Nr1E gGCn4sM/ނ|+5ކ|[YMXr`bDlN kt>f L s(p-X:ڲծmn.ZV;xmCnJ'k_gT9YU"/-|N>EEbv33 i>_rVΒґY!3* x$u֥6ɨkLZލ> d|mG@9}{: UE[͙Jfν ]*<)Mksd5l׀*æLTl߆XY/lQg֒ی/< Fw#!XϿw͘79ē6 >Z^| I*I`qnO^%mC-[>2]>7bힴl?anOZ4ucK :Y_7fd%xO@>~3&y)e?TxҦMh=5S6rXmGЖ/?Dyt}hv(ӗC$l їv@VdS,rlop2S^,Oރ|OF RbHZR5UFD"*}ܶ&muE*%kܳ͸erhB6QG2,zk֨=/t")M CFCFN5i`}'E]ޅ|WYQ=Ul hA~LuJu%v;۸b[dM˫{8ē6IɹԟZWq4X4la^Wp_7 Yߪ[<Qg8ģWϯۃNȯV|N~]A9Գe<Ķ]eE䊢^ [~ m2N+d2GZNM5in*V[UEBC-xL4A#Z{b̲LqZd]nd^1t$k*J-<y7Yphw} !(Z Ui}2sg̢Sr1!fr9"hju%l?ZO%҂aXUrx򯦥#Xѡ~Q 2jүJ[9h4Wld_w!+]W1v&Q wH!'l}5.O˹|6kE5ĕJ)X2cނ|K=T]YbC>_66*&䛲]NO]\];NJ4꫘ߖΪ0pؔک]|i*aa_I-t==Io` sllxM-Dp'kaSّ~ڪ3Ei YtD8n-]D7>YԡI:FIj9r(?DǣѲ"cʼn~E9HKP8/Fy/wMcʼnv@rņsM[̕a^[%ǖ_FEQ&C!L+ gPg`Sm@Y,J;kt 8TϊO Wo Y#ǫ?dFϊS&>+~KNq/?ujc{\uUq"kYHh\fS%6cŕo+yC!oO_+Tt\+퐥eown|0*~'MXjMj1WS[ oZhxղQM-]ʋUKV˵zy11 [.Λ웡9]U; b_٢$eQs\ТT7Gɪ}ag7FfrO4en6SYwlMMy&cD~P9Quf"=<>xOX*:.f2!fB~WF=0[#{"#Tm6M&{OVP?=VJ]TpB&,T?z ?P~gnX`Y%;lhEsBtpBmp+ui3TiP|KO"*O*A#j1 8<]3:`bKLTl!Ͷ 8xeSqG' PV՘9*;PԂV+h|^-_&I3ϔ)V<\>ogeC1y/Mg!atB6Z>qͨM45e}=f:[4Ҧe߱UvslcCZXK"9Li-;Է)벂vN/AAx\Oڹ vdȻ@n,*(&ܶD"Qqv4ٌ"<~Ðb]CԘ)/NA Gz+Ϝr603ȊŚ^o-ǧ׻J.qC͞ o[ԄÑ|*d%nrNA kY'Ŏ3vhePmec fAM]VhZbc9nrȂLh4FJ@d79(Unrw[ v@,[ ɶ379g 9 !.,QOMNx -PɁK[9 mȷeJ-79yMϫ]&A8wh\;Ũv hI\ظhD16 r7lɛOVli|IbH uHtzcؼCiN|Ml{=̆#H*n[\sG0[aedr狧ꚿ,])$ʎQHw1_-FQQa62aWA*f͋IE[?D |"6L'THn@mnYyĪF*CzP?ˡ|L>(aw&MQN7­*͈[͌=5X2~>1vn A&p U i)xZv6 p+][]ps---vѕ`=eVM@6 p770=FvdJ^>:EuކE6RBm~F t{O5|W庵ӟ{Rnō | [IrP)V渆XuJ[ZnoqV >[\bxLK'tX'ԋƗ) @eJ ՆRѤ]1c͌WΗѡCAJ&dSn%*~C ީn(d%LhLAhfCO͆3%;1hmz;os?rN9 7rJ@nBX`񅽩[r.H]̃ej]3HhN֐Cz;(BmX+EQ*4MYӢ5c{ڤ^3,w2jhH]oD?txTTnGIГ 6=ذbO[(26ʊ?Aޅ|WǮ [fӐ+lWbXV$ 8<PMmԮ*cTWPGz/l/sM;YGUBbY݄'+J$1f'W,G!mg!Mov771T*8#ٕ(@Q9Գq Ð:}C5q9snRqGZ6^EuilArէ} 74 j[\R(hATrtD!:UH&,S;`LZg t9iU*KS`釅}ކ,JUa*<~R[pم>A ug%&ӐV4L*n,O>]˴)1WsSVM;;54 Y} ,m<~YjE]\'BPY'TAHvrQ9)aL&[ y5"è/숰݆BB>~lPDV|{FIEr :YjQ\AJ+da)7@9]Y_)#.||Iz77ﱫVqs#W=GP%GVω(H4fs;N-1wDyvE|*vhi H"6+(]b5!ZY@P;` MwN ;%h Ź@ oAC1Nȝ`Q[;jeYVjDbMZA_6QRZrQH606:ܟl9+}`{V.lHd+j;ѡP1ߔWrx22 }!/ E_Cr~A"ӯb+_%/ "ӯɬ6e%njzncFC_I; #<.go7.;?I?/AKw Lo@J|ŗʿ <-Uj{>bwIlԚQPqR1wQ-JI\j=cD16eyb:! Y}E`4)_2r=1Ϟ9CU¿ sL'+22ՙ 峓S)'rG\ |={ԽOEgvlsffM\'<ؾdsv5VZ.mi,NO,va[e6mՎADa7_ŢRkP%Cן9B}~#eDuDŭs5ZD*)7&;XۍyTR*i  SHl4͒C [99&5fL 쀞ɤ^=UIɿewYv{"X),Esg?%g/wˁ \ohTBw@E6t:է(VDD'XS{(;F=u8mk63vp1'K!wRCޛFzg Jj7GBx'0>D"'/s# fAiV;G(;F=ĽaAY R=z|sAa5 5ؿ~xBS|kc$ ~eǨGCuS.O'l!Na~R p}PW9>O\:qD&7Bv^-]b3(V[^8#W+zNچ̫7~Uה}Ռ3_l2+w4_k8ul'?7}tσr1W*x$hH_Yw@!ì6Y[l,*~xր^73,UA-K}-r*~7x4)ǷKΪ%hd goʦ*~x4ieWM+oMF=RPP<6rV=8ģX@tC)765ljZo6\6TM|6h*1m63 z@p=ut{J!M_7liCu1AelJν[%~p;d%^s~Z"IM]@\P:aȇ[x,S hlnl e̡dS  :coٚЖcD ;lC1\?w43TA\vS?8W{Ap$aIאŷP6jzdPC?am,: dg̨mu{ڍd}@sfib9%kl 2̱1 yV5F뿎O̝<4N#v3\GE=mdɁ$4r̨'B&]IV8x( 3Κfi,:~RoOED,gDgXYt;.dIUӯV(o[|C0<'nf>, f|T=cȏw'Qg=gn>iUsL)4p}C^۶Qa(1bGC3wp%W9ēF.T$ה59]Q-)wuыh} CkQ{!(Ky/? 1ػ@CsG9rS+G#g2&2nC95eZRbwGg!mPq'ԽhE+Ȩ:Դ1KF5w KG57@~GY5L}CRU|,'g_YWVxZ*1>(f 8?mbj1pξ>t{DGh]ށ,H W8R"]w5VFBNg ˭joKp?2Z߰3Rse3ZfU(6X3OXv)M7o_Ǹg]-_ng!rk@ij*j-ZuZd7p/dSS*0Y# Cm5$:Uur xttrxz f2< Yj=$YU%󐥖OU^|AY%yX5,_.?6͉/SOv""ϐ;3܉(yA]IW]J0Vnr :Oofj6%s|Yn:M~=jm "{/ K-Tfoa$N2$**N_E0PnCػڢ Az<$<F#iJQr4Q!khs6 UtPM 6KWˮ!K/!wtK,LR nzCΠYeȗv$'YY-mR ﹍]s] s'$>ʬ6M׻,WBoߝ\r:C}i}gWzqSӳ(;N9е$8!s7: 2]uǝ D;FlUh޶,C|nH!%ZiR+bkFIx˨]ĩ eTtx\Gk <>4 |Y=dYJQ^|m cQ.wV<NJ/D4M9edVYO1!wMs/qC,7_mI5,UٷLyK, ◻'ZPFˤ>Dp(}N'jobG!Kd*xҦq5x98) h9852gG""tB/~BBwQ !IC# S1HW Hw^f8lIH 7@ޠtVln5Ŭ,׭5ݜ,m@ْɵ/d?d8xq|XY',18bCo#!4tr8yLY'YW`:%5vD+ dх+j 37ILHے M*j toA~`}\)iכS-b ^ ǎ\O,P,hZ+ "ǀv{Ta RK䔶]R+'YC!uD#둌Z./AZƁ!m4KVd1l=rij7eD 1⑤cD# cF@\oB<+36CT ok⺁ۍE͊ ߔ+?{T|΍x/"_b!"F=ۄ!y2󐥎krZQ]C1 Y*A\-S 7{!3;rg=!Q_/w)U #b #ϰzH|"*+pFZiy*D0N7%U<6M1B:S:EYL3կEٔٞ%cF=Rn̐n`K=,d-)S%N5#ȏ1sǐT{Ko>s3ilBn,:~N)Pq]F.`E)Lbn $@fG!vlC,J=BnQ\$[]'Q*d+*7]"k.k }z6$~ 6!rC˦M6ڸ ;%hyƭ/B"za!yk,MX9˷J?*z _@pNNFSλ%jcS$0a4dM[C2Sw;=nfYiJo^׸b w^NůoQ6ӷsaUpKZ+f(>R!h0ںV*qC<<#F=}CFimӭj`.ZՊ.[љƊGJcEDO߃,?ܢjaʻyr ~oO)(-ӯ|rQgZz_1E+؞wZVGZ,_;PrC'ܱ sC^:yWk0Iy2 Ц1%zӐ7+Eh=ϭVMze3%uÃ50)9xj)G3ys!RYZt.TL$999=o5C-šg۲MslM!WcTs{TV(_c%$g6=rBdzPQRےz:1¡ =75jɍ5dҲa3<~cUSᶆ0GfVN"9n{ QVM Yn[Riaʳ0Hy>mB<?tMVOZQ?P9۷* T{C1zevXM~P$Rqc"!4ttx Td|og4sdzC9dz\1xh-/s:0R_~ 򗴽j EcF>*C<*o`yϩPn?‹l%(] R{߈&JSy~ot>5Zbc9;k|tO'Cnau N^A4`*ppf[C-%*S+Z}vUs0<|Yުv W=AQvZe/Rywpu'ڪUSebXisUH@0!:|c`ZKSdz6]\6U 5^ÞS3HwHHV/)a 5@%/5YRd).qPq@}K]2_ǶrAG@OsmRed ;x_6Y$F/ dkN6^*cR^bd'DQ㰣E#DC|šګ, tebét{p6~LjGZ2P-Bm!jlNpÉb=w S.U)i˒MfeoxqT v{)G_u9.L\o ]ӯV(/:jS pjZ90RL?þi?-"`Nx 巸61M{]8*nބ|S5N'/nЭN%T"yƩQ=sЊb{!߻%>5$?5,CV_Kv y6M}>FB9$Z@] N.\؂apa HzʶZn:C YT^RuV`4uBmqĸ_$.BVw< z/.VQ" YII-JP:u{1⑫zZhQf?mȘ#}yMn+݆G} ^չmWѼFrܼ] nh oĤYpY8}G _ZĪ1~9+zgƐӷ8avEĕ7VWBO?a>-1PosaWۖ_=#6t-&,^+Rxb\6z|R)3`l>լv-(QeGşV㦨~8( V7Eњ[BqޟFF ,U5-K4gar=(-PѤH%/W9PvSP 69?(MP;9l"VT&|[N6Іe>v8(Զ_ߚ&(pFNPlս$gW=g9# ZPAM$e"j#YJ1lÇ ff᳻[@C g"c,_zJa‹/jlUL:*kJnT|x$c*֏$z+6̞Cin!<#V+,rQ.O; ǝh%a­3]lgqe%M84N9CUDQKg'p8yJcsZΤ^*/LZނ Ǎ)&`/^|]c$/RqӫͻoX0;eHS^VOnB~Lzh$撝\/C:e9^ɏ=B%R,Ou:mi 8Y!.w!o} ]eun̚ǐ[VZ Jӱm/V v]H*rX,23ws۪֝v’g D -D=):܍M>5 Rs!<>ciPq!W^m+l=$>pk)E*/ _~I8r=S1k Aj-6KO_B~lYq,OɜrlPz8){7G  ϟ+ oGT\xuC 0G] Rz.Br(Zg3vVUU tT4|:V W !=nMqHB ^aEt?<YJxA|B.I^@Z?͉zd\ =Dn o|b*ǀ _ؚVwL͗D4A)R1-y'2cQlӮ?B=r|rTrnc%^3Q-pE;b9 [loK(O~x'!L-!TX-t~$FmmȷOًjѩ1X\Hkj!Mw6dm>ݔ5gķe?vYzz&-3?*z=7C&JxڑQȣZRK.?O_O}^,9TeMj =HAztv8y@ګ> % ,uFv!͍mS,lI*pAHf ` K+3-n_+9slER;/ qayZ FNuVozv*&wh7Fo%A176CQ7iwrhmڢDE mFuޅa WE C<޾wE}xÆاTT|x$;nU:GGz.nS;ıP`bShB[&vq(dip"<Y,nhEyb}x6ENCLPH? !~F7dD'~blJ}&H Q-2PKvi2jηk9u3UؖB%^ x0w Iގim,R5RV[pcFaU?Q'ǵ(UsI|2,YfB5O8YR+Y7!Nfda 76LF-C%TS[ Q>$]?*ܻJz]NwG pIJ8n OCkYjP3'x.g<-K4G|^^y?WQmwTMVrGVfLVx7v3ЦL,J\YȳƇUN%λ=Bxu;F:,tND,_t^Du dD:.Pq3<ӁQm^Dʤ6H XR3i2h lp=7*n#p/ʺp>ϐ ?`1ߙՒ.E7t= "]j%)=Sdup\|E&7 |;oC8C[;?BH<9GnGM'8KdgDmHYn*,#rbnru,|DIGzKojY; P98l/NCNǂM-ȷ-Xf>*6dySald5lC OJunCtH&RN'8ybXbd=T}*g?0v36϶shZF)k7|jW.?qނ,7ʴ#.r#elFw\P)9R! ً֒zl*V>0[ ['I -9ݙXF's.2BVs)/Ea9 YB|BJ9?Z\Tu%$&Nǀg!2l/Pai6t>NzEosWDƦ|D8;>#c51|Ty`|a V|>0)woʷSܑL y 'ʼ5D$P8jv9}[MaQSB}1LZjBN#,KЄ!_O2=uLC+[d9*װ@}_azGV7aUi~I\v' ,7VV~Zl[xh%*G!Lm0Rq' JN34 D%H/@VO$5IȓZ˴Q gr aTU  T=;[ѱk QS]IM@zjc[)92YMt 55ثGpշMiIح |T(rB+Oح6.*xi[ʻD2bAȷ`Za;<Yok~cAdNCr\Ԭ˧\pNkID K%V*n#rFз,Ҿ@ܥ/v,ؾPFSQw!ER&@>:0y Kk(^'ۘ22֚wkͻ!Nہy7TZ05# ?aLb\; h{y7T`j PݻԤFm@bM!@NLx8_D K %cx8/AvBg>Os,T ɹ\Ԣkq\.ESsrP+a].*~\y4De YTH/_&|J8 yXz$Bu)i)3Sd0{]vVz0nꉶsb:ܲZ */YCbC}!2[:Iwf ꆈBD{$Q: oOAVYE۾*;$PINoA=ۍQS@s\tEp5z %=*.JQ}#<~AK;CQy' fَNqT}[8(Ya^ĸv#Z#հ{Ri[:0BUw9ͯ-[_I;-K* Ē+xe%"H\,DI;!%2HU ,sU,EkgIp- >`jK.J%A*~- VLb3Œg|_Fk.qE+Rep'4EǨ:1M+Bפ\`|8/F8yHPs`1 w ]i0TYjwGR)|BV,J95f!g߉sj*$p򸲺6W3s%F_wTF!IC!W!KcWEkIYTP_/Ŝ. 68Dd/pr]Rqہ]zΜ@֗FJh8 Yg'^މT<SOM3YЄ;x*9x4jHkC_/\T|xҦ5xm_tEvlK֫W0e/'aȇ;%2VӡnVsoӝa- RmoB̨PZOڍP^z[ںOcQxOE*nCłk品\D0099y#T+'\"*d9/h.hpLGwbp.d}Y)/ HG)C~_Y)[V!#:dKm'1eW-מ:c^t֪%Ҝ Z "`dd1N6ػT vF>,5)UsgIսAⲆ_ ԟX)Ɠ(m&Bm,NWS)"xTJ^ |hWCw{&KM^4߾B?msTY`=1Nyh8zIW!U1JeNwF7!J׃Cm.P!y&Z,<ϼV~w(N -aP:/J7oTղύ#YE8rY.{,=FZqTZ08Z ;GHr'm $Ƶf# 58#LϙҶSsf3 PݙkR?kLXD'!Km< אg[Ap y*ߧgiqiQZ?*?Vؒ@㷡옶Ne)A#ڻ2]o5EPڍ^;R"~.UZr!h CZkm:CPaz!*t臠VNAC-K2"l}- :*:aTx4 ݝߍQh@}"ܘudĈ]Mn[0xVbA(Q4f;e*/s7ҞwU,Y5n_kBU~Б}!]-†.:Ò=㐏KaUc&Zg)1]u`-X⑬$kjJN.Ȼ NUBݐx &uPq=ZX^ɲUa] IO@>1 OBzW1G!K5\./A5^|9k85l4 /:bIu1<1 3b;!Ak'x*~.:VAuwCV_#=Ґ4J:!408d*ưx8NG EJg IƁ!_LF Ky2 dq{5cM>' ռ31;,|1ISOQwơA)w'"sfAw'=\wBR7M1wANMCVn0,S8vLak՞pk%Z!gjEk<wCޯ/O@Na܉3*ؾųMe[)b,"pt&|xdq#=N C0]c-F>oj3 ,d qcA~B?_-7#6CFT|xU@r`g+c55c{ytJC?iD!HJVZ񤭺? u!C#(hq(T0* 4*C1?CAxfZ66x+λPѽwtfĸ*&S,9\G)-EM;K~M) k0az9`a&[y?, ːکXXkCOqֆvNS j%NrءϠĨghbd{$?_~Rz3j)Qq ӽlUl)-˨&TF*5*%W4jꧫl+HE9QyʔfqBYE [`#.NW^tkӷxe&\\f(9-o+JBڌl 葹 ϡxBmOTom)+Z8E谾Um.gAU1|Yjǿ .c4V1_-+)ddOa/~6ōں Gxԡe4'E"sxTx-RqWw!˅EdELfA:hu>C?_l{xb7'{pmE֙=/.gCAxCwq'm5c{g`-da4k4欺uxwKPS2A?N[ 2njji.'/A p{<7tEÐOO7ђ,jUp(ClVl-ڲmU{)TٝßD%-yEXH+h2 7 Ky!&wPvzLbv|^(`_$t|)I0xB}`[XHw ~l!~pt%.xne-D3x {<$!ݟKBԴPvz( <l'WKβ{~vpvVKN3Ҳ[5_Ӟ,6c~Z?Uh*%|{]z Է0h*z^8O-.).7/Z兰S>w]ixC;Q~ +<;w=sLsz^m_ .6pҲi΄W _@QvzaջvC-Te~_wi:k ;)- R8>ǺkX?A]*j+|jjЦP+S&+n I\N$ Ceb:O%R@<ʎQOu9JQ4Ws sEDصD3 @VOSeX> w}>_y[ouZ/w<.Λ̮9r,jrk:(Pߊa"[ ɡ#gÖ_ 5lƯ3_mo=thgwku\1Y A5PrRx/jrG٢J*/$!ND]<yVRh MB'l dyV<;W\_eeFpm`*e Nɵcc#E' E+$5U'Y-Yr8e}V,J Az ÑIԸS۬{f K_JmZ=7k__%ey0lɋCƕ(k]rH_%&oAz0[[[d߂!eٕ} 妕^A߁(ar}˿G1[ GC?*T)G"<2ŝF/ɘg/g+`C_ǵlaJ >}ʔz؝%DP2cf??l~1_8I&?eoax T5?^Auj'a]`J8 LS [*@etj|h[^,}0VyarT/~ IJөa h?cz3EV|W"%wFP)bTc"F-`K'MQPMdN>NaylLZduwEU;0=1IiCV:^؄] 8h}HLEmn*U+5&ihi4{"e[iDmn7fZ:h=Q`~wb~ 'nzaFf/lp"MvlaM.&a斨M0LNv[&}&.@[L/]iu4`}&rv4aLs_tf?,p;,s{-,SAZ&L 1 NaR B2p/Z_eR_::eI::F -unPέNAX*Echz{:mӖj7ZpT ͭr5>A~^H!n9oWZd] "Է% Ρ:Ə\VVɞuچm7S mo-Cui?+h~Y{<(UƟ__li WعsWrօώ*eodǣ͸l\UUZņ%76#sܼG{Y^f5n`'(!o?UawcJ++:gƐpӷ]Cn9Wfb*fg&_g? C1ǠCͅݔ;2\ܖ=#t%{hNA Ol)ץ>n>)t93ƌDN|խv/a m&Lc2avl}In7%8签DuH_'tX'ԋn@njǿ %P9ģI)H}/Fvd;P6{1m(مP!KC!KdܒtCxq تOGO*zNrGRUޖNbZ7T#j#,*cf?V`N3 LW_ҟ_yZ '!oy<{yêTz0@A$U}kUCO-UOu>*CI66*suá_Һ={tMNjߺnz1K0NA؜ۆ}j7_n/qowCnnb oH?uIno@dzkQQ=><BY5±@@ǃZuQ"%hj斳m:h_[˾Y`&Jzy0lqV/;Y:Y=%%#&OTNz#^ݒoK$N *ڤw.e#(_AA l%;#Tv찭Mi^ % ' )D [6m T]wyPiܢl6%.JumuZ)!>'㐥=7*( 30*,p\?]/Mr[#^Yjh*n |跤^F2#rd$Yˠd JJXC)Zx8ē6 \Q?6F 3 }$Ypds$el7ldl]>9Xvw5T!ȇw [f Rp695R}ƀSMkXhum]9چr!M2͍i7}̲`d Ug+ڂvR*j_$ m_Q j3puæ VHޞ[H{ws'm\6:Y h;[;u+3Y"|zf^ȓVp'pb[Pfu^a+]*+{7-`i1RtAjCFmnj#=@p 1^s%*Gc̅UVyeȗNx&d.ZtMwt]Iu5-MA~n86K*HpnJA - ?)]@ӨۀW7IE!II{[G|&MV#ӨUvL|8 ,09) F* ,ETl!|| {{T!iA g~AZ91t仚AM >-&\Q5CBt@> Kw/CX =+<X13!KQ?sN:'r>j]JU@wqGoUC}f2͕aI<#_{Yr5'/ۅy ~c/W]QI%ȗ߇ | /C+RW'y쯖ݩhwE,[4G^%vmiv,-Gew!/g_#RpW׋N>,sQctYgUl#8Ű3tưl[VZBY':9T|jNN*rGVϺ~7 8L?{)G?)a6 CRa4pmFQq\ Xl˫EV6>]f7.b'7>v.@f&{YPhP$]GoL%NB쏊BagJZakGi=B֕ծ,CšDZk2^6A\s܁p]ev1JɨĨrv>_CnZγYuIJ 5]nj(:F=} d9EP7| k.(.E*XlމM&x)!V"rt #K8 䲸)X Og#3yOc*^etMepXqI$me,9~Wc!kGG 3`۽ЄzۮԖ[xЮKu+npQݲ[j{=md<WT!'tvv 2ݡz[-&e2,|c,RtN`Q4WMn=w8=Y<|$fHKP,vniα{ #*{?k' mͬ6;u ; Yg6U3Y=վ~^֜a{|md @)s+%3:VaVp}!?,М'Mt.Gx׺ԹpPms f3B(i2 뉆.o@ܞv+yFިTьz4Ukiba#oFe8/Ð+sF0)sG{OxUi;pd5>^R;"}pd6O4E3Q#Z ;D"CX蔜-7d^.(Mri6 k禴,.x#0>,VKV@JSw>.^r?5U+ǰy$ixԲ>#7jD0xn7]ip'NfylD ad{j$㺺r|| ˺¸N7P_f`]@ުo=F rr^l_ꌳR*?~X: iߩQI  {nYz 8ȾԲ@X#?hHb;ہXpز~)!(ΝIM j F='^PVll[Fk{ wΨz%) :H[E7gͨ4>EB%a'lyg/l-ȷ4 kT)mȷRBEw Ӧ"D=北wvEE?jQ.=*'f(%>,F"`Gj$@& `D[hG+EQ@̎B>@ ' O(mk8򅃍jYԔ9MCZg~v"U =V #ڠ%< YjTLi/`Ceގ:7%lG >CiJvj0 'ʜG^\ /G5o4ߠ`s-m/L}39IDY8h}4 KI0U&opEɽŠ_6C풊<>Rw¦gVϭxN8T@bE`9PfR‚|>@3#,NTW| YnKPq[ ʘN6vKV 4兰. A~h0B]/_<5-)ېvѹT]C~ΖnLq@=K/“)*C<`\W_l|/t5nύ5ȿC<2[~_.R]n_辎2PTu_.pRD{| (n=:I?ML|ծ??펌R'`@ Y*SY(;F=waؼۚV>z(5Oƿd{+sK%0P6t3aRVkmǯaz$&HVVnLsl"۠bCUN< '사>ռS lo$ReXc֛HT@d#r"ʎQO7riؼk GkZ{w6 w0N PKK,*S0$< aljvCXvyDD_"!;W&RLnE})KTY @>ҁV]F1idž_~kDa.{b ae>Ya$`B8y@>S{DYf>ʎQS|k~uz=ߑ7݂-3K)9/„ A\1ʐ[!=hpof*ޠaPL\;l[˗ˉ!LP-T]iV1[饚W+ZN |L&5 nSfD7Ro*rBLFBnE -FmKo(85v,T bQ6^ ׄ :$8BV\{ vv@C+'ZSfd ~X,uE|'ʎQO;<$, G,jֲݫH׃2!OuN=/kCQʯZGDKxAS|reBlFB)hP߂Bo[@pM6d6GZ52<B>s^Q#~!ԗB#[vNy-.Qr<6f>0](& dS;iA0!737a흝<{F| AQ-u2]\)EH~dʘCьiVP.Kvѧ֍o5f;˴.{vAR&M'.0 u{˙6,=.o$㹯ed~L~Xk7ʎQknYlr-բ~S]ҍ^U:?H 񨮒Ԯ3 o$ S> h4R> VҲT:"e[EM\Pm,wq#8ӫYAy+ҋ8Bx@ qGV|i,~m&}.6w-ۉ7<+h~Y{<(UƟ__li WعsWrօώ2%</5e㚭n߽K \O<Ѽwܼ2v<]4 Yv.8}@' }GhZmj1~9+gư[MWp$`s&_g??m.쎔MO]״ψvh3aw<~W4(ͬ!Kc;y|R-ߎLwl>>7.p%l)WڨHh鑷$f 7)k7p2\Uj­.^`ZzwqtcBfNRԲMsVѱZ-o266I׈bѤ Al4귌l%l[l]*C<2wF7z&p=)& Bޛn!MuʡSRtC!oOA-oPB&쮫El枃ul6C+ J_CTahPOўdM(#lVz 퐥ګVx4ieWM+ͦJ . ղRZ qش:x¢TܲT.N7TN57u7>h76ps'*0Ǜj(n~#&^%xCCMM+f+BxJ!xCPxuVihiunPnR  jm Yjʫ*~/x$U[:;یZOn>z6 8Mx< K\Źp˾5e>z8'nœ9m3ͼ[Yer?h.ΛXB I.IzVM3͊Zd4_v%LKzÏ$m p.]9ypvbb:M$W/6rz;r8ּ>\naRvϕ`mEJ]&k#*0G!ЈʹQQQ9]# RQuaDg5&|HQ,[/KV_qb̎l%ۓ%zՑlIx1p*n!XSԸݿrF5HԤ Do"-iJNŸš:qj9-q?a,5 \Ϋ Qi\.t- |6c Y"켟0kf}D;:> ;SO3 .]Mn:>&> KmCNAG|*784!Pϣ";=r8̀z<Ѐ6T.EPڙxO 58 nkl} t{ :]JN ;D1xi -OiGKҷf;A.tf@=0 \33w@3(](;6y~uM wkgϛJ"<H'Dvz!*#,ʟEmD}khD} *5:>)Ώڙz?kp 9 "\sSA}$P8a'QvkGF/a?Bk'́*_4֨%"켟 !Bᄝs(;6ȳ nވ&ۄkiϛ睇;MM{TaTZ APY0耺RP0 \;Opx756.։m ;:-pB%e檾`(wr UTD׎KvXNňvcģo&c. 2u{]ƍM͊xt$9&s ~ev9bg+F3V4F?p+Z-o[2 9ʱr5 IYm=B.RSB-{yȢi)$&:svPy4iu0c:CQ|:f,g)PtSv)[QM#@ ؞h^50N)4rc;;(vYNhJ@H㺄nX%n@~կhVS -A &䛲W9C5VF}-Tpо/Zlᑁ<~ iK6]< !g.xx˥504!wQtzfϛDޠpB.&wkEglDYxЬ ,k3oz K-TSѶh(@|Zl/Pa`:ŢY.=` Ud]LƬ*E8>\z/vyT +ﹾ/j[ˑE0lq-m{#xaz.1d}KV8x( 3Κfi,:~ೞޞ|Wg) \vf ELOSyCt {re g0<'nGDf K4wD* xio*E'Eq+0 ޜUsL)Ѵ۞]!gm,ڨjebGC3wp%W9ēF.A.ۡ -)wuыh} CkQ{!(Ky/? 1ػ@CsG9z*UEP-|bԆ,sj%X B>~' O{S 7Q9u׵icjAjn߁j6JBZoc9<"߇JgT!K bj}YC=3kXڄCґv#<q5R~^zQ^|E#Eޅ|;,i E1Ğk~V(J,B.bo1aW؝ބ|ENCVY]-Q"&uZiD(={گB qTvV[;jn^WpdY# Cm5$/cOmu4iRIͤ yR˧*/@Y<@˻jlYH/}¿CD/?6͉/z#ebn~H Jx}B*j2CA7;!cC nA%ZQj(Y8Y[kޗ!l`:y],/]Ð+뤛^<&G!#% x 1e Fgrv^q$ڵ,p2>_1-;"m3T_";9nJ .k)*>ÉCX(M?g\5("xTp'2+n`'L1)mlCTQpYބ|䊝~QxzGO4~j 27Ewr!#[AMMƣCE'ye>*%_>#<6xRqׁw!˭F7GsRQS' 3w!Aoa1t# yquЅ@>~,[?!i^@m%K(P}t4+{ُCs}.Dxڏ¢b.1߀A>[yeK5qC$xCǚP|{l/ٞwBN!p8Ч0!&&d"r89 *(ph&1,d&12pd:j^|UY-GIL.+BD;z"w df 733plvD ..s6Dw)9EˋOt/DFdCkV`,:RsH`Ex=Gd:[CIh:%LlD6vzdacDGe eⱗC<6*n;pdʻ&E_ZH'h;Ɣw +F m!vB\!KE*e7m{vPʾIeL8 v`حv!w2xaJQT-}CW,_44qyd8=~Zrg=tl{ I)I栗P{W^K?0ΏԺ;lwvDn',d}+!4t5gY5gf}_/:EM?L(N$U]6-s,|Fu;w p TϠ^|]YR_ mȷePђT˻ǐ;' C<4:,짔IlGit윔60EfqE/"K`xn gQ$a]?ۄz] S⺁"ל  Qwnr>WxONC/R!Phg"2`gPog"9L~R31ˤ⺁HW(yA鱆C+! KԕICa7 @WBNCg6WlZ*~=۳S-GAz22&Zq_~t.0ofx_-#ϭTofhEBڲm "!#&TG:RP S K4 u4d> Pq@}>I֌_.j(,D+g?Κ4yg`?cߟ EYeUT53]taK&w!;ӺG0ܿl:# F,ZT|xҦ_bTs̈́]W`ʄJ>Xk+R rwr`qILs/- 1睂]mdOo>,u}/T=bdlo% Uka; .>9yrtE_翆C:{ߨYU wb *bgj[FƘsP,dўZtqhB6uB$xQqG#). QSl2ןVs=DUן(U8ga01BN OCO#l?9z٫S.J{c8<DxqFKŝf ]M8dN=1!44rxIm/Ln-ΚCl@;)wE|?,u$Y\ICXYIhoO /!TfUh&pҗw{m(io#`.wkf~L;Nc xmTF`ddFT4B}IC#;Gл*Z)eaBr[J_iWl.e5P9={}]XE"_7Y0f[MiT r8,m˝iKucJ7 ӥ%qxO yF.dzR苺>8wtNVW໐սΣNy*:+2Zؿ ;'< Y~jh>Ɣ6j hjRw~6#DT\7p+S閜;LQ-[Qæ|yؿʌ1!Qvx% =0]4Rq[Y5eEȰ-NѽNd Y X."\?}Ᏸl]8y^->h-4D|_ ?i[onճhNAag;, b`bq̢뾢CywU)__ !W~c QtaUCzMAA֗0loC=: 9TqW' IOBVvQgEz6?my0 TP[Y^ _ptTYL(8 Yj )2;&UrpP)RS;|MxLׄz'<}CPLVpK "Ou!*+S_8<,~N@)4w>; pc5`vC_Y3kMCyVAM!ଓx\OjTjTܧ!_O~r؆~D@ )#TNZG`x\0~d?BO#s:*Q]I#GxscQ#BGJuCVH'2dnvhc_@[c8 Zp@~%Ɣ*n'p-: CqÔ*?LǑBwqOt@Iv@+TL⺁k!kFtĘh+(9 a^bLvעh;ka%x\0%BOu@i>3:P]39aքz"YKkT\7ʪVloJ+cU3We*Td,lqCLe4xsǐ0KvDbK"vM]=1.Rq@}VS_$.tcC֑X+l|-/2V١Z #4;}8YiwFD\هnBFHV86>su42<ɒOfhpv/ySͨHְ1_Ύ7#`$K(6AF?izDBXgTq*q~e:ah9ݷM~܁ ?}x'Gx']ņwznfNf VY$ۯrE)V/`nhux#‡xot7~Bz _)*[eߦԲvު6Nr]ӶeU⢯ BxrWUép[N0En9&R;@ᎀc2Bm6|Q[$H @&"?im7Զ[?x8IjwmYObQQAoH(B{BBr'O V`޾Urn/(VP9԰[Ut*)Z7 eޜ6a͑,yEsh9[EXq=K:y3jh3!"vx!厲i9ku4t8r\gBTuH&L#AHHs#QIz( !p: D[54$D b$w1UǔN.N$T H&L'9aG{%Ϧ StIy0\. KB_cZ. w% [# ׁ"Y2RON<1~m໑LzN?>dЖP҆t画氵YD$CCCm=P l%F%VUWHr5\&`zĈ։#Y4:9Z}DD'm-0&t4jֲYj1ȘJq3W])YJ.UteͲ!wU^tWer(36|=ge3M<-h-0ކa.xchW}6Mz) ŌG񠤸N<1~%p/ԦXSN|t)IJѮHil$K&u44ɒGOŝy*x5B:NmDxcZȉ b'BЉrD `';*ԉԷdݩ:v*bg16fXś-W<^sЛΰK(J- qKnE6$.|Q eF]e&;!dL4 fBCZt Iї9!K2N)$T3f? |hVeNeNk}Gv8B8ZcXf*ǕOE퐻;X<+WK9;;eN&y5Îw wcH/JɂFVks< O0Zj|`2r#k͜MM(N2J'(퀝}$v.oj)s!Qχ8^dmw1χɄ{>TeH 7ʳZe:*PTAyGnR%pEM} ρa݄@E-(MF\teSq3'e^{IQԗBC?VSWw&>\,~!: NnkuLk3q$Yz(ZDzo @ ٴMӖd*PY=:ig۵Z#r+iY]I]>k5{睗'˪Be5TU2Ή'"Ny1(\^˴1xܪbZTu |:F~l'#Y[zy+l`Ɗw!?~x3WdKN0O]tu,4JUGJhԝp4|{KB:yk8̊V8Q ] ߕ} x'Bsӷ{|wi\>-ʚL9? yNbgpMtt; B(Zvm|at jZn0Dzosos٨j&ZBtL|U0wBzϖ 'eǣWAb3!4QVL֓^9TΦsbeթm'/޲c/ <tD[OÑDN3_'+֪jgY"G"@0g=_'"\ZoLu&o*i阊b#Fh̻y}Ʉ6tv1O*k x{vq\, n>GW0qr " <8PH60L .Ҽ3 k9,Y"Gw#@:igT"g! ¹[8c9&,߷!ȎBy7We}xOm"ry'ڲU`x Q8gGr5gykq˞@&Gsɚ;$`Ps -@ UV5G ע^yKG.rGYmb;Mhn-{?F \D ; [:{"@6܎1=l.BwY.X>dBڑG2aZ>ZhZKNG%܍ƫgh6^*po$Hx>Lz?>dND>@}ik}P^FD Ku9R{Kl9R0;6޷kЫSuSzN`;^*z>`;^*C۰f>DU Z7&|ڨ-V Qꨢ^F*+y]qU^zz[Yz4P~U:.㴱Y-3V1lU>&TGEi qPssuO]P&y1ŋΚVe~Tl `̚q1Zϣ#ˉv%|8=VfKP{ZJ>d{ytR,cltrݑ 爊7oA8~cleJ_*  Kx d?OZڐнI?V>7asAmS҉?Hc~__!FaKjHv(q5ȯz6&1SpqS1Ldi'KFn|P2Pk_mnr *IgmMm={,syJW*^\t_u /Q-QB{/)t4*xZ_a |Bpb?_ZtDoP f*諸u;t@&:8#~}ܨǘ,n*4"!繾\e ;#h>S|Pl'pW$vo${"9KfAf1#+kT>H&T4U i)7ې$o@i& rvM7l߄]|ӈ]$u[닡k} FAF3o(w( i[hT8Q?ޞڴaPp~Gm*FMUQ$^ݑ$$Gڥ U'}HGV|a~eV(^;bww=#{9PC1\E+?OΔB~tԞ93:c;{0pqx*/D.P\|WMIzrߠ3Qv?<}-usR@<ڤGuѬmQ:ˆL8UiVX"(;ƺy:2k~Ȕ`M 3h B[CN>Onk p/dN_4:QP*>_j~g,jR|626Z&NFm[&0Uj.ܹ#OJz [C!)Zm/6?P.Rȓ6:oHz+դ=K2[!o&9$f^Ҵ6ZY޶pBa6l*ӱLTڊ׎CڣKT[v(p#*85uF'(dJk~{hnu˲]YG\ l> i JM,[8.h_ZNuNf8yXtg/ *:좋 R"ʚQޅ|W߱)CvJ"i i߬rg"*G@teWؽp\ e,Ǔ3 |6smvZQG7JEBk6HSbȍcq ; R F3-b4Gs>)8Mvb?,I'#;@ 6Hf)6\[8 vq.1p`[<v@ ^MD4ZoVkȧiE`s[.~|qi-|F2 x;̲r3>j%쁬*iHfTòy"14\MѰIA.fț[>Pq-h+dw:$^p+,mj+p@zzzب^gbb"Q0Y.[ a߄1j1TmMFNsb; wQo'mܖNg%-n@ z 8$`k&'<ۅ'Q 8B:|3"L J>6E VKIuG!j+~1A*;2H_Y-Ȗ`39BK&Tyڟ?"J#M_O4$0-R3OC~ImuxnQ[ vPtwIb x @>fY<ЇMSks PlI4rG%IM|Șm5YN,bI]β8dMmmeeK0rx %eTS^.hgPd ʯ}e qnk7T\'p+3>O3xm,9U) d&^ܧC&RMJsDjx:{;;lXF'd͕tBVjrJuf3c$Wogqeaϧ;o󧠳sզ\(brxtv8yB[a=Zc^WU=IMf}_h<-cncK"KdJN1Z$&Lz>8NCki&Mv|;y{pfyP'B\FFmÇl.>%).v[)Pqwlr/@A6{]ٵG"qx„CT)(Qm=WTkmT-mȷi?ׁw ֋Jd?67UgR6TX8nY6SXe8g{Gv^mNE±М1 f-DQFīa' +{uKz3xboF53Fn`AD8pX=4g: LioPlaQީ7l^ . Hy=$I]v&:K4v G33CVǘвZHTAV#o@.4̛l_[l@, ^r!ϬRɶpl_:p7Rт6L_/Ɣ]A  w!+b*?5Ɖ^AJ\A>#%[C=s=#ƀ!띪UZM{(>J_HGJ}DPldCtA-Hy{G)=!ϐİ9& VL f3& yEPឰs(ը3shǕO>rNZ$4{ըSe(ܼl[Rhe(zi/yj7cQ^|m<7_km-4o9:X2;T|zFIt(=$Y6dB\[wDO@'Ft@116j'ޮC<}vm1Ww0( h*h`tGVvwAehOpxta7cm2~zn(s%䉖ĊͫC t]W0*PX4ѡ,_Bo̢#VE9k@p0}makKjx(c~I^3Nn4Z6. Tniu:!nC 6Hy,(AV:4X#<<Y95_>Zt7#ː/w ?. G9sl`[3lE~Y͕1I&7V5iǫXo|H_ [jN 6 ,Tpqݰ&+~ uQ9s)!g?1C}8y>C t ;m2_@~C^*W߄4 z%Kj+~~ 1!p G? G[bVQVzb7R4ԐU՘~ːBUc_+m2ԿUȿC- ?ijo3#Bچ*x4X w";uؚGgzJ7V"xF7ƺvSV RVJu IT#B0Mw\`9:NCN)~%Iyp7dAtЫ3O@OE)*`)F,^f ֲ{| _e.Yޙ=HWB3"}-dwCQZ1knγb i+N;SA\([$ZYP@].j&Iߊc"dV3A1#_=,.ҐXfyg nCJw^K`xĻqOrPF`?dsxGW g7,o|+D !+m=ϢW)Yb6$pKvαyTa3tEDY8'ˆyx<~dj`@ !7fq]Oz6dna ̢h\vtWxwp_vN[;!lyDnpd$!9B!]"-~*=O4Lb4dx7t@'eg*4Z H5GDm}fB6zJ#4bQlf_\wAVK)~mp1+f^7'(-4*m05'I&'E3ь1E[=F݌Up|[`TqN|%'* 23f+%4Ð>0{w*0k9-۞].kNDxA5u`/+0>B5f+|4h;MG ؚ Q];w+{HW|C6UO*+CM11m|yIXa$2U+0x]f]='@j?rxnq.>,W 5?%]t < 6igTzO+?!x l7X(A h]3ʕYwYNC~)Nނ25EXͅJ {Ztܲ'=L})jyFOe.FзBta-Yvb1OSڬ7]Ab^p6iӢ' Ov].Lr3 ¢ceIlQMN1ykJ'qLfc쥰K՛XXNB~D_F|Vs)}47AޤMiwxnHi"n P4j[dF 6u1AP/"~[!%Ƴ'\ yuzD?c4\c>[ M.pth{˲e̽FX%7҇+9twq٨ͨĿ #G- 4u'yۙyw^rB^a$< YRS=zK} ]*I(nEH!3'ةSgF.x[\@m&JxvvehӾǞ}\b~=3JSXYE?Z. g7<^|A}_Q\(T^β-+ ǵzoX}ɯ"w@ct0ir%o/Hd`F5/лq=J)"4_w;?c4c:ż$=Osrt{3k&$G | gAg6來;\36p"y. 䡩g|]75 (t4 u磎3WsoR&;7%r _aϷ#(;FcϵnfЉjEPT*Xr|fGޮWWМ/?.pˆU"Y\ĭӐKv-kDh.:w1,6ը<׿~G!?՚=- #<Y)qf(;F3Jmk^v)lBzoNTX ,F@ +/1C7^7fH7 9KUzNP`}тx i? C/5Pտc?=t$ [t__2Z}/eg#AW4_]Bsa7Lx#3d{֗k}v; Nh_ 3Pjǒ__5&fB]3ՍizN^3(ξ4;% p#< t虝D 9]{(;F3>neO'7үA*گ? BsK]Z?|oomג#0"|]O & L^矠n,lO\F{h-EmF-g~ʭx3u|>D3ۻ薋?}_.*{nÉwo0]-"y6I4CR.*^~Hoᇧ? N{sp'r[Fѱ߇5>*iʝUekϑ`jR&9XF:3F yP㐳?E剟J1s ۠K`&jSģ:[֕1>I~ +s9(sB)VDZVF 6^;b|ww=ۆ7m eG"Wn(iGt鬏~xͺaR5ܜ$mnQ$?zkV)K]S"Z5LӐjtYdqg3L|@@UFfȌ' O|BgW!_aWw)K'e͊LBOI<_?$IIذ^ at.FPݽ֢!T0׬C#t $j:!Wηe:V%ts^#qSueh6(؟{ÐU`/o^|`$taN@V~j+V)U 7Wֆ9Y>8L@~[*໐q3t뢃ѡp݈]g`.,Y>@Q<8ކ|[T:CJGj_Yke`-ɧh3ނ;o5To,Zm[p+ƚ8el&:{ 2k}C>-찭D)qȦ lnZQ5Qu׵$G8yJ-J9vC-4kޖV[z蝴aʞګwx 68YQ! w1F}ЯmT^\V샊[^YS/Y!)z8j#Qz%orpM%ȗ VRB#yP!_֮$?. ilG ؚjZdF{%;Bl[1Rh=܂ IlR25R[evF2*xҦ5cklx _ȓ K(SO£y\•36{al\&J F.ktC\ZEO oxZ_Aq6- W\XݠKf41$yFJViUˊ%?]Z?nfC`cM8n@Ž yNFn-u>ZZHgW(`(ܦH!-9#Ʈg.F91$dd]a+HSQФ}T r1o{eRd=rVk$BO6cx xҢgKRǫXN@Pe◇ 3X b*nx}B:lB? Ⱥ9ko-fίFXPJrB ,{U;4Iy@6t(UATAݨnQsf.2̘sFѕ=?~-$d- Kmr]9xVvFmog+idxUc:Ro:L~> x4iφ*-_<"xlE th,)!Ta@fjcj3(;& M+|R)gbt W&J/ <j-F3蘢%FFe_L*#nN<@`9E!*' !VܨqAY65h17n691QC1%BA@ epG (cg- 7Xpf\on'(M2출hKz3oJ;oCV_V^iPg3"-хɱ2˖ad[8[!$iL,Z;8pe x^|%u6JԊmZ)m#}9,\^-{9zhPW&D{!+K}ꖃR7!sx4 ^ Q1EY]I4&8Y4vS)0;";i2;X%|?VD^S4f.ZStC4Yob .jb"$FU+t FQkO ش\{y8(e{EΖ0]w+5WZ *iTMc $F3+fy%\\$ϔa$QUd#e_z6x @V5/{cc;ȇhI(Um[-XntbwRza3+*ךgc%Fv66zej:A& vl\AZhE'<q#[qpz5s#LoCm6ӟH2y\NBn]v{4J}v rկf^7Ns۩MP+av*~n{R=DG2&3&=u DLjpC]'\+I/l%8h[=Ȕ :eXކkY(>_:'-̣mm%kntgeE؋mpS;aՖ[_*YU̳3;k{v1*%'+aaaRΜ{wQtfkk2ŕJse/̬p" ]ॣ0i+XD9$VPoaMT>? jWY&Kb$Rm;ŨzU:mfcA8V\4+9;I&I3O'z4qЅ>*9/| N{ +Bk-Y^ G_"<@C"wPvfƩ7TZcڣ n.x̚,:Ph.>NgBL@{ChnVVrFDb L$6.,,/ؗR;89 ȷ@p6Hl>B[S}M F=P}џ#=OʡGWGK(l+74̲~{6C\w7¿}g>_'ˁ[byN\,feL4csOn8w^O@nk;d>4P*A_p! ̴aebD.Q R,EHbKz&iNqV䴙vҴAL=f&=>+qАf%ԏAVF/y6_h_agk*[zHzyn7ÐkẊ4{[&lv]a'Nݲo5rtcÉ_^?vtwcP}}֥311쐭&oW.p *85u&'YrYi.ё{hnYCl \O5w!63e @_,$ (eBV̥ˊ*:5O|Bg'- @4g튨\ރ|ObfqdIJ{`%{2_r%+ziwT|xҦ5ckfWPı3S<"حXǧw%>%85m SUUIaCbrKѺ qV< *|xpޅ?o!/#9rg[zd :CX[sͫ?7e P] 'B ޸%Q ܭ8xjnyO6gZBUQOdsGND@’9< A -LB񓭢qPr%2ZAٳ ɲ2]EwN6qb7| 6}^0sBpX9}Y:Y;~5mkKvONI\ K/J^20zee$߅e*P "rxRA RģIcS|keAsVݮVxcJq@f`ɶLl43!MQqLBv D,4"KIC#瀗!_:rm,YbGA/!09gPG-zշA~O_ CΧcS@C G9 69m{mx5nCg3I-);#< t뭇;WIx9mbfZ- Klҧ]ˎ9D<94'>Ĉzs΄{# ]v IC#\^ϻ^`E8Os[ZihXXYpZ[6 NJx-w@"y[+d^l}8  eGbȴ6Br͠MK"!|Ȯ_HQnaN Zb\O^t)'/]OV syR,E+X) 3wgIDft7]7:f4"䋯w$ǁCA4QJqκCtu.y1-ȷY m-{z,] p'6 t(:F3w(>XcEh]]O[ =N$ HAu)f#˓J;Dk8 Y)"+7RqY5״YoAVHLŀC[Ve.oYѤ|m$R'GX7 b >[9àV GjS nV[M6l.aSlP%M i{2&=xbr8y |B[%}<]DUͭzGc{|y}>))N[{d f"O8yF[#鳚#CV\"ڷE-8b&cݲf#$2#3}-?fGQT"p9T\.d.Ʋ#T=}Xb4JǃI*3N\҉ ΁1a lRDŽxZmT(jF#{E&S6,񤡑i],muPpl*&?#UD('QJ;$Ⱦq[uPȤz/wA>z==U|G'j[Pz2B@z==KGē6[c7MJ-Or6Ɠ%SI²ߛ%\G!ِ0JRq ӶutV2 gSQKC exv5hʶ%lhʓj<>ݾģH'#;݌,kG&sҥ²mt7C6Cu@ޢu7.4U:7xs CNw{ hif:"MHl2dqNY~aJxf`~gmVeq@V77UۛL&.d`eo3Q=139+U6ӛ܊lc3|h/ڞtANa Y&I 1dY;'1D80Nk H@h$X75!&34[#LC-#K?C$r_َ]~YfۼgAw ?dTnHw E΂Sx/BoEF6d<ٛX3o㼀+]>X"ccs--dKx M}TRlQq _k#D_ހa(uwO4L#&zM v@wd띊C@ v^z>M%>TWF qFv%.V[[?DRq@l5up7cWNMF]z%ہ%\|d|֡适y|@!la&ՀCÌɀJi sƫ9LGv`;/mDwp %f3RcIJ@w'mQ1f.YbkzS&ZT?PPҔ)v1lKfc9yJO'߸HYT\@bx~r[vŇ辖PѱށS,͑+:@ ~]ACn^ҝػ D ZڋmѿktSvnG {6;Alv 23 >; 쇬_ Qղ)0~n37oxy3?h:9WRqm`'!a^bV|FK<j2:kS,Cg (631񤡑K702Y026d6/dMy% (loO?>L[7% .D#8#$P֢#i<8 5K3?I&tBiGQ@<6u*n pdTFY휅xO yzWyDbPXv;^d%a"E| LiPlm5HGS7T^;oXꆧ7L4c&_XKy&$yZ 4JZ9-!(^|==;vFI'^>.uT,/[;/ d!&gjↀ V|?Y)ڰWۨ`ף̄<(7(%={hc@@<6X*xe+\\/-KN0_;lky##tNYxh <@O$ BVJ(GSF?^Bǀ!'ٙeK;< (U-O> 8#X{ői>S!|ff Ddp#STj&A!g)9]G TlPs rʯП4B&J#Z6~lkі"Bt</w^ۢAg`RdPAbn*,_;y'odM ٨4)_.Dfgvh1m~A|F(Gڶђx\O͎<͑uUKZ 69l88.lpm\4n5,* C^Ӽ4cʻEauGchhOX<#'I$d>8^z@F:BNW*8YmEͧŗ ;<ܑ5Do@6驩r +SNP1imzG:)'pXHRTfF)bW09gWy,C.cwSK_ # Fg\q6iv5_@HnV x/wO@6>RqSsCx F +Mw3W8ނ|˘rX @Ҟ `L)iЅ즣%ȥ6T'@c:l HS+Ho)T\'c q_1]8Rq#@sch TR `\uN!=5<ttYϬp'c"W,RF9Ev#:_@~M4_h4˕bw6W5oж?{i! c۹-0? <^:F У@l'N];wn`|"uE]xVٗ6~Qlۍer;99R{u4Oss oAVS?0vrl/d^Xsn\ԦaG\?rot\cP ]VR]\(T7Ofyh,ac =GW w%AQQv!ڑi.8E*oJxqmF XwqFj$FL=րnAQF]yc7 ^sp#diXڧVʄ0a TR% C 5EYB<_ u8AQ5}259|ģPAu)VK?5! e,,=s'ˆ :9G[s'|0-Y鼂uCI٤Kg}#eWIRc>tfNiMI!nVig]귇2Za2cl cqݍOEW(*?C>3$ D{|$y n *SVs(RܚHdF' kBi>3j+f삻 MfET&w!էz6ہ!IJTT(9/ I8A(rvMHT|xҦ!T0׬C#$j:!Ww$';֢[K~Z`IG QWٓDcR$l "lpPѨtYȳ4Q--FkjX;%wgv_eٜS0Хŀ2V OW[Qq[V5 sRD~%6έބ|lQj-ߦ8^lhlG h6=kQUʵ!X_^เIJnk!itMӊq Tex<=;7ߓA>ꅽU?' N`rd,yt/Jw+V'ew>)B$=hpM[U ܒtsylۙ!G~/@0/#tHo$"Z' w"2ut#71, SơUC,v | ?( i[8Y\]]{efd@u/MpR(Fc}n,36+ toBVr k^峨1Y8y7U&N,8ˣ-o  odUI,!qLoB6l'&oB~ST-i*-oiQē6[hC$6v8aGH/N_ܔò)cr8|AI2m]>x # ^f6U zȵ.3rlfNb<^;\ M6ӭ96]M{Q_G!mLh?MC6!P_ Pk/:Mx 5|'̀l 9MvVȵ BEK>(#ɐB)!+M_wDE7}IC7G -[cna|r!,rӜ\-8QyTEMwN.ӒS,alBKN0<{"`Y{ Jx_Q'މU{'q4ػT_A6fK(<=ymZCKҴJqEz77gVdF$moc]fTv1$Ȝ4̌M-3ͬȌʛdFdFS'ܢhAf4EɌ&kM]ڒm dFKB1\O530^|M=t3K/aM_CW*dF?cwIlQPqҢ1wQ/,U$Nm1eFSuN W ?#NO1S vdejvi^G,\vt^<;߳?);}anF)z,!co?|'W~}j'ˁ[by]fekl!17rӼRBѫqj]OӴ ) y\Dh c!?B?N(2Ld~2Fmdll^<<}*x,^헽}NB~{|4=0\MfcQƣOúzNvAMEvYԃ˫nC^oVz%u{!+wWP g֕Qē: Afԝ}Ahu]!wj;$zH6D\˷5k`MAqMd*C@Qo5tQyFD|O5Mt4,z7;LJ9ӄB%opWN|fJʛl 3>+0$R r |ܝJW%bڧnJuӰWKY2tߋ!yYEmٜK*2˛qY0c{t).]'bJ1c*)T?*xס=}ZiJZvjhh(2A饂bAQL|F*(IYK>k JވSK^TPvTP.CKRAi{qjuTPZ J[9j?#TZ4 P#4I%ٝM |MC.X 'ޞ'p8y`sjaWg`pm+hX=Rk|**p62ҷˬ`y}s[@P̓MlE17;3+g a:Jjk?+ {d-a?*SsU.MP-ȷ%xҦɽ43˒(KW'PߥY1ui&[l o33AhUgD2=8˒J_b* 8 yTu_a+gʎPTUɌ7 [lGaese?='8JV<:`<|N,b# ׍ Ş,'anC`r>F~ <`/l`v"Sj>NJ v1(N{ڦ7*C@ x4iφ^ŷ ˸j, WH\f'{3mQ*O|s^9!fZ7%t>#j|y ݽ/B^ս6\ Ӥ#DprTM.aB[L5{'F_Î$dSI⧣q2YʵAQ -U2rfyJ2M|2Ț8; H;# o3b`\!+MXN2ɝ}x%$;=OD wzAiIS 'Y&IF*]iN|;!lQq뀻 hyMS̻oGynlQ=l ?ЦWgQ_ȇ/׳ORUm2g' { Ť!od46XRSlM'qUݻ6dIKUy.ITbN_A6d'"^3D*. t66R6k%8s*{@<%N*&:ԁz)E6j\4{:AeFB(llu[:;!+yru./ -Uk72 @z}enU['B]˔[Hw5BQiWxn^j)Ta;SutJħ bt \4/Wcpy%H0r(=Azw*\\5Ex)/}~;?KU-QKz"ݐIdl;eG*H\]M$6|PI^r6Vr6g)C$;.'~!O 'ž+cǞWUϙEޅwIU6H g6}v"IʄqLA)okm3kv.H; f6`x?N"wA]yUF5H)~*aRuG21 D;z'xehLJ9f'Q&ؚ[|'b_e5D7⇠E(d)mO;y Og 9h {So054^ц2mheϫW=<~g`hQ ܨ5naRCs*,p!yk9̀a;GʎL+!zXn1g]4iDcKK"´N7Ӈߟ2 *v.]~RSESؓ%,y'/Fe٩! ς3(Qmt$7@VK%~Wm.h y G>R΃&Aq+QdCf41dBުMl5c}P$<*ɞR&Z'N׊b;!7D`p'?ao`l=͵],,0G­&re":U[i~m0TL͍Ω4 iTJW5ͯn +W`_ii~֚gEtM0ǯ4 sjUWl_9~;Hrk`_ki~ tMk0ǯ4sZKMs_pra[ +-Ya_GTmu4*ecruX3Ջe~-ʝX3\za7`ȴou~ P㐳o" @VJǷ1tpĶ ۠/ئxDmx[g}ܺ2'3QБPޘr$ hnåQxoPfkgTlQnu~f{{ԿpsJ_G?~8Sv Q{uRa^X+L?M?;' |^yWud3JGܗsvqd]Vtf着>u^ԽF$3ZiwkVOO‡+T1::b+f򏧇YUGB?^8!?P_UE\*XdV4ΰ:4+;Rxbj}ɣA|;2N=Vh;$EUׁ6cHhxj\CIޙzx72H [UV(vrO:yߋOuB:`diXڧV` *~x )eyZmzy饯5 ^{ 'pBF#$jT~QOǫI bģY-g?Dlhh6ٳt8Νl`{vb_cTF2 iYҺH;JZz]:&vDQk=IR#!y 5NiM엁]'X!fp@;J<'5Ӑn`},zϐMdIQIŽH#Ɔ$aDf8y f/= l&ppH֬dc2B]JfXN$zmtrrun톬T|xҦ!T0׬C# $ 3P;.+$*8Ai"6$ qyF)<* xEՒҍ&E8 yTuo®Wg\UiwvT]M5ckL$IN'p*$-+z/}gx^3 KK9 <}x=)SJ"~ aSTIJߙmZ$ e-If[%[/1CL@>zˡvB>pN]ch;lΒS(Q%nڻ6dAv`|'o_b~9 f`g)2\O #lH*7dPDO oxZBܠx83wšQ}ޅ, h&>~p2#f>GUѽz K6|&Q_(f4GC ^ Z֦I-ȷIn*ynG!NX a#A~u^%ddݨBaPS?C4Ji/5;=2,ܻ<YQ==F+6o`+l7V2]:@ZBRIH{| $3SeC^{Q{3 7Z:A]c7MVxNT;jj'!mTR*anRD~A>dl%u3+2Iv]uVOݝML%nYT3ԜnFM2lT3ll8R7Ƙl("pxa-EĮ@cPA-EG YiD^=vWJscc=ʶ9ƙ[?kL\f*׻CˤZpk*m"YBM '*AL 2.1 =J lyjcF1d%gb3N9[[9}c [ )ёڹ/AdLwo U4v xt4v8yJ[cd_*~Z@<mTM7Ө7ē6 1V+'GYN'ki`rT?1Y=ZwgY[i{sz5<,'!Oj!h-*Y®߂6 =JTMCqjF]-Q5X*~Vu;b iRTxҦ!fVk]gٴ7K땲 - 'bAĆ, "U%w ~z2j鎚[%z3z35۳VpfVoҐ {BaaPl,+L㽍Com7odVnUK674oȝʬ7HR'߰`5 KZzFGZ `]aI:U! fZTZsrh圇cO# ?ҞTs-eRI{XKU׋5ZuI+`8MѪ:!-gꢽtu 8 yp,!NB6*C@ea|}NNC>/@V TJ(Q~mumz8iaFL(0ʯ| %4^P7fNu%iW>.rZ*4 W.~`ofE~eV$2 Z_X6ȯȶ=`{Ӑߥ-AA~崺S(:F3]NO;5״C7Cf%_ʿT2{uF+yy$6zM(u8i8wQo/I߫i8tuUՎ:$?Cf8|`)y{Y (:9$QkeM'!YV [G#bQ!K{𕒈rO`=x_ ϭB2^Fň+FB)XY)']rPX~#=QUQ-=곤y.[ o济1{JFæ;`}9/|@"47``b-r. ]"40CB:{(;F3XV{q@}MmPxh&5,C$lchѶ!O1 'a‖$yk{!;s{ 4ףQT~&Pk1=J/!ӌCD7y7mQ%N ]NNf1&K==mŖ,ϣD!ҹh܊+nZC~ yOz>쀰}y887Snjfv)'#V[[o?@Tm0MM4`2 B9^C0m?L4?9~faKM3ґrOaO3OaO3mџ&fRџf;?MpDWJ ̑0n#G6wm|IQGheZm NNHR]2?5~Ie~1r[F̏agjRne| ܙѹv'2s|Y||jR&9||ɋu>Z:9|$Ym(?pQ{4nSt}K?)I~V&:KPP-RP*`|^) *x*`ŷ_I<Ψ&}Jb#6~ .v}?ė.#Ig_GUX;PH_W2~(h4>Ib WoAQ WǷ nL_T&k>׌/"Ybj7XB.RFR3^b@L"EH}Zz Ǩ^RN>a-i%׶W/k[(^K|emg{5 dSR)jޚ j. zZrsY]oosY&c+] ņ"dj^6@1޻V) "k57!2OF.Ss'(4SFg7R' E5e_N*& ƺ$%xN(rZK}(A6^u.\zP~Q.]ޓ;w9mFuА<^jcJ`䂰ru抮g*<+1&ĴDڜhJ0դƳR15iff{0pDXRot@"U֭#* _fnZZMZ;RϸLkf l{y6㟰@wrqB<ベ  <*2aNP7Uv=xw[hTm﹅NM} <s5=xks@u빅L?#y"kBLSjКYkm!YL ݳS(ci(p_ڡKg5q#@X&v]rFk]d%|*KG.X}@vz.w빾~W%Unԝ1 !Y$RlB٧K}]%uECVPW/Ut5$ 4t#!cFv tmJ}K|^ύbFsgۋ9rD7 ![& &ygYI?LJFי4C*mUy% ~FdFxO9 쇬vxHtpݶ},U(,=I'ogvVnI''hS>5S.C2 {/-*>Dd%OA>F{#j{N/Bli.dx{H%5;+T3|'~VQ7!ƩGǐSr*S"ܚYUGs5Y]H*(lx#J9< d:Bjkg#ؠy ^]kI4&a1!U.Ykг;.I63D!S cPqጩ0,eS5R4ކ|[iyE &Z<\V Ba-ϡckVcƅbG Pgē6[ychH:^XCBG%tC8Sr MSOޠ)vgIShO1]MblAϕH K&ܭl՛ɦWb5ɤWxr A *n~hGͻK̟w˅<O|( !$Kbēd]\\:i<-v!MkZ]j.tzۃىv:˙  yfβu+ 6Y=jȓ%>:t:EKx 6w@@|8}} ӺEcl>LIݪ[9jxdd'nwQ6bp8:[W 6]w:Br)M8_5H KKضU\Vu0i@< ' lB@_Y3uf"%s ygq@@<6*n d%'E[dHt@Ƒ~EdmtCNaJuw@֟*x0ރ(| 8- 44t8yD[CDŽ& U0 =ﵱ1" _;zF5povFsF(7ܑ|C@%GOSn*پhd>q ~+zA%xҦj x+8%}\|J&S?+}J /~`?dU3v hrGVE5''!+-0(!+9@Y@0YWJ |g d5 Gv~mO.>4ͱ)ߖex0oTvo}F1*{f9Nӥ+Jr螡Qň' }M!tGe8Q |QHl" OmӭL{*ƔOPmϝC<ՓEĦ "*}u'=񤡿73=YD͝,:`GxC8" V[ wx.ST1q2 ldסIxfNpx0FTIH Q<|RܥI)>9OG~n V[~N3P\Fs*7},Cw* O;5,uxώߠQnU<4c&2d)Fѭ.XYIu;q]w۬ICgO O[9x4iv|wqxrn#ȆiJ*p]$BB=O|G2,XlFZx CɅm^p=iGQb7=7b]2H֍/~HG)%%_AV "*j 1vYfxm6[.(N>{^D,2 Ż(+|  hiιȀ/eU\F <9}})z={ Tv9Ҥ(_CV?rYIr&cLyW65J1ԀE.Lj{NF;hAǯV>0v;rD<K2j:00Į-WFS>߇1uɟv*9ƔjDyEioP8MDAVKafex ' AVZ\BV(_iqDZbDVG)T\'܈Qg*11<Y}׬DS2Rqǁc(>htB.B^Hح|[ѭ* N]*h[9]+85<~xZ`Lk8yJ[_[B嫨hwēv~ c~N G0kX?,eThys?cCޜ!D 9 [T\'pd%0q,PkeO^Iǀuu#B]%ϡM ރֿ5,toREEO@Vr+Ut.d|OIwD=s0%6v?XS0; 47y?ˢOǭ}|<-ypރIuU& >[acxZmST\7pd-iu!LV~9N`?v^E<ē~2ڬ!6W!_ms) 43 |-d[XmEځ :C~?=⛑J9сfP=qO):T\'МEk>!cR܂Jd +-kix W壢 ]d'ē>\xK@<4Bߊʱrw7C@<*Nmȇ]@ KmvPt\ 4o9{VcTjv޵y7KL!G *ۨ[W?O*1XKf,g[Z~O؏̏gZq ^]='@j?rV<8Pv`d~-;y`[EpPr/,Nd>}O90,0kr |XY\PXfyKu6|JC:[DCX)Ia0?Y¨n&N]0T\'\cLE5!eW?\t\+:m0Qπ!?OTd|[|Uf.`͗RO%6lT3L#ku؊櫺8uSlT\'͗|}/9lggRo?Y;66ߟCWk+Q㕊ɘ*z}NQ^b6^*#ǐzslc:l6%N]_*|}_bs|I}9\n"N]/ lg_Kl|g۰fK߉ =Je6hvMI@zm>'2WV4~; oT$ O@ֿO^ehFj:g"{}/_>6|7Tə_oEARS}|!M##=d!vV,څ,[v|Ib?|wwOhn>sLʄ9[prtD0}/W>>zYwy]wiU_mf9 X#(g7I#~ ]ˑs~L !kxUYzp򄡮Ҵޥ 2H1 _ђwܘV(rdLK YYm)i'p~ C>;u;<ޡE?" 4/hs<) 6Ϳ '!IШ55Gƴ|hk7ȿc4c; vxbN DNmr}b8)*ad-b Ql# ]; FdtFT4rʌ~&FoAxwX޵}Vم!~+[t:<$W'x ;ԪN[OM)|H庈&@ $TO5Բ4S-#<6sMifY#csϮbZ@_.Yg U{`|n7,vSEEeJ! X񭎇eWxʫ7ׇVG?{G$\~WP6K*BiagN!?:jϜΝxz8_*,{ VqP\en͉?jXݳ&,ޝȐn*iwkV'p`;庩bΥ^^GxLЖ3יTGB?^.-1І®(i-nzJWkmkEK [ʬS|p?BOV8kS^Jl@f,]qT (h&$NL=ր y ƣӢhp]yuiiO[5B:`diXڧVy)ՙNR # C*9u۶K"˹Bx 6:,bwٯ/@`Py'aO *S8-QQT^vg-T#jh244$?鄒$6o52xse("Vn."i(kugģn Sʬ#Imp#*85uF'߹5^V)E]귇憘@@MR^8 yZF ֧". yϼHuHވ6B8ypdȌ' Ot|BgW!_aWwe͊LBOiy;?$IiP˵MPN Bmtߨ7-Bks:7¡_G=@fdw.+$*8泡& s<-8чWz8<#ϔ4oۀ!+ ՒpvMp/~ _inImôYm@~MپSvɪDUs‚E;\(T4uOz߲|Z '˲/5#7!~KvX%wfN1J^&7VI| ix9,;ۢ/' t }*J\C&&kOI)m*cC\T>U5nzn͂ÝvR| xOY6GߥI 8 YiafDϴu@w,kSQ6X/pT_Ky1Fn0\«gBv:4ldX40Fx ?X jGk *+ 44yF Գ8EJWiu:)5וKlBC|[z=LZ'hgC=9I'ZܙnΘ;-,gՙ-ֈlu3ޅ:l! i b4S{nٚAp=jl.‘B^/p1+nr;xmb;8>3dmȝ>PzXE`?!W.ݖGK[L9X4 l=R#vëE[A+qos* :5±(<YƊ_|  Ya5& 8Ѻ|AzLdخakAU@Aiyl_|`$j*xR7[.caFRiմns̆pNtg!ϦڅԝlgE8UKݗnČOB67l7Wy+$=mlu"gX^xwK7o"v}m@͛~ 3Nu9z1_: 3B9#܉E9c,jqئ1! iLb4S{j\?QbܼYdacv=U0%<r7٭[(*ck6]5c:PxJ\pZE" ܷ CZ?fQqa{:f-֦n BAS6KͅP'$":W7!+" 1,_ReH+ΟEݮlKnȎ6t)8ϼKi헎/Qp:2qbL'QZ|DT|mLm qB;9+lOIo!ȇW ҡ[Ly6(d3lOs[tFid]"uxrJ[7 . '@}r~ &W*hn16Y*{ʢTs\g%3|bTb=!$ KNn>k!A6XJd}\)Y@hQq xZOi? ukk8{L[*C@eaxiPx^{;FuisSJxxf}ܷmԭx񤭲a-1m mesh$dfLuYaCsvAHf*9] &Y>JZ1Y"f%+š%]tҎ1}ӊr/DO,W|kn\7̝}G=B!.SN1U[ o_~x\n}:\1c[[ǠO(Drt9U*,ɕ=M̬E)𵤰" U _~EWr1/]0!6BSW'T.Vfk& ]4S|%kDC@uIx[f3mKzrYLnEguBv񤭳?EOҸ&%^?.E) }]f g, +i+{|:Sq")~6r \V-Ȭ.ı\˪?SSOi9tw!kun4Ŕ߄ /߃ IQqC~ Co=CkLujTTmOF/K9i4j=RaB~Dd3o ANm}bwx;}Vn^V_Di$d!kD`_!=A@IPt636%a%)nO6 >ݎ g33So8Fu̺)%pʵOZQGYeř\y^q[NJ&%Ri ju8>;TSo'mIfɢҾp/W kU󵨴| PV}Y<} 6Tv&cǐpv>6"gb]^ӲPq朝M!7ƢeT M7b..YF.X&N/2*E8YF'!Fl,#,<SZ7h;v [pppO%0 p;V.%(xK̫ڋ?KVa] E"<6]u}sT:%"Y!wbz .[!o5 )Hwl*sCz\>5kШ,σ:#|._0Kz"|(O= tw@6a$r)te,F"N[酑MH_Z Fئ0WLb4]Gi\R'Y QԜ`Cl]}<*BE.тm/z4@[v`{ N|)ʁ˯4u0F& ~?̾06S }Bh>`m0s%+eY}/z1 YdYn?$4AEC1^} 4`x97ackj24{l 6 So73cxgxN[a׭̄FHǿ~~8óu8r|GG24Q!lcPtfѓM"tYu?zcixts Dh.^whE eIy#<!bN+ IF8=tVAIֆH~ľcdmO$)s$q?S SFI4!Gnѿu-T&`jѿZ ئϢhvk-e:A9ح;̵t 71"bgTؒS(ykQtρ;5;D'+1?/" C!i/__D<_ Qi P?]3cY_ Vc[ojg87+K/͹j{@2 C.+D8=GE"G+쯂 D4l<5i#epU HsGx 1mf|~mZFld&|Y!] Ljo2N.]^ 8ّ |A獓|1__0Nl㚍bL) Fɚj;j rf(h( `[8m6S `2JG@F3;p?q3&W2Q֪_EtZD|Ϟ^u (6{NT4YM @$*J'\ yͪ$sIA<6 P[NAŭnpl%7T%G.oʁ(˓k*UWӷ_ĵ~$~k:G^\Sk: P?N%j jq .dTb-ܜp4k٥wA0Æh5=P{ifw/lUxa{$^~A9®yp]~@y=1RCSy?hYvy44&Q&l0_nE>,6_'N[שMj%l|#`&$M@)J'lxl0 (hnz:-sV/,lI5\$d`&1>s߂暿7-4+暿CzG\0۔n1?njڣB 9ܮGс(^8E^!87lٶ(2svuu홧>!`1b9*aZ7q6GYh$b//i6SplMf uӴk}*54o4??2N&PoS=# qgMPR5r:i ID1uԏ: w.~W~?G^ B憚f6$4 h7 _'䠶j_il۪d+mrp[ qMp 52_A )Hk_u?p}ZW%N[魫Ri1JuU*~xҦC3Qoijj:M&/F#8ߑW+<2Lc qgEJ; 8/Ft%+~}[8m6S P+aTS4xqՈZޙ=H;$Hp $xd$Ahwv*d*:3 ZV>^쮏}JZXo__YCjfUtv'}JV|#ދ׋M/ٶj-KҖ@ ҄j虩&9ģɦE)W7 _WfEM(ނ|KFjzA)#-PĤr3ofH0bL?̈fcgx0gJ`(Cl\v:vQƀ@jQU~xL#SpyiuV?iQ^\v^+\D+/-? 7H30LoOm6j%l0Pz0~uU$w#áqp/Z,:Ə2x\o̷666FrilP1Į:Əb{,؞;|6:i.ެ?@a\CyScc4ش= jyBHqpYF`'pW$kʒfuXrFHmm=HV˘s rki4V1liFddkiGuN}qi4 3M= y^fXhif5ML9oȴ4\3L430LM3443᫞Ufj0ZܣwD0k3 [٣wDȰ=zt N&2Rk6;aFΨN0Livm6;*yfE- I )OW2`] j]Q`դǰXr [fWT#F-k׳NZ*YZg7#[Lb FJ:#u0Tj8ĬI8Jo'(uMyO*)oDlA[4%S1*{6kvE_e7oW,7CAW:Ə\ymЋl2Io(ȸ6E?v5xsse9V`LGFgfNp={Ҭu3Jq-Yh=%k~?lFUƑ=fjw.˦W~,ږoGY(% ,v6q͙`͵ d ).7J~C<4GT {wH|{J >[}aM۲%}P=iBn,U2 SmjOHw0fBI#|/x4iiقm%yIj` oKA-Xdo~TK#j ,'qUZڡg/~ƨW-ZJ+ 46]+Xxb 5i%^n_8ӊBP+ܓV0J4^l[JQ+퐥jb: R }C<s خgJHtJw؁DA]dD'(6r1 pAI[IHj෥xޯ*bay(W+kresܼPxo{vGys 3Sg"guy;+ ͚\ɚ/x*JiǨ^r[rkK4\w]f֓AVjPz2nZ,{{6,N4NdLflehWd 0n?^WaF^QtI:=f<.0M:,pl?xɤ80tz[g^)viƊv[VxS4&t NX~:@4a;  [iF߁N(~W:A u@ߥN|q$NNEc9d; 3ϮmgZU3[,&t)e;[>XYG\r,"[HΙlpu(,Җ"2`\&2W!_xl~j* ɞ^|MZX^ʚseOԬ{XMSx~NJIPإ 'ͯݰS}IF%|7^1QN Yj"i*l$mAX.Z.~5֢ xq|EB+&dSI+ ׼)YsW5ȯiP̖oV-w~gg9ހ|C6k~pL:{#cru 1 '$MQ88/!uBn&N&8I{ƈN7p;d}{6?v nʒvḣόb~Xs./h+1ѝ6<5Y5үFxҦ2J 挧2ex$wшQ=Ffe%_{1wNay$:`(A 2k:{8;?&/;68q>t|Pͱ[)~}rZrm Vf{c+W+2,8>NIg͹_+] lύذ{ȥN1Yjn-b{6m`r!J97M@ىMJ&p򜲵)n0,B.*sًYnvŹٰ^)C%">rFM,=%x_Lhv@%Fz6ԳV1_-Fmxhm(RɅPav9LKoSv%!#<5YCn+\h%2xҦq%cPU#է"շjHVA5\ѦxORJ8vF^^|]cj7b~xƨ5xo:SrՏ"^1gyH|q@Qt+ K"ر~VD+t)~7k.y˵wC^J:Q .LnsɚJ0xҦq%c@ #:3TuuBZ,,>U8dDM+oW2;"6+PZ8Q<֞ UeCђ 8 yZǩlzvP\#rig\ yjA| [|HB&8r䗚oC\2dHۨ#N&@ؤE-N9Qy!j2cⱟC<6JnjAR%pv4(cJRJ ;YlIAj4m]EiNz4{!ֲyCy|(4 >a8sۥ"-_5y)nd qBO3JnԸfS,mV?!=R3,w&DʲMeۓc>PynKDrp-ϳp{.w5p׎q|T| hAұYȳ~p64c8  krWva:nX=T1fqܜwl70ݲ;8- {_0R w]BMTKe׬| W^ꃩ~Jcʮp%@;` ByԴmn[xFz)F2CW4ͬP^AZS1iȧQ!gcƑUh(l؝Jl-lZ*PoY/[pTp/dbKe K\Q{L/l]%ڴ ٗ[bx}W^F> p77a{ Ku="Q,t=X/tsKG7Gus2{,XRJxˎ{Go>Ey&7mC4ϋL.ȻpY'MPmIFQOiH!"q&!Z#.]^##&`>eYl/Dn{Vx TЁwIv8ʪ9<][[V~gBtO_2>NZ9dsâ>rfȴvr!Iqs/{1I)Da^ċ Ga>"i}=GqUe,QtKօ=%.nyZ簋R|aԻ8.nQlDgz6-xDU,CoVȁݼ݆.oUΎ*%PȤNQ9* nŚ%+5(ǽBm:+<:'~/* BOhfΡc,lS52g G ewG 4g^$AϏQ93xI%lEkRz7ұ:o ׀!KJ_c7!KMy[R6aψ=jz\MS׾YN &m#U" 9%YJn3p#K o?-HK"{巈 :n.W=tyQp8yFʮ p~!%Ef4eV`i(>%f0naȇcs$.F#*G9rz.U?Ƣ9+w &dz#q`}8 Yj/߂,[(mo[p¶\_#*O Cfsy˥iYF,Lgˮx|@!#JBI8y_.||T+~Z)WMΤ~L~>Cv"| cT~.3~_0|4K.=sf4&g' ނ竾_*j 쥆nYn"ZlDX9a?~mZxZC~9\V %!U>sL~hDs?]J{ԭZV?Dbē~Bؘ ACWtfN^~CqvMPéIxp+;Oj/D~T4=5oCQ"Gw!_&8CMÖċ9TZav LQg|F~䲰X|o$|#;!Eݽۅ1)vwCT`72hYj0k\n-: ʼn*x@Q+* GΌ(пB /Ή{iǨg@nМbiD-`I /hۭ;&(arNUu%Z!SAPk(&Q ҎQ ^@:jczFM%2wsiTCsmP#4!MiP;;Zi[B1냜6-r|mf-fv~h]g[ϙ}ο !Vx̆rai7P,H;F=-ts\~VX8w B!OW% FoX[d6./t8 X9>/T6 ]jywpr'@S\|:vzt>9hGr%Ž?R.RN-@a|`pݲs6=,>K_,Yg ۝ae4CLn0i~V܂K$<Y=WH;F=<5h`_23L6V=.)p|Z.c#>_#S`ym G$l9j-|4ˎ0@6霄߀*iȧ[`-ҎQ $8w OxuL`h4Cl5l9aEw Ҭp>{p'<;|w 0#g~1"٤̩ZgqO@.@4!w"]J%-*>nB*AfBAG%">X(juf2m'#a*s=£WQ%vxYf$%T7L KH5t(6HFQh,{b+F kVmo3E oP:&b.5`yT$h)a3!$Ce!o'; H OR`6͵ _iC@nRQ?9Coi~s5"#[#TMsԍa; mb9sCCqUsnCغY, 0/Zm_H49~bESM3Y%Qj%K=/T{/a_ѿM6GlW49l~shqWFW0ǯ6_j|e4G} s:]uMk0ǯ[l_nivF_ ,'Nf$XO`?v2z#n?J~1 ,p\_+6E?ێikhs JљKKVb%{ܥYgF 7ZQʙJ?K?zk&2<6W)2;[8]6y=;fX= Vb(̍ModI\+.Zubk'F.F_;efM5tnqiw3V0?m;]eSmq>x4Ft_uNlBNmPS]>-+kMmɄeΡK_ܖ`^ W䚴lm˷#\rqpͶ $ikCy(_vrkM=w$'C+}OEIvp++!V+ZVR0Vѱbo&6(6HV(~hZ MlNY+\ȵVje34^}Ӷ ؑHiPe{CI Ҩ $laCj=ΦoK'q ĈGX١˚\.Pf:C³6O<:vA dT 0"V}kQE n&\)6HV-t4~$Pf5蛙1 RzPZcEgZ85#h  OWԢ2:@-ҀnEU$2W!_8BKdAZ,/e2Q"*{SZqm >XIQѨ=\Ѧݰ5~Ҧ%|7^1t h%)ݜgV+tpmHhP =RN|óq:׮;?fy33 G!`/o^|Ec$,4Rr=(.|3+Cbo$N׀O ?Qv7h=֧m%XdG v1<ۯHQbx] ܝYAƣ0hr1z]-؂)_L8\uC!pRK 愹hw,|@ј-X@8< 6N*8@kH)#䜲ڲ2:|ZN6J\DL^^|)ASVf6deUrl#ڐ\khyXJ;S3hGEG@1xn6OE CVB,ދS |X-~4bŧgЅ t7A~M=wNCѫS6.6mtFMc%c˂hb>XT-6g8mDl25֦ul3"L$䓲vE]Rr=K|=t|P ꘥6]CPmЂl)-.^hyڐYbFv}`FC 7ҮZ$m=~o T7 n/ڐMJ&p򜲵858\b] ɆS T>]:a$:YyӪG !kGa@ _Pw%EW(8l>6KJeR#M_O4bl?Arȏ'VH@rcM A(pp} 7b_?6:Ztrh缑3+{8'X󶟣 }s~Ĭ%,5j2ݦemo-K#<眿BoQt<oOrd9!A#oCmm-`3_4`;Gcӻ^jl:*jTD{߇u^9^^4&֯d [{}QsY*G)*\=CnkdC7h2Ɣ1Mp _~x4>B>;%$P 1[[NRr{rfݗ=͢h^ϣf 7,_4ieٲ[\fq].ps֞Ξɝe|FOvZBqX[YWpDu#7D^iVDu#lyER%U|c!qK来(,hk*mm[zbkGaGZ1-Uϯ ÎBB];!֦uO~!_NG[g@~EY[,(qWkGEzdv*ێtKtpSB|z <58) p&ȓ|S2IGl)~h:A.Q"lDm7sJf,&8P]#%!i@ Ĩ46o !yBDb&< -~)xCn Tq)Ot+ca=};X'v; Kbb4 f:j;<>1Ê1ʤgbiZpq[!omOdvC|@>0x/՜9.DI%ۧE|@~Kх{!t8gjGP̪[ o8%e$ Pm? YF_ܸKTU({mz).y:z$TS Kk$OM}4)Ww IG7Ou"K)Zˉ= OR(3, V7ox.v*O*t2+VyG, K<8eR3Lj#5Jh}S˛g/31toP~LӡJ'H p{v>0B3\$]_wCֹ  3>Ӝz\E'J#Ytrxr /FQ{/A%YtJ x\#,j i#1LdV M`|9=3DwJ{EvC`9ᐛ0SI D1˾9bvx=ju/B*z/Cv>0j3Ck_Kb}UPm:xj^[&d#bƈY!O+ Ex`l-s(ax q0=eG*{)n+a2Ӯ`Ӱ&Ϩ+@I6Kkilzd' O(όyYmD:!ʤvA%k&v-njvFN 3 sC>`M e#TT&Z-(!)RD!W%8 9$^~EQ%W%U' R;uTӽ Y}_rmj=L}x))C}>f=|3[ k,84!wcI6?Æk9D f!$_ir[U_JG#Fm ״He | HLl JMg (Jx!ECsU`EfK*-!wX$IW<.[AɷqGS?eEC/6wQ 0UdnQT* 9T=׏a[ Qmi3IwVM&Lxeab4]>n;RCW;<蕑1DgEӐJ?6Zv,*|$w/4Gv\pz(~x ڪTE]p$tx t43q`|xrR_> 6j>0,6tյ5C]ˣg[U׵l 5Aȃ؍~A>Vŝ6Rr}c[O@vn~mFBǀW Kbi<$,AÜy[aYO#>J!J&(d}ͽ8=; KybՕ|= 8YAGy!eW +zt//Ltx tLtبOb :eEJ:dR]td u)òς*nU{9WZ"u x VqFYGT[,d͝bՖ<҂GiafN1Ƥ0nVR-Lt8YjPD͗ccR3F-T&Y=pBܯws2ImN8ģ|;;?}pQhG Ĩќ2Ѩnn!:j&sL:'YB>b#9ēBc1Ȗ_CүiqBqՏŚ!ܷGhّ̕G&]ʜ0Ab".8XE\k fޢ%Ey &36WX 3A!LBy p49R4YGLA~NjCՏuȯe#_ރ|OOG8|(7!>>uH|K_曍mUz g7çj/yz G y-Y[BHfzNwPm.[UZ"kヲY ޓ={Z]#+l8a77tz;,2@EA;fmd Qzt~CMk8J΋ &2A/V?K|< U8L]S?B2||x|R5R;mGœ-!  Z-X"QmhlJ 2찱.oD%u񤭲gPSz x;k4wV Y> mhvmK<fQ\QMCٰGp +ƅ]BEXށ N9`8HC!f_ԝ狖]fY(竵#XȂNARˤhVD.G/m-tֽٞm696,Ƴi 79y$&:M̕*P%_ B҇Y,[ ~L4p'r>e"*LAւ)?n(zHnBվ?bwIlԪx-摧o}mOb#ƈGX͉8ɪTN Cf0[C gA!$vz$ 2z7%qR 250HVH;F=]AIt\'EsrMvrBl  Cڔ\pv3*gC 2 2RP:cS&I 7ti ʋv?h/Es(lv4ja_ w R{cVc x^Vt *Ap?dp 9@ke޽2gE_BWp7dh+4W޳P\E p1됸- a4-/vz˃'f&%;@ѶS&&݅s ք7VZkP_k+u_~ה(ar5 Ԕ=G,qTh^X+,q +rߥB\u'RgA&d͉+VqH#¤BU&J4:5FYQvHwBOy3O&%Lσ6!^1 'ͻf챉^r,oF%N„$ OBG0Χ ]ɾ$<Cɷq(?=zC/Q#R/؄,菙U,M3^rl\dy$F4OHTָ%]Fe; PVZxx" .O؞˰B}3”>!̑rMC㇨jn|aMC㇆Ꝫj!Ƈ06M?J4?9~jQG0ǏjU/HcF=ǰǏ&?6R?=~lG6Iݳ]qHb '6?1Zl~ݷpOak?mi~ s4]iMSM5~ɋ6 A| *DFBZg*e~k blP #]aͦS R%dW-m>d\]#N U#r$T:|ە ! @} pGּ6ShfߏmC6$^lS[ٟ!^LVۼ=xjѳۿYr_fTI7 :x1c3G w]eWmq>x4Ft_uQ;V^uuӢ?ԘLXc朢+xb krZhۑ}pIh ^)e;[>XYG\r,̒Ld Ii^A\\v5AA%UW5:kv95הuؓ5gby)KGE͊LANI,؁ -xڢGr-wqBMa8ē6 a/HTvBn3 :[8T6$4s[v^ g5Vuv$ِ,(Q+h,u^T,«y?Gb6+㉇p"F׀ ?RfvB9) ky8ဨjPz 7[tzTb`a2i6-H4T[T(tqjG C95\ARK愹h 3gmw>X0}byv~9_Dx_sxJrJo֒U/ZLK&#Cxqi\AVwzXaa sK6[F o0Ka*ģi wJ&iB|AV]ޓ):JÐ&q嬕ۗZo<͉D9lAߊn5zWECх\/l8J~#x4i-\;ՒW+h 6iM4ckD1Y880yr&C#w] Xq1bwbD\!Kb%wxj!;!nLxH6EN!O+R 86~ T҂vV:a4iƏ77qFؓk|n!L:Ir.8EX9_|^sgEgW<#3\=,;eQ:&TZf@dp]7 Jne݈M5˚B}mߘĭ5unX%c2AԽzeW|brx0;ZW^siͱ`U%uCCth-Z'KQSM] N3>Dp %;,74#nG`!_N~aZ*>?a_.QuFmώ"$+gN'|a#Um"M{YьBހ,7jYuX wUqӨйMe>"\[aȇ_[1jcvE5ORj{N%ƩJCK*y"&gDK|9$9k9s]xV(%F=Z sXUS'y-'ZO*ςrWgaw'Ew6Ћ~cWlq4XO9r+JnJXEExv_˝QxjGhK }#AY%ԗ/Cdplyr/man/0000755000176200001440000000000015137161765012173 5ustar liggesusersdplyr/man/rename.Rd0000644000176200001440000000555515106134104013722 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rename.R \name{rename} \alias{rename} \alias{rename_with} \title{Rename columns} \usage{ rename(.data, ...) rename_with(.data, .fn, .cols = everything(), ...) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{For \code{rename()}: <\code{\link[=dplyr_tidy_select]{tidy-select}}> Use \code{new_name = old_name} to rename selected variables. For \code{rename_with()}: additional arguments passed onto \code{.fn}.} \item{.fn}{A function used to transform the selected \code{.cols}. Should return a character vector the same length as the input.} \item{.cols}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Columns to rename; defaults to all columns.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Rows are not affected. \item Column names are changed; column order is preserved. \item Data frame attributes are preserved. \item Groups are updated to reflect new names. } } \description{ \code{rename()} changes the names of individual variables using \code{new_name = old_name} syntax; \code{rename_with()} renames columns using a function. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rename")}. } \examples{ iris <- as_tibble(iris) # so it prints a little nicer rename(iris, petal_length = Petal.Length) # Rename using a named vector and `all_of()` lookup <- c(pl = "Petal.Length", sl = "Sepal.Length") rename(iris, all_of(lookup)) # If your named vector might contain names that don't exist in the data, # use `any_of()` instead lookup <- c(lookup, new = "unknown") try(rename(iris, all_of(lookup))) rename(iris, any_of(lookup)) rename_with(iris, toupper) rename_with(iris, toupper, starts_with("Petal")) rename_with(iris, ~ tolower(gsub(".", "_", .x, fixed = TRUE))) \dontshow{if (getRversion() > "4.0.1") withAutoprint(\{ # examplesIf} # If your renaming function uses `paste0()`, make sure to set # `recycle0 = TRUE` to ensure that empty selections are recycled correctly try(rename_with( iris, ~ paste0("prefix_", .x), starts_with("nonexistent") )) rename_with( iris, ~ paste0("prefix_", .x, recycle0 = TRUE), starts_with("nonexistent") ) \dontshow{\}) # examplesIf} } \seealso{ Other single table verbs: \code{\link{arrange}()}, \code{\link{filter}()}, \code{\link{mutate}()}, \code{\link{reframe}()}, \code{\link{select}()}, \code{\link{slice}()}, \code{\link{summarise}()} } \concept{single table verbs} dplyr/man/distinct_all.Rd0000644000176200001440000000517014366556340015136 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise-distinct.R \name{distinct_all} \alias{distinct_all} \alias{distinct_at} \alias{distinct_if} \title{Select distinct rows by a selection of variables} \usage{ distinct_all(.tbl, .funs = list(), ..., .keep_all = FALSE) distinct_at(.tbl, .vars, .funs = list(), ..., .keep_all = FALSE) distinct_if(.tbl, .predicate, .funs = list(), ..., .keep_all = FALSE) } \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.funs}{A function \code{fun}, a quosure style lambda \code{~ fun(.)} or a list of either form.} \item{...}{Additional arguments for the function calls in \code{.funs}. These are evaluated only once, with \link[rlang:dyn-dots]{tidy dots} support.} \item{.keep_all}{If \code{TRUE}, keep all variables in \code{.data}. If a combination of \code{...} is not distinct, this keeps the first row of values.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} Scoped verbs (\verb{_if}, \verb{_at}, \verb{_all}) have been superseded by the use of \code{\link[=pick]{pick()}} or \code{\link[=across]{across()}} in an existing verb. See \code{vignette("colwise")} for details. These \link{scoped} variants of \code{\link[=distinct]{distinct()}} extract distinct rows by a selection of variables. Like \code{distinct()}, you can modify the variables before ordering with the \code{.funs} argument. } \section{Grouping variables}{ The grouping variables that are part of the selection are taken into account to determine distinct rows. } \examples{ df <- tibble(x = rep(2:5, each = 2) / 2, y = rep(2:3, each = 4) / 2) distinct_all(df) # -> distinct(df, pick(everything())) distinct_at(df, vars(x,y)) # -> distinct(df, pick(x, y)) distinct_if(df, is.numeric) # -> distinct(df, pick(where(is.numeric))) # You can supply a function that will be applied before extracting the distinct values # The variables of the sorted tibble keep their original values. distinct_all(df, round) # -> distinct(df, across(everything(), round)) } \keyword{internal} dplyr/man/arrange_all.Rd0000644000176200001440000000643315106134104014716 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise-arrange.R \name{arrange_all} \alias{arrange_all} \alias{arrange_at} \alias{arrange_if} \title{Arrange rows by a selection of variables} \usage{ arrange_all(.tbl, .funs = list(), ..., .by_group = FALSE, .locale = NULL) arrange_at(.tbl, .vars, .funs = list(), ..., .by_group = FALSE, .locale = NULL) arrange_if( .tbl, .predicate, .funs = list(), ..., .by_group = FALSE, .locale = NULL ) } \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.funs}{A function \code{fun}, a quosure style lambda \code{~ fun(.)} or a list of either form.} \item{...}{Additional arguments for the function calls in \code{.funs}. These are evaluated only once, with \link[rlang:dyn-dots]{tidy dots} support.} \item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to grouped data frames only.} \item{.locale}{The locale to sort character vectors in. \itemize{ \item If \code{NULL}, the default, uses the \code{"C"} locale unless the deprecated \code{dplyr.legacy_locale} global option escape hatch is active. See the \link{dplyr-locale} help page for more details. \item If a single string from \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} is supplied, then this will be used as the locale to sort with. For example, \code{"en"} will sort with the American English locale. This requires the stringi package. \item If \code{"C"} is supplied, then character vectors will always be sorted in the C locale. This does not require stringi and is often much faster than supplying a locale identifier. } The C locale is not the same as English locales, such as \code{"en"}, particularly when it comes to data containing a mix of upper and lower case letters. This is explained in more detail on the \link[=dplyr-locale]{locale} help page under the \verb{Default locale} section.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} Scoped verbs (\verb{_if}, \verb{_at}, \verb{_all}) have been superseded by the use of \code{\link[=pick]{pick()}} or \code{\link[=across]{across()}} in an existing verb. See \code{vignette("colwise")} for details. These \link{scoped} variants of \code{\link[=arrange]{arrange()}} sort a data frame by a selection of variables. Like \code{\link[=arrange]{arrange()}}, you can modify the variables before ordering with the \code{.funs} argument. } \section{Grouping variables}{ The grouping variables that are part of the selection participate in the sorting of the data frame. } \examples{ df <- as_tibble(mtcars) arrange_all(df) # -> arrange(df, pick(everything())) arrange_all(df, desc) # -> arrange(df, across(everything(), desc)) } \keyword{internal} dplyr/man/same_src.Rd0000644000176200001440000000062314366556340014257 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/src.R \name{same_src} \alias{same_src} \title{Figure out if two sources are the same (or two tbl have the same source)} \usage{ same_src(x, y) } \arguments{ \item{x, y}{src or tbls to test} } \value{ a logical flag } \description{ Figure out if two sources are the same (or two tbl have the same source) } \keyword{internal} dplyr/man/count.Rd0000644000176200001440000000760415106134104013600 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/count-tally.R \name{count} \alias{count} \alias{count.data.frame} \alias{tally} \alias{add_count} \alias{add_tally} \title{Count the observations in each group} \usage{ count(x, ..., wt = NULL, sort = FALSE, name = NULL) \method{count}{data.frame}( x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = group_by_drop_default(x) ) tally(x, wt = NULL, sort = FALSE, name = NULL) add_count(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = deprecated()) add_tally(x, wt = NULL, sort = FALSE, name = NULL) } \arguments{ \item{x}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr).} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Variables to group by.} \item{wt}{<\code{\link[rlang:args_data_masking]{data-masking}}> Frequency weights. Can be \code{NULL} or a variable: \itemize{ \item If \code{NULL} (the default), counts the number of rows in each group. \item If a variable, computes \code{sum(wt)} for each group. }} \item{sort}{If \code{TRUE}, will show the largest groups at the top.} \item{name}{The name of the new column in the output. If omitted, it will default to \code{n}. If there's already a column called \code{n}, it will use \code{nn}. If there's a column called \code{n} and \code{nn}, it'll use \code{nnn}, and so on, adding \code{n}s until it gets a new name.} \item{.drop}{Handling of factor levels that don't appear in the data, passed on to \code{\link[=group_by]{group_by()}}. For \code{count()}: if \code{FALSE} will include counts for empty groups (i.e. for levels of factors that don't exist in the data). \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#defunct}{\figure{lifecycle-defunct.svg}{options: alt='[Defunct]'}}}{\strong{[Defunct]}} For \code{add_count()}: defunct since it can't actually affect the output.} } \value{ An object of the same type as \code{.data}. \code{count()} and \code{add_count()} group transiently, so the output has the same groups as the input. } \description{ \code{count()} lets you quickly count the unique values of one or more variables: \code{df |> count(a, b)} is roughly equivalent to \code{df |> group_by(a, b) |> summarise(n = n())}. \code{count()} is paired with \code{tally()}, a lower-level helper that is equivalent to \code{df |> summarise(n = n())}. Supply \code{wt} to perform weighted counts, switching the summary from \code{n = n()} to \code{n = sum(wt)}. \code{add_count()} and \code{add_tally()} are equivalents to \code{count()} and \code{tally()} but use \code{mutate()} instead of \code{summarise()} so that they add a new column with group-wise counts. } \examples{ # count() is a convenient way to get a sense of the distribution of # values in a dataset starwars |> count(species) starwars |> count(species, sort = TRUE) starwars |> count(sex, gender, sort = TRUE) starwars |> count(birth_decade = round(birth_year, -1)) # use the `wt` argument to perform a weighted count. This is useful # when the data has already been aggregated once df <- tribble( ~name, ~gender, ~runs, "Max", "male", 10, "Sandra", "female", 1, "Susan", "female", 4 ) # counts rows: df |> count(gender) # counts runs: df |> count(gender, wt = runs) # When factors are involved, `.drop = FALSE` can be used to retain factor # levels that don't appear in the data df2 <- tibble( id = 1:5, type = factor(c("a", "c", "a", NA, "a"), levels = c("a", "b", "c")) ) df2 |> count(type) df2 |> count(type, .drop = FALSE) # Or, using `group_by()`: df2 |> group_by(type, .drop = FALSE) |> count() # tally() is a lower-level function that assumes you've done the grouping starwars |> tally() starwars |> group_by(species) |> tally() # both count() and tally() have add_ variants that work like # mutate() instead of summarise df |> add_count(gender, wt = runs) df |> add_tally(wt = runs) } dplyr/man/check_dbplyr.Rd0000644000176200001440000000152615106134104015076 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compat-dbplyr.R \name{check_dbplyr} \alias{check_dbplyr} \alias{wrap_dbplyr_obj} \title{dbplyr compatibility functions} \usage{ check_dbplyr() wrap_dbplyr_obj(obj_name) } \description{ In dplyr 0.7.0, a number of database and SQL functions moved from dplyr to dbplyr. The generic functions stayed in dplyr (since there is no easy way to conditionally import a generic from different packages), but many other SQL and database helper functions moved. If you have written a backend, these functions generate the code you need to work with both dplyr 0.5.0 dplyr 0.7.0. } \examples{ \dontshow{if (requireNamespace("dbplyr", quietly = TRUE)) withAutoprint(\{ # examplesIf} wrap_dbplyr_obj("build_sql") wrap_dbplyr_obj("base_agg") \dontshow{\}) # examplesIf} } \keyword{internal} dplyr/man/dplyr-package.Rd0000644000176200001440000000203214406402754015174 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dplyr.R \docType{package} \name{dplyr-package} \alias{dplyr} \alias{dplyr-package} \title{dplyr: A Grammar of Data Manipulation} \description{ To learn more about dplyr, start with the vignettes: \code{browseVignettes(package = "dplyr")} } \seealso{ Useful links: \itemize{ \item \url{https://dplyr.tidyverse.org} \item \url{https://github.com/tidyverse/dplyr} \item Report bugs at \url{https://github.com/tidyverse/dplyr/issues} } } \author{ \strong{Maintainer}: Hadley Wickham \email{hadley@posit.co} (\href{https://orcid.org/0000-0003-4757-117X}{ORCID}) Authors: \itemize{ \item Romain François (\href{https://orcid.org/0000-0002-2444-4226}{ORCID}) \item Lionel Henry \item Kirill Müller (\href{https://orcid.org/0000-0002-1416-3412}{ORCID}) \item Davis Vaughan \email{davis@posit.co} (\href{https://orcid.org/0000-0003-4777-038X}{ORCID}) } Other contributors: \itemize{ \item Posit Software, PBC [copyright holder, funder] } } \keyword{internal} dplyr/man/summarise.Rd0000644000176200001440000001251415106134104014451 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/summarise.R \name{summarise} \alias{summarise} \alias{summarize} \title{Summarise each group down to one row} \usage{ summarise(.data, ..., .by = NULL, .groups = NULL) summarize(.data, ..., .by = NULL, .groups = NULL) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs of summary functions. The name will be the name of the variable in the result. The value can be: \itemize{ \item A vector of length 1, e.g. \code{min(x)}, \code{n()}, or \code{sum(is.na(y))}. \item A data frame with 1 row, to add multiple columns from a single expression. }} \item{.by}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.groups}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Grouping structure of the result. \itemize{ \item \code{"drop_last"}: drops the last level of grouping. This was the only supported option before version 1.0.0. \item \code{"drop"}: All levels of grouping are dropped. \item \code{"keep"}: Same grouping structure as \code{.data}. \item \code{"rowwise"}: Each row is its own group. } When \code{.groups} is not specified, it is set to \code{"drop_last"} for a grouped data frame, and \code{"keep"} for a rowwise data frame. In addition, a message informs you of how the result will be grouped unless the result is ungrouped, the option \code{"dplyr.summarise.inform"} is set to \code{FALSE}, or when \code{summarise()} is called from a function in a package.} } \value{ An object \emph{usually} of the same type as \code{.data}. \itemize{ \item The rows come from the underlying \code{\link[=group_keys]{group_keys()}}. \item The columns are a combination of the grouping keys and the summary expressions that you provide. \item The grouping structure is controlled by the \verb{.groups=} argument, the output may be another \link{grouped_df}, a \link{tibble} or a \link{rowwise} data frame. \item Data frame attributes are \strong{not} preserved, because \code{summarise()} fundamentally creates a new data frame. } } \description{ \code{summarise()} creates a new data frame. It returns one row for each combination of grouping variables; if there are no grouping variables, the output will have a single row summarising all observations in the input. It will contain one column for each grouping variable and one column for each of the summary statistics that you have specified. \code{summarise()} and \code{summarize()} are synonyms. } \section{Useful functions}{ \itemize{ \item Center: \code{\link[=mean]{mean()}}, \code{\link[=median]{median()}} \item Spread: \code{\link[=sd]{sd()}}, \code{\link[=IQR]{IQR()}}, \code{\link[=mad]{mad()}} \item Range: \code{\link[=min]{min()}}, \code{\link[=max]{max()}}, \item Position: \code{\link[=first]{first()}}, \code{\link[=last]{last()}}, \code{\link[=nth]{nth()}}, \item Count: \code{\link[=n]{n()}}, \code{\link[=n_distinct]{n_distinct()}} \item Logical: \code{\link[=any]{any()}}, \code{\link[=all]{all()}} } } \section{Backend variations}{ The data frame backend supports creating a variable and using it in the same summary. This means that previously created summary variables can be further transformed or combined within the summary, as in \code{\link[=mutate]{mutate()}}. However, it also means that summary variables with the same names as previous variables overwrite them, making those variables unavailable to later summary variables. This behaviour may not be supported in other backends. To avoid unexpected results, consider using new names for your summary variables, especially when creating multiple summaries. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("summarise")}. } \examples{ # A summary applied to ungrouped tbl returns a single row mtcars |> summarise(mean = mean(disp), n = n()) # Usually, you'll want to group first mtcars |> group_by(cyl) |> summarise(mean = mean(disp), n = n()) # Each summary call removes one grouping level (since that group # is now just a single row) mtcars |> group_by(cyl, vs) |> summarise(cyl_n = n()) |> group_vars() # BEWARE: reusing variables may lead to unexpected results mtcars |> group_by(cyl) |> summarise(disp = mean(disp), sd = sd(disp)) # Refer to column names stored as strings with the `.data` pronoun: var <- "mass" summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) # Learn more in ?rlang::args_data_masking } \seealso{ Other single table verbs: \code{\link{arrange}()}, \code{\link{filter}()}, \code{\link{mutate}()}, \code{\link{reframe}()}, \code{\link{rename}()}, \code{\link{select}()}, \code{\link{slice}()} } \concept{single table verbs} dplyr/man/desc.Rd0000644000176200001440000000077415106134104013367 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/desc.R \name{desc} \alias{desc} \title{Descending order} \usage{ desc(x) } \arguments{ \item{x}{vector to transform} } \description{ Transform a vector into a format that will be sorted in descending order. This is useful within \code{\link[=arrange]{arrange()}}. } \examples{ desc(1:10) desc(factor(letters)) first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years") desc(first_day) starwars |> arrange(desc(mass)) } dplyr/man/percent_rank.Rd0000644000176200001440000000321614366556340015137 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rank.R \name{percent_rank} \alias{percent_rank} \alias{cume_dist} \title{Proportional ranking functions} \usage{ percent_rank(x) cume_dist(x) } \arguments{ \item{x}{A vector to rank By default, the smallest values will get the smallest ranks. Use \code{\link[=desc]{desc()}} to reverse the direction so the largest values get the smallest ranks. Missing values will be given rank \code{NA}. Use \code{coalesce(x, Inf)} or \code{coalesce(x, -Inf)} if you want to treat them as the largest or smallest values respectively. To rank by multiple columns at once, supply a data frame.} } \value{ A numeric vector containing a proportion. } \description{ These two ranking functions implement two slightly different ways to compute a percentile. For each \code{x_i} in \code{x}: \itemize{ \item \code{cume_dist(x)} counts the total number of values less than or equal to \code{x_i}, and divides it by the number of observations. \item \code{percent_rank(x)} counts the total number of values less than \code{x_i}, and divides it by the number of observations minus 1. } In both cases, missing values are ignored when counting the number of observations. } \examples{ x <- c(5, 1, 3, 2, 2) cume_dist(x) percent_rank(x) # You can understand what's going on by computing it by hand sapply(x, function(xi) sum(x <= xi) / length(x)) sapply(x, function(xi) sum(x < xi) / (length(x) - 1)) # The real computations are a little more complex in order to # correctly deal with missing values } \seealso{ Other ranking functions: \code{\link{ntile}()}, \code{\link{row_number}()} } \concept{ranking functions} dplyr/man/sql.Rd0000644000176200001440000000070614272553254013261 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compat-dbplyr.R \name{sql} \alias{sql} \title{SQL escaping.} \usage{ sql(...) } \arguments{ \item{...}{Character vectors that will be combined into a single SQL expression.} } \description{ These functions are critical when writing functions that translate R functions to sql functions. Typically a conversion function should escape all its inputs and return an sql object. } dplyr/man/ntile.Rd0000644000176200001440000000260414366556340013577 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rank.R \name{ntile} \alias{ntile} \title{Bucket a numeric vector into \code{n} groups} \usage{ ntile(x = row_number(), n) } \arguments{ \item{x}{A vector to rank By default, the smallest values will get the smallest ranks. Use \code{\link[=desc]{desc()}} to reverse the direction so the largest values get the smallest ranks. Missing values will be given rank \code{NA}. Use \code{coalesce(x, Inf)} or \code{coalesce(x, -Inf)} if you want to treat them as the largest or smallest values respectively. To rank by multiple columns at once, supply a data frame.} \item{n}{Number of groups to bucket into} } \description{ \code{ntile()} is a sort of very rough rank, which breaks the input vector into \code{n} buckets. If \code{length(x)} is not an integer multiple of \code{n}, the size of the buckets will differ by up to one, with larger buckets coming first. Unlike other ranking functions, \code{ntile()} ignores ties: it will create evenly sized buckets even if the same value of \code{x} ends up in different buckets. } \examples{ x <- c(5, 1, 3, 2, 2, NA) ntile(x, 2) ntile(x, 4) # If the bucket sizes are uneven, the larger buckets come first ntile(1:8, 3) # Ties are ignored ntile(rep(1, 8), 3) } \seealso{ Other ranking functions: \code{\link{percent_rank}()}, \code{\link{row_number}()} } \concept{ranking functions} dplyr/man/with_order.Rd0000644000176200001440000000073513663216626014634 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/order-by.R \name{with_order} \alias{with_order} \title{Run a function with one order, translating result back to original order} \usage{ with_order(order_by, fun, x, ...) } \arguments{ \item{order_by}{vector to order by} \item{fun}{window function} \item{x, ...}{arguments to \code{f}} } \description{ This is used to power the ordering parameters of dplyr's window functions } \keyword{internal} dplyr/man/dplyr_by.Rd0000644000176200001440000002311715137161765014312 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/by.R \name{dplyr_by} \alias{dplyr_by} \title{Per-operation grouping with \code{.by}/\code{by}} \description{ There are two ways to group in dplyr: \itemize{ \item Persistent grouping with \code{\link[=group_by]{group_by()}} \item Per-operation grouping with \code{.by}/\code{by} } This help page is dedicated to explaining where and why you might want to use the latter. Depending on the dplyr verb, the per-operation grouping argument may be named \code{.by} or \code{by}. The \emph{Supported verbs} section below outlines this on a case-by-case basis. The remainder of this page will refer to \code{.by} for simplicity. Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of \code{.by} is to allow you to place that grouping specification alongside the code that actually uses it. As an added benefit, with \code{.by} you no longer need to remember to \code{\link[=ungroup]{ungroup()}} after \code{\link[=summarise]{summarise()}}, and \code{summarise()} won't ever message you about how it's handling the groups! This idea comes from \href{https://CRAN.R-project.org/package=data.table}{data.table}, which allows you to specify \code{by} alongside modifications in \code{j}, like: \code{dt[, .(x = mean(x)), by = g]}. \subsection{Supported verbs}{ \itemize{ \item \code{\link[=mutate]{mutate(.by = )}} \item \code{\link[=summarise]{summarise(.by = )}} \item \code{\link[=reframe]{reframe(.by = )}} \item \code{\link[=filter]{filter(.by = )}} \item \code{\link[=filter_out]{filter_out(.by = )}} \item \code{\link[=slice]{slice(.by = )}} \item \code{\link[=slice_head]{slice_head(by = )}} and \code{\link[=slice_tail]{slice_tail(by = )}} \item \code{\link[=slice_min]{slice_min(by = )}} and \code{\link[=slice_max]{slice_max(by = )}} \item \code{\link[=slice_sample]{slice_sample(by = )}} } Note that some dplyr verbs use \code{by} while others use \code{.by}. This is a purely technical difference. } \subsection{Differences between \code{.by} and \code{group_by()}}{\tabular{ll}{ \code{.by} \tab \code{group_by()} \cr Grouping only affects a single verb \tab Grouping is persistent across multiple verbs \cr Selects variables with \link[=dplyr_tidy_select]{tidy-select} \tab Computes expressions with \link[rlang:args_data_masking]{data-masking} \cr Summaries use existing order of group keys \tab Summaries sort group keys in ascending order \cr } } \subsection{Using \code{.by}}{ Let's take a look at the two grouping approaches using this \code{expenses} data set, which tracks costs accumulated across various \code{id}s and \code{region}s: \if{html}{\out{
}}\preformatted{expenses <- tibble( id = c(1, 2, 1, 3, 1, 2, 3), region = c("A", "A", "A", "B", "B", "A", "A"), cost = c(25, 20, 19, 12, 9, 6, 6) ) expenses #> # A tibble: 7 x 3 #> id region cost #> #> 1 1 A 25 #> 2 2 A 20 #> 3 1 A 19 #> 4 3 B 12 #> 5 1 B 9 #> 6 2 A 6 #> 7 3 A 6 }\if{html}{\out{
}} Imagine that you wanted to compute the average cost per region. You'd probably write something like this: \if{html}{\out{
}}\preformatted{expenses |> group_by(region) |> summarise(cost = mean(cost)) #> # A tibble: 2 x 2 #> region cost #> #> 1 A 15.2 #> 2 B 10.5 }\if{html}{\out{
}} Instead, you can now specify the grouping \emph{inline} within the verb: \if{html}{\out{
}}\preformatted{expenses |> summarise(cost = mean(cost), .by = region) #> # A tibble: 2 x 2 #> region cost #> #> 1 A 15.2 #> 2 B 10.5 }\if{html}{\out{
}} \code{.by} applies to a single operation, meaning that since \code{expenses} was an ungrouped data frame, the result after applying \code{.by} will also always be an ungrouped data frame, regardless of the number of grouping columns. \if{html}{\out{
}}\preformatted{expenses |> summarise(cost = mean(cost), .by = c(id, region)) #> # A tibble: 5 x 3 #> id region cost #> #> 1 1 A 22 #> 2 2 A 13 #> 3 3 B 12 #> 4 1 B 9 #> 5 3 A 6 }\if{html}{\out{
}} Compare that with \code{group_by() |> summarise()}, where \code{summarise()} generally peels off 1 layer of grouping by default, typically with a message that it is doing so: \if{html}{\out{
}}\preformatted{expenses |> group_by(id, region) |> summarise(cost = mean(cost)) #> `summarise()` has regrouped the output. #> i Summaries were computed grouped by id and region. #> i Output is grouped by id. #> i Use `summarise(.groups = "drop_last")` to silence this message. #> i Use `summarise(.by = c(id, region))` for per-operation grouping #> (`?dplyr::dplyr_by`) instead. #> # A tibble: 5 x 3 #> # Groups: id [3] #> id region cost #> #> 1 1 A 22 #> 2 1 B 9 #> 3 2 A 13 #> 4 3 A 6 #> 5 3 B 12 }\if{html}{\out{
}} Because \code{.by} grouping applies to a single operation, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups. Note that with \code{.by} we specified multiple columns to group by using the \link[=dplyr_tidy_select]{tidy-select} syntax \code{c(id, region)}. If you have a character vector of column names you'd like to group by, you can do so with \code{.by = all_of(my_cols)}. It will group by the columns in the order they were provided. To prevent surprising results, you can't use \code{.by} on an existing grouped data frame: \if{html}{\out{
}}\preformatted{expenses |> group_by(id) |> summarise(cost = mean(cost), .by = c(id, region)) #> Error in `summarise()`: #> ! Can't supply `.by` when `.data` is a grouped data frame. }\if{html}{\out{
}} So far we've focused on the usage of \code{.by} with \code{summarise()}, but \code{.by} works with a number of other dplyr verbs. For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary: \if{html}{\out{
}}\preformatted{expenses |> mutate(cost_by_region = mean(cost), .by = region) #> # A tibble: 7 x 4 #> id region cost cost_by_region #> #> 1 1 A 25 15.2 #> 2 2 A 20 15.2 #> 3 1 A 19 15.2 #> 4 3 B 12 10.5 #> 5 1 B 9 10.5 #> 6 2 A 6 15.2 #> 7 3 A 6 15.2 }\if{html}{\out{
}} Or you could slice out the maximum cost per combination of id and region: \if{html}{\out{
}}\preformatted{# Note that the argument is named `by` in `slice_max()` expenses |> slice_max(cost, n = 1, by = c(id, region)) #> # A tibble: 5 x 3 #> id region cost #> #> 1 1 A 25 #> 2 2 A 20 #> 3 3 B 12 #> 4 1 B 9 #> 5 3 A 6 }\if{html}{\out{
}} } \subsection{Result ordering}{ When used with \code{.by}, \code{summarise()}, \code{reframe()}, and \code{slice()} all maintain the ordering of the existing data. This is different from \code{group_by()}, which has always sorted the group keys in ascending order. \if{html}{\out{
}}\preformatted{df <- tibble( month = c("jan", "jan", "feb", "feb", "mar"), temp = c(20, 25, 18, 20, 40) ) # Uses ordering by "first appearance" in the original data df |> summarise(average_temp = mean(temp), .by = month) #> # A tibble: 3 x 2 #> month average_temp #> #> 1 jan 22.5 #> 2 feb 19 #> 3 mar 40 # Sorts in ascending order df |> group_by(month) |> summarise(average_temp = mean(temp)) #> # A tibble: 3 x 2 #> month average_temp #> #> 1 feb 19 #> 2 jan 22.5 #> 3 mar 40 }\if{html}{\out{
}} If you need sorted group keys, we recommend that you explicitly use \code{\link[=arrange]{arrange()}} either before or after the call to \code{summarise()}, \code{reframe()}, or \code{slice()}. This also gives you full access to all of \code{arrange()}'s features, such as \code{desc()} and the \code{.locale} argument. } \subsection{Verbs without \code{.by} support}{ If a dplyr verb doesn't support \code{.by}, then that typically means that the verb isn't inherently affected by grouping. For example, \code{\link[=pull]{pull()}} and \code{\link[=rename]{rename()}} don't support \code{.by}, because specifying columns to group by would not affect their implementations. That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support \code{.by}, but \emph{does} have special support for grouped data frames created by \code{\link[=group_by]{group_by()}}. This is typically because the verbs are required to retain the grouping columns, for example: \itemize{ \item \code{\link[=select]{select()}} always retains grouping columns, with a message if any aren't specified in the \code{select()} call. \item \code{\link[=distinct]{distinct()}} and \code{\link[=count]{count()}} place unspecified grouping columns at the front of the data frame before computing their results. \item \code{\link[=arrange]{arrange()}} has a \code{.by_group} argument to optionally order by grouping columns first. } If \code{group_by()} didn't exist, then these verbs would not have special support for grouped data frames. } } dplyr/man/group_trim.Rd0000644000176200001440000000214215106134104014627 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group-trim.R \name{group_trim} \alias{group_trim} \title{Trim grouping structure} \usage{ group_trim(.tbl, .drop = group_by_drop_default(.tbl)) } \arguments{ \item{.tbl}{A \link[=grouped_df]{grouped data frame}} \item{.drop}{See \code{\link[=group_by]{group_by()}}} } \value{ A \link[=grouped_df]{grouped data frame} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Drop unused levels of all factors that are used as grouping variables, then recalculates the grouping structure. \code{group_trim()} is particularly useful after a \code{\link[=filter]{filter()}} that is intended to select a subset of groups. } \examples{ iris |> group_by(Species) |> filter(Species == "setosa", .preserve = TRUE) |> group_trim() } \seealso{ Other grouping functions: \code{\link{group_by}()}, \code{\link{group_map}()}, \code{\link{group_nest}()}, \code{\link{group_split}()} } \concept{grouping functions} dplyr/man/mutate-joins.Rd0000644000176200001440000003343615106134104015071 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/join.R \name{mutate-joins} \alias{mutate-joins} \alias{join} \alias{join.data.frame} \alias{inner_join} \alias{inner_join.data.frame} \alias{left_join} \alias{left_join.data.frame} \alias{right_join} \alias{right_join.data.frame} \alias{full_join} \alias{full_join.data.frame} \title{Mutating joins} \usage{ inner_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) \method{inner_join}{data.frame}( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", unmatched = "drop", relationship = NULL ) left_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) \method{left_join}{data.frame}( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", unmatched = "drop", relationship = NULL ) right_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) \method{right_join}{data.frame}( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", unmatched = "drop", relationship = NULL ) full_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL ) \method{full_join}{data.frame}( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = c("na", "never"), multiple = "all", relationship = NULL ) } \arguments{ \item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{by}{A join specification created with \code{\link[=join_by]{join_by()}}, or a character vector of variables to join by. If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all variables in common across \code{x} and \code{y}. A message lists the variables so that you can check they're correct; suppress the message by supplying \code{by} explicitly. To join on different variables between \code{x} and \code{y}, use a \code{\link[=join_by]{join_by()}} specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. To join by multiple variables, use a \code{\link[=join_by]{join_by()}} specification with multiple expressions. For example, \code{join_by(a == b, c == d)} will match \code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between \code{x} and \code{y}, you can shorten this by listing only the variable names, like \code{join_by(a, c)}. \code{\link[=join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap joins. See the documentation at \link[=join_by]{?join_by} for details on these types of joins. For simple equality joins, you can alternatively specify a character vector of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. To perform a cross-join, generating all combinations of \code{x} and \code{y}, see \code{\link[=cross_join]{cross_join()}}.} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into the same src as \code{x}. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.} \item{suffix}{If there are non-joined duplicate variables in \code{x} and \code{y}, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.} \item{...}{Other parameters passed onto methods.} \item{keep}{Should the join keys from both \code{x} and \code{y} be preserved in the output? \itemize{ \item If \code{NULL}, the default, joins on equality retain only the keys from \code{x}, while joins on inequality retain the keys from both inputs. \item If \code{TRUE}, all keys from both inputs are retained. \item If \code{FALSE}, only keys from \code{x} are retained. For right and full joins, the data in key columns corresponding to rows that only exist in \code{y} are merged into the key columns from \code{x}. Can't be used when joining on inequality conditions. }} \item{na_matches}{Should two \code{NA} or two \code{NaN} values match? \itemize{ \item \code{"na"}, the default, treats two \code{NA} or two \code{NaN} values as equal, like \code{\%in\%}, \code{\link[=match]{match()}}, and \code{\link[=merge]{merge()}}. \item \code{"never"} treats two \code{NA} or two \code{NaN} values as different, and will never match them together or to any other values. This is similar to joins for database sources and to \code{base::merge(incomparables = NA)}. }} \item{multiple}{Handling of rows in \code{x} with multiple matches in \code{y}. For each row of \code{x}: \itemize{ \item \code{"all"}, the default, returns every match detected in \code{y}. This is the same behavior as SQL. \item \code{"any"} returns one match detected in \code{y}, with no guarantees on which match will be returned. It is often faster than \code{"first"} and \code{"last"} if you just need to detect if there is at least one match. \item \code{"first"} returns the first match detected in \code{y}. \item \code{"last"} returns the last match detected in \code{y}. }} \item{unmatched}{How should unmatched keys that would result in dropped rows be handled? \itemize{ \item \code{"drop"} drops unmatched keys from the result. \item \code{"error"} throws an error if unmatched keys are detected. } \code{unmatched} is intended to protect you from accidentally dropping rows during a join. It only checks for unmatched keys in the input that could potentially drop rows. \itemize{ \item For left joins, it checks \code{y}. \item For right joins, it checks \code{x}. \item For inner joins, it checks both \code{x} and \code{y}. In this case, \code{unmatched} is also allowed to be a character vector of length 2 to specify the behavior for \code{x} and \code{y} independently. }} \item{relationship}{Handling of the expected relationship between the keys of \code{x} and \code{y}. If the expectations chosen from the list below are invalidated, an error is thrown. \itemize{ \item \code{NULL}, the default, doesn't expect there to be any relationship between \code{x} and \code{y}. However, for equality joins it will check for a many-to-many relationship (which is typically unexpected) and will warn if one occurs, encouraging you to either take a closer look at your inputs or make this relationship explicit by specifying \code{"many-to-many"}. See the \emph{Many-to-many relationships} section for more details. \item \code{"one-to-one"} expects: \itemize{ \item Each row in \code{x} matches at most 1 row in \code{y}. \item Each row in \code{y} matches at most 1 row in \code{x}. } \item \code{"one-to-many"} expects: \itemize{ \item Each row in \code{y} matches at most 1 row in \code{x}. } \item \code{"many-to-one"} expects: \itemize{ \item Each row in \code{x} matches at most 1 row in \code{y}. } \item \code{"many-to-many"} doesn't perform any relationship checks, but is provided to allow you to be explicit about this relationship if you know it exists. } \code{relationship} doesn't handle cases where there are zero matches. For that, see \code{unmatched}.} } \value{ An object of the same type as \code{x} (including the same groups). The order of the rows and columns of \code{x} is preserved as much as possible. The output has the following properties: \itemize{ \item The rows are affect by the join type. \itemize{ \item \code{inner_join()} returns matched \code{x} rows. \item \code{left_join()} returns all \code{x} rows. \item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. \item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. } \item Output columns include all columns from \code{x} and all non-key columns from \code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. \item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added to disambiguate these as well. \item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their common type between \code{x} and \code{y}. } } \description{ Mutating joins add columns from \code{y} to \code{x}, matching observations based on the keys. There are four mutating joins: the inner join, and the three outer joins. \subsection{Inner join}{ An \code{inner_join()} only keeps observations from \code{x} that have a matching key in \code{y}. The most important property of an inner join is that unmatched rows in either input are not included in the result. This means that generally inner joins are not appropriate in most analyses, because it is too easy to lose observations. } \subsection{Outer joins}{ The three outer joins keep observations that appear in at least one of the data frames: \itemize{ \item A \code{left_join()} keeps all observations in \code{x}. \item A \code{right_join()} keeps all observations in \code{y}. \item A \code{full_join()} keeps all observations in \code{x} and \code{y}. } } } \section{Many-to-many relationships}{ By default, dplyr guards against many-to-many relationships in equality joins by throwing a warning. These occur when both of the following are true: \itemize{ \item A row in \code{x} matches multiple rows in \code{y}. \item A row in \code{y} matches multiple rows in \code{x}. } This is typically surprising, as most joins involve a relationship of one-to-one, one-to-many, or many-to-one, and is often the result of an improperly specified join. Many-to-many relationships are particularly problematic because they can result in a Cartesian explosion of the number of rows returned from the join. If a many-to-many relationship is expected, silence this warning by explicitly setting \code{relationship = "many-to-many"}. In production code, it is best to preemptively set \code{relationship} to whatever relationship you expect to exist between the keys of \code{x} and \code{y}, as this forces an error to occur immediately if the data doesn't align with your expectations. Inequality joins typically result in many-to-many relationships by nature, so they don't warn on them by default, but you should still take extra care when specifying an inequality join, because they also have the capability to return a large number of rows. Rolling joins don't warn on many-to-many relationships either, but many rolling joins follow a many-to-one relationship, so it is often useful to set \code{relationship = "many-to-one"} to enforce this. Note that in SQL, most database providers won't let you specify a many-to-many relationship between two tables, instead requiring that you create a third \emph{junction table} that results in two one-to-many relationships instead. } \section{Methods}{ These functions are \strong{generic}s, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \itemize{ \item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. \item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. \item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. \item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. } } \examples{ band_members |> inner_join(band_instruments) band_members |> left_join(band_instruments) band_members |> right_join(band_instruments) band_members |> full_join(band_instruments) # To suppress the message about joining variables, supply `by` band_members |> inner_join(band_instruments, by = join_by(name)) # This is good practice in production code # Use an equality expression if the join variables have different names band_members |> full_join(band_instruments2, by = join_by(name == artist)) # By default, the join keys from `x` and `y` are coalesced in the output; use # `keep = TRUE` to keep the join keys from both `x` and `y` band_members |> full_join(band_instruments2, by = join_by(name == artist), keep = TRUE) # If a row in `x` matches multiple rows in `y`, all the rows in `y` will be # returned once for each matching row in `x`. df1 <- tibble(x = 1:3) df2 <- tibble(x = c(1, 1, 2), y = c("first", "second", "third")) df1 |> left_join(df2) # If a row in `y` also matches multiple rows in `x`, this is known as a # many-to-many relationship, which is typically a result of an improperly # specified join or some kind of messy data. In this case, a warning is # thrown by default: df3 <- tibble(x = c(1, 1, 1, 3)) df3 |> left_join(df2) # In the rare case where a many-to-many relationship is expected, set # `relationship = "many-to-many"` to silence this warning df3 |> left_join(df2, relationship = "many-to-many") # Use `join_by()` with a condition other than `==` to perform an inequality # join. Here we match on every instance where `df1$x > df2$x`. df1 |> left_join(df2, join_by(x > x)) # By default, NAs match other NAs so that there are two # rows in the output of this join: df1 <- data.frame(x = c(1, NA), y = 2) df2 <- data.frame(x = c(1, NA), z = 3) left_join(df1, df2) # You can optionally request that NAs don't match, giving a # a result that more closely resembles SQL joins left_join(df1, df2, na_matches = "never") } \seealso{ Other joins: \code{\link{cross_join}()}, \code{\link{filter-joins}}, \code{\link{nest_join}()} } \concept{joins} dplyr/man/slice.Rd0000644000176200001440000001762415137161765013573 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/slice.R \name{slice} \alias{slice} \alias{slice_head} \alias{slice_tail} \alias{slice_min} \alias{slice_max} \alias{slice_sample} \title{Subset rows using their positions} \usage{ slice(.data, ..., .by = NULL, .preserve = FALSE) slice_head(.data, ..., n, prop, by = NULL) slice_tail(.data, ..., n, prop, by = NULL) slice_min( .data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE ) slice_max( .data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE ) slice_sample(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{For \code{slice()}: <\code{\link[rlang:args_data_masking]{data-masking}}> Integer row values. Provide either positive values to keep, or negative values to drop. The values provided must be either all positive or all negative. Indices beyond the number of rows in the input are silently ignored. For \verb{slice_*()}, these arguments are passed on to methods.} \item{.by, by}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise the grouping is kept as is.} \item{n, prop}{Provide either \code{n}, the number of rows, or \code{prop}, the proportion of rows to select. If neither are supplied, \code{n = 1} will be used. If \code{n} is greater than the number of rows in the group (or \code{prop > 1}), the result will be silently truncated to the group size. \code{prop} will be rounded towards zero to generate an integer number of rows. A negative value of \code{n} or \code{prop} will be subtracted from the group size. For example, \code{n = -2} with a group of 5 rows will select 5 - 2 = 3 rows; \code{prop = -0.25} with 8 rows will select 8 * (1 - 0.25) = 6 rows.} \item{order_by}{<\code{\link[rlang:args_data_masking]{data-masking}}> Variable or function of variables to order by. To order by multiple variables, wrap them in a data frame or tibble.} \item{with_ties}{Should ties be kept together? The default, \code{TRUE}, may return more rows than you request. Use \code{FALSE} to ignore ties, and return the first \code{n} rows.} \item{na_rm}{Should missing values in \code{order_by} be removed from the result? If \code{FALSE}, \code{NA} values are sorted to the end (like in \code{\link[=arrange]{arrange()}}), so they will only be included if there are insufficient non-missing values to reach \code{n}/\code{prop}.} \item{weight_by}{<\code{\link[rlang:args_data_masking]{data-masking}}> Sampling weights. This must evaluate to a vector of non-negative numbers the same length as the input. Weights are automatically standardised to sum to 1. See the \code{Details} section for more technical details regarding these weights.} \item{replace}{Should sampling be performed with (\code{TRUE}) or without (\code{FALSE}, the default) replacement.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Each row may appear 0, 1, or many times in the output. \item Columns are not modified. \item Groups are not modified. \item Data frame attributes are preserved. } } \description{ \code{slice()} lets you index rows by their (integer) locations. It allows you to select, remove, and duplicate rows. It is accompanied by a number of helpers for common use cases: \itemize{ \item \code{slice_head()} and \code{slice_tail()} select the first or last rows. \item \code{slice_sample()} randomly selects rows. \item \code{slice_min()} and \code{slice_max()} select rows with the smallest or largest values of a variable. } If \code{.data} is a \link{grouped_df}, the operation will be performed on each group, so that (e.g.) \code{slice_head(df, n = 5)} will select the first five rows in each group. } \details{ Slice does not work with relational databases because they have no intrinsic notion of row order. If you want to perform the equivalent operation, use \code{\link[=filter]{filter()}} and \code{\link[=row_number]{row_number()}}. For \code{slice_sample()}, note that the weights provided in \code{weight_by} are passed through to the \code{prob} argument of \code{\link[base:sample]{base::sample.int()}}. This means they cannot be used to reconstruct summary statistics from the underlying population. See \href{https://stats.stackexchange.com/q/639211/}{this discussion} for more details. } \section{Methods}{ These function are \strong{generic}s, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \itemize{ \item \code{slice()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice")}. \item \code{slice_head()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_head")}. \item \code{slice_tail()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_tail")}. \item \code{slice_min()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_min")}. \item \code{slice_max()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_max")}. \item \code{slice_sample()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_sample")}. } } \examples{ # Similar to head(mtcars, 1): mtcars |> slice(1L) # Similar to tail(mtcars, 1): mtcars |> slice(n()) mtcars |> slice(5:n()) # Rows can be dropped with negative indices: slice(mtcars, -(1:4)) # First and last rows based on existing order mtcars |> slice_head(n = 5) mtcars |> slice_tail(n = 5) # Rows with minimum and maximum values of a variable mtcars |> slice_min(mpg, n = 5) mtcars |> slice_max(mpg, n = 5) # slice_min() and slice_max() may return more rows than requested # in the presence of ties. mtcars |> slice_min(cyl, n = 1) # Use with_ties = FALSE to return exactly n matches mtcars |> slice_min(cyl, n = 1, with_ties = FALSE) # Or use additional variables to break the tie: mtcars |> slice_min(tibble(cyl, mpg), n = 1) # slice_sample() allows you to random select with or without replacement mtcars |> slice_sample(n = 5) mtcars |> slice_sample(n = 5, replace = TRUE) # slice_sample() can be used to shuffle rows with `prop = 1` mtcars |> slice_sample(prop = 1) # You can optionally weight by a variable - this code weights by the # physical weight of the cars, so heavy cars are more likely to get # selected. mtcars |> slice_sample(weight_by = wt, n = 5) # Group wise operation ---------------------------------------- df <- tibble( group = rep(c("a", "b", "c"), c(1, 2, 4)), x = runif(7) ) # All slice helpers operate per group, silently truncating to the group # size, so the following code works without error df |> group_by(group) |> slice_head(n = 2) # When specifying the proportion of rows to include non-integer sizes # are rounded down, so group a gets 0 rows df |> group_by(group) |> slice_head(prop = 0.5) # Filter equivalents -------------------------------------------- # slice() expressions can often be written to use `filter()` and # `row_number()`, which can also be translated to SQL. For many databases, # you'll need to supply an explicit variable to use to compute the row number. filter(mtcars, row_number() == 1L) filter(mtcars, row_number() == n()) filter(mtcars, between(row_number(), 5, n())) } \seealso{ Other single table verbs: \code{\link{arrange}()}, \code{\link{filter}()}, \code{\link{mutate}()}, \code{\link{reframe}()}, \code{\link{rename}()}, \code{\link{select}()}, \code{\link{summarise}()} } \concept{single table verbs} dplyr/man/grouped_df.Rd0000644000176200001440000000156014366556340014602 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/grouped-df.R \name{grouped_df} \alias{grouped_df} \alias{is.grouped_df} \alias{is_grouped_df} \title{A grouped data frame.} \usage{ grouped_df(data, vars, drop = group_by_drop_default(data)) is.grouped_df(x) is_grouped_df(x) } \arguments{ \item{data}{a tbl or data frame.} \item{vars}{A character vector.} \item{drop}{When \code{.drop = TRUE}, empty groups are dropped.} } \description{ The easiest way to create a grouped data frame is to call the \code{group_by()} method on a data frame or tbl: this will take care of capturing the unevaluated expressions for you. These functions are designed for programmatic use. For data analysis purposes see \code{\link[=group_data]{group_data()}} for the accessor functions that retrieve various metadata from a grouped data frames. } \keyword{internal} dplyr/man/rowwise.Rd0000644000176200001440000000531015106134104014137 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rowwise.R \name{rowwise} \alias{rowwise} \title{Group input by rows} \usage{ rowwise(data, ...) } \arguments{ \item{data}{Input data frame.} \item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Variables to be preserved when calling \code{\link[=summarise]{summarise()}}. This is typically a set of variables whose combination uniquely identify each row. \strong{NB}: unlike \code{group_by()} you can not create new variables here but instead you can select multiple variables with (e.g.) \code{everything()}.} } \value{ A row-wise data frame with class \code{rowwise_df}. Note that a \code{rowwise_df} is implicitly grouped by row, but is not a \code{grouped_df}. } \description{ \code{rowwise()} allows you to compute on a data frame a row-at-a-time. This is most useful when a vectorised function doesn't exist. Most dplyr verbs preserve row-wise grouping. The exception is \code{\link[=summarise]{summarise()}}, which return a \link{grouped_df}. You can explicitly ungroup with \code{\link[=ungroup]{ungroup()}} or \code{\link[=as_tibble]{as_tibble()}}, or convert to a \link{grouped_df} with \code{\link[=group_by]{group_by()}}. } \section{List-columns}{ Because a rowwise has exactly one row per group it offers a small convenience for working with list-columns. Normally, \code{summarise()} and \code{mutate()} extract a groups worth of data with \code{[}. But when you index a list in this way, you get back another list. When you're working with a \code{rowwise} tibble, then dplyr will use \code{[[} instead of \code{[} to make your life a little easier. } \examples{ df <- tibble(x = runif(6), y = runif(6), z = runif(6)) # Compute the mean of x, y, z in each row df |> rowwise() |> mutate(m = mean(c(x, y, z))) # use c_across() to more easily select many variables df |> rowwise() |> mutate(m = mean(c_across(x:z))) # Compute the minimum of x and y in each row df |> rowwise() |> mutate(m = min(c(x, y, z))) # In this case you can use an existing vectorised function: df |> mutate(m = pmin(x, y, z)) # Where these functions exist they'll be much faster than rowwise # so be on the lookout for them. # rowwise() is also useful when doing simulations params <- tribble( ~sim, ~n, ~mean, ~sd, 1, 1, 1, 1, 2, 2, 2, 4, 3, 3, -1, 2 ) # Here I supply variables to preserve after the computation params |> rowwise(sim) |> reframe(z = rnorm(n, mean, sd)) # If you want one row per simulation, put the results in a list() params |> rowwise(sim) |> summarise(z = list(rnorm(n, mean, sd)), .groups = "keep") } \seealso{ \code{\link[=nest_by]{nest_by()}} for a convenient way of creating rowwise data frames with nested data. } dplyr/man/with_groups.Rd0000644000176200001440000000242015106134104015011 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/groups-with.R \name{with_groups} \alias{with_groups} \title{Perform an operation with temporary groups} \usage{ with_groups(.data, .groups, .f, ...) } \arguments{ \item{.data}{A data frame} \item{.groups}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> One or more variables to group by. Unlike \code{\link[=group_by]{group_by()}}, you can only group by existing variables, and you can use tidy-select syntax like \code{c(x, y, z)} to select multiple variables. Use \code{NULL} to temporarily \strong{un}group.} \item{.f}{Function to apply to regrouped data. Supports purrr-style \code{~} syntax} \item{...}{Additional arguments passed on to \code{...}.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} This was an experimental function that allows you to modify the grouping variables for a single operation; it is superseded in favour of using the \code{.by} argument to individual verbs. } \examples{ df <- tibble(g = c(1, 1, 2, 2, 3), x = runif(5)) # Old df |> with_groups(g, mutate, x_mean = mean(x)) # New df |> mutate(x_mean = mean(x), .by = g) } \keyword{internal} dplyr/man/defunct.Rd0000644000176200001440000000212615106134104014072 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/defunct.R \name{defunct} \alias{defunct} \alias{combine} \alias{src_mysql} \alias{src_postgres} \alias{src_sqlite} \alias{src_local} \alias{src_df} \alias{tbl_df} \alias{as.tbl} \alias{add_rownames} \title{Defunct functions} \usage{ # Deprecated in 1.0.0 ------------------------------------- combine(...) src_mysql( dbname, host = NULL, port = 0L, username = "root", password = "", ... ) src_postgres( dbname = NULL, host = NULL, port = NULL, user = NULL, password = NULL, ... ) src_sqlite(path, create = FALSE) src_local(tbl, pkg = NULL, env = NULL) src_df(pkg = NULL, env = NULL) tbl_df(data) as.tbl(x, ...) add_rownames(df, var = "rowname") } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#defunct}{\figure{lifecycle-defunct.svg}{options: alt='[Defunct]'}}}{\strong{[Defunct]}} These functions were deprecated for at least two years before being made defunct. If there's a known replacement, calling the function will tell you about it. } \keyword{internal} dplyr/man/vars.Rd0000644000176200001440000000223514366556340013437 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise.R \name{vars} \alias{vars} \title{Select variables} \usage{ vars(...) } \arguments{ \item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Variables to operate on.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{vars()} is superseded because it is only needed for the scoped verbs (i.e. \code{\link[=mutate_at]{mutate_at()}}, \code{\link[=summarise_at]{summarise_at()}}, and friends), which have been been superseded in favour of \code{\link[=across]{across()}}. See \code{vignette("colwise")} for details. This helper is intended to provide tidy-select semantics for scoped verbs like \code{mutate_at()} and \code{summarise_at()}. Note that anywhere you can supply \code{vars()} specification, you can also supply a numeric vector of column positions or a character vector of column names. } \seealso{ \code{\link[=all_vars]{all_vars()}} and \code{\link[=any_vars]{any_vars()}} for other quoting functions that you can use with scoped verbs. } dplyr/man/starwars.Rd0000644000176200001440000000232115016155021014306 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data-starwars.R \docType{data} \name{starwars} \alias{starwars} \title{Starwars characters} \format{ A tibble with 87 rows and 14 variables: \describe{ \item{name}{Name of the character} \item{height}{Height (cm)} \item{mass}{Weight (kg)} \item{hair_color,skin_color,eye_color}{Hair, skin, and eye colors} \item{birth_year}{Year born (BBY = Before Battle of Yavin)} \item{sex}{The biological sex of the character, namely male, female, hermaphroditic, or none (as in the case for Droids).} \item{gender}{The gender role or gender identity of the character as determined by their personality or the way they were programmed (as in the case for Droids).} \item{homeworld}{Name of homeworld} \item{species}{Name of species} \item{films}{List of films the character appeared in} \item{vehicles}{List of vehicles the character has piloted} \item{starships}{List of starships the character has piloted} } } \usage{ starwars } \description{ The original data, from SWAPI, the Star Wars API, \url{https://swapi.py4e.com/}, has been revised to reflect additional research into gender and sex determinations of characters. } \examples{ starwars } \keyword{datasets} dplyr/man/cumall.Rd0000644000176200001440000000263415106134104013723 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/funs.R \name{cumall} \alias{cumall} \alias{cumany} \alias{cummean} \title{Cumulative versions of any, all, and mean} \usage{ cumall(x) cumany(x) cummean(x) } \arguments{ \item{x}{For \code{cumall()} and \code{cumany()}, a logical vector; for \code{cummean()} an integer or numeric vector.} } \value{ A vector the same length as \code{x}. } \description{ dplyr provides \code{cumall()}, \code{cumany()}, and \code{cummean()} to complete R's set of cumulative functions. } \section{Cumulative logical functions}{ These are particularly useful in conjunction with \code{filter()}: \itemize{ \item \code{cumall(x)}: all cases until the first \code{FALSE}. \item \code{cumall(!x)}: all cases until the first \code{TRUE}. \item \code{cumany(x)}: all cases after the first \code{TRUE}. \item \code{cumany(!x)}: all cases after the first \code{FALSE}. } } \examples{ # `cummean()` returns a numeric/integer vector of the same length # as the input vector. x <- c(1, 3, 5, 2, 2) cummean(x) cumsum(x) / seq_along(x) # `cumall()` and `cumany()` return logicals cumall(x < 5) cumany(x == 3) # `cumall()` vs. `cumany()` df <- data.frame( date = as.Date("2020-01-01") + 0:6, balance = c(100, 50, 25, -25, -50, 30, 120) ) # all rows after first overdraft df |> filter(cumany(balance < 0)) # all rows until first overdraft df |> filter(cumall(!(balance < 0))) } dplyr/man/select_all.Rd0000644000176200001440000000567015106134104014560 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise-select.R \name{select_all} \alias{select_all} \alias{rename_all} \alias{select_if} \alias{rename_if} \alias{select_at} \alias{rename_at} \title{Select and rename a selection of variables} \usage{ select_all(.tbl, .funs = list(), ...) rename_all(.tbl, .funs = list(), ...) select_if(.tbl, .predicate, .funs = list(), ...) rename_if(.tbl, .predicate, .funs = list(), ...) select_at(.tbl, .vars, .funs = list(), ...) rename_at(.tbl, .vars, .funs = list(), ...) } \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.funs}{A function \code{fun}, a purrr style lambda \code{~ fun(.)} or a list of either form.} \item{...}{Additional arguments for the function calls in \code{.funs}. These are evaluated only once, with \link[rlang:dyn-dots]{tidy dots} support.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{rename_if()}, \code{rename_at()}, and \code{rename_all()} have been superseded by \code{rename_with()}. The matching select statements have been superseded by the combination of a \code{select()} + \code{rename_with()}. Any predicate functions passed as arguments to \code{select()} or \code{rename_with()} must be wrapped in \code{\link[=where]{where()}}. These functions were superseded because \code{mutate_if()} and friends were superseded by \code{across()}. \code{select_if()} and \code{rename_if()} already use tidy selection so they can't be replaced by \code{across()} and instead we need a new function. } \examples{ mtcars <- as_tibble(mtcars) # for nicer printing mtcars |> rename_all(toupper) # -> mtcars |> rename_with(toupper) # NB: the transformation comes first in rename_with is_whole <- function(x) all(floor(x) == x) mtcars |> rename_if(is_whole, toupper) # -> mtcars |> rename_with(toupper, where(is_whole)) mtcars |> rename_at(vars(mpg:hp), toupper) # -> mtcars |> rename_with(toupper, mpg:hp) # You now must select() and then rename mtcars |> select_all(toupper) # -> mtcars |> rename_with(toupper) # Selection drops unselected variables: mtcars |> select_if(is_whole, toupper) # -> mtcars |> select(where(is_whole)) |> rename_with(toupper) mtcars |> select_at(vars(-contains("ar"), starts_with("c")), toupper) # -> mtcars |> select(!contains("ar") | starts_with("c")) |> rename_with(toupper) } \keyword{internal} dplyr/man/args_by.Rd0000644000176200001440000000105115106134104014064 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/by.R \name{args_by} \alias{args_by} \title{Helper for consistent documentation of \code{.by}} \arguments{ \item{.by}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} } \description{ Use \verb{@inheritParams args_by} to consistently document \code{.by}. } \keyword{internal} dplyr/man/pick.Rd0000644000176200001440000000563315106134104013376 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/pick.R \name{pick} \alias{pick} \title{Select a subset of columns} \usage{ pick(...) } \arguments{ \item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Columns to pick. You can't pick grouping columns because they are already automatically handled by the verb (i.e. \code{\link[=summarise]{summarise()}} or \code{\link[=mutate]{mutate()}}).} } \value{ A tibble containing the selected columns for the current group. } \description{ \code{pick()} provides a way to easily select a subset of columns from your data using \code{\link[=select]{select()}} semantics while inside a \link[rlang:args_data_masking]{"data-masking"} function like \code{\link[=mutate]{mutate()}} or \code{\link[=summarise]{summarise()}}. \code{pick()} returns a data frame containing the selected columns for the current group. \code{pick()} is complementary to \code{\link[=across]{across()}}: \itemize{ \item With \code{pick()}, you typically apply a function to the full data frame. \item With \code{across()}, you typically apply a function to each column. } } \details{ Theoretically, \code{pick()} is intended to be replaceable with an equivalent call to \code{tibble()}. For example, \code{pick(a, c)} could be replaced with \code{tibble(a = a, c = c)}, and \code{pick(everything())} on a data frame with cols \code{a}, \code{b}, and \code{c} could be replaced with \code{tibble(a = a, b = b, c = c)}. \code{pick()} specially handles the case of an empty selection by returning a 1 row, 0 column tibble, so an exact replacement is more like: \if{html}{\out{
}}\preformatted{size <- vctrs::vec_size_common(..., .absent = 1L) out <- vctrs::vec_recycle_common(..., .size = size) tibble::new_tibble(out, nrow = size) }\if{html}{\out{
}} } \examples{ df <- tibble( x = c(3, 2, 2, 2, 1), y = c(0, 2, 1, 1, 4), z1 = c("a", "a", "a", "b", "a"), z2 = c("c", "d", "d", "a", "c") ) df # `pick()` provides a way to select a subset of your columns using # tidyselect. It returns a data frame. df |> mutate(cols = pick(x, y)) # This is useful for functions that take data frames as inputs. # For example, you can compute a joint rank between `x` and `y`. df |> mutate(rank = dense_rank(pick(x, y))) # `pick()` is also useful as a bridge between data-masking functions (like # `mutate()` or `group_by()`) and functions with tidy-select behavior (like # `select()`). For example, you can use `pick()` to create a wrapper around # `group_by()` that takes a tidy-selection of columns to group on. For more # bridge patterns, see # https://rlang.r-lib.org/reference/topic-data-mask-programming.html#bridge-patterns. my_group_by <- function(data, cols) { group_by(data, pick({{ cols }})) } df |> my_group_by(c(x, starts_with("z"))) # Or you can use it to dynamically select columns to `count()` by df |> count(pick(starts_with("z"))) } \seealso{ \code{\link[=across]{across()}} } dplyr/man/group_by_prepare.Rd0000644000176200001440000000141615106134104016007 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/distinct.R, R/group-by.R \name{distinct_prepare} \alias{distinct_prepare} \alias{group_by_prepare} \title{Prepare for grouping and other operations} \usage{ distinct_prepare( .data, vars, group_vars = character(), .keep_all = FALSE, caller_env = caller_env(2), error_call = caller_env() ) group_by_prepare( .data, ..., .add = FALSE, .dots = deprecated(), add = deprecated(), error_call = caller_env() ) } \value{ A list \item{data}{Modified tbl} \item{groups}{Modified groups} } \description{ \verb{*_prepare()} performs standard manipulation that is needed prior to actual data processing. They are only be needed by packages that implement dplyr backends. } \keyword{internal} dplyr/man/group_cols.Rd0000644000176200001440000000207415106134104014620 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/select-helpers.R \name{group_cols} \alias{group_cols} \title{Select grouping variables} \usage{ group_cols(vars = NULL, data = NULL) } \arguments{ \item{vars}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#defunct}{\figure{lifecycle-defunct.svg}{options: alt='[Defunct]'}}}{\strong{[Defunct]}}} \item{data}{For advanced use only. The default \code{NULL} automatically finds the "current" data frames.} } \description{ This selection helpers matches grouping variables. It can be used in \code{\link[=select]{select()}} or \code{\link[=vars]{vars()}} selections. } \examples{ gdf <- iris |> group_by(Species) gdf |> select(group_cols()) # Remove the grouping variables from mutate selections: gdf |> mutate_at(vars(-group_cols()), `/`, 100) # -> No longer necessary with across() gdf |> mutate(across(everything(), ~ . / 100)) } \seealso{ \code{\link[=groups]{groups()}} and \code{\link[=group_vars]{group_vars()}} for retrieving the grouping variables outside selection contexts. } dplyr/man/transmute.Rd0000644000176200001440000000377714406402754014514 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/transmute.R \name{transmute} \alias{transmute} \title{Create, modify, and delete columns} \usage{ transmute(.data, ...) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs. The name gives the name of the column in the output. The value can be: \itemize{ \item A vector of length 1, which will be recycled to the correct length. \item A vector the same length as the current group (or the whole data frame if ungrouped). \item \code{NULL}, to remove the column. \item A data frame or tibble, to create multiple columns in the output. }} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Columns created or modified through \code{...} will be returned in the order specified by \code{...}. \item Unmodified grouping columns will be placed at the front. \item The number of rows is not affected. \item Columns given the value \code{NULL} will be removed. \item Groups will be recomputed if a grouping variable is mutated. \item Data frame attributes are preserved. } } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{transmute()} creates a new data frame containing only the specified computations. It's superseded because you can perform the same job with \code{mutate(.keep = "none")}. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("transmute")}. } \keyword{internal} dplyr/man/consecutive_id.Rd0000644000176200001440000000161515106134104015447 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/consecutive-id.R \name{consecutive_id} \alias{consecutive_id} \title{Generate a unique identifier for consecutive combinations} \usage{ consecutive_id(...) } \arguments{ \item{...}{Unnamed vectors. If multiple vectors are supplied, then they should have the same length.} } \value{ A numeric vector the same length as the longest element of \code{...}. } \description{ \code{consecutive_id()} generates a unique identifier that increments every time a variable (or combination of variables) changes. Inspired by \code{data.table::rleid()}. } \examples{ consecutive_id(c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, NA, NA)) consecutive_id(c(1, 1, 1, 2, 1, 1, 2, 2)) df <- data.frame(x = c(0, 0, 1, 0), y = c(2, 2, 2, 2)) df |> group_by(x, y) |> summarise(n = n()) df |> group_by(id = consecutive_id(x, y), x, y) |> summarise(n = n()) } dplyr/man/case-and-replace-when.Rd0000644000176200001440000002255215137161765016513 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/case-when.R \name{case-and-replace-when} \alias{case-and-replace-when} \alias{case_when} \alias{replace_when} \title{A general vectorised if-else} \usage{ case_when( ..., .default = NULL, .unmatched = "default", .ptype = NULL, .size = NULL ) replace_when(x, ...) } \arguments{ \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> A sequence of two-sided formulas. The left hand side (LHS) determines which values match this case. The right hand side (RHS) provides the replacement value. For \code{case_when()}: \itemize{ \item The LHS inputs must be logical vectors. For backwards compatibility, scalars are \link[vctrs:theory-faq-recycling]{recycled}, but we no longer recommend supplying scalars. \item The RHS inputs will be \link[vctrs:theory-faq-coercion]{cast} to their common type, and will be \link[vctrs:theory-faq-recycling]{recycled} to the common size of the LHS inputs. } For \code{replace_when()}: \itemize{ \item The LHS inputs must be logical vectors the same size as \code{x}. \item The RHS inputs will be \link[vctrs:theory-faq-coercion]{cast} to the type of \code{x} and \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{x}. } \code{NULL} inputs are ignored.} \item{.default}{The value used when all of the LHS inputs return either \code{FALSE} or \code{NA}. \itemize{ \item If \code{NULL}, the default, a missing value will be used. \item If provided, \code{.default} will follow the same type and size rules as the RHS inputs. } \code{NA} values in the LHS conditions are treated like \code{FALSE}, meaning that the result at those locations will be assigned the \code{.default} value. To handle missing values in the conditions differently, you must explicitly catch them with another condition before they fall through to the \code{.default}. This typically involves some variation of \code{is.na(x) ~ value} tailored to your usage of \code{case_when()}.} \item{.unmatched}{Handling of unmatched locations. One of: \itemize{ \item \code{"default"} to use \code{.default} in unmatched locations. \item \code{"error"} to error when there are unmatched locations. }} \item{.ptype}{An optional prototype declaring the desired output type. If supplied, this overrides the common type of the RHS inputs.} \item{.size}{An optional size declaring the desired output size. If supplied, this overrides the common size computed from the LHS inputs.} \item{x}{A vector.} } \value{ For \code{case_when()}, a new vector where the size is the common size of the LHS inputs, the type is the common type of the RHS inputs, and the names correspond to the names of the RHS elements used in the result. For \code{replace_when()}, an updated version of \code{x}, with the same size, type, and names as \code{x}. } \description{ \code{case_when()} and \code{replace_when()} are two forms of vectorized \code{\link[=if_else]{if_else()}}. They work by evaluating each case sequentially and using the first match for each element to determine the corresponding value in the output vector. \itemize{ \item Use \code{case_when()} when creating an entirely new vector. \item Use \code{replace_when()} when partially updating an existing vector. } If you are just replacing a few values within an existing vector, then \code{replace_when()} is always a better choice because it is type stable, size stable, pipes better, and better expresses intent. A major difference between the two functions is what happens when no cases match: \itemize{ \item \code{case_when()} falls through to a \code{.default} as a final "else" statement. \item \code{replace_when()} retains the original values from \code{x}. } See \code{vignette("recoding-replacing")} for more examples. } \examples{ x <- 1:70 case_when( x \%\% 35 == 0 ~ "fizz buzz", x \%\% 5 == 0 ~ "fizz", x \%\% 7 == 0 ~ "buzz", .default = as.character(x) ) # Like an if statement, the arguments are evaluated in order, so you must # proceed from the most specific to the most general. This won't work: case_when( x \%\% 5 == 0 ~ "fizz", x \%\% 7 == 0 ~ "buzz", x \%\% 35 == 0 ~ "fizz buzz", .default = as.character(x) ) # If none of the cases match and no `.default` is supplied, NA is used: case_when( x \%\% 35 == 0 ~ "fizz buzz", x \%\% 5 == 0 ~ "fizz", x \%\% 7 == 0 ~ "buzz" ) # Note that `NA` values on the LHS are treated like `FALSE` and will be # assigned the `.default` value. You must handle them explicitly if you # want to use a different value. The exact way to handle missing values is # dependent on the set of LHS conditions you use. x[2:4] <- NA_real_ case_when( x \%\% 35 == 0 ~ "fizz buzz", x \%\% 5 == 0 ~ "fizz", x \%\% 7 == 0 ~ "buzz", is.na(x) ~ "nope", .default = as.character(x) ) # `case_when()` is not a replacement for basic if/else control flow. When # you have a single scalar condition, using if/else is faster, simpler to # reason about, and is lazy on the branch that isn't run. For example, this # seems to work: x <- "value" case_when(is.character(x) ~ x, .default = "not-a-character") # Until `x` is a non-character type x <- 1 try(case_when(is.character(x) ~ x, .default = "not-a-character")) # Instead, you should use if/else if (is.character(x)) { y <- x } else { y <- "not-a-character" } y # If you believe that you've covered every possible case, then set # `.unmatched = "error"` rather than supplying a `.default`. This adds an # extra layer of safety to `case_when()` and is particularly useful when you # have a series of complex expressions! set.seed(123) x <- sample(50) # Oops, we forgot to handle `50` try(case_when( x < 10 ~ "ten", x < 20 ~ "twenty", x < 30 ~ "thirty", x < 40 ~ "forty", x < 50 ~ "fifty", .unmatched = "error" )) case_when( x < 10 ~ "ten", x < 20 ~ "twenty", x < 30 ~ "thirty", x < 40 ~ "forty", x <= 50 ~ "fifty", .unmatched = "error" ) # Note that `NA` is considered unmatched and must be handled with its own # explicit case, even if that case just propagates the missing value! x[c(2, 5)] <- NA case_when( x < 10 ~ "ten", x < 20 ~ "twenty", x < 30 ~ "thirty", x < 40 ~ "forty", x <= 50 ~ "fifty", is.na(x) ~ NA, .unmatched = "error" ) # `replace_when()` is useful when you're updating an existing vector, # rather than creating an entirely new one. Note the so-far unused "puppy" # factor level: pets <- tibble( name = c("Max", "Bella", "Chuck", "Luna", "Cooper"), type = factor( c("dog", "dog", "cat", "dog", "cat"), levels = c("dog", "cat", "puppy") ), age = c(1, 3, 5, 2, 4) ) # We can replace some values with `"puppy"` based on arbitrary conditions. # Even though we are using a character `"puppy"` value, `replace_when()` will # automatically cast it to the factor type of `type` for us. pets |> mutate( type = replace_when(type, type == "dog" & age <= 2 ~ "puppy") ) # Compare that with this `case_when()` call, which loses the factor class. # It's always better to use `replace_when()` when updating a few values in # an existing vector! pets |> mutate( type = case_when(type == "dog" & age <= 2 ~ "puppy", .default = type) ) # `case_when()` and `replace_when()` evaluate all RHS expressions, and then # construct their result by extracting the selected (via the LHS expressions) # parts. For example, `NaN`s are produced here because `sqrt(y)` is evaluated # on all of `y`, not just where `y >= 0`. y <- seq(-2, 2, by = .5) replace_when(y, y >= 0 ~ sqrt(y)) # These functions are particularly useful inside `mutate()` when you want to # create a new variable that relies on a complex combination of existing # variables starwars |> select(name:mass, gender, species) |> mutate( type = case_when( height > 200 | mass > 200 ~ "large", species == "Droid" ~ "robot", .default = "other" ) ) # `case_when()` is not a tidy eval function. If you'd like to reuse # the same patterns, extract the `case_when()` call into a normal # function: case_character_type <- function(height, mass, species) { case_when( height > 200 | mass > 200 ~ "large", species == "Droid" ~ "robot", .default = "other" ) } case_character_type(150, 250, "Droid") case_character_type(150, 150, "Droid") # Such functions can be used inside `mutate()` as well: starwars |> mutate(type = case_character_type(height, mass, species)) |> pull(type) # `case_when()` ignores `NULL` inputs. This is useful when you'd # like to use a pattern only under certain conditions. Here we'll # take advantage of the fact that `if` returns `NULL` when there is # no `else` clause: case_character_type <- function(height, mass, species, robots = TRUE) { case_when( height > 200 | mass > 200 ~ "large", if (robots) species == "Droid" ~ "robot", .default = "other" ) } starwars |> mutate(type = case_character_type(height, mass, species, robots = FALSE)) |> pull(type) # `replace_when()` can also be used in combination with `pick()` to # conditionally mutate rows within multiple columns using a single condition. # Here `replace_when()` returns a data frame with new `species` and `name` # columns, which `mutate()` then automatically unpacks. starwars |> select(homeworld, species, name) |> mutate(replace_when( pick(species, name), homeworld == "Tatooine" ~ tibble( species = "Tatooinese", name = paste(name, "(Tatooine)") ) )) } \seealso{ \code{\link[=recode_values]{recode_values()}}, \code{\link[vctrs:vec-case-and-replace]{vctrs::vec_case_when()}} } dplyr/man/make_tbl.Rd0000644000176200001440000000100715106134104014215 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tbl.R \name{make_tbl} \alias{make_tbl} \title{Create a "tbl" object} \usage{ make_tbl(subclass, ...) } \arguments{ \item{subclass}{name of subclass. "tbl" is an abstract base class, so you must supply this value. \code{tbl_} is automatically prepended to the class name} \item{...}{For \code{tbl()}, other fields used by class.} } \description{ \code{tbl()} is the standard constructor for tbls. \code{is.tbl()} tests. } \keyword{internal} dplyr/man/rmd/0000755000176200001440000000000015137161765012755 5ustar liggesusersdplyr/man/rmd/overview.Rmd0000644000176200001440000000300314366556340015263 0ustar liggesusers Tidyverse selections implement a dialect of R where operators make it easy to select variables: - `:` for selecting a range of consecutive variables. - `!` for taking the complement of a set of variables. - `&` and `|` for selecting the intersection or the union of two sets of variables. - `c()` for combining selections. In addition, you can use __selection helpers__. Some helpers select specific columns: * [`everything()`][tidyselect::everything]: Matches all variables. * [`last_col()`][tidyselect::last_col]: Select last variable, possibly with an offset. * [group_cols()]: Select all grouping columns. Other helpers select variables by matching patterns in their names: * [`starts_with()`][tidyselect::starts_with]: Starts with a prefix. * [`ends_with()`][tidyselect::ends_with()]: Ends with a suffix. * [`contains()`][tidyselect::contains()]: Contains a literal string. * [`matches()`][tidyselect::matches()]: Matches a regular expression. * [`num_range()`][tidyselect::num_range()]: Matches a numerical range like x01, x02, x03. Or from variables stored in a character vector: * [`all_of()`][tidyselect::all_of()]: Matches variable names in a character vector. All names must be present, otherwise an out-of-bounds error is thrown. * [`any_of()`][tidyselect::any_of()]: Same as `all_of()`, except that no error is thrown for names that don't exist. Or using a predicate function: * [`where()`][tidyselect::where()]: Applies a function to all variables and selects those for which the function returns `TRUE`. dplyr/man/rmd/select.Rmd0000644000176200001440000000310615106134104014657 0ustar liggesusers```{r, include=FALSE} # So the second library() call doesn't show messages library(tidyverse) ``` Here we show the usage for the basic selection operators. See the specific help pages to learn about helpers like [starts_with()]. The selection language can be used in functions like `dplyr::select()`. Let's first attach the tidyverse: ```{r} library(tidyverse) # For better printing iris <- as_tibble(iris) ``` Select variables by name: ```{r} starwars |> select(height) iris |> select(Sepal.Length) ``` Select multiple variables by separating them with commas. Note how the order of columns is determined by the order of inputs: ```{r} starwars |> select(homeworld, height, mass) iris |> select(Sepal.Length, Petal.Length) ``` If you use a named vector to select columns, the output will have its columns renamed: ```{r} selection <- c( new_homeworld = "homeworld", new_height = "height", new_mass = "mass" ) starwars |> select(all_of(selection)) ``` ## Operators: The `:` operator selects a range of consecutive variables: ```{r} starwars |> select(name:mass) ``` The `!` operator negates a selection: ```{r} starwars |> select(!(name:mass)) iris |> select(!c(Sepal.Length, Petal.Length)) iris |> select(!ends_with("Width")) ``` `&` and `|` take the intersection or the union of two selections: ```{r} iris |> select(starts_with("Petal") & ends_with("Width")) iris |> select(starts_with("Petal") | ends_with("Width")) ``` To take the difference between two selections, combine the `&` and `!` operators: ```{r} iris |> select(starts_with("Petal") & !ends_with("Width")) ``` dplyr/man/rmd/by.Rmd0000644000176200001440000001472215137161765014041 0ustar liggesusers--- output: html_document editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` There are two ways to group in dplyr: - Persistent grouping with [group_by()] - Per-operation grouping with `.by`/`by` This help page is dedicated to explaining where and why you might want to use the latter. Depending on the dplyr verb, the per-operation grouping argument may be named `.by` or `by`. The *Supported verbs* section below outlines this on a case-by-case basis. The remainder of this page will refer to `.by` for simplicity. Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of `.by` is to allow you to place that grouping specification alongside the code that actually uses it. As an added benefit, with `.by` you no longer need to remember to [ungroup()] after [summarise()], and `summarise()` won't ever message you about how it's handling the groups! This idea comes from [data.table](https://CRAN.R-project.org/package=data.table), which allows you to specify `by` alongside modifications in `j`, like: `dt[, .(x = mean(x)), by = g]`. ### Supported verbs - [`mutate(.by = )`][mutate()] - [`summarise(.by = )`][summarise()] - [`reframe(.by = )`][reframe()] - [`filter(.by = )`][filter()] - [`filter_out(.by = )`][filter_out()] - [`slice(.by = )`][slice()] - [`slice_head(by = )`][slice_head()] and [`slice_tail(by = )`][slice_tail()] - [`slice_min(by = )`][slice_min()] and [`slice_max(by = )`][slice_max()] - [`slice_sample(by = )`][slice_sample()] Note that some dplyr verbs use `by` while others use `.by`. This is a purely technical difference. ### Differences between `.by` and `group_by()` | `.by` | `group_by()` | |---------------------------------------------------------|--------------------------------------------------------------------| | Grouping only affects a single verb | Grouping is persistent across multiple verbs | | Selects variables with [tidy-select][dplyr_tidy_select] | Computes expressions with [data-masking][rlang::args_data_masking] | | Summaries use existing order of group keys | Summaries sort group keys in ascending order | ### Using `.by` Let's take a look at the two grouping approaches using this `expenses` data set, which tracks costs accumulated across various `id`s and `region`s: ```{r} expenses <- tibble( id = c(1, 2, 1, 3, 1, 2, 3), region = c("A", "A", "A", "B", "B", "A", "A"), cost = c(25, 20, 19, 12, 9, 6, 6) ) expenses ``` Imagine that you wanted to compute the average cost per region. You'd probably write something like this: ```{r} expenses |> group_by(region) |> summarise(cost = mean(cost)) ``` Instead, you can now specify the grouping *inline* within the verb: ```{r} expenses |> summarise(cost = mean(cost), .by = region) ``` `.by` applies to a single operation, meaning that since `expenses` was an ungrouped data frame, the result after applying `.by` will also always be an ungrouped data frame, regardless of the number of grouping columns. ```{r} expenses |> summarise(cost = mean(cost), .by = c(id, region)) ``` Compare that with `group_by() |> summarise()`, where `summarise()` generally peels off 1 layer of grouping by default, typically with a message that it is doing so: ```{r} expenses |> group_by(id, region) |> summarise(cost = mean(cost)) ``` Because `.by` grouping applies to a single operation, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups. Note that with `.by` we specified multiple columns to group by using the [tidy-select][dplyr_tidy_select] syntax `c(id, region)`. If you have a character vector of column names you'd like to group by, you can do so with `.by = all_of(my_cols)`. It will group by the columns in the order they were provided. To prevent surprising results, you can't use `.by` on an existing grouped data frame: ```{r, error=TRUE} expenses |> group_by(id) |> summarise(cost = mean(cost), .by = c(id, region)) ``` So far we've focused on the usage of `.by` with `summarise()`, but `.by` works with a number of other dplyr verbs. For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary: ```{r} expenses |> mutate(cost_by_region = mean(cost), .by = region) ``` Or you could slice out the maximum cost per combination of id and region: ```{r} # Note that the argument is named `by` in `slice_max()` expenses |> slice_max(cost, n = 1, by = c(id, region)) ``` ### Result ordering When used with `.by`, `summarise()`, `reframe()`, and `slice()` all maintain the ordering of the existing data. This is different from `group_by()`, which has always sorted the group keys in ascending order. ```{r} df <- tibble( month = c("jan", "jan", "feb", "feb", "mar"), temp = c(20, 25, 18, 20, 40) ) # Uses ordering by "first appearance" in the original data df |> summarise(average_temp = mean(temp), .by = month) # Sorts in ascending order df |> group_by(month) |> summarise(average_temp = mean(temp)) ``` If you need sorted group keys, we recommend that you explicitly use [arrange()] either before or after the call to `summarise()`, `reframe()`, or `slice()`. This also gives you full access to all of `arrange()`'s features, such as `desc()` and the `.locale` argument. ### Verbs without `.by` support If a dplyr verb doesn't support `.by`, then that typically means that the verb isn't inherently affected by grouping. For example, [pull()] and [rename()] don't support `.by`, because specifying columns to group by would not affect their implementations. That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support `.by`, but *does* have special support for grouped data frames created by [group_by()]. This is typically because the verbs are required to retain the grouping columns, for example: - [select()] always retains grouping columns, with a message if any aren't specified in the `select()` call. - [distinct()] and [count()] place unspecified grouping columns at the front of the data frame before computing their results. - [arrange()] has a `.by_group` argument to optionally order by grouping columns first. If `group_by()` didn't exist, then these verbs would not have special support for grouped data frames. dplyr/man/when-any-all.Rd0000644000176200001440000001072115137161765014757 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/when.R \name{when-any-all} \alias{when-any-all} \alias{when_any} \alias{when_all} \title{Elementwise \code{any()} and \code{all()}} \usage{ when_any(..., na_rm = FALSE, size = NULL) when_all(..., na_rm = FALSE, size = NULL) } \arguments{ \item{...}{Logical vectors of equal size.} \item{na_rm}{Missing value handling: \itemize{ \item If \code{FALSE}, missing values are propagated according to the same rules as \code{|} and \code{&}. \item If \code{TRUE}, missing values are removed from the elementwise computation. }} \item{size}{An optional output size. Only useful to specify if it is possible for \code{...} to be empty, with no inputs provided.} } \description{ These functions are variants of \code{\link[=any]{any()}} and \code{\link[=all]{all()}} that work elementwise across multiple inputs. You can also think of these functions as generalizing \code{\link{|}} and \code{\link{&}} to any number of inputs, rather than just two, for example: \itemize{ \item \code{when_any(x, y, z)} is equivalent to \code{x | y | z}. \item \code{when_all(x, y, z)} is equivalent to \code{x & y & z}. } \code{when_any()} is particularly useful within \code{\link[=filter]{filter()}} and \code{\link[=filter_out]{filter_out()}} to specify comma separated conditions combined with \code{|} rather than \code{&}. } \details{ \code{when_any()} and \code{when_all()} are "parallel" versions of \code{\link[=any]{any()}} and \code{\link[=all]{all()}} in the same way that \code{\link[=pmin]{pmin()}} and \code{\link[=pmax]{pmax()}} are "parallel" versions of \code{\link[=min]{min()}} and \code{\link[=max]{max()}}. } \examples{ x <- c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, NA, NA, NA) y <- c(TRUE, FALSE, NA, TRUE, FALSE, NA, TRUE, FALSE, NA) # `any()` and `all()` summarise down to 1 value any(x, y) all(x, y) # `when_any()` and `when_all()` work element by element across all inputs # at the same time. Their defaults are equivalent to calling `|` or `&`. when_any(x, y) x | y when_all(x, y) x & y # `na_rm = TRUE` is useful when you'd like to force these functions to # return only `TRUE` or `FALSE`. This argument does so by removing any `NA` # from the elementwise computation entirely. tibble( x = x, y = y, any_propagate = when_any(x, y), any_remove = when_any(x, y, na_rm = TRUE), all_propagate = when_all(x, y), all_remove = when_all(x, y, na_rm = TRUE) ) # --------------------------------------------------------------------------- # With `filter()` and `filter_out()` # `when_any()` is particularly useful inside of `filter()` and # `filter_out()` as a way to combine comma separated conditions with `|` # instead of with `&`. countries <- tibble( name = c("US", "CA", "PR", "RU", "US", NA, "CA", "PR", "RU"), score = c(200, 100, 150, NA, 50, 100, 300, 250, 120) ) countries # Find rows where any of the following are true: # - "US" and "CA" have a score between 200-300 # - "PR" and "RU" have a score between 100-200 countries |> filter( (name \%in\% c("US", "CA") & between(score, 200, 300)) | (name \%in\% c("PR", "RU") & between(score, 100, 200)) ) # With `when_any()`, you drop the explicit `|`, the extra `()`, and your # conditions are all indented to the same level countries |> filter(when_any( name \%in\% c("US", "CA") & between(score, 200, 300), name \%in\% c("PR", "RU") & between(score, 100, 200) )) # To drop these rows instead, use `filter_out()` countries |> filter_out(when_any( name \%in\% c("US", "CA") & between(score, 200, 300), name \%in\% c("PR", "RU") & between(score, 100, 200) )) # --------------------------------------------------------------------------- # Programming with `when_any()` and `when_all()` # The `size` argument is useful for making these functions size stable when # you aren't sure how many inputs you're going to receive size <- length(x) # Two inputs inputs <- list(x, y) when_all(!!!inputs, size = size) # One input inputs <- list(x) when_all(!!!inputs, size = size) # Zero inputs (without `size`, this would return `logical()`) inputs <- list() when_all(!!!inputs, size = size) # When no inputs are provided, these functions are consistent with `any()` # and `all()` any() when_any(size = 1) all() when_all(size = 1) } \seealso{ \code{\link[base:any]{base::any()}}, \code{\link[base:all]{base::all()}}, \code{\link[=cumany]{cumany()}}, \code{\link[=cumall]{cumall()}}, \code{\link[base:Extremes]{base::pmin()}}, \code{\link[base:Extremes]{base::pmax()}} } dplyr/man/defunct-lazyeval.Rd0000644000176200001440000000427215137161765015744 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/defunct-lazyeval.R \name{defunct-lazyeval} \alias{defunct-lazyeval} \alias{add_count_} \alias{add_tally_} \alias{arrange_} \alias{count_} \alias{distinct_} \alias{do_} \alias{filter_} \alias{funs_} \alias{group_by_} \alias{group_indices_} \alias{mutate_} \alias{tally_} \alias{transmute_} \alias{rename_} \alias{select_} \alias{slice_} \alias{summarise_} \alias{summarize_} \title{Defunct standard evaluation functions} \usage{ add_count_(x, vars, wt = NULL, sort = FALSE) add_tally_(x, wt, sort = FALSE) arrange_(.data, ..., .dots = list()) count_(x, vars, wt = NULL, sort = FALSE, .drop = group_by_drop_default(x)) distinct_(.data, ..., .dots, .keep_all = FALSE) do_(.data, ..., .dots = list()) filter_(.data, ..., .dots = list()) funs_(dots, args = list(), env = base_env()) group_by_(.data, ..., .dots = list(), add = FALSE) group_indices_(.data, ..., .dots = list()) mutate_(.data, ..., .dots = list()) tally_(x, wt, sort = FALSE) transmute_(.data, ..., .dots = list()) rename_(.data, ..., .dots = list()) select_(.data, ..., .dots = list()) slice_(.data, ..., .dots = list()) summarise_(.data, ..., .dots = list()) summarize_(.data, ..., .dots = list()) } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#defunct}{\figure{lifecycle-defunct.svg}{options: alt='[Defunct]'}}}{\strong{[Defunct]}} dplyr used to offer twin versions of each verb suffixed with an underscore. These versions had standard evaluation (SE) semantics: rather than taking arguments by code, like NSE verbs, they took arguments by value. Their purpose was to make it possible to program with dplyr. However, dplyr now uses tidy evaluation semantics. NSE verbs still capture their arguments, but you can now unquote parts of these arguments. This offers full programmability with NSE verbs. Thus, the underscored versions are now superfluous. Unquoting triggers immediate evaluation of its operand and inlines the result within the captured expression. This result can be a value or an expression to be evaluated later with the rest of the argument. See \code{vignette("programming")} for more information. } \keyword{internal} dplyr/man/across.Rd0000644000176200001440000002206415137161765013760 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/across.R \name{across} \alias{across} \alias{if_any} \alias{if_all} \title{Apply a function (or functions) across multiple columns} \usage{ across(.cols, .fns, ..., .names = NULL, .unpack = FALSE) if_any(.cols, .fns, ..., .names = NULL) if_all(.cols, .fns, ..., .names = NULL) } \arguments{ \item{.cols}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Columns to transform. You can't select grouping columns because they are already automatically handled by the verb (i.e. \code{\link[=summarise]{summarise()}} or \code{\link[=mutate]{mutate()}}).} \item{.fns}{Functions to apply to each of the selected columns. Possible values are: \itemize{ \item A function, e.g. \code{mean}. \item A purrr-style lambda, e.g. \code{~ mean(.x, na.rm = TRUE)} \item A named list of functions or lambdas, e.g. \verb{list(mean = mean, n_miss = ~ sum(is.na(.x))}. Each function is applied to each column, and the output is named by combining the function name and the column name using the glue specification in \code{.names}. } Within these functions you can use \code{\link[=cur_column]{cur_column()}} and \code{\link[=cur_group]{cur_group()}} to access the current column and grouping keys respectively.} \item{...}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Additional arguments for the function calls in \code{.fns} are no longer accepted in \code{...} because it's not clear when they should be evaluated: once per \code{across()} or once per group? Instead supply additional arguments directly in \code{.fns} by using a lambda. For example, instead of \code{across(a:b, mean, na.rm = TRUE)} write \code{across(a:b, ~ mean(.x, na.rm = TRUE))}.} \item{.names}{A glue specification that describes how to name the output columns. This can use \code{{.col}} to stand for the selected column name, and \code{{.fn}} to stand for the name of the function being applied. The default (\code{NULL}) is equivalent to \code{"{.col}"} for the single function case and \code{"{.col}_{.fn}"} for the case where a list is used for \code{.fns}.} \item{.unpack}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Optionally \link[tidyr:pack]{unpack} data frames returned by functions in \code{.fns}, which expands the df-columns out into individual columns, retaining the number of rows in the data frame. \itemize{ \item If \code{FALSE}, the default, no unpacking is done. \item If \code{TRUE}, unpacking is done with a default glue specification of \code{"{outer}_{inner}"}. \item Otherwise, a single glue specification can be supplied to describe how to name the unpacked columns. This can use \code{{outer}} to refer to the name originally generated by \code{.names}, and \code{{inner}} to refer to the names of the data frame you are unpacking. }} } \value{ \code{across()} typically returns a tibble with one column for each column in \code{.cols} and each function in \code{.fns}. If \code{.unpack} is used, more columns may be returned depending on how the results of \code{.fns} are unpacked. \code{if_any()} and \code{if_all()} return a logical vector. } \description{ \code{across()} makes it easy to apply the same transformation to multiple columns, allowing you to use \code{\link[=select]{select()}} semantics inside in "data-masking" functions like \code{\link[=summarise]{summarise()}} and \code{\link[=mutate]{mutate()}}. See \code{vignette("colwise")} for more details. \code{if_any()} and \code{if_all()} apply the same predicate function to a selection of columns and combine the results into a single logical vector: \code{if_any()} is \code{TRUE} when the predicate is \code{TRUE} for \emph{any} of the selected columns, \code{if_all()} is \code{TRUE} when the predicate is \code{TRUE} for \emph{all} selected columns. If you just need to select columns without applying a transformation to each of them, then you probably want to use \code{\link[=pick]{pick()}} instead. \code{across()} supersedes the family of "scoped variants" like \code{summarise_at()}, \code{summarise_if()}, and \code{summarise_all()}. } \details{ When there are no selected columns: \itemize{ \item \code{if_any()} will return \code{FALSE}, consistent with the behavior of \code{any()} when called without inputs. \item \code{if_all()} will return \code{TRUE}, consistent with the behavior of \code{all()} when called without inputs. } } \section{Timing of evaluation}{ R code in dplyr verbs is generally evaluated once per group. Inside \code{across()} however, code is evaluated once for each combination of columns and groups. If the evaluation timing is important, for example if you're generating random variables, think about when it should happen and place your code in consequence. \if{html}{\out{
}}\preformatted{gdf <- tibble(g = c(1, 1, 2, 3), v1 = 10:13, v2 = 20:23) |> group_by(g) set.seed(1) # Outside: 1 normal variate n <- rnorm(1) gdf |> mutate(across(v1:v2, ~ .x + n)) #> # A tibble: 4 x 3 #> # Groups: g [3] #> g v1 v2 #> #> 1 1 9.37 19.4 #> 2 1 10.4 20.4 #> 3 2 11.4 21.4 #> 4 3 12.4 22.4 # Inside a verb: 3 normal variates (ngroup) gdf |> mutate(n = rnorm(1), across(v1:v2, ~ .x + n)) #> # A tibble: 4 x 4 #> # Groups: g [3] #> g v1 v2 n #> #> 1 1 10.2 20.2 0.184 #> 2 1 11.2 21.2 0.184 #> 3 2 11.2 21.2 -0.836 #> 4 3 14.6 24.6 1.60 # Inside `across()`: 6 normal variates (ncol * ngroup) gdf |> mutate(across(v1:v2, ~ .x + rnorm(1))) #> # A tibble: 4 x 3 #> # Groups: g [3] #> g v1 v2 #> #> 1 1 10.3 20.7 #> 2 1 11.3 21.7 #> 3 2 11.2 22.6 #> 4 3 13.5 22.7 }\if{html}{\out{
}} } \examples{ # For better printing iris <- as_tibble(iris) # across() ----------------------------------------------------------------- # Using everything() to apply the same function to all columns iris |> mutate(across(everything(), as.character)) # Different ways to select the same set of columns # See for details iris |> mutate(across(c(Sepal.Length, Sepal.Width), round)) iris |> mutate(across(c(1, 2), round)) iris |> mutate(across(1:Sepal.Width, round)) iris |> mutate(across(where(is.double) & !c(Petal.Length, Petal.Width), round)) # Using an external vector of names cols <- c("Sepal.Length", "Petal.Width") iris |> mutate(across(all_of(cols), round)) # If the external vector is named, the output columns will be named according # to those names names(cols) <- tolower(cols) iris |> mutate(across(all_of(cols), round)) # A purrr-style formula iris |> group_by(Species) |> summarise(across(starts_with("Sepal"), ~ mean(.x, na.rm = TRUE))) # A named list of functions iris |> group_by(Species) |> summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd))) # Use the .names argument to control the output names iris |> group_by(Species) |> summarise(across(starts_with("Sepal"), mean, .names = "mean_{.col}")) iris |> group_by(Species) |> summarise( across( starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{.col}.{.fn}" ) ) # If a named external vector is used for column selection, .names will use # those names when constructing the output names iris |> group_by(Species) |> summarise(across(all_of(cols), mean, .names = "mean_{.col}")) # When the list is not named, .fn is replaced by the function's position iris |> group_by(Species) |> summarise( across(starts_with("Sepal"), list(mean, sd), .names = "{.col}.fn{.fn}") ) # When the functions in .fns return a data frame, you typically get a # "packed" data frame back quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) { tibble(quantile = probs, value = quantile(x, probs)) } iris |> reframe(across(starts_with("Sepal"), quantile_df)) # Use .unpack to automatically expand these packed data frames into their # individual columns iris |> reframe(across(starts_with("Sepal"), quantile_df, .unpack = TRUE)) # .unpack can utilize a glue specification if you don't like the defaults iris |> reframe( across(starts_with("Sepal"), quantile_df, .unpack = "{outer}.{inner}") ) # This is also useful inside mutate(), for example, with a multi-lag helper multilag <- function(x, lags = 1:3) { names(lags) <- as.character(lags) purrr::map_dfr(lags, lag, x = x) } iris |> group_by(Species) |> mutate(across(starts_with("Sepal"), multilag, .unpack = TRUE)) |> select(Species, starts_with("Sepal")) # if_any() and if_all() ---------------------------------------------------- iris |> filter(if_any(ends_with("Width"), ~ . > 4)) iris |> filter_out(if_any(ends_with("Width"), ~ . > 4)) iris |> filter(if_all(ends_with("Width"), ~ . > 2)) iris |> filter_out(if_all(ends_with("Width"), ~ . > 2)) } \seealso{ \code{\link[=c_across]{c_across()}} for a function that returns a vector } dplyr/man/group_by.Rd0000644000176200001440000001262715106134104014277 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group-by.R \name{group_by} \alias{group_by} \alias{ungroup} \title{Group by one or more variables} \usage{ group_by(.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) ungroup(x, ...) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> In \code{group_by()}, variables or computations to group by. Computations are always done on the ungrouped data frame. To perform computations on the grouped data, you need to use a separate \code{mutate()} step before the \code{group_by()}. Computations are not allowed in \code{nest_by()}. In \code{ungroup()}, variables to remove from the grouping.} \item{.add}{When \code{FALSE}, the default, \code{group_by()} will override existing groups. To add to the existing groups, use \code{.add = TRUE}.} \item{.drop}{Drop groups formed by factor levels that don't appear in the data? The default is \code{TRUE} except when \code{.data} has been previously grouped with \code{.drop = FALSE}. See \code{\link[=group_by_drop_default]{group_by_drop_default()}} for details.} \item{x}{A \code{\link[=tbl]{tbl()}}} } \value{ A grouped data frame with class \code{\link{grouped_df}}, unless the combination of \code{...} and \code{add} yields a empty set of grouping columns, in which case a tibble will be returned. } \description{ Most data operations are done on groups defined by variables. \code{group_by()} takes an existing tbl and converts it into a grouped tbl where operations are performed "by group". \code{ungroup()} removes grouping. } \section{Methods}{ These function are \strong{generic}s, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \itemize{ \item \code{group_by()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. \item \code{ungroup()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. } } \section{Ordering}{ Currently, \code{group_by()} internally orders the groups in ascending order. This results in ordered output from functions that aggregate groups, such as \code{\link[=summarise]{summarise()}}. When used as grouping columns, character vectors are ordered in the C locale for performance and reproducibility across R sessions. If the resulting ordering of your grouped operation matters and is dependent on the locale, you should follow up the grouped operation with an explicit call to \code{\link[=arrange]{arrange()}} and set the \code{.locale} argument. For example: \if{html}{\out{
}}\preformatted{data |> group_by(chr) |> summarise(avg = mean(x)) |> arrange(chr, .locale = "en") }\if{html}{\out{
}} This is often useful as a preliminary step before generating content intended for humans, such as an HTML table. \subsection{Legacy behavior}{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Prior to dplyr 1.1.0, character vector grouping columns were ordered in the system locale. Setting the global option \code{dplyr.legacy_locale} to \code{TRUE} retains this legacy behavior, but this has been deprecated. Update existing code to explicitly call \code{arrange(.locale = )} instead. Run \code{Sys.getlocale("LC_COLLATE")} to determine your system locale, and compare that against the list in \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to find an appropriate value for \code{.locale}, i.e. for American English, \code{"en_US"}. } } \examples{ by_cyl <- mtcars |> group_by(cyl) # grouping doesn't change how the data looks (apart from listing # how it's grouped): by_cyl # It changes how it acts with the other dplyr verbs: by_cyl |> summarise( disp = mean(disp), hp = mean(hp) ) by_cyl |> filter(disp == max(disp)) # Each call to summarise() removes a layer of grouping by_vs_am <- mtcars |> group_by(vs, am) by_vs <- by_vs_am |> summarise(n = n()) by_vs by_vs |> summarise(n = sum(n)) # To removing grouping, use ungroup by_vs |> ungroup() |> summarise(n = sum(n)) # By default, group_by() overrides existing grouping by_cyl |> group_by(vs, am) |> group_vars() # Use add = TRUE to instead append by_cyl |> group_by(vs, am, .add = TRUE) |> group_vars() # You can group by expressions: this is a short-hand # for a mutate() followed by a group_by() mtcars |> group_by(vsam = vs + am) # The implicit mutate() step is always performed on the # ungrouped data. Here we get 3 groups: mtcars |> group_by(vs) |> group_by(hp_cut = cut(hp, 3)) # If you want it to be performed by groups, # you have to use an explicit mutate() call. # Here we get 3 groups per value of vs mtcars |> group_by(vs) |> mutate(hp_cut = cut(hp, 3)) |> group_by(hp_cut) # when factors are involved and .drop = FALSE, groups can be empty tbl <- tibble( x = 1:10, y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) ) tbl |> group_by(y, .drop = FALSE) |> group_rows() } \seealso{ Other grouping functions: \code{\link{group_map}()}, \code{\link{group_nest}()}, \code{\link{group_split}()}, \code{\link{group_trim}()} } \concept{grouping functions} dplyr/man/join_by.Rd0000644000176200001440000002233614366556340014121 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/join-by.R \name{join_by} \alias{join_by} \alias{closest} \alias{overlaps} \alias{within} \title{Join specifications} \usage{ join_by(...) } \arguments{ \item{...}{Expressions specifying the join. Each expression should consist of one of the following: \itemize{ \item Equality condition: \code{==} \item Inequality conditions: \code{>=}, \code{>}, \code{<=}, or \code{<} \item Rolling helper: \code{closest()} \item Overlap helpers: \code{between()}, \code{within()}, or \code{overlaps()} } Other expressions are not supported. If you need to perform a join on a computed variable, e.g. \code{join_by(sales_date - 40 >= promo_date)}, you'll need to precompute and store it in a separate column. Column names should be specified as quoted or unquoted names. By default, the name on the left-hand side of a join condition refers to the left-hand table, unless overridden by explicitly prefixing the column name with either \verb{x$} or \verb{y$}. If a single column name is provided without any join conditions, it is interpreted as if that column name was duplicated on each side of \code{==}, i.e. \code{x} is interpreted as \code{x == x}.} } \description{ \code{join_by()} constructs a specification that describes how to join two tables using a small domain specific language. The result can be supplied as the \code{by} argument to any of the join functions (such as \code{\link[=left_join]{left_join()}}). } \section{Join types}{ The following types of joins are supported by dplyr: \itemize{ \item Equality joins \item Inequality joins \item Rolling joins \item Overlap joins \item Cross joins } Equality, inequality, rolling, and overlap joins are discussed in more detail below. Cross joins are implemented through \code{\link[=cross_join]{cross_join()}}. \subsection{Equality joins}{ Equality joins require keys to be equal between one or more pairs of columns, and are the most common type of join. To construct an equality join using \code{join_by()}, supply two column names to join with separated by \code{==}. Alternatively, supplying a single name will be interpreted as an equality join between two columns of the same name. For example, \code{join_by(x)} is equivalent to \code{join_by(x == x)}. } \subsection{Inequality joins}{ Inequality joins match on an inequality, such as \code{>}, \code{>=}, \code{<}, or \code{<=}, and are common in time series analysis and genomics. To construct an inequality join using \code{join_by()}, supply two column names separated by one of the above mentioned inequalities. Note that inequality joins will match a single row in \code{x} to a potentially large number of rows in \code{y}. Be extra careful when constructing inequality join specifications! } \subsection{Rolling joins}{ Rolling joins are a variant of inequality joins that limit the results returned from an inequality join condition. They are useful for "rolling" the closest match forward/backwards when there isn't an exact match. To construct a rolling join, wrap an inequality with \code{closest()}. \itemize{ \item \code{closest(expr)} \code{expr} must be an inequality involving one of: \code{>}, \code{>=}, \code{<}, or \code{<=}. For example, \code{closest(x >= y)} is interpreted as: For each value in \code{x}, find the closest value in \code{y} that is less than or equal to that \code{x} value. } \code{closest()} will always use the left-hand table (\code{x}) as the primary table, and the right-hand table (\code{y}) as the one to find the closest match in, regardless of how the inequality is specified. For example, \code{closest(y$a >= x$b)} will always be interpreted as \code{closest(x$b <= y$a)}. } \subsection{Overlap joins}{ Overlap joins are a special case of inequality joins involving one or two columns from the left-hand table \emph{overlapping} a range defined by two columns from the right-hand table. There are three helpers that \code{join_by()} recognizes to assist with constructing overlap joins, all of which can be constructed from simpler inequalities. \itemize{ \item \code{between(x, y_lower, y_upper, ..., bounds = "[]")} For each value in \code{x}, this finds everywhere that value falls between \verb{[y_lower, y_upper]}. Equivalent to \verb{x >= y_lower, x <= y_upper} by default. \code{bounds} can be one of \code{"[]"}, \code{"[)"}, \code{"(]"}, or \code{"()"} to alter the inclusiveness of the lower and upper bounds. This changes whether \code{>=} or \code{>} and \code{<=} or \code{<} are used to build the inequalities shown above. Dots are for future extensions and must be empty. \item \code{within(x_lower, x_upper, y_lower, y_upper)} For each range in \verb{[x_lower, x_upper]}, this finds everywhere that range falls completely within \verb{[y_lower, y_upper]}. Equivalent to \verb{x_lower >= y_lower, x_upper <= y_upper}. The inequalities used to build \code{within()} are the same regardless of the inclusiveness of the supplied ranges. \item \code{overlaps(x_lower, x_upper, y_lower, y_upper, ..., bounds = "[]")} For each range in \verb{[x_lower, x_upper]}, this finds everywhere that range overlaps \verb{[y_lower, y_upper]} in any capacity. Equivalent to \verb{x_lower <= y_upper, x_upper >= y_lower} by default. \code{bounds} can be one of \code{"[]"}, \code{"[)"}, \code{"(]"}, or \code{"()"} to alter the inclusiveness of the lower and upper bounds. \code{"[]"} uses \code{<=} and \code{>=}, but the 3 other options use \code{<} and \code{>} and generate the exact same inequalities. Dots are for future extensions and must be empty. } These conditions assume that the ranges are well-formed and non-empty, i.e. \code{x_lower <= x_upper} when bounds are treated as \code{"[]"}, and \code{x_lower < x_upper} otherwise. } } \section{Column referencing}{ When specifying join conditions, \code{join_by()} assumes that column names on the left-hand side of the condition refer to the left-hand table (\code{x}), and names on the right-hand side of the condition refer to the right-hand table (\code{y}). Occasionally, it is clearer to be able to specify a right-hand table name on the left-hand side of the condition, and vice versa. To support this, column names can be prefixed by \verb{x$} or \verb{y$} to explicitly specify which table they come from. } \examples{ sales <- tibble( id = c(1L, 1L, 1L, 2L, 2L), sale_date = as.Date(c("2018-12-31", "2019-01-02", "2019-01-05", "2019-01-04", "2019-01-01")) ) sales promos <- tibble( id = c(1L, 1L, 2L), promo_date = as.Date(c("2019-01-01", "2019-01-05", "2019-01-02")) ) promos # Match `id` to `id`, and `sale_date` to `promo_date` by <- join_by(id, sale_date == promo_date) left_join(sales, promos, by) # For each `sale_date` within a particular `id`, # find all `promo_date`s that occurred before that particular sale by <- join_by(id, sale_date >= promo_date) left_join(sales, promos, by) # For each `sale_date` within a particular `id`, # find only the closest `promo_date` that occurred before that sale by <- join_by(id, closest(sale_date >= promo_date)) left_join(sales, promos, by) # If you want to disallow exact matching in rolling joins, use `>` rather # than `>=`. Note that the promo on `2019-01-05` is no longer considered the # closest match for the sale on the same date. by <- join_by(id, closest(sale_date > promo_date)) left_join(sales, promos, by) # Same as before, but also require that the promo had to occur at most 1 # day before the sale was made. We'll use a full join to see that id 2's # promo on `2019-01-02` is no longer matched to the sale on `2019-01-04`. sales <- mutate(sales, sale_date_lower = sale_date - 1) by <- join_by(id, closest(sale_date >= promo_date), sale_date_lower <= promo_date) full_join(sales, promos, by) # --------------------------------------------------------------------------- segments <- tibble( segment_id = 1:4, chromosome = c("chr1", "chr2", "chr2", "chr1"), start = c(140, 210, 380, 230), end = c(150, 240, 415, 280) ) segments reference <- tibble( reference_id = 1:4, chromosome = c("chr1", "chr1", "chr2", "chr2"), start = c(100, 200, 300, 415), end = c(150, 250, 399, 450) ) reference # Find every time a segment `start` falls between the reference # `[start, end]` range. by <- join_by(chromosome, between(start, start, end)) full_join(segments, reference, by) # If you wanted the reference columns first, supply `reference` as `x` # and `segments` as `y`, then explicitly refer to their columns using `x$` # and `y$`. by <- join_by(chromosome, between(y$start, x$start, x$end)) full_join(reference, segments, by) # Find every time a segment falls completely within a reference. # Sometimes using `x$` and `y$` makes your intentions clearer, even if they # match the default behavior. by <- join_by(chromosome, within(x$start, x$end, y$start, y$end)) inner_join(segments, reference, by) # Find every time a segment overlaps a reference in any way. by <- join_by(chromosome, overlaps(x$start, x$end, y$start, y$end)) full_join(segments, reference, by) # It is common to have right-open ranges with bounds like `[)`, which would # mean an end value of `415` would no longer overlap a start value of `415`. # Setting `bounds` allows you to compute overlaps with those kinds of ranges. by <- join_by(chromosome, overlaps(x$start, x$end, y$start, y$end, bounds = "[)")) full_join(segments, reference, by) } dplyr/man/group_split.Rd0000644000176200001440000000431515106134104015013 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group-split.R \name{group_split} \alias{group_split} \title{Split data frame by groups} \usage{ group_split(.tbl, ..., .keep = TRUE) } \arguments{ \item{.tbl}{A tbl.} \item{...}{If \code{.tbl} is an ungrouped data frame, a grouping specification, forwarded to \code{\link[=group_by]{group_by()}}.} \item{.keep}{Should the grouping columns be kept?} } \value{ A list of tibbles. Each tibble contains the rows of \code{.tbl} for the associated group and all the columns, including the grouping variables. Note that this returns a \link[vctrs:list_of]{list_of} which is slightly stricter than a simple list but is useful for representing lists where every element has the same type. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \code{\link[=group_split]{group_split()}} works like \code{\link[base:split]{base::split()}} but: \itemize{ \item It uses the grouping structure from \code{\link[=group_by]{group_by()}} and therefore is subject to the data mask \item It does not name the elements of the list based on the grouping as this only works well for a single character grouping variable. Instead, use \code{\link[=group_keys]{group_keys()}} to access a data frame that defines the groups. } \code{group_split()} is primarily designed to work with grouped data frames. You can pass \code{...} to group and split an ungrouped data frame, but this is generally not very useful as you want have easy access to the group metadata. } \section{Lifecycle}{ \code{group_split()} is not stable because you can achieve very similar results by manipulating the nested column returned from \code{\link[tidyr:nest]{tidyr::nest(.by =)}}. That also retains the group keys all within a single data structure. \code{group_split()} may be deprecated in the future. } \examples{ ir <- iris |> group_by(Species) group_split(ir) group_keys(ir) } \seealso{ Other grouping functions: \code{\link{group_by}()}, \code{\link{group_map}()}, \code{\link{group_nest}()}, \code{\link{group_trim}()} } \concept{grouping functions} \keyword{internal} dplyr/man/tbl_ptype.Rd0000644000176200001440000000071513663216626014466 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise.R \name{tbl_ptype} \alias{tbl_ptype} \title{Return a prototype of a tbl} \usage{ tbl_ptype(.data) } \description{ Used in \verb{_if} functions to enable type-based selection even when the data is lazily generated. Should either return the complete tibble, or if that can not be computed quickly, a 0-row tibble where the columns are of the correct type. } \keyword{internal} dplyr/man/band_members.Rd0000644000176200001440000000166513663216626015107 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data-bands.R \docType{data} \name{band_members} \alias{band_members} \alias{band_instruments} \alias{band_instruments2} \title{Band membership} \format{ Each is a tibble with two variables and three observations } \usage{ band_members band_instruments band_instruments2 } \description{ These data sets describe band members of the Beatles and Rolling Stones. They are toy data sets that can be displayed in their entirety on a slide (e.g. to demonstrate a join). } \details{ \code{band_instruments} and \code{band_instruments2} contain the same data but use different column names for the first column of the data set. \code{band_instruments} uses \code{name}, which matches the name of the key column of \code{band_members}; \code{band_instruments2} uses \code{artist}, which does not. } \examples{ band_members band_instruments band_instruments2 } \keyword{datasets} dplyr/man/c_across.Rd0000644000176200001440000000216415106134104014240 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/across.R \name{c_across} \alias{c_across} \title{Combine values from multiple columns} \usage{ c_across(cols) } \arguments{ \item{cols}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Columns to transform. You can't select grouping columns because they are already automatically handled by the verb (i.e. \code{\link[=summarise]{summarise()}} or \code{\link[=mutate]{mutate()}}).} } \description{ \code{c_across()} is designed to work with \code{\link[=rowwise]{rowwise()}} to make it easy to perform row-wise aggregations. It has two differences from \code{c()}: \itemize{ \item It uses tidy select semantics so you can easily select multiple variables. See \code{vignette("rowwise")} for more details. \item It uses \code{\link[vctrs:vec_c]{vctrs::vec_c()}} in order to give safer outputs. } } \examples{ df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4)) df |> rowwise() |> mutate( sum = sum(c_across(w:z)), sd = sd(c_across(w:z)) ) } \seealso{ \code{\link[=across]{across()}} for a function that returns a tibble. } dplyr/man/do.Rd0000644000176200001440000000362715106134104013053 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprec-do.R \name{do} \alias{do} \title{Do anything} \usage{ do(.data, ...) } \arguments{ \item{.data}{a tbl} \item{...}{Expressions to apply to each group. If named, results will be stored in a new column. If unnamed, must return a data frame. You can use \code{.} to refer to the current group. You can not mix named and unnamed arguments.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{do()} is superseded as of dplyr 1.0.0, because its syntax never really felt like it belonged with the rest of dplyr. It's replaced by a combination of \code{\link[=reframe]{reframe()}} (which can produce multiple rows and multiple columns), \code{\link[=nest_by]{nest_by()}} (which creates a \link{rowwise} tibble of nested data), and \code{\link[=pick]{pick()}} (which allows you to access the data for the "current" group). } \examples{ # do() with unnamed arguments becomes reframe() or summarise() # . becomes pick() by_cyl <- mtcars |> group_by(cyl) by_cyl |> do(head(., 2)) # -> by_cyl |> reframe(head(pick(everything()), 2)) by_cyl |> slice_head(n = 2) # Can refer to variables directly by_cyl |> do(mean = mean(.$vs)) # -> by_cyl |> summarise(mean = mean(vs)) # do() with named arguments becomes nest_by() + mutate() & list() models <- by_cyl |> do(mod = lm(mpg ~ disp, data = .)) # -> models <- mtcars |> nest_by(cyl) |> mutate(mod = list(lm(mpg ~ disp, data = data))) models |> summarise(rsq = summary(mod)$r.squared) # use broom to turn models into data models |> do(data.frame( var = names(coef(.$mod)), coef(summary(.$mod))) ) \dontshow{if (requireNamespace("broom", quietly = TRUE)) withAutoprint(\{ # examplesIf} # -> models |> reframe(broom::tidy(mod)) \dontshow{\}) # examplesIf} } \keyword{internal} dplyr/man/n_distinct.Rd0000644000176200001440000000205614366556340014623 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/n-distinct.R \name{n_distinct} \alias{n_distinct} \title{Count unique combinations} \usage{ n_distinct(..., na.rm = FALSE) } \arguments{ \item{...}{Unnamed vectors. If multiple vectors are supplied, then they should have the same length.} \item{na.rm}{If \code{TRUE}, exclude missing observations from the count. If there are multiple vectors in \code{...}, an observation will be excluded if \emph{any} of the values are missing.} } \value{ A single number. } \description{ \code{n_distinct()} counts the number of unique/distinct combinations in a set of one or more vectors. It's a faster and more concise equivalent to \code{nrow(unique(data.frame(...)))}. } \examples{ x <- c(1, 1, 2, 2, 2) n_distinct(x) y <- c(3, 3, NA, 3, 3) n_distinct(y) n_distinct(y, na.rm = TRUE) # Pairs (1, 3), (2, 3), and (2, NA) are distinct n_distinct(x, y) # (2, NA) is dropped, leaving 2 distinct combinations n_distinct(x, y, na.rm = TRUE) # Also works with data frames n_distinct(data.frame(x, y)) } dplyr/man/coalesce.Rd0000644000176200001440000000445515106134104014227 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/coalesce.R \name{coalesce} \alias{coalesce} \title{Find the first non-missing element} \usage{ coalesce(..., .ptype = NULL, .size = NULL) } \arguments{ \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> One or more vectors. These will be \link[vctrs:theory-faq-recycling]{recycled} against each other, and will be cast to their common type.} \item{.ptype}{An optional prototype declaring the desired output type. If supplied, this overrides the common type of the vectors in \code{...}.} \item{.size}{An optional size declaring the desired output size. If supplied, this overrides the common size of the vectors in \code{...}.} } \value{ A vector with the same type and size as the common type and common size of the vectors in \code{...}. } \description{ Given a set of vectors, \code{coalesce()} finds the first non-missing value at each position. It's inspired by the SQL \code{COALESCE} function which does the same thing for SQL \code{NULL}s. } \examples{ # Replace missing values with a single value x <- sample(c(1:5, NA, NA, NA)) coalesce(x, 0L) # Or replace missing values with the corresponding non-missing value in # another vector x <- c(1, 2, NA, NA, 5, NA) y <- c(NA, NA, 3, 4, 5, NA) coalesce(x, y) # For cases like these where your replacement is a single value or a single # vector, `replace_values()` works just as well replace_values(x, NA ~ 0) coalesce(x, 0) replace_values(x, NA ~ y) coalesce(x, y) # `coalesce()` really shines when you have >2 vectors to coalesce with z <- c(NA, 2, 3, 4, 5, 6) coalesce(x, y, z) # If you're looking to replace values with `NA`, rather than replacing `NA` # with a value, then use `replace_values()` x <- c(0, -1, 5, -99, 8) replace_values(x, c(-1, -99) ~ NA) # The equivalent to a missing value in a list is `NULL` coalesce(list(1, 2, NULL, NA), list(0)) # Supply lists of vectors by splicing them into dots vecs <- list( c(1, 2, NA, NA, 5), c(NA, NA, 3, 4, 5) ) coalesce(!!!vecs) } \seealso{ \itemize{ \item \code{\link[=na_if]{na_if()}} to replace a specified value with \code{NA}. \item \code{\link[=replace_values]{replace_values()}} for making arbitrary replacements by value. \item \code{\link[=replace_when]{replace_when()}} for making arbitrary replacements using logical conditions. } } dplyr/man/case_match.Rd0000644000176200001440000001175315137161765014560 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/case-match.R \name{case_match} \alias{case_match} \title{A general vectorised \code{switch()}} \usage{ case_match(.x, ..., .default = NULL, .ptype = NULL) } \arguments{ \item{.x}{A vector to match against.} \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> A sequence of two-sided formulas: \code{old_values ~ new_value}. The right hand side (RHS) determines the output value for all values of \code{.x} that match the left hand side (LHS). The LHS must evaluate to the same type of vector as \code{.x}. It can be any length, allowing you to map multiple \code{.x} values to the same RHS value. If a value is repeated in the LHS, i.e. a value in \code{.x} matches to multiple cases, the first match is used. The RHS inputs will be coerced to their common type. Each RHS input will be \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{.x}.} \item{.default}{The value used when values in \code{.x} aren't matched by any of the LHS inputs. If \code{NULL}, the default, a missing value will be used. \code{.default} is \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{.x}.} \item{.ptype}{An optional prototype declaring the desired output type. If not supplied, the output type will be taken from the common type of all RHS inputs and \code{.default}.} } \value{ A vector with the same size as \code{.x} and the same type as the common type of the RHS inputs and \code{.default} (if not overridden by \code{.ptype}). } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} \code{case_match()} is deprecated. Please use \code{\link[=recode_values]{recode_values()}} and \code{\link[=replace_values]{replace_values()}} instead, which are more powerful, have more intuitive names, and have better safety. In addition to the familiar two-sided formula interface, these functions also have \code{from} and \code{to} arguments which allow you to incorporate a lookup table into the recoding process. This function allows you to vectorise multiple \code{\link[=switch]{switch()}} statements. Each case is evaluated sequentially and the first match for each element determines the corresponding value in the output vector. If no cases match, the \code{.default} is used. } \examples{ # `case_match()` is deprecated and has been replaced by `recode_values()` and # `replace_values()` x <- c("a", "b", "a", "d", "b", NA, "c", "e") # `recode_values()` is a 1:1 replacement for `case_match()` case_match( x, "a" ~ 1, "b" ~ 2, "c" ~ 3, "d" ~ 4 ) recode_values( x, "a" ~ 1, "b" ~ 2, "c" ~ 3, "d" ~ 4 ) # `recode_values()` has an additional `unmatched` argument to help you catch # missed mappings try(recode_values( x, "a" ~ 1, "b" ~ 2, "c" ~ 3, "d" ~ 4, unmatched = "error" )) # `recode_values()` also has additional `from` and `to` arguments, which are # useful when your lookup table is defined elsewhere (for example, it could # be read in from a CSV file). This is very difficult to do with # `case_match()`! lookup <- tribble( ~from, ~to, "a", 1, "b", 2, "c", 3, "d", 4 ) recode_values(x, from = lookup$from, to = lookup$to) # Both `case_match()` and `recode_values()` work with more than just # character inputs: y <- as.integer(c(1, 2, 1, 3, 1, NA, 2, 4)) case_match( y, c(1, 3) ~ "odd", c(2, 4) ~ "even", .default = "missing" ) recode_values( y, c(1, 3) ~ "odd", c(2, 4) ~ "even", default = "missing" ) # Or with a lookup table lookup <- tribble( ~from, ~to, c(1, 3), "odd", c(2, 4), "even" ) recode_values(y, from = lookup$from, to = lookup$to, default = "missing") # `replace_values()` is a convenient way to replace selected values, leaving # everything else as is. It's similar to `case_match(y, .default = y)`. replace_values(y, NA ~ 0) case_match(y, NA ~ 0, .default = y) # Notably, `replace_values()` is type stable, which means that `y` can't # change types out from under you, unlike with `case_match()`! typeof(y) typeof(replace_values(y, NA ~ 0)) typeof(case_match(y, NA ~ 0, .default = y)) # We believe that `replace_values()` better expresses intent when doing a # partial replacement. Compare these two `mutate()` calls, each with the # goals of: # - Replace missings in `hair_color` # - Replace some of the `species` starwars |> mutate( hair_color = case_match(hair_color, NA ~ "unknown", .default = hair_color), species = case_match( species, "Human" ~ "Humanoid", "Droid" ~ "Robot", c("Wookiee", "Ewok") ~ "Hairy", .default = species ), .keep = "used" ) updates <- tribble( ~from, ~to, "Human", "Humanoid", "Droid", "Robot", c("Wookiee", "Ewok"), "Hairy" ) starwars |> mutate( hair_color = replace_values(hair_color, NA ~ "unknown"), species = replace_values(species, from = updates$from, to = updates$to), .keep = "used" ) } \keyword{internal} dplyr/man/mutate_all.Rd0000644000176200001440000001543715106134104014602 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise-mutate.R \name{mutate_all} \alias{mutate_all} \alias{mutate_if} \alias{mutate_at} \alias{transmute_all} \alias{transmute_if} \alias{transmute_at} \title{Mutate multiple columns} \usage{ mutate_all(.tbl, .funs, ...) mutate_if(.tbl, .predicate, .funs, ...) mutate_at(.tbl, .vars, .funs, ..., .cols = NULL) transmute_all(.tbl, .funs, ...) transmute_if(.tbl, .predicate, .funs, ...) transmute_at(.tbl, .vars, .funs, ..., .cols = NULL) } \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.funs}{A function \code{fun}, a quosure style lambda \code{~ fun(.)} or a list of either form.} \item{...}{Additional arguments for the function calls in \code{.funs}. These are evaluated only once, with \link[rlang:dyn-dots]{tidy dots} support.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} \item{.cols}{This argument has been renamed to \code{.vars} to fit dplyr's terminology and is deprecated.} } \value{ A data frame. By default, the newly created columns have the shortest names needed to uniquely identify the output. To force inclusion of a name, even when not needed, name the input (see examples for details). } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} Scoped verbs (\verb{_if}, \verb{_at}, \verb{_all}) have been superseded by the use of \code{\link[=pick]{pick()}} or \code{\link[=across]{across()}} in an existing verb. See \code{vignette("colwise")} for details. The \link{scoped} variants of \code{\link[=mutate]{mutate()}} and \code{\link[=transmute]{transmute()}} make it easy to apply the same transformation to multiple variables. There are three variants: \itemize{ \item _all affects every variable \item _at affects variables selected with a character vector or vars() \item _if affects variables selected with a predicate function: } } \section{Grouping variables}{ If applied on a grouped tibble, these operations are \emph{not} applied to the grouping variables. The behaviour depends on whether the selection is \strong{implicit} (\code{all} and \code{if} selections) or \strong{explicit} (\code{at} selections). \itemize{ \item Grouping variables covered by explicit selections in \code{mutate_at()} and \code{transmute_at()} are always an error. Add \code{-group_cols()} to the \code{\link[=vars]{vars()}} selection to avoid this: \if{html}{\out{
}}\preformatted{data |> mutate_at(vars(-group_cols(), ...), myoperation) }\if{html}{\out{
}} Or remove \code{group_vars()} from the character vector of column names: \if{html}{\out{
}}\preformatted{nms <- setdiff(nms, group_vars(data)) data |> mutate_at(vars, myoperation) }\if{html}{\out{
}} \item Grouping variables covered by implicit selections are ignored by \code{mutate_all()}, \code{transmute_all()}, \code{mutate_if()}, and \code{transmute_if()}. } } \section{Naming}{ The names of the new columns are derived from the names of the input variables and the names of the functions. \itemize{ \item if there is only one unnamed function (i.e. if \code{.funs} is an unnamed list of length one), the names of the input variables are used to name the new columns; \item for \verb{_at} functions, if there is only one unnamed variable (i.e., if \code{.vars} is of the form \code{vars(a_single_column)}) and \code{.funs} has length greater than one, the names of the functions are used to name the new columns; \item otherwise, the new names are created by concatenating the names of the input variables and the names of the functions, separated with an underscore \code{"_"}. } The \code{.funs} argument can be a named or unnamed list. If a function is unnamed and the name cannot be derived automatically, a name of the form "fn#" is used. Similarly, \code{\link[=vars]{vars()}} accepts named and unnamed arguments. If a variable in \code{.vars} is named, a new column by that name will be created. Name collisions in the new columns are disambiguated using a unique suffix. } \examples{ iris <- as_tibble(iris) # All variants can be passed functions and additional arguments, # purrr-style. The _at() variants directly support strings. Here # we'll scale the variables `height` and `mass`: scale2 <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm) starwars |> mutate_at(c("height", "mass"), scale2) # -> starwars |> mutate(across(c("height", "mass"), scale2)) # You can pass additional arguments to the function: starwars |> mutate_at(c("height", "mass"), scale2, na.rm = TRUE) starwars |> mutate_at(c("height", "mass"), ~scale2(., na.rm = TRUE)) # -> starwars |> mutate(across(c("height", "mass"), ~ scale2(.x, na.rm = TRUE))) # You can also supply selection helpers to _at() functions but you have # to quote them with vars(): iris |> mutate_at(vars(matches("Sepal")), log) iris |> mutate(across(matches("Sepal"), log)) # The _if() variants apply a predicate function (a function that # returns TRUE or FALSE) to determine the relevant subset of # columns. Here we divide all the numeric columns by 100: starwars |> mutate_if(is.numeric, scale2, na.rm = TRUE) starwars |> mutate(across(where(is.numeric), ~ scale2(.x, na.rm = TRUE))) # mutate_if() is particularly useful for transforming variables from # one type to another iris |> mutate_if(is.factor, as.character) iris |> mutate_if(is.double, as.integer) # -> iris |> mutate(across(where(is.factor), as.character)) iris |> mutate(across(where(is.double), as.integer)) # Multiple transformations ---------------------------------------- # If you want to apply multiple transformations, pass a list of # functions. When there are multiple functions, they create new # variables instead of modifying the variables in place: iris |> mutate_if(is.numeric, list(scale2, log)) iris |> mutate_if(is.numeric, list(~scale2(.), ~log(.))) iris |> mutate_if(is.numeric, list(scale = scale2, log = log)) # -> iris |> as_tibble() |> mutate(across(where(is.numeric), list(scale = scale2, log = log))) # When there's only one function in the list, it modifies existing # variables in place. Give it a name to instead create new variables: iris |> mutate_if(is.numeric, list(scale2)) iris |> mutate_if(is.numeric, list(scale = scale2)) } \seealso{ \link[=scoped]{The other scoped verbs}, \code{\link[=vars]{vars()}} } \keyword{internal} dplyr/man/relocate.Rd0000644000176200001440000000437215106134104014245 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/relocate.R \name{relocate} \alias{relocate} \title{Change column order} \usage{ relocate(.data, ..., .before = NULL, .after = NULL) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Columns to move.} \item{.before, .after}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Destination of columns selected by \code{...}. Supplying neither will move columns to the left-hand side; specifying both is an error.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Rows are not affected. \item The same columns appear in the output, but (usually) in a different place and possibly renamed. \item Data frame attributes are preserved. \item Groups are not affected. } } \description{ Use \code{relocate()} to change column positions, using the same syntax as \code{select()} to make it easy to move blocks of columns at once. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("relocate")}. } \examples{ df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a") df |> relocate(f) df |> relocate(a, .after = c) df |> relocate(f, .before = b) df |> relocate(a, .after = last_col()) # relocated columns can change name df |> relocate(ff = f) # Can also select variables based on their type df |> relocate(where(is.character)) df |> relocate(where(is.numeric), .after = last_col()) # Or with any other select helper df |> relocate(any_of(c("a", "e", "i", "o", "u"))) # When .before or .after refers to multiple variables they will be # moved to be immediately before/after the selected variables. df2 <- tibble(a = 1, b = "a", c = 1, d = "a") df2 |> relocate(where(is.numeric), .after = where(is.character)) df2 |> relocate(where(is.numeric), .before = where(is.character)) } dplyr/man/na_if.Rd0000644000176200001440000000511115106134104013513 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/na-if.R \name{na_if} \alias{na_if} \title{Convert values to \code{NA}} \usage{ na_if(x, y) } \arguments{ \item{x}{Vector to modify} \item{y}{Value or vector to compare against. When \code{x} and \code{y} are equal, the value in \code{x} will be replaced with \code{NA}. \code{y} is \link[vctrs:theory-faq-coercion]{cast} to the type of \code{x} before comparison. \code{y} is \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{x} before comparison. This means that \code{y} can be a vector with the same size as \code{x}, but most of the time this will be a single value.} } \value{ A modified version of \code{x} that replaces any values that are equal to \code{y} with \code{NA}. } \description{ This is a translation of the SQL command \code{NULLIF}. It is useful if you want to convert an annoying value to \code{NA}. } \examples{ # `na_if()` is useful for replacing a single problematic value with `NA` na_if(c(-99, 1, 4, 3, -99, 5), -99) na_if(c("abc", "def", "", "ghi"), "") # You can use it to standardize `NaN`s to `NA` na_if(c(1, NaN, NA, 2, NaN), NaN) # Because `na_if()` is an R translation of SQL's `NULLIF` command, # it compares `x` and `y` element by element. Where `x` and `y` are # equal, the value in `x` is replaced with an `NA`. na_if( x = c(1, 2, 5, 5, 6), y = c(0, 2, 3, 5, 4) ) # If you have multiple problematic values that you'd like to replace with # `NA`, then `replace_values()` is a better choice than `na_if()` x <- c(-99, 1, 4, 0, -99, 5, -1, 0, 5) replace_values(x, c(0, -1, -99) ~ NA) # You'd have to nest `na_if()`s to achieve this try(na_if(x, c(0, -1, -99))) na_if(na_if(na_if(x, 0), -1), -99) # If you'd like to replace values that match a logical condition with `NA`, # use `replace_when()` replace_when(x, x < 0 ~ NA) # If you'd like to replace `NA` with some other value, use `replace_values()` x <- c(NA, 5, 2, NA, 0, 3) replace_values(x, NA ~ 0) # `na_if()` is particularly useful inside `mutate()` starwars |> select(name, eye_color) |> mutate(eye_color = na_if(eye_color, "unknown")) # `na_if()` can also be used with `mutate()` and `across()` # to alter multiple columns starwars |> mutate(across(where(is.character), ~na_if(., "unknown"))) } \seealso{ \itemize{ \item \code{\link[=coalesce]{coalesce()}} to replace \code{NA}s with the first non-missing value. \item \code{\link[=replace_values]{replace_values()}} for making arbitrary replacements by value. \item \code{\link[=replace_when]{replace_when()}} for making arbitrary replacements using logical conditions. } } dplyr/man/distinct.Rd0000644000176200001440000000511515106134104014264 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/distinct.R \name{distinct} \alias{distinct} \title{Keep distinct/unique rows} \usage{ distinct(.data, ..., .keep_all = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs, only the first row will be preserved. If omitted, will use all variables in the data frame.} \item{.keep_all}{If \code{TRUE}, keep all variables in \code{.data}. If a combination of \code{...} is not distinct, this keeps the first row of values.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Rows are a subset of the input but appear in the same order. \item Columns are not modified if \code{...} is empty or \code{.keep_all} is \code{TRUE}. Otherwise, \code{distinct()} first calls \code{mutate()} to create new columns. \item Groups are not modified. \item Data frame attributes are preserved. } } \description{ Keep only unique/distinct rows from a data frame. This is similar to \code{\link[=unique.data.frame]{unique.data.frame()}} but considerably faster. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("distinct")}. } \examples{ df <- tibble( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) nrow(df) nrow(distinct(df)) nrow(distinct(df, x, y)) distinct(df, x) distinct(df, y) # You can choose to keep all other variables as well distinct(df, x, .keep_all = TRUE) distinct(df, y, .keep_all = TRUE) # You can also use distinct on computed variables distinct(df, diff = abs(x - y)) # Use `pick()` to select columns with tidy-select distinct(starwars, pick(contains("color"))) # Grouping ------------------------------------------------- df <- tibble( g = c(1, 1, 2, 2, 2), x = c(1, 1, 2, 1, 2), y = c(3, 2, 1, 3, 1) ) df <- df |> group_by(g) # With grouped data frames, distinctness is computed within each group df |> distinct(x) # When `...` are omitted, `distinct()` still computes distinctness using # all variables in the data frame df |> distinct() } dplyr/man/sample_n.Rd0000644000176200001440000000550414366556340014264 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/sample.R \name{sample_n} \alias{sample_n} \alias{sample_frac} \title{Sample n rows from a table} \usage{ sample_n(tbl, size, replace = FALSE, weight = NULL, .env = NULL, ...) sample_frac(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ...) } \arguments{ \item{tbl}{A data.frame.} \item{size}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> For \code{sample_n()}, the number of rows to select. For \code{sample_frac()}, the fraction of rows to select. If \code{tbl} is grouped, \code{size} applies to each group.} \item{replace}{Sample with or without replacement?} \item{weight}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Sampling weights. This must evaluate to a vector of non-negative numbers the same length as the input. Weights are automatically standardised to sum to 1.} \item{.env}{DEPRECATED.} \item{...}{ignored} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{sample_n()} and \code{sample_frac()} have been superseded in favour of \code{\link[=slice_sample]{slice_sample()}}. While they will not be deprecated in the near future, retirement means that we will only perform critical bug fixes, so we recommend moving to the newer alternative. These functions were superseded because we realised it was more convenient to have two mutually exclusive arguments to one function, rather than two separate functions. This also made it to clean up a few other smaller design issues with \code{sample_n()}/\code{sample_frac}: \itemize{ \item The connection to \code{slice()} was not obvious. \item The name of the first argument, \code{tbl}, is inconsistent with other single table verbs which use \code{.data}. \item The \code{size} argument uses tidy evaluation, which is surprising and undocumented. \item It was easier to remove the deprecated \code{.env} argument. \item \code{...} was in a suboptimal position. } } \examples{ df <- tibble(x = 1:5, w = c(0.1, 0.1, 0.1, 2, 2)) # sample_n() -> slice_sample() ---------------------------------------------- # Was: sample_n(df, 3) sample_n(df, 10, replace = TRUE) sample_n(df, 3, weight = w) # Now: slice_sample(df, n = 3) slice_sample(df, n = 10, replace = TRUE) slice_sample(df, n = 3, weight_by = w) # Note that sample_n() would error if n was bigger than the group size # slice_sample() will just use the available rows for consistency with # the other slice helpers like slice_head() try(sample_n(df, 10)) slice_sample(df, n = 10) # sample_frac() -> slice_sample() ------------------------------------------- # Was: sample_frac(df, 0.25) sample_frac(df, 2, replace = TRUE) # Now: slice_sample(df, prop = 0.25) slice_sample(df, prop = 2, replace = TRUE) } \keyword{internal} dplyr/man/near.Rd0000644000176200001440000000101513663216626013403 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/near.R \name{near} \alias{near} \title{Compare two numeric vectors} \usage{ near(x, y, tol = .Machine$double.eps^0.5) } \arguments{ \item{x, y}{Numeric vectors to compare} \item{tol}{Tolerance of comparison.} } \description{ This is a safe way of comparing if two vectors of floating point numbers are (pairwise) equal. This is safer than using \code{==}, because it has a built in tolerance } \examples{ sqrt(2) ^ 2 == 2 near(sqrt(2) ^ 2, 2) } dplyr/man/defunct-each.Rd0000644000176200001440000000161315137161765015011 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/defunct-each.R \name{defunct-each} \alias{defunct-each} \alias{summarise_each} \alias{summarise_each_} \alias{mutate_each} \alias{mutate_each_} \alias{summarize_each} \alias{summarize_each_} \title{Defunct functions for working with multiple columns} \usage{ summarise_each(tbl, funs, ...) summarise_each_(tbl, funs, vars) mutate_each(tbl, funs, ...) mutate_each_(tbl, funs, vars) summarize_each(tbl, funs, ...) summarize_each_(tbl, funs, vars) } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#defunct}{\figure{lifecycle-defunct.svg}{options: alt='[Defunct]'}}}{\strong{[Defunct]}} \code{mutate_each()} and \code{summarise_each()} are deprecated in favour of the new \code{\link[=across]{across()}} function that works within \code{summarise()} and \code{mutate()}. } \keyword{internal} dplyr/man/rows.Rd0000644000176200001440000001542014366556340013456 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rows.R \name{rows} \alias{rows} \alias{rows_insert} \alias{rows_append} \alias{rows_update} \alias{rows_patch} \alias{rows_upsert} \alias{rows_delete} \title{Manipulate individual rows} \usage{ rows_insert( x, y, by = NULL, ..., conflict = c("error", "ignore"), copy = FALSE, in_place = FALSE ) rows_append(x, y, ..., copy = FALSE, in_place = FALSE) rows_update( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) rows_patch( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) rows_upsert(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) rows_delete( x, y, by = NULL, ..., unmatched = c("error", "ignore"), copy = FALSE, in_place = FALSE ) } \arguments{ \item{x, y}{A pair of data frames or data frame extensions (e.g. a tibble). \code{y} must have the same columns of \code{x} or a subset.} \item{by}{An unnamed character vector giving the key columns. The key columns must exist in both \code{x} and \code{y}. Keys typically uniquely identify each row, but this is only enforced for the key values of \code{y} when \code{rows_update()}, \code{rows_patch()}, or \code{rows_upsert()} are used. By default, we use the first column in \code{y}, since the first column is a reasonable place to put an identifier variable.} \item{...}{Other parameters passed onto methods.} \item{conflict}{For \code{rows_insert()}, how should keys in \code{y} that conflict with keys in \code{x} be handled? A conflict arises if there is a key in \code{y} that already exists in \code{x}. One of: \itemize{ \item \code{"error"}, the default, will error if there are any keys in \code{y} that conflict with keys in \code{x}. \item \code{"ignore"} will ignore rows in \code{y} with keys that conflict with keys in \code{x}. }} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into the same src as \code{x}. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.} \item{in_place}{Should \code{x} be modified in place? This argument is only relevant for mutable backends (e.g. databases, data.tables). When \code{TRUE}, a modified version of \code{x} is returned invisibly; when \code{FALSE}, a new object representing the resulting changes is returned.} \item{unmatched}{For \code{rows_update()}, \code{rows_patch()}, and \code{rows_delete()}, how should keys in \code{y} that are unmatched by the keys in \code{x} be handled? One of: \itemize{ \item \code{"error"}, the default, will error if there are any keys in \code{y} that are unmatched by the keys in \code{x}. \item \code{"ignore"} will ignore rows in \code{y} with keys that are unmatched by the keys in \code{x}. }} } \value{ An object of the same type as \code{x}. The order of the rows and columns of \code{x} is preserved as much as possible. The output has the following properties: \itemize{ \item \code{rows_update()} and \code{rows_patch()} preserve the number of rows; \code{rows_insert()}, \code{rows_append()}, and \code{rows_upsert()} return all existing rows and potentially new rows; \code{rows_delete()} returns a subset of the rows. \item Columns are not added, removed, or relocated, though the data may be updated. \item Groups are taken from \code{x}. \item Data frame attributes are taken from \code{x}. } If \code{in_place = TRUE}, the result will be returned invisibly. } \description{ These functions provide a framework for modifying rows in a table using a second table of data. The two tables are matched \code{by} a set of key variables whose values typically uniquely identify each row. The functions are inspired by SQL's \code{INSERT}, \code{UPDATE}, and \code{DELETE}, and can optionally modify \code{in_place} for selected backends. \itemize{ \item \code{rows_insert()} adds new rows (like \code{INSERT}). By default, key values in \code{y} must not exist in \code{x}. \item \code{rows_append()} works like \code{rows_insert()} but ignores keys. \item \code{rows_update()} modifies existing rows (like \code{UPDATE}). Key values in \code{y} must be unique, and, by default, key values in \code{y} must exist in \code{x}. \item \code{rows_patch()} works like \code{rows_update()} but only overwrites \code{NA} values. \item \code{rows_upsert()} inserts or updates depending on whether or not the key value in \code{y} already exists in \code{x}. Key values in \code{y} must be unique. \item \code{rows_delete()} deletes rows (like \code{DELETE}). By default, key values in \code{y} must exist in \code{x}. } } \section{Methods}{ These function are \strong{generic}s, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \itemize{ \item \code{rows_insert()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_insert")}. \item \code{rows_append()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_append")}. \item \code{rows_update()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_update")}. \item \code{rows_patch()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_patch")}. \item \code{rows_upsert()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_upsert")}. \item \code{rows_delete()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rows_delete")}. } } \examples{ data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) data # Insert rows_insert(data, tibble(a = 4, b = "z")) # By default, if a key in `y` matches a key in `x`, then it can't be inserted # and will throw an error. Alternatively, you can ignore rows in `y` # containing keys that conflict with keys in `x` with `conflict = "ignore"`, # or you can use `rows_append()` to ignore keys entirely. try(rows_insert(data, tibble(a = 3, b = "z"))) rows_insert(data, tibble(a = 3, b = "z"), conflict = "ignore") rows_append(data, tibble(a = 3, b = "z")) # Update rows_update(data, tibble(a = 2:3, b = "z")) rows_update(data, tibble(b = "z", a = 2:3), by = "a") # Variants: patch and upsert rows_patch(data, tibble(a = 2:3, b = "z")) rows_upsert(data, tibble(a = 2:4, b = "z")) # Delete and truncate rows_delete(data, tibble(a = 2:3)) rows_delete(data, tibble(a = 2:3, b = "b")) # By default, for update, patch, and delete it is an error if a key in `y` # doesn't exist in `x`. You can ignore rows in `y` that have unmatched keys # with `unmatched = "ignore"`. y <- tibble(a = 3:4, b = "z") try(rows_update(data, y, by = "a")) rows_update(data, y, by = "a", unmatched = "ignore") rows_patch(data, y, by = "a", unmatched = "ignore") rows_delete(data, y, by = "a", unmatched = "ignore") } dplyr/man/group_nest.Rd0000644000176200001440000000443015106134104014627 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group-nest.R \name{group_nest} \alias{group_nest} \title{Nest a tibble using a grouping specification} \usage{ group_nest(.tbl, ..., .key = "data", keep = FALSE) } \arguments{ \item{.tbl}{A tbl} \item{...}{Grouping specification, forwarded to \code{\link[=group_by]{group_by()}}} \item{.key}{the name of the list column} \item{keep}{Should the grouping columns be kept in the list column.} } \value{ A tbl with one row per unique combination of the grouping variables. The first columns are the grouping variables, followed by a list column of tibbles with matching rows of the remaining columns. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Nest a tibble using a grouping specification } \section{Lifecycle}{ \code{group_nest()} is not stable because \code{\link[tidyr:nest]{tidyr::nest(.by =)}} provides very similar behavior. It may be deprecated in the future. } \section{Grouped data frames}{ The primary use case for \code{\link[=group_nest]{group_nest()}} is with already grouped data frames, typically a result of \code{\link[=group_by]{group_by()}}. In this case \code{\link[=group_nest]{group_nest()}} only uses the first argument, the grouped tibble, and warns when \code{...} is used. } \section{Ungrouped data frames}{ When used on ungrouped data frames, \code{\link[=group_nest]{group_nest()}} forwards the \code{...} to \code{\link[=group_by]{group_by()}} before nesting, therefore the \code{...} are subject to the data mask. } \examples{ #----- use case 1: a grouped data frame iris |> group_by(Species) |> group_nest() # this can be useful if the grouped data has been altered before nesting iris |> group_by(Species) |> filter(Sepal.Length > mean(Sepal.Length)) |> group_nest() #----- use case 2: using group_nest() on a ungrouped data frame with # a grouping specification that uses the data mask starwars |> group_nest(species, homeworld) } \seealso{ Other grouping functions: \code{\link{group_by}()}, \code{\link{group_map}()}, \code{\link{group_split}()}, \code{\link{group_trim}()} } \concept{grouping functions} \keyword{internal} dplyr/man/src_tbls.Rd0000644000176200001440000000076614366556340014306 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/src.R \name{src_tbls} \alias{src_tbls} \title{List all tbls provided by a source.} \usage{ src_tbls(x, ...) } \arguments{ \item{x}{a data src.} \item{...}{other arguments passed on to the individual methods.} } \description{ This is a generic method which individual src's will provide methods for. Most methods will not be documented because it's usually pretty obvious what possible results will be. } \keyword{internal} dplyr/man/storms.Rd0000644000176200001440000000363015137161765014013 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data-storms.R \docType{data} \name{storms} \alias{storms} \title{Storm tracks data} \format{ A tibble with 20,778 observations and 13 variables: \describe{ \item{name}{Storm Name} \item{year,month,day}{Date of report} \item{hour}{Hour of report (in UTC)} \item{lat,long}{Location of storm center} \item{status}{Storm classification (Tropical Depression, Tropical Storm, or Hurricane)} \item{category}{Saffir-Simpson hurricane category calculated from wind speed. \itemize{ \item \code{NA}: Not a hurricane \item 1: 64+ knots \item 2: 83+ knots \item 3: 96+ knots \item 4: 113+ knots \item 5: 137+ knots } } \item{wind}{storm's maximum sustained wind speed (in knots)} \item{pressure}{Air pressure at the storm's center (in millibars)} \item{tropicalstorm_force_diameter}{Diameter (in nautical miles) of the area experiencing tropical storm strength winds (34 knots or above). Only available starting in 2004.} \item{hurricane_force_diameter}{Diameter (in nautical miles) of the area experiencing hurricane strength winds (64 knots or above). Only available starting in 2004.} } } \usage{ storms } \description{ This dataset is the NOAA Atlantic hurricane database best track data, \url{https://www.nhc.noaa.gov/data/#hurdat}. The data includes the positions and attributes of storms from 1975-2024. Storms from 1979 onward are measured every six hours during the lifetime of the storm. Storms in earlier years have some missing data. } \examples{ storms # Show a few recent storm paths if (requireNamespace("ggplot2", quietly = TRUE)) { library(ggplot2) storms |> filter(year >= 2000) |> ggplot(aes(long, lat, color = paste(year, name))) + geom_path(show.legend = FALSE) + facet_wrap(~year) } storms } \seealso{ The script to create the storms data set: \url{https://github.com/tidyverse/dplyr/blob/main/data-raw/storms.R} } \keyword{datasets} dplyr/man/select.Rd0000644000176200001440000002140615106134104013723 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/select.R \name{select} \alias{select} \title{Keep or drop columns using their names and types} \usage{ select(.data, ...) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> One or more unquoted expressions separated by commas. Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can be used to select a range of variables.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Rows are not affected. \item Output columns are a subset of input columns, potentially with a different order. Columns will be renamed if \code{new_name = old_name} form is used. \item Data frame attributes are preserved. \item Groups are maintained; you can't select off grouping variables. } } \description{ Select (and optionally rename) variables in a data frame, using a concise mini-language that makes it easy to refer to variables based on their name (e.g. \code{a:f} selects all columns from \code{a} on the left to \code{f} on the right) or type (e.g. \code{where(is.numeric)} selects all numeric columns). \subsection{Overview of selection features}{ Tidyverse selections implement a dialect of R where operators make it easy to select variables: \itemize{ \item \code{:} for selecting a range of consecutive variables. \item \code{!} for taking the complement of a set of variables. \item \code{&} and \code{|} for selecting the intersection or the union of two sets of variables. \item \code{c()} for combining selections. } In addition, you can use \strong{selection helpers}. Some helpers select specific columns: \itemize{ \item \code{\link[tidyselect:everything]{everything()}}: Matches all variables. \item \code{\link[tidyselect:everything]{last_col()}}: Select last variable, possibly with an offset. \item \code{\link[=group_cols]{group_cols()}}: Select all grouping columns. } Other helpers select variables by matching patterns in their names: \itemize{ \item \code{\link[tidyselect:starts_with]{starts_with()}}: Starts with a prefix. \item \code{\link[tidyselect:starts_with]{ends_with()}}: Ends with a suffix. \item \code{\link[tidyselect:starts_with]{contains()}}: Contains a literal string. \item \code{\link[tidyselect:starts_with]{matches()}}: Matches a regular expression. \item \code{\link[tidyselect:starts_with]{num_range()}}: Matches a numerical range like x01, x02, x03. } Or from variables stored in a character vector: \itemize{ \item \code{\link[tidyselect:all_of]{all_of()}}: Matches variable names in a character vector. All names must be present, otherwise an out-of-bounds error is thrown. \item \code{\link[tidyselect:all_of]{any_of()}}: Same as \code{all_of()}, except that no error is thrown for names that don't exist. } Or using a predicate function: \itemize{ \item \code{\link[tidyselect:where]{where()}}: Applies a function to all variables and selects those for which the function returns \code{TRUE}. } } } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("select")}. } \section{Examples}{ Here we show the usage for the basic selection operators. See the specific help pages to learn about helpers like \code{\link[=starts_with]{starts_with()}}. The selection language can be used in functions like \code{dplyr::select()}. Let's first attach the tidyverse: \if{html}{\out{
}}\preformatted{library(tidyverse) # For better printing iris <- as_tibble(iris) }\if{html}{\out{
}} Select variables by name: \if{html}{\out{
}}\preformatted{starwars |> select(height) #> # A tibble: 87 x 1 #> height #> #> 1 172 #> 2 167 #> 3 96 #> 4 202 #> # i 83 more rows iris |> select(Sepal.Length) #> # A tibble: 150 x 1 #> Sepal.Length #> #> 1 5.1 #> 2 4.9 #> 3 4.7 #> 4 4.6 #> # i 146 more rows }\if{html}{\out{
}} Select multiple variables by separating them with commas. Note how the order of columns is determined by the order of inputs: \if{html}{\out{
}}\preformatted{starwars |> select(homeworld, height, mass) #> # A tibble: 87 x 3 #> homeworld height mass #> #> 1 Tatooine 172 77 #> 2 Tatooine 167 75 #> 3 Naboo 96 32 #> 4 Tatooine 202 136 #> # i 83 more rows iris |> select(Sepal.Length, Petal.Length) #> # A tibble: 150 x 2 #> Sepal.Length Petal.Length #> #> 1 5.1 1.4 #> 2 4.9 1.4 #> 3 4.7 1.3 #> 4 4.6 1.5 #> # i 146 more rows }\if{html}{\out{
}} If you use a named vector to select columns, the output will have its columns renamed: \if{html}{\out{
}}\preformatted{selection <- c( new_homeworld = "homeworld", new_height = "height", new_mass = "mass" ) starwars |> select(all_of(selection)) #> # A tibble: 87 x 3 #> new_homeworld new_height new_mass #> #> 1 Tatooine 172 77 #> 2 Tatooine 167 75 #> 3 Naboo 96 32 #> 4 Tatooine 202 136 #> # i 83 more rows }\if{html}{\out{
}} \subsection{Operators:}{ The \code{:} operator selects a range of consecutive variables: \if{html}{\out{
}}\preformatted{starwars |> select(name:mass) #> # A tibble: 87 x 3 #> name height mass #> #> 1 Luke Skywalker 172 77 #> 2 C-3PO 167 75 #> 3 R2-D2 96 32 #> 4 Darth Vader 202 136 #> # i 83 more rows }\if{html}{\out{
}} The \code{!} operator negates a selection: \if{html}{\out{
}}\preformatted{starwars |> select(!(name:mass)) #> # A tibble: 87 x 11 #> hair_color skin_color eye_color birth_year sex gender homeworld species #> #> 1 blond fair blue 19 male masculine Tatooine Human #> 2 gold yellow 112 none masculine Tatooine Droid #> 3 white, blue red 33 none masculine Naboo Droid #> 4 none white yellow 41.9 male masculine Tatooine Human #> # i 83 more rows #> # i 3 more variables: films , vehicles , starships iris |> select(!c(Sepal.Length, Petal.Length)) #> # A tibble: 150 x 3 #> Sepal.Width Petal.Width Species #> #> 1 3.5 0.2 setosa #> 2 3 0.2 setosa #> 3 3.2 0.2 setosa #> 4 3.1 0.2 setosa #> # i 146 more rows iris |> select(!ends_with("Width")) #> # A tibble: 150 x 3 #> Sepal.Length Petal.Length Species #> #> 1 5.1 1.4 setosa #> 2 4.9 1.4 setosa #> 3 4.7 1.3 setosa #> 4 4.6 1.5 setosa #> # i 146 more rows }\if{html}{\out{
}} \code{&} and \code{|} take the intersection or the union of two selections: \if{html}{\out{
}}\preformatted{iris |> select(starts_with("Petal") & ends_with("Width")) #> # A tibble: 150 x 1 #> Petal.Width #> #> 1 0.2 #> 2 0.2 #> 3 0.2 #> 4 0.2 #> # i 146 more rows iris |> select(starts_with("Petal") | ends_with("Width")) #> # A tibble: 150 x 3 #> Petal.Length Petal.Width Sepal.Width #> #> 1 1.4 0.2 3.5 #> 2 1.4 0.2 3 #> 3 1.3 0.2 3.2 #> 4 1.5 0.2 3.1 #> # i 146 more rows }\if{html}{\out{
}} To take the difference between two selections, combine the \code{&} and \code{!} operators: \if{html}{\out{
}}\preformatted{iris |> select(starts_with("Petal") & !ends_with("Width")) #> # A tibble: 150 x 1 #> Petal.Length #> #> 1 1.4 #> 2 1.4 #> 3 1.3 #> 4 1.5 #> # i 146 more rows }\if{html}{\out{
}} } } \seealso{ Other single table verbs: \code{\link{arrange}()}, \code{\link{filter}()}, \code{\link{mutate}()}, \code{\link{reframe}()}, \code{\link{rename}()}, \code{\link{slice}()}, \code{\link{summarise}()} } \concept{single table verbs} dplyr/man/filter.Rd0000644000176200001440000002310415137161765013747 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/filter.R \name{filter} \alias{filter} \alias{filter_out} \title{Keep or drop rows that match a condition} \usage{ filter(.data, ..., .by = NULL, .preserve = FALSE) filter_out(.data, ..., .by = NULL, .preserve = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Expressions that return a logical vector, defined in terms of the variables in \code{.data}. If multiple expressions are included, they are combined with the \code{&} operator. To combine expressions using \code{|} instead, wrap them in \code{\link[=when_any]{when_any()}}. Only rows for which all expressions evaluate to \code{TRUE} are kept (for \code{filter()}) or dropped (for \code{filter_out()}).} \item{.by}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise the grouping is kept as is.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Rows are a subset of the input, but appear in the same order. \item Columns are not modified. \item The number of groups may be reduced (if \code{.preserve} is not \code{TRUE}). \item Data frame attributes are preserved. } } \description{ These functions are used to subset a data frame, applying the expressions in \code{...} to determine which rows should be kept (for \code{filter()}) or dropped ( for \code{filter_out()}). Multiple conditions can be supplied separated by a comma. These will be combined with the \code{&} operator. To combine comma separated conditions using \code{|} instead, wrap them in \code{\link[=when_any]{when_any()}}. Both \code{filter()} and \code{filter_out()} treat \code{NA} like \code{FALSE}. This subtle behavior can impact how you write your conditions when missing values are involved. See the section on \verb{Missing values} for important details and examples. } \section{Missing values}{ Both \code{filter()} and \code{filter_out()} treat \code{NA} like \code{FALSE}. This results in the following behavior: \itemize{ \item \code{filter()} \emph{drops} both \code{NA} and \code{FALSE}. \item \code{filter_out()} \emph{keeps} both \code{NA} and \code{FALSE}. } This means that \verb{filter(data, ) + filter_out(data, )} captures every row within \code{data} exactly once. The \code{NA} handling of these functions has been designed to match your \emph{intent}. When your intent is to keep rows, use \code{filter()}. When your intent is to drop rows, use \code{filter_out()}. For example, if your goal with this \code{cars} data is to "drop rows where the \code{class} is suv", then you might write this in one of two ways: \if{html}{\out{
}}\preformatted{cars <- tibble(class = c("suv", NA, "coupe")) cars #> # A tibble: 3 x 1 #> class #> #> 1 suv #> 2 #> 3 coupe }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{cars |> filter(class != "suv") #> # A tibble: 1 x 1 #> class #> #> 1 coupe }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{cars |> filter_out(class == "suv") #> # A tibble: 2 x 1 #> class #> #> 1 #> 2 coupe }\if{html}{\out{
}} Note how \code{filter()} drops the \code{NA} rows even though our goal was only to drop \code{"suv"} rows, but \code{filter_out()} matches our intuition. To generate the correct result with \code{filter()}, you'd need to use: \if{html}{\out{
}}\preformatted{cars |> filter(class != "suv" | is.na(class)) #> # A tibble: 2 x 1 #> class #> #> 1 #> 2 coupe }\if{html}{\out{
}} This quickly gets unwieldy when multiple conditions are involved. In general, if you find yourself: \itemize{ \item Using "negative" operators like \code{!=} or \code{!} \item Adding in \code{NA} handling like \verb{| is.na(col)} or \verb{& !is.na(col)} } then you should consider if swapping to the other filtering variant would make your conditions simpler. \subsection{Comparison to base subsetting}{ Base subsetting with \code{[} doesn't treat \code{NA} like \code{TRUE} or \code{FALSE}. Instead, it generates a fully missing row, which is different from how both \code{filter()} and \code{filter_out()} work. \if{html}{\out{
}}\preformatted{cars <- tibble(class = c("suv", NA, "coupe"), mpg = c(10, 12, 14)) cars #> # A tibble: 3 x 2 #> class mpg #> #> 1 suv 10 #> 2 12 #> 3 coupe 14 }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{cars[cars$class == "suv",] #> # A tibble: 2 x 2 #> class mpg #> #> 1 suv 10 #> 2 NA cars |> filter(class == "suv") #> # A tibble: 1 x 2 #> class mpg #> #> 1 suv 10 }\if{html}{\out{
}} } } \section{Useful filter functions}{ There are many functions and operators that are useful when constructing the expressions used to filter the data: \itemize{ \item \code{\link{==}}, \code{\link{>}}, \code{\link{>=}} etc \item \code{\link{&}}, \code{\link{|}}, \code{\link{!}}, \code{\link[=xor]{xor()}} \item \code{\link[=is.na]{is.na()}} \item \code{\link[=between]{between()}}, \code{\link[=near]{near()}} \item \code{\link[=when_any]{when_any()}}, \code{\link[=when_all]{when_all()}} } } \section{Grouped tibbles}{ Because filtering expressions are computed within groups, they may yield different results on grouped tibbles. This will be the case as soon as an aggregating, lagging, or ranking function is involved. Compare this ungrouped filtering: \if{html}{\out{
}}\preformatted{starwars |> filter(mass > mean(mass, na.rm = TRUE)) }\if{html}{\out{
}} With the grouped equivalent: \if{html}{\out{
}}\preformatted{starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) }\if{html}{\out{
}} In the ungrouped version, \code{filter()} compares the value of \code{mass} in each row to the global average (taken over the whole data set), keeping only the rows with \code{mass} greater than this global average. In contrast, the grouped version calculates the average mass separately for each \code{gender} group, and keeps rows with \code{mass} greater than the relevant within-gender average. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. } \examples{ # Filtering for one criterion filter(starwars, species == "Human") # Filtering for multiple criteria within a single logical expression filter(starwars, hair_color == "none" & eye_color == "black") filter(starwars, hair_color == "none" | eye_color == "black") # Multiple comma separated expressions are combined using `&` starwars |> filter(hair_color == "none", eye_color == "black") # To combine comma separated expressions using `|` instead, use `when_any()` starwars |> filter(when_any(hair_color == "none", eye_color == "black")) # Filtering out to drop rows filter_out(starwars, hair_color == "none") # When filtering out, it can be useful to first interactively filter for the # rows you want to drop, just to double check that you've written the # conditions correctly. Then, just change `filter()` to `filter_out()`. filter(starwars, mass > 1000, eye_color == "orange") filter_out(starwars, mass > 1000, eye_color == "orange") # The filtering operation may yield different results on grouped # tibbles because the expressions are computed within groups. # # The following keeps rows where `mass` is greater than the # global average: starwars |> filter(mass > mean(mass, na.rm = TRUE)) # Whereas this keeps rows with `mass` greater than the per `gender` # average: starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) # If you find yourself trying to use a `filter()` to drop rows, then # you should consider if switching to `filter_out()` can simplify your # conditions. For example, to drop blond individuals, you might try: starwars |> filter(hair_color != "blond") # But this also drops rows with an `NA` hair color! To retain those: starwars |> filter(hair_color != "blond" | is.na(hair_color)) # But explicit `NA` handling like this can quickly get unwieldy, especially # with multiple conditions. Since your intent was to specify rows to drop # rather than rows to keep, use `filter_out()`. This also removes the need # for any explicit `NA` handling. starwars |> filter_out(hair_color == "blond") # To refer to column names that are stored as strings, use the `.data` # pronoun: vars <- c("mass", "height") cond <- c(80, 150) starwars |> filter( .data[[vars[[1]]]] > cond[[1]], .data[[vars[[2]]]] > cond[[2]] ) # Learn more in ?rlang::args_data_masking } \seealso{ Other single table verbs: \code{\link{arrange}()}, \code{\link{mutate}()}, \code{\link{reframe}()}, \code{\link{rename}()}, \code{\link{select}()}, \code{\link{slice}()}, \code{\link{summarise}()} } \concept{single table verbs} dplyr/man/dplyr_extending.Rd0000644000176200001440000001300015137161765015653 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/generics.R \name{dplyr_extending} \alias{dplyr_extending} \alias{dplyr_row_slice} \alias{dplyr_col_modify} \alias{dplyr_reconstruct} \title{Extending dplyr with new data frame subclasses} \usage{ dplyr_row_slice(data, i, ...) dplyr_col_modify(data, cols) dplyr_reconstruct(data, template) } \arguments{ \item{data}{A tibble. We use tibbles because they avoid some inconsistent subset-assignment use cases.} \item{i}{A numeric or logical vector that indexes the rows of \code{data}.} \item{cols}{A named list used to modify columns. A \code{NULL} value should remove an existing column.} \item{template}{Template data frame to use for restoring attributes.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} These three functions, along with \verb{names<-} and 1d numeric \code{[} (i.e. \code{x[loc]}) methods, provide a minimal interface for extending dplyr to work with new data frame subclasses. This means that for simple cases you should only need to provide a couple of methods, rather than a method for every dplyr verb. These functions are a stop-gap measure until we figure out how to solve the problem more generally, but it's likely that any code you write to implement them will find a home in what comes next. } \section{Basic advice}{ This section gives you basic advice if you want to extend dplyr to work with your custom data frame subclass, and you want the dplyr methods to behave in basically the same way. \itemize{ \item If you have data frame attributes that don't depend on the rows or columns (and should unconditionally be preserved), you don't need to do anything. The one exception to this is if your subclass extends a data.frame directly rather than extending a tibble. The \verb{[.data.frame} method does not preserve attributes, so you'll need to write a \code{[} method for your subclass that preserves attributes important for your class. \item If you have \strong{scalar} attributes that depend on \strong{rows}, implement a \code{dplyr_reconstruct()} method. Your method should recompute the attribute depending on rows now present. \item If you have \strong{scalar} attributes that depend on \strong{columns}, implement a \code{dplyr_reconstruct()} method and a 1d \code{[} method. For example, if your class requires that certain columns be present, your method should return a data.frame or tibble when those columns are removed. \item If your attributes are \strong{vectorised} over \strong{rows}, implement a \code{dplyr_row_slice()} method. This gives you access to \code{i} so you can modify the row attribute accordingly. You'll also need to think carefully about how to recompute the attribute in \code{dplyr_reconstruct()}, and you will need to carefully verify the behaviour of each verb, and provide additional methods as needed. \item If your attributes that are \strong{vectorised} over \strong{columns}, implement \code{dplyr_col_modify()}, 1d \code{[}, and \verb{names<-} methods. All of these methods know which columns are being modified, so you can update the column attribute according. You'll also need to think carefully about how to recompute the attribute in \code{dplyr_reconstruct()}, and you will need to carefully verify the behaviour of each verb, and provide additional methods as needed. } } \section{Current usage}{ \itemize{ \item \code{arrange()}, \code{filter()} (and \code{filter_out()}), \code{slice()} (and the rest of the \verb{slice_*()} family), \code{semi_join()}, and \code{anti_join()} work by generating a vector of row indices, and then subsetting with \code{dplyr_row_slice()}. \item \code{mutate()} generates a list of new column value (using \code{NULL} to indicate when columns should be deleted), then passes that to \code{dplyr_col_modify()}. It also uses 1d \code{[} to implement \code{.keep}, and will call \code{relocate()} if either \code{.before} or \code{.after} are supplied. \item \code{summarise()} and \code{reframe()} work similarly to \code{mutate()} but the data modified by \code{dplyr_col_modify()} comes from \code{group_data()} or is built from \code{.by}. Note that this means that the data frames returned by \code{summarise()} and \code{reframe()} are fundamentally new data frames, and will not retain any custom subclasses or attributes. \item \code{select()} uses 1d \code{[} to select columns, then \verb{names<-} to rename them. \code{rename()} just uses \verb{names<-}. \code{relocate()} just uses 1d \code{[}. \item \code{inner_join()}, \code{left_join()}, \code{right_join()}, and \code{full_join()} coerce \code{x} to a tibble, modify the rows, then use \code{dplyr_reconstruct()} to convert back to the same type as \code{x}. \item \code{nest_join()} converts both \code{x} and \code{y} to tibbles, modifies the rows, and uses \code{dplyr_col_modify()} to handle modified key variables and the list-column that \code{y} becomes. It also uses \code{dplyr_reconstruct()} to convert the outer result back to the type of \code{x}, and to convert the nested tibbles back to the type of \code{y}. \item \code{distinct()} does a \code{mutate()} if any expressions are present, then uses 1d \code{[} to select variables to keep, then \code{dplyr_row_slice()} to select distinct rows. } Note that \code{group_by()} and \code{ungroup()} don't use any of these generics and you'll need to provide methods for them directly, or rely on \code{.by} for per-operation grouping. } \keyword{internal} dplyr/man/tidyeval-compat.Rd0000644000176200001440000000225414406415372015561 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-tidy-eval.R \name{tidyeval-compat} \alias{tidyeval-compat} \alias{.data} \alias{expr} \alias{enquo} \alias{enquos} \alias{sym} \alias{syms} \alias{as_label} \alias{quo} \alias{quos} \alias{quo_name} \alias{ensym} \alias{ensyms} \alias{enexpr} \alias{enexprs} \title{Other tidy eval tools} \description{ These tidy eval functions are no longer for normal usage, but are still exported from dplyr for backward compatibility. See \code{\link[rlang:args_data_masking]{?rlang::args_data_masking}} and \code{vignette("programming")} for the latest recommendations. \itemize{ \item \link[rlang:expr]{expr()} \item \link[rlang:enquo]{enquo()} \item \link[rlang:enquo]{enquos()} \item \link[rlang:sym]{sym()} \item \link[rlang:sym]{syms()} \item \link[rlang:as_label]{as_label()} \item \link[rlang:defusing-advanced]{quo()} \item \link[rlang:defusing-advanced]{quos()} \item \link[rlang:quo_label]{quo_name()} \item \link[rlang:defusing-advanced]{ensym()} \item \link[rlang:defusing-advanced]{ensyms()} \item \link[rlang:defusing-advanced]{enexpr()} \item \link[rlang:defusing-advanced]{enexprs()} } } \keyword{internal} dplyr/man/tbl_vars.Rd0000644000176200001440000000116614366556340014302 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tbl.R \name{tbl_vars} \alias{tbl_vars} \alias{tbl_nongroup_vars} \title{List variables provided by a tbl.} \usage{ tbl_vars(x) tbl_nongroup_vars(x) } \arguments{ \item{x}{A tbl object} } \description{ \code{tbl_vars()} returns all variables while \code{tbl_nongroup_vars()} returns only non-grouping variables. The \code{groups} attribute of the object returned by \code{tbl_vars()} is a character vector of the grouping columns. } \seealso{ \code{\link[=group_vars]{group_vars()}} for a function that returns grouping variables. } \keyword{internal} dplyr/man/compute.Rd0000644000176200001440000000403615106134104014120 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compute-collect.R \name{compute} \alias{compute} \alias{collect} \alias{collapse} \title{Force computation of a database query} \usage{ compute(x, ...) collect(x, ...) collapse(x, ...) } \arguments{ \item{x}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{Arguments passed on to methods} } \description{ \code{compute()} stores results in a remote temporary table. \code{collect()} retrieves data into a local tibble. \code{collapse()} is slightly different: it doesn't force computation, but instead forces generation of the SQL query. This is sometimes needed to work around bugs in dplyr's SQL generation. All functions preserve grouping and ordering. } \section{Methods}{ These functions are \strong{generics}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \itemize{ \item \code{compute()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("compute")} \item \code{collect()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("collect")} \item \code{collapse()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("collapse")} } } \examples{ \dontshow{if (requireNamespace("dbplyr", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE)) withAutoprint(\{ # examplesIf} mtcars2 <- dbplyr::src_memdb() |> copy_to(mtcars, name = "mtcars2-cc", overwrite = TRUE) remote <- mtcars2 |> filter(cyl == 8) |> select(mpg:drat) # Compute query and save in remote table compute(remote) # Compute query bring back to this session collect(remote) # Creates a fresh query based on the generated SQL collapse(remote) \dontshow{\}) # examplesIf} } \seealso{ \code{\link[=copy_to]{copy_to()}}, the opposite of \code{collect()}: it takes a local data frame and uploads it to the remote source. } dplyr/man/group_data.Rd0000644000176200001440000000343715106134104014575 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group-data.R \name{group_data} \alias{group_data} \alias{group_keys} \alias{group_rows} \alias{group_indices} \alias{group_vars} \alias{groups} \alias{group_size} \alias{n_groups} \title{Grouping metadata} \usage{ group_data(.data) group_keys(.tbl, ...) group_rows(.data) group_indices(.data, ...) group_vars(x) groups(x) group_size(x) n_groups(x) } \arguments{ \item{.data, .tbl, x}{A data frame or extension (like a tibble or grouped tibble).} \item{...}{Unused.} } \description{ This collection of functions accesses data about grouped data frames in various ways: \itemize{ \item \code{group_data()} returns a data frame that defines the grouping structure. The columns give the values of the grouping variables. The last column, always called \code{.rows}, is a list of integer vectors that gives the location of the rows in each group. \item \code{group_keys()} returns a data frame describing the groups. \item \code{group_rows()} returns a list of integer vectors giving the rows that each group contains. \item \code{group_indices()} returns an integer vector the same length as \code{.data} that gives the group that each row belongs to. \item \code{group_vars()} gives names of grouping variables as character vector. \item \code{groups()} gives the names of the grouping variables as a list of symbols. \item \code{group_size()} gives the size of each group. \item \code{n_groups()} gives the total number of groups. } See \link{context} for equivalent functions that return values for the \emph{current} group. } \examples{ df <- tibble(x = c(1,1,2,2)) group_vars(df) group_rows(df) group_data(df) group_indices(df) gf <- group_by(df, x) group_vars(gf) group_rows(gf) group_data(gf) group_indices(gf) } \keyword{internal} dplyr/man/nest_join.Rd0000644000176200001440000001353414366556340014460 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/join.R \name{nest_join} \alias{nest_join} \alias{nest_join.data.frame} \title{Nest join} \usage{ nest_join(x, y, by = NULL, copy = FALSE, keep = NULL, name = NULL, ...) \method{nest_join}{data.frame}( x, y, by = NULL, copy = FALSE, keep = NULL, name = NULL, ..., na_matches = c("na", "never"), unmatched = "drop" ) } \arguments{ \item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{by}{A join specification created with \code{\link[=join_by]{join_by()}}, or a character vector of variables to join by. If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all variables in common across \code{x} and \code{y}. A message lists the variables so that you can check they're correct; suppress the message by supplying \code{by} explicitly. To join on different variables between \code{x} and \code{y}, use a \code{\link[=join_by]{join_by()}} specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. To join by multiple variables, use a \code{\link[=join_by]{join_by()}} specification with multiple expressions. For example, \code{join_by(a == b, c == d)} will match \code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between \code{x} and \code{y}, you can shorten this by listing only the variable names, like \code{join_by(a, c)}. \code{\link[=join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap joins. See the documentation at \link[=join_by]{?join_by} for details on these types of joins. For simple equality joins, you can alternatively specify a character vector of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. To perform a cross-join, generating all combinations of \code{x} and \code{y}, see \code{\link[=cross_join]{cross_join()}}.} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into the same src as \code{x}. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.} \item{keep}{Should the new list-column contain join keys? The default will preserve the join keys for inequality joins.} \item{name}{The name of the list-column created by the join. If \code{NULL}, the default, the name of \code{y} is used.} \item{...}{Other parameters passed onto methods.} \item{na_matches}{Should two \code{NA} or two \code{NaN} values match? \itemize{ \item \code{"na"}, the default, treats two \code{NA} or two \code{NaN} values as equal, like \code{\%in\%}, \code{\link[=match]{match()}}, and \code{\link[=merge]{merge()}}. \item \code{"never"} treats two \code{NA} or two \code{NaN} values as different, and will never match them together or to any other values. This is similar to joins for database sources and to \code{base::merge(incomparables = NA)}. }} \item{unmatched}{How should unmatched keys that would result in dropped rows be handled? \itemize{ \item \code{"drop"} drops unmatched keys from the result. \item \code{"error"} throws an error if unmatched keys are detected. } \code{unmatched} is intended to protect you from accidentally dropping rows during a join. It only checks for unmatched keys in the input that could potentially drop rows. \itemize{ \item For left joins, it checks \code{y}. \item For right joins, it checks \code{x}. \item For inner joins, it checks both \code{x} and \code{y}. In this case, \code{unmatched} is also allowed to be a character vector of length 2 to specify the behavior for \code{x} and \code{y} independently. }} } \value{ The output: \itemize{ \item Is same type as \code{x} (including having the same groups). \item Has exactly the same number of rows as \code{x}. \item Contains all the columns of \code{x} in the same order with the same values. They are only modified (slightly) if \code{keep = FALSE}, when columns listed in \code{by} will be coerced to their common type across \code{x} and \code{y}. \item Gains one new column called \code{{name}} on the far right, a list column containing data frames the same type as \code{y}. } } \description{ A nest join leaves \code{x} almost unchanged, except that it adds a new list-column, where each element contains the rows from \code{y} that match the corresponding row in \code{x}. } \section{Relationship to other joins}{ You can recreate many other joins from the result of a nest join: \itemize{ \item \code{\link[=inner_join]{inner_join()}} is a \code{nest_join()} plus \code{\link[tidyr:unnest]{tidyr::unnest()}}. \item \code{\link[=left_join]{left_join()}} is a \code{nest_join()} plus \code{tidyr::unnest(keep_empty = TRUE)}. \item \code{\link[=semi_join]{semi_join()}} is a \code{nest_join()} plus a \code{filter()} where you check that every element of data has at least one row. \item \code{\link[=anti_join]{anti_join()}} is a \code{nest_join()} plus a \code{filter()} where you check that every element has zero rows. } } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("nest_join")}. } \examples{ df1 <- tibble(x = 1:3) df2 <- tibble(x = c(2, 3, 3), y = c("a", "b", "c")) out <- nest_join(df1, df2) out out$df2 } \seealso{ Other joins: \code{\link{cross_join}()}, \code{\link{filter-joins}}, \code{\link{mutate-joins}} } \concept{joins} dplyr/man/reexports.Rd0000644000176200001440000000316314366556340014520 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reexport-magrittr.R, R/reexport-pillar.R, % R/reexport-tibble.R, R/select-helpers.R \docType{import} \name{reexports} \alias{reexports} \alias{\%>\%} \alias{type_sum} \alias{data_frame} \alias{as_data_frame} \alias{lst} \alias{add_row} \alias{tribble} \alias{tibble} \alias{as_tibble} \alias{view} \alias{contains} \alias{select_helpers} \alias{ends_with} \alias{everything} \alias{matches} \alias{num_range} \alias{one_of} \alias{starts_with} \alias{last_col} \alias{any_of} \alias{all_of} \alias{where} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} \item{pillar}{\code{\link[pillar]{type_sum}}} \item{tibble}{\code{\link[tibble]{add_row}}, \code{\link[tibble:deprecated]{as_data_frame}}, \code{\link[tibble]{as_tibble}}, \code{\link[tibble:deprecated]{data_frame}}, \code{\link[tibble]{lst}}, \code{\link[tibble]{tibble}}, \code{\link[tibble]{tribble}}, \code{\link[tibble]{view}}} \item{tidyselect}{\code{\link[tidyselect]{all_of}}, \code{\link[tidyselect:all_of]{any_of}}, \code{\link[tidyselect:starts_with]{contains}}, \code{\link[tidyselect:starts_with]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect:everything]{last_col}}, \code{\link[tidyselect:starts_with]{matches}}, \code{\link[tidyselect:starts_with]{num_range}}, \code{\link[tidyselect]{one_of}}, \code{\link[tidyselect]{starts_with}}, \code{\link[tidyselect]{where}}} }} dplyr/man/explain.Rd0000644000176200001440000000301215106134104014075 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/explain.R \name{explain} \alias{explain} \alias{show_query} \title{Explain details of a tbl} \usage{ explain(x, ...) show_query(x, ...) } \arguments{ \item{x}{An object to explain} \item{...}{Other parameters possibly used by generic} } \value{ The first argument, invisibly. } \description{ This is a generic function which gives more details about an object than \code{\link[=print]{print()}}, and is more focused on human readable output than \code{\link[=str]{str()}}. } \section{Databases}{ Explaining a \code{tbl_sql} will run the SQL \code{EXPLAIN} command which will describe the query plan. This requires a little bit of knowledge about how \code{EXPLAIN} works for your database, but is very useful for diagnosing performance problems. } \examples{ \dontshow{if (requireNamespace("dbplyr", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE)) withAutoprint(\{ # examplesIf} \donttest{ lahman_s <- dbplyr::lahman_sqlite() batting <- tbl(lahman_s, "Batting") batting |> show_query() batting |> explain() # The batting database has indices on all ID variables: # SQLite automatically picks the most restrictive index batting |> filter(lgID == "NL" & yearID == 2000L) |> explain() # OR's will use multiple indexes batting |> filter(lgID == "NL" | yearID == 2000) |> explain() # Joins will use indexes in both tables teams <- tbl(lahman_s, "Teams") batting |> left_join(teams, c("yearID", "teamID")) |> explain() } \dontshow{\}) # examplesIf} } dplyr/man/pull.Rd0000644000176200001440000000400315106134104013412 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/pull.R \name{pull} \alias{pull} \title{Extract a single column} \usage{ pull(.data, var = -1, name = NULL, ...) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{var}{A variable specified as: \itemize{ \item a literal variable name \item a positive integer, giving the position counting from the left \item a negative integer, giving the position counting from the right. } The default returns the last column (on the assumption that's the column you've created most recently). This argument is taken by expression and supports \link[rlang:topic-inject]{quasiquotation} (you can unquote column names and column locations).} \item{name}{An optional parameter that specifies the column to be used as names for a named vector. Specified in a similar manner as \code{var}.} \item{...}{For use by methods.} } \value{ A vector the same size as \code{.data}. } \description{ \code{pull()} is similar to \code{$}. It's mostly useful because it looks a little nicer in pipes, it also works with remote data frames, and it can optionally name the output. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("pull")}. } \examples{ mtcars |> pull(-1) mtcars |> pull(1) mtcars |> pull(cyl) \dontshow{if (requireNamespace("dbplyr", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE)) withAutoprint(\{ # examplesIf} # Also works for remote sources df <- dbplyr::memdb_frame(x = 1:10, y = 10:1, .name = "pull-ex") df |> mutate(z = x * y) |> pull() \dontshow{\}) # examplesIf} # Pull a named vector starwars |> pull(height, name) } dplyr/man/if_else.Rd0000644000176200001440000000510215106134104014045 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/if-else.R \name{if_else} \alias{if_else} \title{Vectorised if-else} \usage{ if_else( condition, true, false, missing = NULL, ..., ptype = NULL, size = deprecated() ) } \arguments{ \item{condition}{A logical vector} \item{true, false}{Vectors to use for \code{TRUE} and \code{FALSE} values of \code{condition}. Both \code{true} and \code{false} will be \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{condition}. \code{true}, \code{false}, and \code{missing} (if used) will be cast to their common type.} \item{missing}{If not \code{NULL}, will be used as the value for \code{NA} values of \code{condition}. Follows the same size and type rules as \code{true} and \code{false}.} \item{...}{These dots are for future extensions and must be empty.} \item{ptype}{An optional prototype declaring the desired output type. If supplied, this overrides the common type of \code{true}, \code{false}, and \code{missing}.} \item{size}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Output size is always taken from \code{condition}.} } \value{ A vector with the same size as \code{condition} and the same type as the common type of \code{true}, \code{false}, and \code{missing}. Where \code{condition} is \code{TRUE}, the matching values from \code{true}, where it is \code{FALSE}, the matching values from \code{false}, and where it is \code{NA}, the matching values from \code{missing}, if provided, otherwise a missing value will be used. } \description{ \code{if_else()} is a vectorized \link[=if]{if-else}. Compared to the base R equivalent, \code{\link[=ifelse]{ifelse()}}, this function allows you to handle missing values in the \code{condition} with \code{missing} and always takes \code{true}, \code{false}, and \code{missing} into account when determining what the output type should be. } \examples{ x <- c(-5:5, NA) if_else(x < 0, NA, x) # Explicitly handle `NA` values in the `condition` with `missing` if_else(x < 0, "negative", "positive", missing = "missing") # Unlike `ifelse()`, `if_else()` preserves types x <- factor(sample(letters[1:5], 10, replace = TRUE)) ifelse(x \%in\% c("a", "b", "c"), x, NA) if_else(x \%in\% c("a", "b", "c"), x, NA) # `if_else()` is often useful for creating new columns inside of `mutate()` starwars |> mutate(category = if_else(height < 100, "short", "tall"), .keep = "used") } \seealso{ \code{\link[vctrs:vec_if_else]{vctrs::vec_if_else()}} } dplyr/man/bind_cols.Rd0000644000176200001440000000243315016155021014400 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bind-cols.R \name{bind_cols} \alias{bind_cols} \title{Bind multiple data frames by column} \usage{ bind_cols( ..., .name_repair = c("unique", "universal", "check_unique", "minimal") ) } \arguments{ \item{...}{Data frames to combine. Each argument can either be a data frame, a list that could be a data frame, or a list of data frames. Inputs are \link[vctrs:theory-faq-recycling]{recycled} to the same length, then matched by position.} \item{.name_repair}{One of \code{"unique"}, \code{"universal"}, or \code{"check_unique"}. See \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}} for the meaning of these options.} } \value{ A data frame the same type as the first element of \code{...}. } \description{ Bind any number of data frames by column, making a wider result. This is similar to \code{do.call(cbind, dfs)}. Where possible prefer using a \link[=left_join]{join} to combine multiple data frames. \code{bind_cols()} binds the rows in order in which they appear so it is easy to create meaningless results without realising it. } \examples{ df1 <- tibble(x = 1:3) df2 <- tibble(y = 3:1) bind_cols(df1, df2) # Row sizes must be compatible when column-binding try(bind_cols(tibble(x = 1:3), tibble(y = 1:2))) } dplyr/man/setops.Rd0000644000176200001440000000416114366556340014001 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/sets.R \name{setops} \alias{setops} \alias{intersect} \alias{union} \alias{union_all} \alias{setdiff} \alias{setequal} \alias{symdiff} \title{Set operations} \usage{ intersect(x, y, ...) union(x, y, ...) union_all(x, y, ...) setdiff(x, y, ...) setequal(x, y, ...) symdiff(x, y, ...) } \arguments{ \item{x, y}{Pair of compatible data frames. A pair of data frames is compatible if they have the same column names (possibly in different orders) and compatible types.} \item{...}{These dots are for future extensions and must be empty.} } \description{ Perform set operations using the rows of a data frame. \itemize{ \item \code{intersect(x, y)} finds all rows in both \code{x} and \code{y}. \item \code{union(x, y)} finds all rows in either \code{x} or \code{y}, excluding duplicates. \item \code{union_all(x, y)} finds all rows in either \code{x} or \code{y}, including duplicates. \item \code{setdiff(x, y)} finds all rows in \code{x} that aren't in \code{y}. \item \code{symdiff(x, y)} computes the symmetric difference, i.e. all rows in \code{x} that aren't in \code{y} and all rows in \code{y} that aren't in \code{x}. \item \code{setequal(x, y)} returns \code{TRUE} if \code{x} and \code{y} contain the same rows (ignoring order). } Note that \code{intersect()}, \code{union()}, \code{setdiff()}, and \code{symdiff()} remove duplicates in \code{x} and \code{y}. } \section{Base functions}{ \code{intersect()}, \code{union()}, \code{setdiff()}, and \code{setequal()} override the base functions of the same name in order to make them generic. The existing behaviour for vectors is preserved by providing default methods that call the base functions. } \examples{ df1 <- tibble(x = 1:3) df2 <- tibble(x = 3:5) intersect(df1, df2) union(df1, df2) union_all(df1, df2) setdiff(df1, df2) setdiff(df2, df1) symdiff(df1, df2) setequal(df1, df2) setequal(df1, df1[3:1, ]) # Note that the following functions remove pre-existing duplicates: df1 <- tibble(x = c(1:3, 3, 3)) df2 <- tibble(x = c(3:5, 5)) intersect(df1, df2) union(df1, df2) setdiff(df1, df2) symdiff(df1, df2) } dplyr/man/mutate.Rd0000644000176200001440000001636615106134104013754 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mutate.R \name{mutate} \alias{mutate} \alias{mutate.data.frame} \title{Create, modify, and delete columns} \usage{ mutate(.data, ...) \method{mutate}{data.frame}( .data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL ) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs. The name gives the name of the column in the output. The value can be: \itemize{ \item A vector of length 1, which will be recycled to the correct length. \item A vector the same length as the current group (or the whole data frame if ungrouped). \item \code{NULL}, to remove the column. \item A data frame or tibble, to create multiple columns in the output. }} \item{.by}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.keep}{Control which columns from \code{.data} are retained in the output. Grouping columns and columns created by \code{...} are always kept. \itemize{ \item \code{"all"} retains all columns from \code{.data}. This is the default. \item \code{"used"} retains only the columns used in \code{...} to create new columns. This is useful for checking your work, as it displays inputs and outputs side-by-side. \item \code{"unused"} retains only the columns \emph{not} used in \code{...} to create new columns. This is useful if you generate new columns, but no longer need the columns used to generate them. \item \code{"none"} doesn't retain any extra columns from \code{.data}. Only the grouping variables and columns created by \code{...} are kept. }} \item{.before, .after}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, control where new columns should appear (the default is to add to the right hand side). See \code{\link[=relocate]{relocate()}} for more details.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item Columns from \code{.data} will be preserved according to the \code{.keep} argument. \item Existing columns that are modified by \code{...} will always be returned in their original location. \item New columns created through \code{...} will be placed according to the \code{.before} and \code{.after} arguments. \item The number of rows is not affected. \item Columns given the value \code{NULL} will be removed. \item Groups will be recomputed if a grouping variable is mutated. \item Data frame attributes are preserved. } } \description{ \code{mutate()} creates new columns that are functions of existing variables. It can also modify (if the name is the same as an existing column) and delete columns (by setting their value to \code{NULL}). } \section{Useful mutate functions}{ \itemize{ \item \code{\link{+}}, \code{\link{-}}, \code{\link[=log]{log()}}, etc., for their usual mathematical meanings \item \code{\link[=lead]{lead()}}, \code{\link[=lag]{lag()}} \item \code{\link[=dense_rank]{dense_rank()}}, \code{\link[=min_rank]{min_rank()}}, \code{\link[=percent_rank]{percent_rank()}}, \code{\link[=row_number]{row_number()}}, \code{\link[=cume_dist]{cume_dist()}}, \code{\link[=ntile]{ntile()}} \item \code{\link[=cumsum]{cumsum()}}, \code{\link[=cummean]{cummean()}}, \code{\link[=cummin]{cummin()}}, \code{\link[=cummax]{cummax()}}, \code{\link[=cumany]{cumany()}}, \code{\link[=cumall]{cumall()}} \item \code{\link[=na_if]{na_if()}}, \code{\link[=coalesce]{coalesce()}} \item \code{\link[=if_else]{if_else()}}, \code{\link[=recode]{recode()}}, \code{\link[=case_when]{case_when()}} } } \section{Grouped tibbles}{ Because mutating expressions are computed within groups, they may yield different results on grouped tibbles. This will be the case as soon as an aggregating, lagging, or ranking function is involved. Compare this ungrouped mutate: \if{html}{\out{
}}\preformatted{starwars |> select(name, mass, species) |> mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) }\if{html}{\out{
}} With the grouped equivalent: \if{html}{\out{
}}\preformatted{starwars |> select(name, mass, species) |> group_by(species) |> mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) }\if{html}{\out{
}} The former normalises \code{mass} by the global average whereas the latter normalises by the averages within species levels. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. } \examples{ # Newly created variables are available immediately starwars |> select(name, mass) |> mutate( mass2 = mass * 2, mass2_squared = mass2 * mass2 ) # As well as adding new variables, you can use mutate() to # remove variables and modify existing variables. starwars |> select(name, height, mass, homeworld) |> mutate( mass = NULL, height = height * 0.0328084 # convert to feet ) # Use across() with mutate() to apply a transformation # to multiple columns in a tibble. starwars |> select(name, homeworld, species) |> mutate(across(!name, as.factor)) # see more in ?across # Window functions are useful for grouped mutates: starwars |> select(name, mass, homeworld) |> group_by(homeworld) |> mutate(rank = min_rank(desc(mass))) # see `vignette("window-functions")` for more details # By default, new columns are placed on the far right. df <- tibble(x = 1, y = 2) df |> mutate(z = x + y) df |> mutate(z = x + y, .before = 1) df |> mutate(z = x + y, .after = x) # By default, mutate() keeps all columns from the input data. df <- tibble(x = 1, y = 2, a = "a", b = "b") df |> mutate(z = x + y, .keep = "all") # the default df |> mutate(z = x + y, .keep = "used") df |> mutate(z = x + y, .keep = "unused") df |> mutate(z = x + y, .keep = "none") # Grouping ---------------------------------------- # The mutate operation may yield different results on grouped # tibbles because the expressions are computed within groups. # The following normalises `mass` by the global average: starwars |> select(name, mass, species) |> mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) # Whereas this normalises `mass` by the averages within species # levels: starwars |> select(name, mass, species) |> group_by(species) |> mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) # Indirection ---------------------------------------- # Refer to column names stored as strings with the `.data` pronoun: vars <- c("mass", "height") mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) # Learn more in ?rlang::args_data_masking } \seealso{ Other single table verbs: \code{\link{arrange}()}, \code{\link{filter}()}, \code{\link{reframe}()}, \code{\link{rename}()}, \code{\link{select}()}, \code{\link{slice}()}, \code{\link{summarise}()} } \concept{single table verbs} dplyr/man/figures/0000755000176200001440000000000014406402754013631 5ustar liggesusersdplyr/man/figures/lifecycle-questioning.svg0000644000176200001440000000171413663216626020664 0ustar liggesuserslifecyclelifecyclequestioningquestioning dplyr/man/figures/lifecycle-stable.svg0000644000176200001440000000167413663216626017576 0ustar liggesuserslifecyclelifecyclestablestable dplyr/man/figures/lifecycle-experimental.svg0000644000176200001440000000171613663216626021016 0ustar liggesuserslifecyclelifecycleexperimentalexperimental dplyr/man/figures/lifecycle-deprecated.svg0000644000176200001440000000171213663216626020415 0ustar liggesuserslifecyclelifecycledeprecateddeprecated dplyr/man/figures/lifecycle-superseded.svg0000644000176200001440000000171313663216626020461 0ustar liggesusers lifecyclelifecyclesupersededsuperseded dplyr/man/figures/logo.png0000644000176200001440000012370514406402754015307 0ustar liggesusersPNG  IHDRޫhgAMA a cHRMz&u0`:pQ<bKGDtIME6 IDATxwx$uUhsI,fRT"mAeAao^|k[Aʖ-˶2E ")a89FTu?fHbtW׽Uo#&oy衇 V?wy@<#WTID\&/6t?Е(F^_ .6\_ǁ,5ߠbuϫIx3{ *"sy?u>N.98:::vgi/5o7|8{r2dttadUCWW^u,䭕kB Ͻpz};55 pxSί>Z[[+s3cik B-"!W+daafffu}SUQ' M_855o Q=vblll6t:줷߿8emq7A>A;X=vs9N366xKgg'nR5 7CRDϽ24?;4Mcff-P(D?ͨKHx?~kMki}4QOJI8f``i4M))B?uuua1h_^ru/SF[=v͕H$,?7N_T$nkjjV Q-N˓k )w#2 ŮR6$~>:::p\_ -~k˖kހނz}q,W\xijj,3|?hk^"ch]4 ?nT`ΝSOU55p8hmmmsk 5W Em@Q=6[\.I:ӟ4Z?o h/ymQ \RD{3@cכ7(r98}4[uEaǎ| !t:I$J;" Wn׮]?~(4p)A5@զ%mmm}x嗪r9>Od6<466{n*}[#Iהk.4H UDF ^o ۸;hll\R( vmQ+|)x .^ mI项ӨMRLLL0::J*~Z[[q8ƈ?<5ڢ%4>lE)(>|SOpĉMsMxt]gjjOFۡ(G\Z ߯ROQu?Lss3`D/G\.d߾}|K_"0;;K,###|3Ao!P(;::dE"LAfffC[#D p. 4?W܍D:Nkk+.kCl[[y{ؿ?\'|H$XoMKK .}tIFGGI$>f<LNNx['gz0 ~x4?!eɤU444_9x BTU!4l۶ ] B155){r蠳k=fggJoaa]}oF[ߴ.A(Fr) 3::! B xffD"aU5[199iEkjjhjj:UUjkkQ]YZZbxx꽺4Mcbby驄Xq}߆oRJx}a4# ~n6ettӧO355uY@(  >|Zb1^}U|>>((.]^׳sed2 .\`iiiR`}>ړU|FFHZ%h~*όV733C2(9+DY:;;RDMM XYnooˑhkk6!f2^]hSNYqEQ1?~Cho ;)Lk\}_ZZbppɪ?aY`v9zhiLNNr!QWWG,#hMVhiivot:iooϝgxx+aR277G8>ZxpiLf%h~?ޫH$ )%~0@n _}!TjY@ (Bcc#===YKKK\5rx<jjjp:4bQމf @~ s-1s'''Xs刮VNVJImm-Pp8yߏT*ECCê&Jz@֫j}=XlI0&1L2>>~U2k{zzYLD@oj?f|#p.݈d}kLsg劢viiia``%rMkk+.\({E"eֶYkD4Etƙ3g<7Kgg'x,O>dUljv+F׷"UN"JGSS$IfxS(m.۳ccceYaf+p8eāQrA/z衫xUx+h~b555(6"yٳ)%G#,Ky<\xhmmepp:i{糧{Y⢍eQߕe型;Y2큶RM=]]][̦T*e+-$wU[U }I(ͯX}c9)Ce;NxW&Nxp8YfCQv߿FqREբmYcmS9? -,,=̇<^w#wch㯐-^- .AFMKԄ]Av-h2o0޽{yW'Zڥ EQVQnNnӆfMd2x<h匿ˢ.6Ks9?8`ڊ JP(A!(YC=tUL[b.2Rկ>;opp f飚rVCgϞIM,cnned2,?n)f~inw>FT_^cm>dV3S㌏.M9N[=LڢYY!m],-f{O5i~,BPSSCss3+^= 7܀Ge>Z-]C255޽{2eZ^kSUU(G婧bjj]vdp\8Nك4?{TJY`e+OTVv[d{p|hU4qC[>A[4-N]) o)h~4Xcu||WMMM-Qږ -ooi $;(hԺ%?\n}>IFGGQVYE0m%a38ǭ\Ckśbm%mqS\gN+Kof%3??OkkUxH&kˊi<:ujYTN:s@Vcc㲇 =~;⋼+ ݶm;hRXƱ_R鳖+(͵g2+8WMnֶ6`dN0mqKh"6R\mSrN2}T3L&$N[9q=ZR̢ۙh4MMM+7yvQŚsNvif$J_*%iZ=Vg5Ý˸sss7y$1::j+H4is vǫWJ f4sssb1sx''377gfv V %I0XLtΝڵv73ܹsW~8SjZ3Pwww q\[KR?)5d'Ճ1|ShUp߻1V+777ǥK\Nv;\=uHlll\=l.cbb#69goݻɺ1xO۲*%h~eYaf~&' +tɑHǏ311]*hSXueϜW5fYt}quYeպuǎ l=%''-3{ 7X7=L&y7yj8v$###% Jjjj,n7gYSUz%L[Q-#Fj x+h~%s=9r_L ;^444{V9 AjEX)TUU+_ Xgs̙+b:oWg-~j%[-BBu}TUE4"DJ- 5MOOo)pp\^@Jouthhh`vvYP1NK-+wk-4a$"Jq=t:ygz˕jao6K&F}WZLêr5aW>p^{͚&Q__VYCſdŊ\d.wRu{q !Ƀ>}ݷ^Opv]w݅rٳgYZZ"SWW>p뭷 V-OOOSOsϭkVJBSOY]&lͯ=N6[y(%;Cʬ2r'''%%)cV W--fj55m*bzz>t]</OH DA<)%/ٳ͢uuu|h_dEhll\v}ƘHd2\m )L[h:^i~[F!8x .2ogggO|3e {fՏ=l'O'ҥK+n7"ϼuE+峚\LV0.w Ehj }0l_5pXMLЧ?iJLm߾J~+k ҥK<8qbXnVTWM淕bI9N[doaڢ4?s]躾 TBULp}vv! c /֚wweEQWzmˠ-"F59mqm+h~asVL󛘘`ppjnn5544zXL]paY !<+|R~ԛgJ6J )7¼>|žekƌ?F[tbǫxyŢ*w{r9{ymѫMF-׻!uPDQN< eĤ-mπz!́4a%J!ft.`.^>lUzzW4RbOTB3r>kf6*&'۪Pd[45Ĥ-3W {bdd"^D`L4 qsmY\{=5@ X={n5R3_-ETp5N0]d2TUEUJAe[788xU!#v6P;vpѱ"dsgffcMOO[3TUeϞ=>}ટW)h+E+jjR[[Kww)WK=Nb/Ir躾iݫ.jޜw5!+D?s?G[[۲f቙XaZR)jkk oi~o6]JUz٣Wr*Ii3-گW)) `!xǯd=5r@2ŋ<Ze+$/|wB!FGGygVvYhhhF+%MLLYl4+!v\Ij5Eb9qWt3-&!?L5 /~<ȩSͲfrUp?PLU2Jv38 ׸b*ñ~(\nݕDQX,Ƴ>{Eοj4R75^.G~n%bll~?e墲WJI/7:2 `][qH lɍ)|sqnR*rؽ{wp z5VROZ.f|l12~AWCg--DVCYff!J\)kOtlʐD%J+)I]J===e ŝ&LcorDW󩬛DJ \&FFc1m&AQ9qϣХ$N75uzY-D.X.D.ˬ#!7֥\1'f3(n?tBgL]b5Y< +JpJW\[!H$#q~ᣧ)D(B*uuw8ۿ@FJ"d.L$If,J(t<0#$It$߆vӋk <[e(7=Vkvd5-P&GW2MO,Ic4NHwQPeg,oSBШ:hR\^YǜKE5pYvJi #E ]7k-As.8w&F7JR+vR"iF$2O :HI]V/bg,Ig2?JDA6Mr :&cXuTc)9)-M A "1*)y9_" ;GKb-\)/H[X@:Yjب盋}C{zMJ"]j׌Mpq7w#˾goYbr>ntS\ӓH H@vS+%JIo^EK7)nl`>>w<R,WCZ5AyS>Pz# ïznf[Q P)d.йh#bd.vQ`^$t>;<A&'yynm Q߅3kv.O D[)`Yohw(5`-zŻAGg56N^晧RqV OESܲ/Zi"_1]|*# $I~v*x!Lri~|AQ ahߡ /G*k?KGL.4?UZ)QPd-oR Hi6n}aob,sYM,,D-naMFjZr*w$pQ]}_߫>&&8%ժIx t_f& b*Ϣ]p)Fd'wr'@E6_‹ 3e_[֧Sy|tvKk;}[RpkԤ$StoV`zz뭫h~Ĵ]OgrNa\>|<ȃRVx[h+|Y]ͧ2Tw-?eeXp,`H}O^%5BJTa.Bqs7 .R"v 0u0zѕiNziGAaIT\w 5Rbj[xqƁ5|* 鎾x΄L\JTMJ^I'8*hE~a7Z(< tiW3oMڵi9*9TS=V:n$Dև5xCf7(U #p/ln;/)_"*S̊\0YmK=@e.fbt'SKhҹ/(8 v8Xf2钯Hzvh~%vho"At{'6Rd5Hő&}={CoCpen\d|WJoυފpƼ><Ȯ-nj JY]rB"u}ӴUL%u9I2hW RK_-G?C%XJjv.5ɴA/uWg_57siWtSV$:Iעy\H셭h4xg@8OSTVFa͐Dt'KIA<9+JR ]:-ߵt@V"w/( tLV|{|\F+|=>|Ym4`ú@Q3: wE*.@w-ƹ1>U5k3G- ZnUBp)D*^J5q"bx"Wx = @ӐtlOh_{wm'@ԮbڵcTO*~gͯ Ý0-AԥD7cZxmI㘢h_<(Ld@U %u<F>|d̟zNH@8pq+.EkK M!H˦]ZH7ɕԮ2s,f_Sѻ^%roO hg'y=ǁK(sQd@G0%zϡ+`41ޥ@ G|%<,'"d*}7>cg"4+pC˞To]I5 ]ˉBNNÿ=<}Azȓu}Kɖx4JVp/4W*M8{HMt0{LXnVeS\F_-um=_!Qѹ^$벑i|6S~5[4q^e^IESP5@TT]EՌ|s9”`aͯ׷Т:6 E3Zܬ&~[&ڼݹg&3Q7]CoGO,Z@릇YjJџK'p~>H$ahZwހY%?^BU75>#(>C|ު(6ՠA+mq'\Iy\Bge ‘ cӜq?־HKg3ev,dF,8xW;iϸ^sLIBp'c(Z Xx ^gٶvL'$5}Cms%m|佈/S!;/) +_:/"?5 λxuK=}RE^ r4X,fKhc-Mvs;>-$#St~p4kU*B)aMui{gs3( 8_-]LJɟ<[U~'q~ aant~p/}gu<[.7*yƓ?ArmPX  2[DeS.զࠛ}m 526M /24NNE\Yq9Kw/stll&K >z;kk6zHc]E.PucdN7絳q׻iWH (fj#&ulY,lUf # x// fCqs,3%U ܴ˥4<6&o,5Xߗo^y )PkƖ:>4#dQ}pG݂;Z !GL6rr*yfjK*mpmEj":Vo,"TGyߡﯫ Vd4QL31mD\xanni^eyH-兓p(nClSH &9WmX$ϫ/V7աuYq#~CU֑OVtz=z$BURkd27^t3h~tQnZI\> dUnR5 sg1R)NU4"_Qp =Fe%hGǃd323$tRk\?KfMm:az,H ߚ$|L򩳕7iHF/à *ĨZB>teo퐑~ڞEǕ&}(RYEGFFp/Zܽrh~*j.Fu'_fumthu/ʒS-@@=)nD7ax5ɟ䪥|nJDo @{Occ5"@MW/]}e\[[˾}iq(t,zC+p 5,5mmm*T7[ #>; }uL.|r)ul;6f{3R-?|]puo؀/£]_Fؔa݁;'ދpR.kWWqxJUr#.GvttDXͯtЌPb\`7Y|H:dq^5 3~c4*-f$JUҼ~kͨ*K]A5Ff*"C{ y>x}0: !iG|S/ GԼƋڭP(Tr] &/N{nNp $hy 9| d"^hp=ЪdoI~%ɿT5ژ8ȴ ^+催 -Hpj })$&P3Vx=O;~_EGkD]. 񙢯 z($>3Y<ꋸgH4x~}vig%=IY:sҥ5i~M>*vv+ ZjXH #B}E%Z0㧵+\*IQUᥬ~9bL2텬{\-¡/ E!En)y;1 5w |"XKyA!/+ ޵Ƌ](eKb}4X&vv(ju%Nbߤ.e^FNJ6*4Ԩhgp2w=÷QTb s*, "^ "[kuQ8<'v!>v*1s $ qv MZEmmm%{W%e\J;h]b6Q@,UZy V隀hC6^.SJ-Eih^"zjuOKue#25\z]dkfrwl\ d܃hp kM^XzGJM^AW!70LcếwjϏ@QhB-]mmK_T6n{IXuɕv"qH] {\i: 5njJ{AG~sSvL籉r9iokۨu6οcJX6ZDҍAK`מDWarw=7![M^ HxQdɅ,0uuu< G3xpq^"D(sYxʙϦUU=xjMN'$>'3`niXz/Vz6nNY;RaE0/i qR֢ȯG&܈KȾiD}vp97_xqo6@uek5H\v͗hx˚WֻXs>avtth>&+SԕH藧~TID Z_2nStWdN1ӓ:ͬlgbE߇0oȡY s=c&B'&+kAs.=Q`ji~ @=d8i'bȼ&P$|ȯ?ؽC7-PpFWi φ|ټTbUDu<=lf"rC(BǥSn^Gmf<&ZҺ=x6?7jfH1*ISug@ Zkh!q7zu0{ͤ{Lrr 0@ba8Di h.?y5#wN )?b&'vY Z|9 ˚Wjra\Whz(%4qpc4U& %M.e+( ]<ͷ]lE %.h@0EguMq!W@:'9: dR~ [ZQAh m Ӵ?pk'9dB^V %X2AAw9pw]7!N6!@osvVKa{ d[Lc5)sssfaGG䤎΋Y+I2ZEHܪ/ ^ vaO`G-@ip2lkj]S#B["n-'n8-|qblڷ ~ߊ q0QwHv@X{+]@'oz˞ t؇ucCԚ<6$f5z8]_{ `Ӝ### d'9n1fBf&$YU'Ԉ;sD9S0GDdE2y;N4:D0qIhs0$ \d7-hں$ 5$ l#a#qW$A~KWya3i"h2~ejR1.O ΃m;\nuF|v,z)z1^byEqk H[dDbcLlv0+5H1MQ3h5ăuH;5bZѩ FGHe61&׳q>tZu Zh\'ӓ  c禍ԳӒLX>@h\ v߭ !ڷ-. 歆kuXv+L%zm-,ʖVQmHt=X,A6;M.M`{4S21,3 @ԝ%Hkg;1#H4̘jK!QK* J?7d!#0/J24x-E4(Qg~h9!]XBH2f3I@#P8o!@j "F*-ɗ/Zּ^Բ|ִfXvd:;R -\1Y~y- ɂL0'जs'(BAu"333+*\vg4*ׇyt<ǫ'Ԃd(ƛyw"?6 z Rؿ>U[Fxd}aԋT"-fQ^c~KC ,ٛ\=-$)tDQ\Jw2?,ȉ -*AS$aOsYIXv1¤X~ך=\F{pU*ۜnTgUX;VdD:W 3܋ AWpfW.)pY_ÜrdBKhǙ]A+`\N݀Ċ'}?ʹsд[`9 Iĝc>/$/rrvXv/SSSCww\&c=+<5(? *3I d#ڠNDny'q7P,<|bk,m9ocJo^8~qH#Y&ͯQufDhi݅~שF =Mۙpjmpn .Y (p:'{.s2G_|Bv;j,ñ0se0?wBqmx濊|` V݌ЁA;)ʅ5y8}&OGq~kA-Uu ̷p?i֗EfXUB2S\S3&If㋌ 0;3^O\Jv;־%#Vd"P{x#øzuk_ Z D4t0}#joP  ljG{POi3R7+ "RǗF_ ,bz)8rxu_4S}1k#02Bu(yEz  f9{Z5۲%M946E81h0AN(RD. !hll5LMMڃkwԆp a:*xtq'EZ5AvD,VP< Rb))uIXS[7'_8GOdxpn^1F58K`v/h~cc-Im>R"tO )Mՙν:}GVARp847#kalllYSH6s "\qr L[v lURu7[\.mUWb,}5y$* x0dLh] ,c2"ã9];:u[.5\q)I/#fO5DQNT@X,f->!PB`?߳Qc5]B=|uj_cǦMRnvpr<VͿ̓hi6u_sp~nl'W2+V9XafQӣlaƤ͉ FIt#LODHi'ft )u9D9AOOn"r-EUUNmmXH#H|>ÁJ/b*ɓ|Np a%Q9EL߷1dHG*E냴=" /5* DQ 2;}C :p5 Gᕣ0=cVlͯ~P+\s)߃N-"JE+j^#*Ql:B*t#"St 5V4EL8 d C* %ӧSG~nqE`'TN@0MaE1a9o,\YE{bOQgPҙb7-q=5,WJTtsMQ689QJ5+Qs8̝lG/[iU "SL,!kFHxc|5#$7@K? үc :ma`Men'b FQ@09Aƭ?λgngQcfypD#4?:xV73Svx<ZJ8sL`jjj,kX ͔H?RJk(V S~h#Wtu*n7 M1,@LG\ӕ3ƢӨpxYjqXz9'D˜\SI&v,l hw(rHIUCcU(h\<-&Zyj{oA֭\]1x( \@kmmEQM\p5M#XVd ુW6l16Wwkw re;o_w"y-)YJo>6]X8#-7Lh/N㈛*f?+= ZĘ}אKByʰ!x۟-չ$ڏaý24Jm HMV>kf|֧_{/.0ܲ4?(4_3 ȿl9o&_qX(e WӪf*044c޽%_[^۷[ph~4Jk(MMpt0WzSï")m_b_t/;~&dѤez&YHAj$@VPDsUgbcQ+Hl)+ڧ/~LJ71KӯqS5' 4r~n"gOA&8qDn=Qk[2߶K)ihfZJnXH,t*pr<йQs~^ƭ/o6_<UOPI)Km=)<23 QH "4~LCn G-@(#Ϳ%1G@ Fq:P@4 <0+i&[X$ٻ /V tKc/؛%===8uM\ ,ǝ_s!N3::zҺF`d2|$Od B4pONI"$'T51Ȁgq뫛;Mqb1O'i\b)rScLOT9PDo>9 8d$GI s] X+bv2AR2&p5McvzI 4?E?ei?w) 'c^;M,O\F&/%e<22ŋzlD^)WqdYZ011A,Ջoܮ/G噷{rm56J$ p85bZY1(Bdd Rufg6 j/n$ 0XDoKf)- y<:su˧M3i~w6b ˩$?^Y4|>b1N8al]%RU񻺺({e|54?UUѤ䥋nFg'- } :?d߼y3a*Wygm]=&)qHz-p"E))c8DUl?y#jKmԉ"ղ 2rh00C)QVSg ,T|5c#֓h~ 5`.ix+Ƀ_MdžHWRV*!D"dOl߹:Ss ^s~|0gBT>n şK@+8K*hP_"+Q͛Z,r{b5 0 r#(UN (v%~%%Kؿ<͎݇ P"VzXic/Å +i~AxC]qӓ/Y) Y,5*~\[7JIqLdW+^hqD"M/~m4|6O6 9"*giR^Q} Af As\qrKNElssusخM6hG\@:j0s;ĿQMcS'X7h~]k?F#p-%K}}=9sP# v)/!^znYzbrEThI:_dA6eWzy! 4}z?_"U) ~'#L+NjmctAr`Z%s(Bn g+OUf`yځ^7= !!/&;Gwg'mq+'7aVDZ`f:usDOkk+DӧOhj7KEW[O?25.͝A.xyLa5dY7i% Z+Vs; w!z/|atRS>ͷ Bh -4("_1dkKVY؊Lj^q+d^j"N PtMMM)/be<'cwW.}5r֣(8\.BZp*,W|fnj,PS6p'\]WUWk_G.~}U"u&|TiSA} 0wz mkvÚQȭ*GUTt:#Tbhh!|WܖV$#3;.Hoo/555[j79xݸ\.@Er 7Ԗpx~o?qes$M>=ԅ.meved\淮J/nDC{AQ}~җٝ7^!%g;| Dߥ^9Slt6a])Bne9P__󧙭m4>]Ф̝-\9GYw}&߂#F!E"OS@-9g{P~xy4KA^ ׼L[Ji~kJ} ;`1n _BbjJd.^jsG+'qeUm +[]ji4=9+AnTaDyaJh]{֦e3p";jTS>/NW>؆s rhT~]6m곱W YstMGϒ}{(fMK]zgXvmg/e(t/296 VxNhh~Yof6u!.ɑiw0A^cT(~DoO71Vt?o ^VCwDhxC_K#2U)cyhdQ H uM+TTNbS:TM f?̌~P5b3i"CP9K=zc r7i~Ĵ6[qTv.ޛm,^BnsfC(x O6Zl%U}mFwqnn܊IDFmK/C⁹P~ٵ.V<3˱64~)>#]ˠk?>YM(9=Iލꦃ׼T^9N;pXp29{VN0BoY^щ㥭>A8ٛ$DfꂯO0qV}m/лf-2 C+Pgh/t!R du4E &]q4ff6-ݦM/h 4T4nz8]G)p(Iҩ }1[> w7Ә7MN6i#KAFWHԬR=Pb}{IWײk9eE8g=lB\zϾύ|"z F^o^ۧH55iRҾD϶Y%lL&D禷J $..dl*v|&9J\NgrjIB%h~eZgh+M3\ڬo?nf_עOĻൡ_y׆Bs*B| (&, 7t'7]Et>t9GGGf1s+W^CzYvΡ4ͺk/W )!Bde ٟ!شh \]Yt G 4?r*__wdͦfsKmKanl Lxh7?B4?ի2L受ՆC$7/#Ȑ'Uw-Ƀw4׍ͬ/\m]Pc\N#ΒeO nN,˖7#!ѥ܇o5ΧDA ۜL*x>+gP:Pg ecb2I5@cڗ*y(q :]I4d^+q A e͕o"!ԠLӮ>F_*3d9&'&h~4?֠M~4JofH) o͎!PΗe k|-Hv9 oZ-x=Nv-񾛇e,^J?d_2Xϵ^*$͎,ĹAGM\Hy됲ݻ zj_GƟaWܨm,k\I$_3XJBfc,)>q#dŨI8QwYf:iLjU,WMQv>:˅ &bqiZV2)EOG}w&sg<|X~~n~{ǮU5vyrÙA񄟘Z빘⡇ZaJ)fE[K#|h>[nkmK#8_7΄i6˂TIR Ъjxh=Vv䤟ُ3櫶W|r z[N3<=OXM*4#?uwPޟ垽Vtz\inEߝ£?qH8ęd =4C7g5˴UU-+XV\.gE@]8z] w.6DGl5+p!#ƙ @ #hUIg&n p)Q{ *BCy͙s,;^I4:<1p}Z'L54 fxM]}?4>tL=u9x|gA{f <]Q7L5V e_݇xp/M5)DKo$ɠK*ݳ#rP7lnxr?`u#29CdO3Z܁/g6n8o٬1w _gg'mx]j/s=ț0wL6g&2$+pSMAHq<[UB^}Qv_x\5HvD4 1Z*T>|Z`p.Lt}ou2#"5A0,-mph~4\4?zyA?0Y {k|mEn,Ut—B ]Y^Kr }N>"ҿ(poƵ6-O L85yri~o9Ռ;7w wI~ 0< xvg_i`'ik0lYmh}J, 8'sȭpj-̦uÞ ? y^m t1{/nx dw%F^= +i~] b0mT{}0v Ot>X)oddl13=хew{<;f ~n9 CS^VP$1r MYWE4?s^j4.Z[۩ܴ}<̞0YQLaNDF)U[/܄l4wirfIvy{4kpLnK5o߆-!T0cHC6R}789^ 3307m5&3\}EPe7A|j/bA+%%/kEŗ5`Fц|`9jtU@ Rq yk3,L2z~ki~uA3QÛpo`0x@p[ d0FT_#/HW٤R_wwM-45gs;P`k ֺ@]9R=jM[mctnx5)x*3Lg?`\?;9Qmg֘ҷI3N¡1j"LFy}^ߪk|rb4 2! Eh0pΨ^xR4?B@lE0vӑ/RG{)#-._['un>{o N` 1G]/ܔ\osI$mjbFHW|#q|de÷\QzP<|#G.s#'"LCoHp 8 (cFܺi~fӾV;DJs >DXKk$Rhx ]pD`lb4Zx6G5BD*kiʆqZE$sMdgѥle4nA?u;gqk`gה<9aLC8 tw-p?Z((^!Y9\3]'W&XU\9??og Ү&c#\h7/΅SjaSm|8uLh/_@19_n[eWT3z&]!zRja3ȥ=G /W_)ɃGq YpÙ,8m֍鳎f<|K_.;`2x97kCfH;Ì; #7z bқ%$UyFiD mO %45 ִJp/䂯(۔ϱ0}u.GLo#4RU0CLj1{ޑ @I= 'NkM[X4Jkps3EoPTAkf[5 M\|uUi~]]4:{Q:S y!Uкᜃ#u< ӓ7*_DeD{,, I :y/QfNi~k{Gj<-K*1iW|m p:ͨ!T)\>W?7+j2{܉uȚYda[uT{m*(i70eHjl90s ^ytbim흄Nna-_AN`1߀[)Iּ@pv6Cnb7Q-:ZLGs؁=Zۂ ԩw;ʦj<-75n _`]DŽ'ĸ'ĘiwG 5IdiH9?9=yLv\$V^Uò:[NDߍ.-lY{80\X4n?48"Vy  * aVtUG'G"%8, ^3=C=iwhz6탸E Hnp~/69x{pW_)a:4Z4V;㴙bRT`ityۀ< t46w1٬\qΧL'D{\5OOO5v8h3_Y?~%",|s] ؈x<V<6,IN軙b^l*lf,6K!';, + 7B{$ ӂ*З34Mh~]n40:,;}R7!<λM^@kD^VF3n+Xe4Mhj66t7ؽx ٟe^2^\|9mCT]\&IDAT.gttm70NDENX![\4r/ao gD=J-3"75]EC㶌*82r|Ǧ&P_ޮ%޻SF܊9- 3kCNb㻮5FýsIaD'kx8\0oɷm5VA41$jbIs,w4j,]T:Fs2{̏\ͬb0i-i~>+YdIͬv+aR) iKXy:a{ȑ?g00.#{W;4yV׸g;.P輤≛5FhҠM~^GDoK so`0d=98 sI]Na3[Ye!x:|1\ڶV\5kz2Dil]iqOKoWvO7yDXAf[:kc5א JKj4I1yF^r]m1pdf9` 7,ϜH$*h s7`c,jF2}Nvq+#ÑI^y}NƉ 4v(~:=8tC=_* &:`a .4:Js+tɸ{H܅r4?g{ťU%etDe?s-=,j-£ȩpR,VP&h5YC\vb^6f@|s2E D~LZ2v o0kCHAJä@і2h>>%g/J>N9ͯƐ[vޛݱ"LV _.7JJc݆6FͶOA|^8p,SxO۵BR4R6~[\\5"Ϥ *qgN 5M0VO{4m\v]ϼ~+YxQNnz9I haEhi~-^nmZ\ߒJ밂3-4Y$PUܵSz@v5uGxMqngU=K sJ6\`\ѡ}@uAKFk]X3fmvZKo!kd)7ͯZ=FJ2@LeQ 1 z%@[8|$ 0bW#ۻͯZkciWWR>yQ&7EZ5 gةslf"sl{Asaqf6cM;r:1?׃hߋ&ؐ/0lNy(TLӺ֩^-d܆>t#?+` KH yڡj`SlBt`xfvH;p2S m)ufZonr -8{, l5ZVLMXˢ~ RZb3OB|;lO4e5Q}P2X,kRwc+7/OJDl9borVO8T5rIJLʹ}2R?ε0sI4exxxo#/WZWUUTU@[u߄~4 l9T4"-,jYoYDZ@ E~n1RC"{y4eeΔmX낊/47+Gkwߡ0_< ZP9ťlQMy7%U2*R沢Tܥ n܇!esҥ 7"_n)!+=$dK^DҲM: `S͔e@ԧ`".:;Y4355em]~nN0)=R75ineh;\7js\AB󘂚]...  $ǝB<e@˗/3::UUtvtPp Aۯ0mׂL 1kB"ƾ G@_fF]SSS/<~Fe9H$00hqC5- ]u np9-72n~Fpʸuy /zwlR+}L'G3.DFb Uss'5SbdpTImg˥%iyA!Fw-`j7i!a\zqMNvh3{+#wj(t鏚B>Ico,Z:jJɂtm J~"`~8SKh\cbb_J|c~sHp9 @LJJ#WnJwz BvhAL1I_,6e_1eX[@r W###n< h?QT޸V*J|/`Ɯ\Yf)iSRJK<hfg(vfv5Pʀ@?D*gB!悁nԂqM!^!2?PJEQZ`~/98w+o`:Bo39_@ȟ0ۓ=M)KC #p|sF߽mН=0dA{4ŎYjkܞd1Au[3^CAКTdO[s +`8e}rz8עrrG4ģ qLP,cap1sbfW܁^~%C? \KHF&8_LXƚݒ}qڸ۔Y\  , 3,Kf,~+LlT:܇𜁪>pM빺6L G$luW}ݾZ)'71 J78k~nfX7\(.)Mxl7߻ر SQP+m|eZV*Ihm:kn6QprLt:G*淦wjB*2+91O)Zׁ{ 5"Z={E156P}9 E1T,|}4̅]\oIf26nm5?׋0N!yS91el;D(DUC!7/#_ fH [_eBlt Mþ&LJpdTp  ers$9|{>ې(XaFDyfqTs{L<\8 oDdE^/SS&ws#cMDUmT?{ǖN#X#Hh6@I "u2s23l2MDU;%fLG] *t%MOz ϡF8"7i0  yF%ģq ט6 vt"$҄~2.}0q]=%c[b>drC/]0Kfj~D"Y<آ!k|<5b0v(T"NBpLU ›'埐O711]["CPWA97մ.*4(D_^G:9;T*iIlTM]Z=m@c#Q ?I'(c#0WX-,_! H:l;;XCD-a20hzÉc7_8N?fƭ >MimE8?1ոNݰҴ8e}~tzo1x5O[ J<{G)`9Β+i`,a\U0pYBB oA\4"T@&"t Oy8Tȕ7 P5;QLsK?8#63s`~Һ"`/q]`ou!~9|kqwE4>-Q>MPe슘6wW‹X-oƕЄ4&{냠w]z6"x7twulLV u70,-~aH#.5u 9J= ߐ1.+75%3MR+|\g1qQhNhK<5Z>%^󗃼Xg~)шþ)ு!C̯P6bϱ0BQ4WFGф,Yann.Y&{OTN1×!A$n,+P-V*[ip~xmܥj {f_I]Yt ޙ 8 h`~_N7q m(م-Ml1+76F0 Ͷ4wo\槫V$6(F|Tƭ&*2}k.n ng/Ld7t6`JMpl?~m15B\Qn߫_ 6q mHâ޸[.9ð9N:Lܠ!_c\* $]ɿ?cOÑY~;)xsDԔꦥ*F3bGko_c5˧1?ؘ l`C)F[V:W!w`+AhsG ؖXWpU9lE66FqR S5}AdHjn 3|j{څn [xŹ1kԮf6tof$_~YLM=n7yaP#GY [5Sҿ-DCa_m\רo% sFc6\qƹj |4d7Pw*6*=8{]2nF6-ނ-._ĚM.hEgyR>ȹWP)1!C=]HU(W7eglKF|W{1747N0B0l>dZآn{ZGGWN#ekb̦h2S &ݼtçCƤ<dS%Aެcuy.&#%{ia~Ue`X4n?^99=mCԖe?3Z*= z14>qnj#5PLs4x_͟a8T}צss01$v5BJ1$ׯ#ZRƤ r4]̡X3@&f4шmj.l&J*2n&蝮O?X1ΝBNN}'pUk`C)iEko\bӸq쭝ZXp $*j^Z8w5>;,͹*{i- wd3+z[1TĹuuh;lI0/@qЄB]^>sMjy({vsjkJ̯󛅫߸*RխbGR>{:}mWsK&]vM(9T;6H߳&1ab~yac{<`~Cb~'o%0z([V[4RiEU֘kFBQ>3@12U60W(m'n`bҟe UUQUuA>25! YdVCnz|''5oӘ_8V6/ ⧐1"iiddi^E<'Q5.rw XΓ'vrj&o1o0Aƺ+NQ x[|?V[B ( d™|x`KNW;97]{ca~Ri-BB,aLlUa>m-45|NX-{)1Bher#ݙoNm_ByptN`8\=*1oj̯P*8-->29?\V;}e_tDБ@W}M<6"֯ssH/J0Bh,d1[|iE8H$ ٜa+Q:BSy$r'ykw;Vm<`~/cb~*^4sYǦq+\ n2ۆ;nޙo-g"J̯P*xZ[|9ޚ+Qqol}[GWiVlO/F޼9ƹȡ\Ř_T4pJ1^$ײآUcxJ4Ž\[Ҹ`D uj;+̛')qhT4p-މ-V::R1Il:#7€"k6xl7s%Z&!1W( g`@&ܘɹ&h-ؾ\0){U%́^"=xZ12T2yb_bRBQŋڵ ds9@nS ᄋ -󔮴H,CJNQEvUpdc^g#k塃R^C-->%ĵb{<rSYd{9#i6~2ķu2*jsgxc~RP Nd||7fVevy{5dyEW¥|B'?hzȜC@}O)S5Ⱥǟ#lQP[m#sf%eYU[纸,s0#hBh+eX݅-nI>Q.wR m-D9. bb~^`ѼU%#0+kx{XB:8-Lz0{"*x-N3>v'G8(b~BEo 1]n[ ߄qhWH)Fށ̭9`r]5qT4T x+rX^V-^A_mX e1r9&x,E [&q7JV7#&Cl1 # _? *xh lѨ11M7x`)?DV9(b~TEoR`Db7dxq$}ߦVX)67OiNEn^ |(ćH/U4U$KNSe({5֎X:%tEXtdate:create2023-03-06T19:54:22+00:00r %tEXtdate:modify2023-03-06T19:54:22+00:00mtEXtSoftwareAdobe ImageReadyqe<IENDB`dplyr/man/figures/lifecycle-archived.svg0000644000176200001440000000170713663216626020106 0ustar liggesusers lifecyclelifecyclearchivedarchived dplyr/man/figures/lifecycle-defunct.svg0000644000176200001440000000170413663216626017746 0ustar liggesuserslifecyclelifecycledefunctdefunct dplyr/man/figures/lifecycle-soft-deprecated.svg0000644000176200001440000000172613663216626021373 0ustar liggesuserslifecyclelifecyclesoft-deprecatedsoft-deprecated dplyr/man/figures/lifecycle-maturing.svg0000644000176200001440000000170613663216626020146 0ustar liggesuserslifecyclelifecyclematuringmaturing dplyr/man/figures/lifecycle-retired.svg0000644000176200001440000000170513663216626017755 0ustar liggesusers lifecyclelifecycleretiredretired dplyr/man/dim_desc.Rd0000644000176200001440000000064514366556340014236 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-format.R \name{dim_desc} \alias{dim_desc} \title{Describing dimensions} \usage{ dim_desc(x) } \arguments{ \item{x}{Object to show dimensions for.} } \description{ Prints the dimensions of an array-like object in a user-friendly manner, substituting \code{NA} with ?? (for SQL queries). } \examples{ dim_desc(mtcars) } \keyword{internal} dplyr/man/recode.Rd0000644000176200001440000001545215106134104013711 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/recode.R \name{recode} \alias{recode} \alias{recode_factor} \title{Recode values} \usage{ recode(.x, ..., .default = NULL, .missing = NULL) recode_factor(.x, ..., .default = NULL, .missing = NULL, .ordered = FALSE) } \arguments{ \item{.x}{A vector to modify} \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Replacements. For character and factor \code{.x}, these should be named and replacement is based only on their name. For numeric \code{.x}, these can be named or not. If not named, the replacement is done based on position i.e. \code{.x} represents positions to look for in replacements. See examples. When named, the argument names should be the current values to be replaced, and the argument values should be the new (replacement) values. All replacements must be the same type, and must have either length one or the same length as \code{.x}.} \item{.default}{If supplied, all values not otherwise matched will be given this value. If not supplied and if the replacements are the same type as the original values in \code{.x}, unmatched values are not changed. If not supplied and if the replacements are not compatible, unmatched values are replaced with \code{NA}. \code{.default} must be either length 1 or the same length as \code{.x}.} \item{.missing}{If supplied, any missing values in \code{.x} will be replaced by this value. Must be either length 1 or the same length as \code{.x}.} \item{.ordered}{If \code{TRUE}, \code{recode_factor()} creates an ordered factor.} } \value{ A vector the same length as \code{.x}, and the same type as the first of \code{...}, \code{.default}, or \code{.missing}. \code{recode_factor()} returns a factor whose levels are in the same order as in \code{...}. The levels in \code{.default} and \code{.missing} come last. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{recode()} is superseded in favor of \code{\link[=recode_values]{recode_values()}} and \code{\link[=replace_values]{replace_values()}}, which are more general and have a much better interface. \code{recode_factor()} is also superseded, however, its direct replacement is not currently available but will eventually live in \href{https://forcats.tidyverse.org/}{forcats}. For creating new variables based on logical vectors, use \code{\link[=if_else]{if_else()}}. For even more complicated criteria, use \code{\link[=case_when]{case_when()}}. \code{recode()} is a vectorised version of \code{\link[=switch]{switch()}}: you can replace numeric values based on their position or their name, and character or factor values only by their name. This is an S3 generic: dplyr provides methods for numeric, character, and factors. You can use \code{recode()} directly with factors; it will preserve the existing order of levels while changing the values. Alternatively, you can use \code{recode_factor()}, which will change the order of levels to match the order of replacements. } \examples{ set.seed(1234) x <- sample(c("a", "b", "c"), 10, replace = TRUE) # `recode()` is superseded by `recode_values()` and `replace_values()` # If you are fully recoding a vector use `recode_values()` recode(x, a = "Apple", b = "Banana", .default = NA_character_) recode_values(x, "a" ~ "Apple", "b" ~ "Banana") # With a default recode(x, a = "Apple", b = "Banana", .default = "unknown") recode_values(x, "a" ~ "Apple", "b" ~ "Banana", default = "unknown") # If you are partially updating a vector and want to retain the original # vector's values in locations you don't make a replacement, use # `replace_values()` recode(x, a = "Apple", b = "Banana") replace_values(x, "a" ~ "Apple", "b" ~ "Banana") # `replace_values()` is easier to use with numeric vectors, because you don't # need to turn the numeric values into names y <- c(1:4, NA) recode(y, `2` = 20L, `4` = 40L) replace_values(y, 2 ~ 20L, 4 ~ 40L) # `recode()` is particularly confusing because it tries to handle both # full recodings to new vector types and partial updating of an existing # vector. With the above example, using doubles (20) rather than integers # (20L) results in a warning from `recode()`, because it thinks you are # doing a full recode and missed a case. `replace_values()` is type stable # on `y` and will instead coerce the double values to integer. recode(y, `2` = 20, `4` = 40) replace_values(y, 2 ~ 20, 4 ~ 40) # This also makes `replace_values()` much safer. If you provide # incompatible types, it will error. recode(y, `2` = "20", `4` = "40") try(replace_values(y, 2 ~ "20", 4 ~ "40")) # If you were trying to fully recode the vector and want a different output # type, use `recode_values()` recode_values(y, 2 ~ "20", 4 ~ "40") # And if you want to ensure you don't miss a case, use `unmatched`, which # errors rather than warns try(recode_values(y, 2 ~ "20", 4 ~ "40", unmatched = "error")) # --------------------------------------------------------------------------- # Lookup tables # If you were splicing an external lookup vector into `recode()`, you can # instead use the `from` and `to` arguments of `recode_values()` x <- c("a", "b", "a", "c", "d", "c") lookup <- c( "a" = "A", "b" = "B", "c" = "C", "d" = "D" ) recode(x, !!!lookup) recode_values(x, from = names(lookup), to = unname(lookup)) # `recode_values()` is much more flexible here because the lookup table # isn't restricted to just character values. We recommend using `tribble()` # to build your lookup tables. lookup <- tribble( ~from, ~to, "a", 1, "b", 2, "c", 3, "d", 4 ) recode_values(x, from = lookup$from, to = lookup$to) # --------------------------------------------------------------------------- # Factors # The factor method of `recode()` can generally be replaced with # `forcats::fct_recode()` x <- factor(c("a", "b", "c")) recode(x, a = "Apple") # forcats::fct_recode(x, "Apple" = "a") # `recode_factor()` does not currently have a direct replacement, but we # plan to add one to forcats. In the meantime, use a lookup table that # recodes every case, and then convert the `to` column to a factor. If you # define your lookup table in your preferred level order, then the conversion # to factor is straightforward! y <- c(3, 4, 1, 2, 4, NA) recode_factor( y, `1` = "a", `2` = "b", `3` = "c", `4` = "d", .missing = "M" ) lookup <- tribble( ~from, ~to, 1, "a", 2, "b", 3, "c", 4, "d", NA, "M" ) # `factor()` generates levels by sorting the unique values of `to`, which we # don't want, so we supply `levels = to` directly. Alternatively, use # `forcats::fct(to)`, which generates levels in order of appearance. lookup <- mutate(lookup, to = factor(to, levels = to)) recode_values(y, from = lookup$from, to = lookup$to) } \seealso{ \code{\link[=recode_values]{recode_values()}} } dplyr/man/src.Rd0000644000176200001440000000115614366556340013254 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/src.R \name{src} \alias{src} \alias{is.src} \title{Create a "src" object} \usage{ src(subclass, ...) is.src(x) } \arguments{ \item{subclass}{name of subclass. "src" is an abstract base class, so you must supply this value. \code{src_} is automatically prepended to the class name} \item{...}{fields used by object. These dots are evaluated with \link[rlang:list2]{explicit splicing}.} \item{x}{object to test for "src"-ness.} } \description{ \code{src()} is the standard constructor for srcs and \code{is.src()} tests. } \keyword{internal} dplyr/man/between.Rd0000644000176200001440000000317415106134104014077 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/funs.R \name{between} \alias{between} \title{Detect where values fall in a specified range} \usage{ between(x, left, right, ..., ptype = NULL) } \arguments{ \item{x}{A vector} \item{left, right}{Boundary values. Both \code{left} and \code{right} are recycled to the size of \code{x}.} \item{...}{These dots are for future extensions and must be empty.} \item{ptype}{An optional prototype giving the desired output type. The default is to compute the common type of \code{x}, \code{left}, and \code{right} using \code{\link[vctrs:vec_cast]{vctrs::vec_cast_common()}}.} } \value{ A logical vector the same size as \code{x} with a type determined by \code{ptype}. } \description{ This is a shortcut for \code{x >= left & x <= right}, implemented for local vectors and translated to the appropriate SQL for remote tables. } \details{ \code{x}, \code{left}, and \code{right} are all cast to their common type before the comparison is made. Use the \code{ptype} argument to specify the type manually. } \examples{ between(1:12, 7, 9) x <- rnorm(1e2) x[between(x, -1, 1)] # On a tibble using `filter()` filter(starwars, between(height, 100, 150)) # Using the `ptype` argument with ordered factors, where otherwise everything # is cast to the common type of character before the comparison x <- ordered( c("low", "medium", "high", "medium"), levels = c("low", "medium", "high") ) between(x, "medium", "high") between(x, "medium", "high", ptype = x) } \seealso{ \code{\link[=join_by]{join_by()}} if you are looking for documentation for the \code{between()} overlap join helper. } dplyr/man/dplyr_data_masking.Rd0000644000176200001440000000045314406402754016312 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-tidy-eval.R \name{dplyr_data_masking} \alias{dplyr_data_masking} \title{Data-masking} \description{ This page is now located at \code{\link[rlang:args_data_masking]{?rlang::args_data_masking}}. } \keyword{internal} dplyr/man/scoped.Rd0000644000176200001440000001221614366556340013741 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise.R \name{scoped} \alias{scoped} \title{Operate on a selection of variables} \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.funs}{A function \code{fun}, a quosure style lambda \code{~ fun(.)} or a list of either form.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} \item{...}{Additional arguments for the function calls in \code{.funs}. These are evaluated only once, with \link[rlang:dyn-dots]{tidy dots} support.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} Scoped verbs (\verb{_if}, \verb{_at}, \verb{_all}) have been superseded by the use of \code{\link[=pick]{pick()}} or \code{\link[=across]{across()}} in an existing verb. See \code{vignette("colwise")} for details. The variants suffixed with \verb{_if}, \verb{_at} or \verb{_all} apply an expression (sometimes several) to all variables within a specified subset. This subset can contain all variables (\verb{_all} variants), a \code{\link[=vars]{vars()}} selection (\verb{_at} variants), or variables selected with a predicate (\verb{_if} variants). The verbs with scoped variants are: \itemize{ \item \code{\link[=mutate]{mutate()}}, \code{\link[=transmute]{transmute()}} and \code{\link[=summarise]{summarise()}}. See \code{\link[=summarise_all]{summarise_all()}}. \item \code{\link[=filter]{filter()}}. See \code{\link[=filter_all]{filter_all()}}. \item \code{\link[=group_by]{group_by()}}. See \code{\link[=group_by_all]{group_by_all()}}. \item \code{\link[=rename]{rename()}} and \code{\link[=select]{select()}}. See \code{\link[=select_all]{select_all()}}. \item \code{\link[=arrange]{arrange()}}. See \code{\link[=arrange_all]{arrange_all()}} } There are three kinds of scoped variants. They differ in the scope of the variable selection on which operations are applied: \itemize{ \item Verbs suffixed with \verb{_all()} apply an operation on all variables. \item Verbs suffixed with \verb{_at()} apply an operation on a subset of variables specified with the quoting function \code{\link[=vars]{vars()}}. This quoting function accepts \code{\link[tidyselect:vars_select]{tidyselect::vars_select()}} helpers like \code{\link[=starts_with]{starts_with()}}. Instead of a \code{\link[=vars]{vars()}} selection, you can also supply an \link[rlang:is_integerish]{integerish} vector of column positions or a character vector of column names. \item Verbs suffixed with \verb{_if()} apply an operation on the subset of variables for which a predicate function returns \code{TRUE}. Instead of a predicate function, you can also supply a logical vector. } } \section{Grouping variables}{ Most of these operations also apply on the grouping variables when they are part of the selection. This includes: \itemize{ \item \code{\link[=arrange_all]{arrange_all()}}, \code{\link[=arrange_at]{arrange_at()}}, and \code{\link[=arrange_if]{arrange_if()}} \item \code{\link[=distinct_all]{distinct_all()}}, \code{\link[=distinct_at]{distinct_at()}}, and \code{\link[=distinct_if]{distinct_if()}} \item \code{\link[=filter_all]{filter_all()}}, \code{\link[=filter_at]{filter_at()}}, and \code{\link[=filter_if]{filter_if()}} \item \code{\link[=group_by_all]{group_by_all()}}, \code{\link[=group_by_at]{group_by_at()}}, and \code{\link[=group_by_if]{group_by_if()}} \item \code{\link[=select_all]{select_all()}}, \code{\link[=select_at]{select_at()}}, and \code{\link[=select_if]{select_if()}} } This is not the case for summarising and mutating variants where operations are \emph{not} applied on grouping variables. The behaviour depends on whether the selection is \strong{implicit} (\code{all} and \code{if} selections) or \strong{explicit} (\code{at} selections). Grouping variables covered by explicit selections (with \code{\link[=summarise_at]{summarise_at()}}, \code{\link[=mutate_at]{mutate_at()}}, and \code{\link[=transmute_at]{transmute_at()}}) are always an error. For implicit selections, the grouping variables are always ignored. In this case, the level of verbosity depends on the kind of operation: \itemize{ \item Summarising operations (\code{\link[=summarise_all]{summarise_all()}} and \code{\link[=summarise_if]{summarise_if()}}) ignore grouping variables silently because it is obvious that operations are not applied on grouping variables. \item On the other hand it isn't as obvious in the case of mutating operations (\code{\link[=mutate_all]{mutate_all()}}, \code{\link[=mutate_if]{mutate_if()}}, \code{\link[=transmute_all]{transmute_all()}}, and \code{\link[=transmute_if]{transmute_if()}}). For this reason, they issue a message indicating which grouping variables are ignored. } } dplyr/man/bind_rows.Rd0000644000176200001440000000251314366556340014451 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bind-rows.R \name{bind_rows} \alias{bind_rows} \alias{bind} \title{Bind multiple data frames by row} \usage{ bind_rows(..., .id = NULL) } \arguments{ \item{...}{Data frames to combine. Each argument can either be a data frame, a list that could be a data frame, or a list of data frames. Columns are matched by name, and any missing columns will be filled with \code{NA}.} \item{.id}{The name of an optional identifier column. Provide a string to create an output column that identifies each input. The column will use names if available, otherwise it will use positions.} } \value{ A data frame the same type as the first element of \code{...}. } \description{ Bind any number of data frames by row, making a longer result. This is similar to \code{do.call(rbind, dfs)}, but the output will contain all columns that appear in any of the inputs. } \examples{ df1 <- tibble(x = 1:2, y = letters[1:2]) df2 <- tibble(x = 4:5, z = 1:2) # You can supply individual data frames as arguments: bind_rows(df1, df2) # Or a list of data frames: bind_rows(list(df1, df2)) # When you supply a column name with the `.id` argument, a new # column is created to link each row to its original data frame bind_rows(list(df1, df2), .id = "id") bind_rows(list(a = df1, b = df2), .id = "id") } dplyr/man/glimpse.Rd0000644000176200001440000000175515106134104014111 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reexport-pillar.R \name{glimpse} \alias{glimpse} \title{Get a glimpse of your data} \value{ x original x is (invisibly) returned, allowing \code{glimpse()} to be used within a data pipeline. } \description{ \code{glimpse()} is like a transposed version of \code{print()}: columns run down the page, and data runs across. This makes it possible to see every column in a data frame. It's a little like \code{\link[=str]{str()}} applied to a data frame but it tries to show you as much data as possible. (And it always shows the underlying data, even when applied to a remote data source.) \code{glimpse()} is provided by the pillar package, and re-exported by dplyr. See \code{\link[pillar:glimpse]{pillar::glimpse()}} for more details. } \examples{ glimpse(mtcars) # Note that original x is (invisibly) returned, allowing `glimpse()` to be # used within a pipeline. mtcars |> glimpse() |> select(1:3) glimpse(starwars) } dplyr/man/cross_join.Rd0000644000176200001440000000505014366556340014632 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/join-cross.R \name{cross_join} \alias{cross_join} \title{Cross join} \usage{ cross_join(x, y, ..., copy = FALSE, suffix = c(".x", ".y")) } \arguments{ \item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{Other parameters passed onto methods.} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into the same src as \code{x}. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.} \item{suffix}{If there are non-joined duplicate variables in \code{x} and \code{y}, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.} } \value{ An object of the same type as \code{x} (including the same groups). The output has the following properties: \itemize{ \item There are \code{nrow(x) * nrow(y)} rows returned. \item Output columns include all columns from both \code{x} and \code{y}. Column name collisions are resolved using \code{suffix}. \item The order of the rows and columns of \code{x} is preserved as much as possible. } } \description{ Cross joins match each row in \code{x} to every row in \code{y}, resulting in a data frame with \code{nrow(x) * nrow(y)} rows. Since cross joins result in all possible matches between \code{x} and \code{y}, they technically serve as the basis for all \link[=mutate-joins]{mutating joins}, which can generally be thought of as cross joins followed by a filter. In practice, a more specialized procedure is used for better performance. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("cross_join")}. } \examples{ # Cross joins match each row in `x` to every row in `y`. # Data within the columns is not used in the matching process. cross_join(band_instruments, band_members) # Control the suffix added to variables duplicated in # `x` and `y` with `suffix`. cross_join(band_instruments, band_members, suffix = c("", "_y")) } \seealso{ Other joins: \code{\link{filter-joins}}, \code{\link{mutate-joins}}, \code{\link{nest_join}()} } \concept{joins} dplyr/man/filter_all.Rd0000644000176200001440000000632114366556340014601 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise-filter.R \name{filter_all} \alias{filter_all} \alias{filter_if} \alias{filter_at} \title{Filter within a selection of variables} \usage{ filter_all(.tbl, .vars_predicate, .preserve = FALSE) filter_if(.tbl, .predicate, .vars_predicate, .preserve = FALSE) filter_at(.tbl, .vars, .vars_predicate, .preserve = FALSE) } \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.vars_predicate}{A quoted predicate expression as returned by \code{\link[=all_vars]{all_vars()}} or \code{\link[=any_vars]{any_vars()}}. Can also be a function or purrr-like formula. In this case, the intersection of the results is taken by default and there's currently no way to request the union.} \item{.preserve}{when \code{FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise it is kept as is.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} Scoped verbs (\verb{_if}, \verb{_at}, \verb{_all}) have been superseded by the use of \code{\link[=if_all]{if_all()}} or \code{\link[=if_any]{if_any()}} in an existing verb. See \code{vignette("colwise")} for details. These \link{scoped} filtering verbs apply a predicate expression to a selection of variables. The predicate expression should be quoted with \code{\link[=all_vars]{all_vars()}} or \code{\link[=any_vars]{any_vars()}} and should mention the pronoun \code{.} to refer to variables. } \section{Grouping variables}{ The grouping variables that are part of the selection are taken into account to determine filtered rows. } \examples{ # While filter() accepts expressions with specific variables, the # scoped filter verbs take an expression with the pronoun `.` and # replicate it over all variables. This expression should be quoted # with all_vars() or any_vars(): all_vars(is.na(.)) any_vars(is.na(.)) # You can take the intersection of the replicated expressions: filter_all(mtcars, all_vars(. > 150)) # -> filter(mtcars, if_all(everything(), ~ .x > 150)) # Or the union: filter_all(mtcars, any_vars(. > 150)) # -> filter(mtcars, if_any(everything(), ~ . > 150)) # You can vary the selection of columns on which to apply the # predicate. filter_at() takes a vars() specification: filter_at(mtcars, vars(starts_with("d")), any_vars((. \%\% 2) == 0)) # -> filter(mtcars, if_any(starts_with("d"), ~ (.x \%\% 2) == 0)) # And filter_if() selects variables with a predicate function: filter_if(mtcars, ~ all(floor(.) == .), all_vars(. != 0)) # -> is_int <- function(x) all(floor(x) == x) filter(mtcars, if_all(where(is_int), ~ .x != 0)) } \keyword{internal} dplyr/man/tbl.Rd0000644000176200001440000000063414366556340013246 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tbl.R \name{tbl} \alias{tbl} \alias{is.tbl} \title{Create a table from a data source} \usage{ tbl(src, ...) is.tbl(x) } \arguments{ \item{src}{A data source} \item{...}{Other arguments passed on to the individual methods} \item{x}{Any object} } \description{ This is a generic method that dispatches based on the first argument. } dplyr/man/dplyr-locale.Rd0000644000176200001440000000630215106134104015031 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/locale.R \name{dplyr-locale} \alias{dplyr-locale} \title{Locale used by \code{arrange()}} \description{ This page documents details about the locale used by \code{\link[=arrange]{arrange()}} when ordering character vectors. \subsection{Default locale}{ The default locale used by \code{arrange()} is the C locale. This is used when \code{.locale = NULL} unless the deprecated \code{dplyr.legacy_locale} global option is set to \code{TRUE}. You can also force the C locale to be used unconditionally with \code{.locale = "C"}. The C locale is not exactly the same as English locales, such as \code{"en"}. The main difference is that the C locale groups the English alphabet by \emph{case}, while most English locales group the alphabet by \emph{letter}. For example, \code{c("a", "b", "C", "B", "c")} will sort as \code{c("B", "C", "a", "b", "c")} in the C locale, with all uppercase letters coming before lowercase letters, but will sort as \code{c("a", "b", "B", "c", "C")} in an English locale. This often makes little practical difference during data analysis, because both return identical results when case is consistent between observations. } \subsection{Reproducibility}{ The C locale has the benefit of being completely reproducible across all supported R versions and operating systems with no extra effort. If you set \code{.locale} to an option from \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}}, then stringi must be installed by anyone who wants to run your code. If you utilize this in a package, then stringi should be placed in \code{Imports}. } \subsection{Legacy behavior}{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Prior to dplyr 1.1.0, character columns were ordered in the system locale. Setting the global option \code{dplyr.legacy_locale} to \code{TRUE} retains this legacy behavior, but this has been deprecated. Update existing code to explicitly call \code{arrange(.locale = )} instead. Run \code{Sys.getlocale("LC_COLLATE")} to determine your system locale, and compare that against the list in \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to find an appropriate value for \code{.locale}, i.e. for American English, \code{"en_US"}. Setting \code{.locale} directly will override any usage of \code{dplyr.legacy_locale}. } } \examples{ \dontshow{if (dplyr:::has_minimum_stringi()) withAutoprint(\{ # examplesIf} df <- tibble(x = c("a", "b", "C", "B", "c")) df # Default locale is C, which groups the English alphabet by case, placing # uppercase letters before lowercase letters. arrange(df, x) # The American English locale groups the alphabet by letter. # Explicitly override `.locale` with `"en"` for this ordering. arrange(df, x, .locale = "en") # This Danish letter is expected to sort after `z` df <- tibble(x = c("o", "p", "\u00F8", "z")) df # The American English locale sorts it right after `o` arrange(df, x, .locale = "en") # Using `"da"` for Danish ordering gives the expected result arrange(df, x, .locale = "da") \dontshow{\}) # examplesIf} } \keyword{internal} dplyr/man/funs.Rd0000644000176200001440000000324114406402754013427 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprec-funs.R \name{funs} \alias{funs} \title{Create a list of function calls} \usage{ funs(..., .args = list()) } \arguments{ \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> A list of functions specified by: \itemize{ \item Their name, \code{"mean"} \item The function itself, \code{mean} \item A call to the function with \code{.} as a dummy argument, \code{mean(., na.rm = TRUE)} } The following notations are \strong{not} supported, see examples: \itemize{ \item An anonymous function, \code{function(x) mean(x, na.rm = TRUE)} \item An anonymous function in \pkg{purrr} notation, \code{~mean(., na.rm = TRUE)} }} \item{.args, args}{A named list of additional arguments to be added to all function calls. As \code{funs()} is being deprecated, use other methods to supply arguments: \code{...} argument in \link[=summarise_at]{scoped verbs} or make own functions with \code{\link[purrr:partial]{purrr::partial()}}.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} \code{funs()} is deprecated; please use \code{list()} instead. We deprecated this function because it provided a unique way of specifying anonymous functions, rather than adopting the conventions used by purrr and other packages in the tidyverse. } \examples{ funs("mean", mean(., na.rm = TRUE)) # -> list(mean = mean, mean = ~ mean(.x, na.rm = TRUE)) funs(m1 = mean, m2 = "mean", m3 = mean(., na.rm = TRUE)) # -> list(m1 = mean, m2 = "mean", m3 = ~ mean(.x, na.rm = TRUE)) } \keyword{internal} dplyr/man/group_map.Rd0000644000176200001440000001030515106134104014431 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group-map.R \name{group_map} \alias{group_map} \alias{group_modify} \alias{group_walk} \title{Apply a function to each group} \usage{ group_map(.data, .f, ..., .keep = FALSE) group_modify(.data, .f, ..., .keep = FALSE) group_walk(.data, .f, ..., .keep = FALSE) } \arguments{ \item{.data}{A grouped tibble} \item{.f}{A function or formula to apply to each group. If a \strong{function}, it is used as is. It should have at least 2 formal arguments. If a \strong{formula}, e.g. \code{~ head(.x)}, it is converted to a function. In the formula, you can use \itemize{ \item \code{.} or \code{.x} to refer to the subset of rows of \code{.tbl} for the given group \item \code{.y} to refer to the key, a one row tibble with one column per grouping variable that identifies the group }} \item{...}{Additional arguments passed on to \code{.f}} \item{.keep}{are the grouping variables kept in \code{.x}} } \value{ \itemize{ \item \code{group_modify()} returns a grouped tibble. In that case \code{.f} must return a data frame. \item \code{group_map()} returns a list of results from calling \code{.f} on each group. \item \code{group_walk()} calls \code{.f} for side effects and returns the input \code{.tbl}, invisibly. } } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \code{group_map()}, \code{group_modify()} and \code{group_walk()} are purrr-style functions that can be used to iterate on grouped tibbles. } \details{ Use \code{group_modify()} when \code{summarize()} is too limited, in terms of what you need to do and return for each group. \code{group_modify()} is good for "data frame in, data frame out". If that is too limited, you need to use a \link[=group_nest]{nested} or \link[=group_split]{split} workflow. \code{group_modify()} is an evolution of \code{\link[=do]{do()}}, if you have used that before. Each conceptual group of the data frame is exposed to the function \code{.f} with two pieces of information: \itemize{ \item The subset of the data for the group, exposed as \code{.x}. \item The key, a tibble with exactly one row and columns for each grouping variable, exposed as \code{.y}. } For completeness, \code{group_modify()}, \code{group_map} and \code{group_walk()} also work on ungrouped data frames, in that case the function is applied to the entire data frame (exposed as \code{.x}), and \code{.y} is a one row tibble with no column, consistently with \code{\link[=group_keys]{group_keys()}}. } \examples{ # return a list mtcars |> group_by(cyl) |> group_map(~ head(.x, 2L)) # return a tibble grouped by `cyl` with 2 rows per group # the grouping data is recalculated mtcars |> group_by(cyl) |> group_modify(~ head(.x, 2L)) \dontshow{if (requireNamespace("broom", quietly = TRUE)) withAutoprint(\{ # examplesIf} # a list of tibbles iris |> group_by(Species) |> group_map(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x))) # a restructured grouped tibble iris |> group_by(Species) |> group_modify(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x))) \dontshow{\}) # examplesIf} # a list of vectors iris |> group_by(Species) |> group_map(~ quantile(.x$Petal.Length, probs = c(0.25, 0.5, 0.75))) # to use group_modify() the lambda must return a data frame iris |> group_by(Species) |> group_modify(~ { quantile(.x$Petal.Length, probs = c(0.25, 0.5, 0.75)) |> tibble::enframe(name = "prob", value = "quantile") }) iris |> group_by(Species) |> group_modify(~ { .x |> purrr::map_dfc(fivenum) |> mutate(nms = c("min", "Q1", "median", "Q3", "max")) }) # group_walk() is for side effects dir.create(temp <- tempfile()) iris |> group_by(Species) |> group_walk(~ write.csv(.x, file = file.path(temp, paste0(.y$Species, ".csv")))) list.files(temp, pattern = "csv$") unlink(temp, recursive = TRUE) # group_modify() and ungrouped data frames mtcars |> group_modify(~ head(.x, 2L)) } \seealso{ Other grouping functions: \code{\link{group_by}()}, \code{\link{group_nest}()}, \code{\link{group_split}()}, \code{\link{group_trim}()} } \concept{grouping functions} dplyr/man/order_by.Rd0000644000176200001440000000210113663216626014260 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/order-by.R \name{order_by} \alias{order_by} \title{A helper function for ordering window function output} \usage{ order_by(order_by, call) } \arguments{ \item{order_by}{a vector to order_by} \item{call}{a function call to a window function, where the first argument is the vector being operated on} } \description{ This function makes it possible to control the ordering of window functions in R that don't have a specific ordering parameter. When translated to SQL it will modify the order clause of the OVER function. } \details{ This function works by changing the \code{call} to instead call \code{\link[=with_order]{with_order()}} with the appropriate arguments. } \examples{ order_by(10:1, cumsum(1:10)) x <- 10:1 y <- 1:10 order_by(x, cumsum(y)) df <- data.frame(year = 2000:2005, value = (0:5) ^ 2) scrambled <- df[sample(nrow(df)), ] wrong <- mutate(scrambled, running = cumsum(value)) arrange(wrong, year) right <- mutate(scrambled, running = order_by(year, cumsum(value))) arrange(right, year) } dplyr/man/common_by.Rd0000644000176200001440000000042214277513434014440 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/join-common-by.R \name{common_by} \alias{common_by} \title{Extract out common by variables} \usage{ common_by(by = NULL, x, y) } \description{ Extract out common by variables } \keyword{internal} dplyr/man/all_vars.Rd0000644000176200001440000000240614406402754014261 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise.R \name{all_vars} \alias{all_vars} \alias{any_vars} \title{Apply predicate to all variables} \usage{ all_vars(expr) any_vars(expr) } \arguments{ \item{expr}{<\code{\link[rlang:args_data_masking]{data-masking}}> An expression that returns a logical vector, using \code{.} to refer to the "current" variable.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{all_vars()} and \code{any_vars()} were only needed for the scoped verbs, which have been superseded by the use of \code{\link[=across]{across()}} in an existing verb. See \code{vignette("colwise")} for details. These quoting functions signal to scoped filtering verbs (e.g. \code{\link[=filter_if]{filter_if()}} or \code{\link[=filter_all]{filter_all()}}) that a predicate expression should be applied to all relevant variables. The \code{all_vars()} variant takes the intersection of the predicate expressions with \code{&} while the \code{any_vars()} variant takes the union with \code{|}. } \seealso{ \code{\link[=vars]{vars()}} for other quoting functions that you can use with scoped verbs. } dplyr/man/copy_to.Rd0000644000176200001440000000262515106134104014122 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/copy-to.R \name{copy_to} \alias{copy_to} \title{Copy a local data frame to a remote src} \usage{ copy_to(dest, df, name = deparse(substitute(df)), overwrite = FALSE, ...) } \arguments{ \item{dest}{remote data source} \item{df}{local data frame} \item{name}{name for new remote table.} \item{overwrite}{If \code{TRUE}, will overwrite an existing table with name \code{name}. If \code{FALSE}, will throw an error if \code{name} already exists.} \item{...}{other parameters passed to methods.} } \value{ a \code{tbl} object in the remote source } \description{ This function uploads a local data frame into a remote data source, creating the table definition as needed. Wherever possible, the new object will be temporary, limited to the current connection to the source. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("copy_to")}. } \examples{ \dontrun{ iris2 <- dbplyr::src_memdb() |> copy_to(iris, overwrite = TRUE) iris2 } } \seealso{ \code{\link[=collect]{collect()}} for the opposite action; downloading remote data into a local dbl. } dplyr/man/reframe.Rd0000644000176200001440000001035715106134104014070 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reframe.R \name{reframe} \alias{reframe} \title{Transform each group to an arbitrary number of rows} \usage{ reframe(.data, ..., .by = NULL) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs of functions. The name will be the name of the variable in the result. The value can be a vector of any length. Unnamed data frame values add multiple columns from a single expression.} \item{.by}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} } \value{ If \code{.data} is a tibble, a tibble. Otherwise, a data.frame. \itemize{ \item The rows originate from the underlying grouping keys. \item The columns are a combination of the grouping keys and the expressions that you provide. \item The output is always ungrouped. \item Data frame attributes are \strong{not} preserved, because \code{reframe()} fundamentally creates a new data frame. } } \description{ While \code{\link[=summarise]{summarise()}} requires that each argument returns a single value, and \code{\link[=mutate]{mutate()}} requires that each argument returns the same number of rows as the input, \code{reframe()} is a more general workhorse with no requirements on the number of rows returned per group. \code{reframe()} creates a new data frame by applying functions to columns of an existing data frame. It is most similar to \code{summarise()}, with two big differences: \itemize{ \item \code{reframe()} can return an arbitrary number of rows per group, while \code{summarise()} reduces each group down to a single row. \item \code{reframe()} always returns an ungrouped data frame, while \code{summarise()} might return a grouped or rowwise data frame, depending on the scenario. } We expect that you'll use \code{summarise()} much more often than \code{reframe()}, but \code{reframe()} can be particularly helpful when you need to apply a complex function that doesn't return a single summary value. } \section{Connection to tibble}{ \code{reframe()} is theoretically connected to two functions in tibble, \code{\link[tibble:enframe]{tibble::enframe()}} and \code{\link[tibble:enframe]{tibble::deframe()}}: \itemize{ \item \code{enframe()}: vector -> data frame \item \code{deframe()}: data frame -> vector \item \code{reframe()}: data frame -> data frame } } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("reframe")}. } \examples{ table <- c("a", "b", "d", "f") df <- tibble( g = c(1, 1, 1, 2, 2, 2, 2), x = c("e", "a", "b", "c", "f", "d", "a") ) # `reframe()` allows you to apply functions that return # an arbitrary number of rows df |> reframe(x = intersect(x, table)) # Functions are applied per group, and each group can return a # different number of rows. df |> reframe(x = intersect(x, table), .by = g) # The output is always ungrouped, even when using `group_by()` df |> group_by(g) |> reframe(x = intersect(x, table)) # You can add multiple columns at once using a single expression by returning # a data frame. quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) { tibble( val = quantile(x, probs, na.rm = TRUE), quant = probs ) } x <- c(10, 15, 18, 12) quantile_df(x) starwars |> reframe(quantile_df(height)) starwars |> reframe(quantile_df(height), .by = homeworld) starwars |> reframe( across(c(height, mass), quantile_df, .unpack = TRUE), .by = homeworld ) } \seealso{ Other single table verbs: \code{\link{arrange}()}, \code{\link{filter}()}, \code{\link{mutate}()}, \code{\link{rename}()}, \code{\link{select}()}, \code{\link{slice}()}, \code{\link{summarise}()} } \concept{single table verbs} dplyr/man/recode-and-replace-values.Rd0000644000176200001440000002267715137161765017407 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/recode-values.R \name{recode-and-replace-values} \alias{recode-and-replace-values} \alias{recode_values} \alias{replace_values} \title{Recode and replace values} \usage{ recode_values( x, ..., from = NULL, to = NULL, default = NULL, unmatched = "default", ptype = NULL ) replace_values(x, ..., from = NULL, to = NULL) } \arguments{ \item{x}{A vector.} \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> A sequence of two-sided formulas. The left hand side (LHS) determines which values match this case. The right hand side (RHS) provides the replacement value. \itemize{ \item The LHS inputs can be any size, but will be \link[vctrs:theory-faq-coercion]{cast} to the type of \code{x}. \item The RHS inputs will be \link[vctrs:theory-faq-recycling]{recycled} to the same size as \code{x}. For \code{recode_values()} they will be \link[vctrs:theory-faq-coercion]{cast} to their common type, and for \code{replace_values()} they will be \link[vctrs:theory-faq-coercion]{cast} to the type of \code{x}. } \code{NULL} inputs are ignored. Mutually exclusive with \code{from} and \code{to}.} \item{from}{Values to look up in \code{x} and map to values in \code{to}. Typically this is a single vector of any size that is \link[vctrs:theory-faq-coercion]{cast} to the type of \code{x}. For more advanced usage, this can be a list of vectors of any size each of which are \link[vctrs:theory-faq-coercion]{cast} to the type of \code{x}. Mutually exclusive with \code{...}.} \item{to}{Values that \code{from} map to. Typically this is a single vector that is \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{from}. For more advanced usage, this can be a list of vectors each of which are \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{x}. Mutually exclusive with \code{...}.} \item{default}{Default value to use when there is a value present in \code{x} that is unmatched by a value in \code{from}. By default, a missing value is used as the default value. If supplied, will be \link[vctrs:theory-faq-recycling]{recycled} to the size of \code{x}. Can only be set when \code{unmatched = "default"}.} \item{unmatched}{Handling of unmatched locations. One of: \itemize{ \item \code{"default"} to use \code{default} in unmatched locations. \item \code{"error"} to error when there are unmatched locations. }} \item{ptype}{An optional override for the output type, which is usually computed as the common type of \code{to} and \code{default}.} } \value{ A vector the same size as \code{x}. \itemize{ \item For \code{recode_values()}, the type of the output is computed as the common type of \code{to} and \code{default}, unless overridden by \code{ptype}. The names of the output come from the names of \code{to} and \code{default}. \item For \code{replace_values()}, the type of the output will have the same type as \code{x}. The names of the output will be the same as the names of \code{x}. } } \description{ \code{recode_values()} and \code{replace_values()} provide two ways to map old values to new values. They work by matching values against \code{x} and using the first match to determine the corresponding value in the output vector. You can also think of these functions as a way to use a lookup table to recode a vector. \itemize{ \item Use \code{recode_values()} when creating an entirely new vector. \item Use \code{replace_values()} when partially updating an existing vector. } If you are just replacing a few values within an existing vector, then \code{replace_values()} is always a better choice because it is type stable and better expresses intent. A major difference between the two functions is what happens when no cases match: \itemize{ \item \code{recode_values()} falls through to a \code{default}. \item \code{replace_values()} retains the original values from \code{x}. } These functions have two mutually exclusive ways to use them: \itemize{ \item A formula-based approach, i.e. \code{recode_values(x, from1 ~ to1, from2 ~ to2)}, similar to \code{\link[=case_when]{case_when()}}, which is useful when you have a small number of cases. \item A vector-based approach, i.e. \code{recode_values(x, from = from, to = to)}, which is useful when you have a pre-built lookup table (which may come from an external source, like a CSV file). } See \code{vignette("recoding-replacing")} for more examples. } \examples{ x <- c("NC", "NYC", "CA", NA, "NYC", "Unknown") # `recode_values()` is useful for fully recoding from one set of values to # another, creating an entirely new vector in the process. Note that any # unmatched values result in `NA`, or a `default` value. recode_values( x, "NC" ~ "North Carolina", "NYC" ~ "New York", "CA" ~ "California" ) recode_values( x, "NC" ~ "North Carolina", "NYC" ~ "New York", "CA" ~ "California", default = "" ) # `replace_values()` is useful for updating an existing vector, tweaking a # few values along the way replace_values(x, "NYC" ~ "NY") # `replace_values()` is particularly nice for replacing `NA`s with values... replace_values(x, NA ~ "Unknown (NA)") # ...or values with `NA`s replace_values(x, "Unknown" ~ NA) # Multiple values can be grouped within a single left-hand side to normalize # all problematic values at once replace_values(x, c(NA, "Unknown") ~ "") # --------------------------------------------------------------------------- # Lookup tables # `recode_values()` works with more than just character vectors. Imagine you # have this series of Likert Scale scores, which is a scoring system that is # ordered from 1-5. data <- tibble( score = c(1, 2, 3, 4, 5, 2, 3, 1, 4) ) # To recode each `score` to its corresponding Likert Score label, you may # initially be inclined to reach for `case_when()` data |> mutate( score = case_when( score == 1 ~ "Strongly disagree", score == 2 ~ "Disagree", score == 3 ~ "Neutral", score == 4 ~ "Agree", score == 5 ~ "Strongly agree" ) ) # While this works, it can be written more efficiently using # `recode_values()` data |> mutate( score = score |> recode_values( 1 ~ "Strongly disagree", 2 ~ "Disagree", 3 ~ "Neutral", 4 ~ "Agree", 5 ~ "Strongly agree" ) ) # `recode_values()` actually has two mutually exclusive APIs. The formula API # used above, which is like `case_when()`, and a lookup style API that uses # `from` and `to` arguments. The lookup API is even better suited for this # problem, because we can move the mapping outside of the `mutate()` call # into a standalone lookup table. You could even imagine reading this # `likert` lookup table in from a separate CSV file. likert <- tribble( ~from, ~to, 1, "Strongly disagree", 2, "Disagree", 3, "Neutral", 4, "Agree", 5, "Strongly agree" ) data |> mutate(score = recode_values(score, from = likert$from, to = likert$to)) # You can utilize the same lookup table across multiple columns by using # `across()` data_months <- tibble( score_january = c(1, 2, 3, 4, 5, 2, 3, 1, 4), score_february = c(4, 2, 1, 2, 1, 5, 2, 4, 4) ) data_months |> mutate(across( starts_with("score"), ~ recode_values(.x, from = likert$from, to = likert$to) )) # The `unmatched` argument allows you to assert that you believe that you've # recoded all of the cases and will error if you've missed one, adding an # extra layer of safety data_with_zero <- add_row(data, score = 0) try({ recode_values( data_with_zero$score, from = likert$from, to = likert$to, unmatched = "error" ) }) # Note that missing values are considered unmatched. If you expect missing # values, you'll need to handle them explicitly in your lookup table. data_with_missing <- add_row(data, score = NA) try({ recode_values( data_with_missing$score, from = likert$from, to = likert$to, unmatched = "error" ) }) likert <- add_row(likert, from = NA, to = NA) recode_values( data_with_missing$score, from = likert$from, to = likert$to, unmatched = "error" ) # ------------------------------------------------------------------------------ # Lists of vectors # In some cases, your mapping may collapse multiple groups together into a # single value. For example, here we'd like to standardize the school names. schools <- c( "UNC", "Chapel Hill", NA, "Duke", "Duke University", "UNC", "NC State", "ECU", "East Carolina" ) # This `tribble()` is more complex than it may appear, it actually # creates a list column! standardized <- tribble( ~from, ~to, c("UNC", "Chapel Hill"), "UNC", c("Duke", "Duke University"), "Duke", c("NC State"), "NC State", c("ECU", "East Carolina"), "ECU", NA, NA ) standardized standardized$from # `recode_values()` treats a list `from` value as a list of vectors, where # any match within one of the vectors is mapped to its corresponding `to` # value recode_values( schools, from = standardized$from, to = standardized$to, unmatched = "error" ) # This formula based approach is equivalent, but the lookup based approach is # nicer because the lookup table can be defined separately recode_values( schools, c("UNC", "Chapel Hill") ~ "UNC", c("Duke", "Duke University") ~ "Duke", c("NC State") ~ "NC State", c("ECU", "East Carolina") ~ "ECU", NA ~ NA, unmatched = "error" ) } \seealso{ \code{\link[=case_when]{case_when()}}, \code{\link[vctrs:vec-recode-and-replace]{vctrs::vec_recode_values()}} } dplyr/man/auto_copy.Rd0000644000176200001440000000125314366556340014465 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/copy-to.R \name{auto_copy} \alias{auto_copy} \title{Copy tables to same source, if necessary} \usage{ auto_copy(x, y, copy = FALSE, ...) } \arguments{ \item{x, y}{\code{y} will be copied to \code{x}, if necessary.} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into the same src as \code{x}. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.} \item{...}{Other arguments passed on to methods.} } \description{ Copy tables to same source, if necessary } dplyr/man/top_n.Rd0000644000176200001440000000356415106134104013570 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/top-n.R \name{top_n} \alias{top_n} \alias{top_frac} \title{Select top (or bottom) n rows (by value)} \usage{ top_n(x, n, wt) top_frac(x, n, wt) } \arguments{ \item{x}{A data frame.} \item{n}{Number of rows to return for \code{top_n()}, fraction of rows to return for \code{top_frac()}. If \code{n} is positive, selects the top rows. If negative, selects the bottom rows. If \code{x} is grouped, this is the number (or fraction) of rows per group. Will include more rows if there are ties.} \item{wt}{(Optional). The variable to use for ordering. If not specified, defaults to the last variable in the tbl.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{top_n()} has been superseded in favour of \code{\link[=slice_min]{slice_min()}}/\code{\link[=slice_max]{slice_max()}}. While it will not be deprecated in the near future, retirement means that we will only perform critical bug fixes, so we recommend moving to the newer alternatives. \code{top_n()} was superseded because the name was fundamentally confusing as it returned what you might reasonably consider to be the \emph{bottom} rows. Additionally, the \code{wt} variable had a confusing name, and strange default (the last column in the data frame). Unfortunately we could not see an easy way to fix the existing \code{top_n()} function without breaking existing code, so we created a new alternative. } \examples{ df <- data.frame(x = c(6, 4, 1, 10, 3, 1, 1)) df |> top_n(2) # highest values df |> top_n(-2) # lowest values # now use df |> slice_max(x, n = 2) df |> slice_min(x, n = 2) # top_frac() -> prop argument of slice_min()/slice_max() df |> top_frac(.5) # -> df |> slice_max(x, prop = 0.5) } \keyword{internal} dplyr/man/row_number.Rd0000644000176200001440000000461615106134104014627 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rank.R \name{row_number} \alias{row_number} \alias{min_rank} \alias{dense_rank} \title{Integer ranking functions} \usage{ row_number(x) min_rank(x) dense_rank(x) } \arguments{ \item{x}{A vector to rank By default, the smallest values will get the smallest ranks. Use \code{\link[=desc]{desc()}} to reverse the direction so the largest values get the smallest ranks. Missing values will be given rank \code{NA}. Use \code{coalesce(x, Inf)} or \code{coalesce(x, -Inf)} if you want to treat them as the largest or smallest values respectively. To rank by multiple columns at once, supply a data frame.} } \value{ An integer vector. } \description{ Three ranking functions inspired by SQL2003. They differ primarily in how they handle ties: \itemize{ \item \code{row_number()} gives every input a unique rank, so that \code{c(10, 20, 20, 30)} would get ranks \code{c(1, 2, 3, 4)}. It's equivalent to \code{rank(ties.method = "first")}. \item \code{min_rank()} gives every tie the same (smallest) value so that \code{c(10, 20, 20, 30)} gets ranks \code{c(1, 2, 2, 4)}. It's the way that ranks are usually computed in sports and is equivalent to \code{rank(ties.method = "min")}. \item \code{dense_rank()} works like \code{min_rank()}, but doesn't leave any gaps, so that \code{c(10, 20, 20, 30)} gets ranks \code{c(1, 2, 2, 3)}. } } \examples{ x <- c(5, 1, 3, 2, 2, NA) row_number(x) min_rank(x) dense_rank(x) # Ranking functions can be used in `filter()` to select top/bottom rows df <- data.frame( grp = c(1, 1, 1, 2, 2, 2, 3, 3, 3), x = c(3, 2, 1, 1, 2, 2, 1, 1, 1), y = c(1, 3, 2, 3, 2, 2, 4, 1, 2), id = 1:9 ) # Always gives exactly 1 row per group df |> group_by(grp) |> filter(row_number(x) == 1) # May give more than 1 row if ties df |> group_by(grp) |> filter(min_rank(x) == 1) # Rank by multiple columns (to break ties) by selecting them with `pick()` df |> group_by(grp) |> filter(min_rank(pick(x, y)) == 1) # See slice_min() and slice_max() for another way to tackle the same problem # You can use row_number() without an argument to refer to the "current" # row number. df |> group_by(grp) |> filter(row_number() == 1) # It's easiest to see what this does with mutate(): df |> group_by(grp) |> mutate(grp_id = row_number()) } \seealso{ Other ranking functions: \code{\link{ntile}()}, \code{\link{percent_rank}()} } \concept{ranking functions} dplyr/man/group_by_all.Rd0000644000176200001440000000610315106134104015117 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise-group-by.R \name{group_by_all} \alias{group_by_all} \alias{group_by_at} \alias{group_by_if} \title{Group by a selection of variables} \usage{ group_by_all( .tbl, .funs = list(), ..., .add = FALSE, .drop = group_by_drop_default(.tbl) ) group_by_at( .tbl, .vars, .funs = list(), ..., .add = FALSE, .drop = group_by_drop_default(.tbl) ) group_by_if( .tbl, .predicate, .funs = list(), ..., .add = FALSE, .drop = group_by_drop_default(.tbl) ) } \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.funs}{A function \code{fun}, a quosure style lambda \code{~ fun(.)} or a list of either form.} \item{...}{Additional arguments for the function calls in \code{.funs}. These are evaluated only once, with \link[rlang:dyn-dots]{tidy dots} support.} \item{.add}{See \code{\link[=group_by]{group_by()}}} \item{.drop}{Drop groups formed by factor levels that don't appear in the data? The default is \code{TRUE} except when \code{.data} has been previously grouped with \code{.drop = FALSE}. See \code{\link[=group_by_drop_default]{group_by_drop_default()}} for details.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} Scoped verbs (\verb{_if}, \verb{_at}, \verb{_all}) have been superseded by the use of \code{\link[=pick]{pick()}} or \code{\link[=across]{across()}} in an existing verb. See \code{vignette("colwise")} for details. These \link{scoped} variants of \code{\link[=group_by]{group_by()}} group a data frame by a selection of variables. Like \code{\link[=group_by]{group_by()}}, they have optional \link{mutate} semantics. } \section{Grouping variables}{ Existing grouping variables are maintained, even if not included in the selection. } \examples{ # Group a data frame by all variables: group_by_all(mtcars) # -> mtcars |> group_by(pick(everything())) # Group by variables selected with a predicate: group_by_if(iris, is.factor) # -> iris |> group_by(pick(where(is.factor))) # Group by variables selected by name: group_by_at(mtcars, vars(vs, am)) # -> mtcars |> group_by(pick(vs, am)) # Like group_by(), the scoped variants have optional mutate # semantics. This provide a shortcut for group_by() + mutate(): d <- tibble(x=c(1,1,2,2), y=c(1,2,1,2)) group_by_all(d, as.factor) # -> d |> group_by(across(everything(), as.factor)) group_by_if(iris, is.factor, as.character) # -> iris |> group_by(across(where(is.factor), as.character)) } \keyword{internal} dplyr/man/arrange.Rd0000644000176200001440000000750115106134104014063 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/arrange.R \name{arrange} \alias{arrange} \alias{arrange.data.frame} \title{Order rows using column values} \usage{ arrange(.data, ..., .by_group = FALSE) \method{arrange}{data.frame}(.data, ..., .by_group = FALSE, .locale = NULL) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Variables, or functions of variables. Use \code{\link[=desc]{desc()}} to sort a variable in descending order.} \item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to grouped data frames only.} \item{.locale}{The locale to sort character vectors in. \itemize{ \item If \code{NULL}, the default, uses the \code{"C"} locale unless the deprecated \code{dplyr.legacy_locale} global option escape hatch is active. See the \link{dplyr-locale} help page for more details. \item If a single string from \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} is supplied, then this will be used as the locale to sort with. For example, \code{"en"} will sort with the American English locale. This requires the stringi package. \item If \code{"C"} is supplied, then character vectors will always be sorted in the C locale. This does not require stringi and is often much faster than supplying a locale identifier. } The C locale is not the same as English locales, such as \code{"en"}, particularly when it comes to data containing a mix of upper and lower case letters. This is explained in more detail on the \link[=dplyr-locale]{locale} help page under the \verb{Default locale} section.} } \value{ An object of the same type as \code{.data}. The output has the following properties: \itemize{ \item All rows appear in the output, but (usually) in a different place. \item Columns are not modified. \item Groups are not modified. \item Data frame attributes are preserved. } } \description{ \code{arrange()} orders the rows of a data frame by the values of selected columns. Unlike other dplyr verbs, \code{arrange()} largely ignores grouping; you need to explicitly mention grouping variables (or use \code{.by_group = TRUE}) in order to group by them, and functions of variables are evaluated once per data frame, not once per group. } \details{ \subsection{Missing values}{ Unlike base sorting with \code{sort()}, \code{NA} are: \itemize{ \item always sorted to the end for local data, even when wrapped with \code{desc()}. \item treated differently for remote data, depending on the backend. } } } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("arrange")}. } \examples{ arrange(mtcars, cyl, disp) arrange(mtcars, desc(disp)) # grouped arrange ignores groups by_cyl <- mtcars |> group_by(cyl) by_cyl |> arrange(desc(wt)) # Unless you specifically ask: by_cyl |> arrange(desc(wt), .by_group = TRUE) # use embracing when wrapping in a function; # see ?rlang::args_data_masking for more details tidy_eval_arrange <- function(.data, var) { .data |> arrange({{ var }}) } tidy_eval_arrange(mtcars, mpg) # Use `across()` or `pick()` to select columns with tidy-select iris |> arrange(pick(starts_with("Sepal"))) iris |> arrange(across(starts_with("Sepal"), desc)) } \seealso{ Other single table verbs: \code{\link{filter}()}, \code{\link{mutate}()}, \code{\link{reframe}()}, \code{\link{rename}()}, \code{\link{select}()}, \code{\link{slice}()}, \code{\link{summarise}()} } \concept{single table verbs} dplyr/man/last_dplyr_warnings.Rd0000644000176200001440000000122514366556340016547 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/conditions.R \name{last_dplyr_warnings} \alias{last_dplyr_warnings} \title{Show warnings from the last command} \usage{ last_dplyr_warnings(n = 5) } \arguments{ \item{n}{Passed to \code{\link[=head]{head()}} so that only the first \code{n} warnings are displayed.} } \description{ Warnings that occur inside a dplyr verb like \code{mutate()} are caught and stashed away instead of being emitted to the console. This prevents rowwise and grouped data frames from flooding the console with warnings. To see the original warnings, use \code{last_dplyr_warnings()}. } \keyword{internal} dplyr/man/group_by_drop_default.Rd0000644000176200001440000000122415106134104017016 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/group-by.R \name{group_by_drop_default} \alias{group_by_drop_default} \title{Default value for .drop argument of group_by} \usage{ group_by_drop_default(.tbl) } \arguments{ \item{.tbl}{A data frame} } \value{ \code{TRUE} unless \code{.tbl} is a grouped data frame that was previously obtained by \code{group_by(.drop = FALSE)} } \description{ Default value for .drop argument of group_by } \examples{ group_by_drop_default(iris) iris |> group_by(Species) |> group_by_drop_default() iris |> group_by(Species, .drop = FALSE) |> group_by_drop_default() } \keyword{internal} dplyr/man/context.Rd0000644000176200001440000000350015106134104014123 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/context.R \name{context} \alias{context} \alias{n} \alias{cur_group} \alias{cur_group_id} \alias{cur_group_rows} \alias{cur_column} \title{Information about the "current" group or variable} \usage{ n() cur_group() cur_group_id() cur_group_rows() cur_column() } \description{ These functions return information about the "current" group or "current" variable, so only work inside specific contexts like \code{\link[=summarise]{summarise()}} and \code{\link[=mutate]{mutate()}}. \itemize{ \item \code{n()} gives the current group size. \item \code{cur_group()} gives the group keys, a tibble with one row and one column for each grouping variable. \item \code{cur_group_id()} gives a unique numeric identifier for the current group. \item \code{cur_group_rows()} gives the row indices for the current group. \item \code{cur_column()} gives the name of the current column (in \code{\link[=across]{across()}} only). } See \code{\link[=group_data]{group_data()}} for equivalent functions that return values for all groups. See \code{\link[=pick]{pick()}} for a way to select a subset of columns using tidyselect syntax while inside \code{summarise()} or \code{mutate()}. } \section{data.table}{ If you're familiar with data.table: \itemize{ \item \code{cur_group_id()} <-> \code{.GRP} \item \code{cur_group()} <-> \code{.BY} \item \code{cur_group_rows()} <-> \code{.I} } See \code{\link[=pick]{pick()}} for an equivalent to \code{.SD}. } \examples{ df <- tibble( g = sample(rep(letters[1:3], 1:3)), x = runif(6), y = runif(6) ) gf <- df |> group_by(g) gf |> summarise(n = n()) gf |> mutate(id = cur_group_id()) gf |> reframe(row = cur_group_rows()) gf |> summarise(data = list(cur_group())) gf |> mutate(across(everything(), ~ paste(cur_column(), round(.x, 2)))) } dplyr/man/dplyr_tidy_select.Rd0000644000176200001440000000574114527137164016211 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/doc-params.R \name{dplyr_tidy_select} \alias{dplyr_tidy_select} \title{Argument type: tidy-select} \description{ This page describes the \verb{} argument modifier which indicates the argument supports \strong{tidy selections}. Tidy selection provides a concise dialect of R for selecting variables based on their names or properties. Tidy selection is a variant of tidy evaluation. This means that inside functions, tidy-select arguments require special attention, as described in the \emph{Indirection} section below. If you've never heard of tidy evaluation before, start with \code{vignette("programming")}. } \section{Overview of selection features}{ Tidyverse selections implement a dialect of R where operators make it easy to select variables: \itemize{ \item \code{:} for selecting a range of consecutive variables. \item \code{!} for taking the complement of a set of variables. \item \code{&} and \code{|} for selecting the intersection or the union of two sets of variables. \item \code{c()} for combining selections. } In addition, you can use \strong{selection helpers}. Some helpers select specific columns: \itemize{ \item \code{\link[tidyselect:everything]{everything()}}: Matches all variables. \item \code{\link[tidyselect:everything]{last_col()}}: Select last variable, possibly with an offset. \item \code{\link[=group_cols]{group_cols()}}: Select all grouping columns. } Other helpers select variables by matching patterns in their names: \itemize{ \item \code{\link[tidyselect:starts_with]{starts_with()}}: Starts with a prefix. \item \code{\link[tidyselect:starts_with]{ends_with()}}: Ends with a suffix. \item \code{\link[tidyselect:starts_with]{contains()}}: Contains a literal string. \item \code{\link[tidyselect:starts_with]{matches()}}: Matches a regular expression. \item \code{\link[tidyselect:starts_with]{num_range()}}: Matches a numerical range like x01, x02, x03. } Or from variables stored in a character vector: \itemize{ \item \code{\link[tidyselect:all_of]{all_of()}}: Matches variable names in a character vector. All names must be present, otherwise an out-of-bounds error is thrown. \item \code{\link[tidyselect:all_of]{any_of()}}: Same as \code{all_of()}, except that no error is thrown for names that don't exist. } Or using a predicate function: \itemize{ \item \code{\link[tidyselect:where]{where()}}: Applies a function to all variables and selects those for which the function returns \code{TRUE}. } } \section{Indirection}{ There are two main cases: \itemize{ \item If you have a character vector of column names, use \code{all_of()} or \code{any_of()}, depending on whether or not you want unknown variable names to cause an error, e.g. \code{select(df, all_of(vars))}, \code{select(df, !any_of(vars))}. \item If you want the user to be able to supply a tidyselect specification in a function argument, embrace the function argument, e.g. \code{select(df, {{ vars }})}. } } \keyword{internal} dplyr/man/backend_dbplyr.Rd0000644000176200001440000000742113663216626015430 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dbplyr.R \name{backend_dbplyr} \alias{backend_dbplyr} \alias{db_desc} \alias{sql_translate_env} \alias{db_list_tables} \alias{db_has_table} \alias{db_data_type} \alias{db_save_query} \alias{db_begin} \alias{db_commit} \alias{db_rollback} \alias{db_write_table} \alias{db_create_table} \alias{db_insert_into} \alias{db_create_indexes} \alias{db_create_index} \alias{db_drop_table} \alias{db_analyze} \alias{db_explain} \alias{db_query_fields} \alias{db_query_rows} \alias{sql_select} \alias{sql_subquery} \alias{sql_join} \alias{sql_semi_join} \alias{sql_set_op} \alias{sql_escape_string} \alias{sql_escape_ident} \title{Database and SQL generics.} \usage{ db_desc(x) sql_translate_env(con) db_list_tables(con) db_has_table(con, table) db_data_type(con, fields) db_save_query(con, sql, name, temporary = TRUE, ...) db_begin(con, ...) db_commit(con, ...) db_rollback(con, ...) db_write_table(con, table, types, values, temporary = FALSE, ...) db_create_table(con, table, types, temporary = FALSE, ...) db_insert_into(con, table, values, ...) db_create_indexes(con, table, indexes = NULL, unique = FALSE, ...) db_create_index(con, table, columns, name = NULL, unique = FALSE, ...) db_drop_table(con, table, force = FALSE, ...) db_analyze(con, table, ...) db_explain(con, sql, ...) db_query_fields(con, sql, ...) db_query_rows(con, sql, ...) sql_select( con, select, from, where = NULL, group_by = NULL, having = NULL, order_by = NULL, limit = NULL, distinct = FALSE, ... ) sql_subquery(con, from, name = random_table_name(), ...) sql_join(con, x, y, vars, type = "inner", by = NULL, ...) sql_semi_join(con, x, y, anti = FALSE, by = NULL, ...) sql_set_op(con, x, y, method) sql_escape_string(con, x) sql_escape_ident(con, x) } \arguments{ \item{con}{A database connection.} \item{table}{A string, the table name.} \item{fields}{A list of fields, as in a data frame.} } \value{ Usually a logical value indicating success. Most failures should generate an error. However, \code{db_has_table()} should return \code{NA} if temporary tables cannot be listed with \code{\link[DBI:dbListTables]{DBI::dbListTables()}} (due to backend API limitations for example). As a result, you methods will rely on the backend to throw an error if a table exists when it shouldn't. } \description{ The \code{sql_} generics are used to build the different types of SQL queries. The default implementations in dbplyr generates ANSI 92 compliant SQL. The \code{db_} generics execute actions on the database. The default implementations in dbplyr typically just call the standard DBI S4 method. } \details{ A few backend methods do not call the standard DBI S4 methods including \itemize{ \item \code{db_data_type()}: Calls \code{\link[DBI:dbDataType]{DBI::dbDataType()}} for every field (e.g. data frame column) and returns a vector of corresponding SQL data types \item \code{db_save_query()}: Builds and executes a \verb{CREATE [TEMPORARY] TABLE ...} SQL command. \item \code{db_create_index()}: Builds and executes a \verb{CREATE INDEX ON
} SQL command. \item \code{db_drop_table()}: Builds and executes a \verb{DROP TABLE [IF EXISTS]
} SQL command. \item \code{db_analyze()}: Builds and executes an \verb{ANALYZE
} SQL command. } Currently, \code{\link[=copy_to]{copy_to()}} is the only user of \code{db_begin()}, \code{db_commit()}, \code{db_rollback()}, \code{db_write_table()}, \code{db_create_indexes()}, \code{db_drop_table()} and \code{db_analyze()}. If you find yourself overriding many of these functions it may suggest that you should just override \code{copy_to()} instead. \code{db_create_table()} and \code{db_insert_into()} have been deprecated in favour of \code{db_write_table()}. } \keyword{internal} dplyr/man/filter-joins.Rd0000644000176200001440000001114115106134104015044 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/join.R \name{filter-joins} \alias{filter-joins} \alias{semi_join} \alias{semi_join.data.frame} \alias{anti_join} \alias{anti_join.data.frame} \title{Filtering joins} \usage{ semi_join(x, y, by = NULL, copy = FALSE, ...) \method{semi_join}{data.frame}(x, y, by = NULL, copy = FALSE, ..., na_matches = c("na", "never")) anti_join(x, y, by = NULL, copy = FALSE, ...) \method{anti_join}{data.frame}(x, y, by = NULL, copy = FALSE, ..., na_matches = c("na", "never")) } \arguments{ \item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{by}{A join specification created with \code{\link[=join_by]{join_by()}}, or a character vector of variables to join by. If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all variables in common across \code{x} and \code{y}. A message lists the variables so that you can check they're correct; suppress the message by supplying \code{by} explicitly. To join on different variables between \code{x} and \code{y}, use a \code{\link[=join_by]{join_by()}} specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. To join by multiple variables, use a \code{\link[=join_by]{join_by()}} specification with multiple expressions. For example, \code{join_by(a == b, c == d)} will match \code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between \code{x} and \code{y}, you can shorten this by listing only the variable names, like \code{join_by(a, c)}. \code{\link[=join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap joins. See the documentation at \link[=join_by]{?join_by} for details on these types of joins. For simple equality joins, you can alternatively specify a character vector of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. To perform a cross-join, generating all combinations of \code{x} and \code{y}, see \code{\link[=cross_join]{cross_join()}}.} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into the same src as \code{x}. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.} \item{...}{Other parameters passed onto methods.} \item{na_matches}{Should two \code{NA} or two \code{NaN} values match? \itemize{ \item \code{"na"}, the default, treats two \code{NA} or two \code{NaN} values as equal, like \code{\%in\%}, \code{\link[=match]{match()}}, and \code{\link[=merge]{merge()}}. \item \code{"never"} treats two \code{NA} or two \code{NaN} values as different, and will never match them together or to any other values. This is similar to joins for database sources and to \code{base::merge(incomparables = NA)}. }} } \value{ An object of the same type as \code{x}. The output has the following properties: \itemize{ \item Rows are a subset of the input, but appear in the same order. \item Columns are not modified. \item Data frame attributes are preserved. \item Groups are taken from \code{x}. The number of groups may be reduced. } } \description{ Filtering joins filter rows from \code{x} based on the presence or absence of matches in \code{y}: \itemize{ \item \code{semi_join()} returns all rows from \code{x} with a match in \code{y}. \item \code{anti_join()} returns all rows from \code{x} with\strong{out} a match in \code{y}. } } \section{Methods}{ These function are \strong{generic}s, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. Methods available in currently loaded packages: \itemize{ \item \code{semi_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("semi_join")}. \item \code{anti_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("anti_join")}. } } \examples{ # "Filtering" joins keep cases from the LHS band_members |> semi_join(band_instruments) band_members |> anti_join(band_instruments) # To suppress the message about joining variables, supply `by` band_members |> semi_join(band_instruments, by = join_by(name)) # This is good practice in production code } \seealso{ Other joins: \code{\link{cross_join}()}, \code{\link{mutate-joins}}, \code{\link{nest_join}()} } \concept{joins} dplyr/man/progress_estimated.Rd0000644000176200001440000000312614266276767016402 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/progress.R \name{progress_estimated} \alias{progress_estimated} \title{Progress bar with estimated time.} \usage{ progress_estimated(n, min_time = 0) } \arguments{ \item{n}{Total number of items} \item{min_time}{Progress bar will wait until at least \code{min_time} seconds have elapsed before displaying any results.} } \value{ A ref class with methods \code{tick()}, \code{print()}, \code{pause()}, and \code{stop()}. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This progress bar has been deprecated since providing progress bars is not the responsibility of dplyr. Instead, you might try the more powerful \href{https://github.com/r-lib/progress}{progress} package. This reference class represents a text progress bar displayed estimated time remaining. When finished, it displays the total duration. The automatic progress bar can be disabled by setting option \code{dplyr.show_progress} to \code{FALSE}. } \examples{ p <- progress_estimated(3) p$tick() p$tick() p$tick() p <- progress_estimated(3) for (i in 1:3) p$pause(0.1)$tick()$print() p <- progress_estimated(3) p$tick()$print()$ pause(1)$stop() # If min_time is set, progress bar not shown until that many # seconds have elapsed p <- progress_estimated(3, min_time = 3) for (i in 1:3) p$pause(0.1)$tick()$print() \dontrun{ p <- progress_estimated(10, min_time = 3) for (i in 1:10) p$pause(0.5)$tick()$print() } } \keyword{internal} dplyr/man/nest_by.Rd0000644000176200001440000000742215106134104014111 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nest-by.R \name{nest_by} \alias{nest_by} \title{Nest by one or more variables} \usage{ nest_by(.data, ..., .key = "data", .keep = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more details.} \item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> In \code{group_by()}, variables or computations to group by. Computations are always done on the ungrouped data frame. To perform computations on the grouped data, you need to use a separate \code{mutate()} step before the \code{group_by()}. Computations are not allowed in \code{nest_by()}. In \code{ungroup()}, variables to remove from the grouping.} \item{.key}{Name of the list column} \item{.keep}{Should the grouping columns be kept in the list column.} } \value{ A \link{rowwise} data frame. The output has the following properties: \itemize{ \item The rows come from the underlying \code{\link[=group_keys]{group_keys()}}. \item The columns are the grouping keys plus one list-column of data frames. \item Data frame attributes are \strong{not} preserved, because \code{nest_by()} fundamentally creates a new data frame. } A tbl with one row per unique combination of the grouping variables. The first columns are the grouping variables, followed by a list column of tibbles with matching rows of the remaining columns. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \code{nest_by()} is closely related to \code{\link[=group_by]{group_by()}}. However, instead of storing the group structure in the metadata, it is made explicit in the data, giving each group key a single row along with a list-column of data frames that contain all the other data. \code{nest_by()} returns a \link{rowwise} data frame, which makes operations on the grouped data particularly elegant. See \code{vignette("rowwise")} for more details. } \details{ Note that \code{df |> nest_by(x, y)} is roughly equivalent to \if{html}{\out{
}}\preformatted{df |> group_by(x, y) |> summarise(data = list(pick(everything()))) |> rowwise() }\if{html}{\out{
}} If you want to unnest a nested data frame, you can either use \code{tidyr::unnest()} or take advantage of \code{reframe()}s multi-row behaviour: \if{html}{\out{
}}\preformatted{nested |> reframe(data) }\if{html}{\out{
}} } \section{Lifecycle}{ \code{nest_by()} is not stable because \code{\link[tidyr:nest]{tidyr::nest(.by =)}} provides very similar behavior. It may be deprecated in the future. } \section{Methods}{ This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. The following methods are currently available in loaded packages: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("nest_by")}. } \examples{ # After nesting, you get one row per group iris |> nest_by(Species) starwars |> nest_by(species) # The output is grouped by row, which makes modelling particularly easy models <- mtcars |> nest_by(cyl) |> mutate(model = list(lm(mpg ~ wt, data = data))) models models |> summarise(rsq = summary(model)$r.squared) \dontshow{if (requireNamespace("broom", quietly = TRUE)) withAutoprint(\{ # examplesIf} # This is particularly elegant with the broom functions models |> summarise(broom::glance(model)) models |> reframe(broom::tidy(model)) \dontshow{\}) # examplesIf} # Note that you can also `reframe()` to unnest the data models |> reframe(data) } \keyword{internal} dplyr/man/new_grouped_df.Rd0000644000176200001440000000315214366556340015452 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/grouped-df.R, R/rowwise.R \name{new_grouped_df} \alias{new_grouped_df} \alias{validate_grouped_df} \alias{new_rowwise_df} \alias{validate_rowwise_df} \title{Low-level construction and validation for the grouped_df and rowwise_df classes} \usage{ new_grouped_df(x, groups, ..., class = character()) validate_grouped_df(x, check_bounds = FALSE) new_rowwise_df(data, group_data = NULL, ..., class = character()) validate_rowwise_df(x) } \arguments{ \item{x}{A data frame} \item{groups}{The grouped structure, \code{groups} should be a data frame. Its last column should be called \code{.rows} and be a list of 1 based integer vectors that all are between 1 and the number of rows of \code{.data}.} \item{...}{additional attributes} \item{class}{additional class, will be prepended to canonical classes.} \item{check_bounds}{whether to check all indices for out of bounds problems in \code{grouped_df} objects} } \description{ \code{new_grouped_df()} and \code{new_rowwise_df()} are constructors designed to be high-performance so only check types, not values. This means it is the caller's responsibility to create valid values, and hence this is for expert use only. \code{validate_grouped_df()} and \code{validate_rowwise_df()} validate the attributes of a \code{grouped_df} or a \code{rowwise_df}. } \examples{ # 5 bootstrap samples tbl <- new_grouped_df( tibble(x = rnorm(10)), groups = tibble(".rows" := replicate(5, sample(1:10, replace = TRUE), simplify = FALSE)) ) # mean of each bootstrap sample summarise(tbl, x = mean(x)) } \keyword{internal} dplyr/man/ident.Rd0000644000176200001440000000162215106134104013545 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compat-dbplyr.R \name{ident} \alias{ident} \title{Flag a character vector as SQL identifiers} \usage{ ident(...) } \arguments{ \item{...}{A character vector, or name-value pairs.} } \description{ \code{ident()} takes strings and turns them as database identifiers (e.g. table or column names) quoting them using the identifer rules for your database. \code{ident_q()} does the same, but assumes the names have already been quoted, preventing them from being quoted again. These are generally for internal use only; if you need to supply an table name that is qualified with schema or catalog, or has already been quoted for some other reason, use \code{I()}. } \examples{ # Identifiers are escaped with " \dontshow{if (requireNamespace("dbplyr", quietly = TRUE)) withAutoprint(\{ # examplesIf} ident("x") \dontshow{\}) # examplesIf} } dplyr/man/lead-lag.Rd0000644000176200001440000000335714366556340014140 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/lead-lag.R \name{lead-lag} \alias{lead-lag} \alias{lag} \alias{lead} \title{Compute lagged or leading values} \usage{ lag(x, n = 1L, default = NULL, order_by = NULL, ...) lead(x, n = 1L, default = NULL, order_by = NULL, ...) } \arguments{ \item{x}{A vector} \item{n}{Positive integer of length 1, giving the number of positions to lag or lead by} \item{default}{The value used to pad \code{x} back to its original size after the lag or lead has been applied. The default, \code{NULL}, pads with a missing value. If supplied, this must be a vector with size 1, which will be cast to the type of \code{x}.} \item{order_by}{An optional secondary vector that defines the ordering to use when applying the lag or lead to \code{x}. If supplied, this must be the same size as \code{x}.} \item{...}{Not used.} } \value{ A vector with the same type and size as \code{x}. } \description{ Find the "previous" (\code{lag()}) or "next" (\code{lead()}) values in a vector. Useful for comparing values behind of or ahead of the current values. } \examples{ lag(1:5) lead(1:5) x <- 1:5 tibble(behind = lag(x), x, ahead = lead(x)) # If you want to look more rows behind or ahead, use `n` lag(1:5, n = 1) lag(1:5, n = 2) lead(1:5, n = 1) lead(1:5, n = 2) # If you want to define a value to pad with, use `default` lag(1:5) lag(1:5, default = 0) lead(1:5) lead(1:5, default = 6) # If the data are not already ordered, use `order_by` scrambled <- slice_sample( tibble(year = 2000:2005, value = (0:5) ^ 2), prop = 1 ) wrong <- mutate(scrambled, previous_year_value = lag(value)) arrange(wrong, year) right <- mutate(scrambled, previous_year_value = lag(value, order_by = year)) arrange(right, year) } dplyr/man/deprec-context.Rd0000644000176200001440000000142514366556340015410 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprec-context.R \name{deprec-context} \alias{deprec-context} \alias{cur_data} \alias{cur_data_all} \title{Information about the "current" group or variable} \usage{ cur_data() cur_data_all() } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} These functions were deprecated in dplyr 1.1.0. \itemize{ \item \code{cur_data()} is deprecated in favor of \code{\link[=pick]{pick()}}. \item \code{cur_data_all()} is deprecated but does not have a direct replacement as selecting the grouping variables is not well-defined and is unlikely to ever be useful. } } \keyword{internal} dplyr/man/all_equal.Rd0000644000176200001440000000316314366556340014424 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/all-equal.R \name{all_equal} \alias{all_equal} \title{Flexible equality comparison for data frames} \usage{ all_equal( target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, ... ) } \arguments{ \item{target, current}{Two data frames to compare.} \item{ignore_col_order}{Should order of columns be ignored?} \item{ignore_row_order}{Should order of rows be ignored?} \item{convert}{Should similar classes be converted? Currently this will convert factor to character and integer to double.} \item{...}{Ignored. Needed for compatibility with \code{all.equal()}.} } \value{ \code{TRUE} if equal, otherwise a character vector describing the reasons why they're not equal. Use \code{\link[=isTRUE]{isTRUE()}} if using the result in an \code{if} expression. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} \code{all_equal()} allows you to compare data frames, optionally ignoring row and column names. It is deprecated as of dplyr 1.1.0, because it makes it too easy to ignore important differences. } \examples{ scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))] # `all_equal()` ignored row and column ordering by default, # but we now feel that that makes it too easy to make mistakes mtcars2 <- scramble(mtcars) all_equal(mtcars, mtcars2) # Instead, be explicit about the row and column ordering all.equal( mtcars, mtcars2[rownames(mtcars), names(mtcars)] ) } \keyword{internal} dplyr/man/summarise_all.Rd0000644000176200001440000001330515106134104015300 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/colwise-mutate.R \name{summarise_all} \alias{summarise_all} \alias{summarise_if} \alias{summarise_at} \alias{summarize_all} \alias{summarize_if} \alias{summarize_at} \title{Summarise multiple columns} \usage{ summarise_all(.tbl, .funs, ...) summarise_if(.tbl, .predicate, .funs, ...) summarise_at(.tbl, .vars, .funs, ..., .cols = NULL) summarize_all(.tbl, .funs, ...) summarize_if(.tbl, .predicate, .funs, ...) summarize_at(.tbl, .vars, .funs, ..., .cols = NULL) } \arguments{ \item{.tbl}{A \code{tbl} object.} \item{.funs}{A function \code{fun}, a quosure style lambda \code{~ fun(.)} or a list of either form.} \item{...}{Additional arguments for the function calls in \code{.funs}. These are evaluated only once, with \link[rlang:dyn-dots]{tidy dots} support.} \item{.predicate}{A predicate function to be applied to the columns or a logical vector. The variables for which \code{.predicate} is or returns \code{TRUE} are selected. This argument is passed to \code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda functions and strings representing function names.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} \item{.cols}{This argument has been renamed to \code{.vars} to fit dplyr's terminology and is deprecated.} } \value{ A data frame. By default, the newly created columns have the shortest names needed to uniquely identify the output. To force inclusion of a name, even when not needed, name the input (see examples for details). } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} Scoped verbs (\verb{_if}, \verb{_at}, \verb{_all}) have been superseded by the use of \code{\link[=pick]{pick()}} or \code{\link[=across]{across()}} in an existing verb. See \code{vignette("colwise")} for details. The \link{scoped} variants of \code{\link[=summarise]{summarise()}} make it easy to apply the same transformation to multiple variables. There are three variants. \itemize{ \item \code{summarise_all()} affects every variable \item \code{summarise_at()} affects variables selected with a character vector or vars() \item \code{summarise_if()} affects variables selected with a predicate function } } \section{Grouping variables}{ If applied on a grouped tibble, these operations are \emph{not} applied to the grouping variables. The behaviour depends on whether the selection is \strong{implicit} (\code{all} and \code{if} selections) or \strong{explicit} (\code{at} selections). \itemize{ \item Grouping variables covered by explicit selections in \code{summarise_at()} are always an error. Add \code{-group_cols()} to the \code{\link[=vars]{vars()}} selection to avoid this: \if{html}{\out{
}}\preformatted{data |> summarise_at(vars(-group_cols(), ...), myoperation) }\if{html}{\out{
}} Or remove \code{group_vars()} from the character vector of column names: \if{html}{\out{
}}\preformatted{nms <- setdiff(nms, group_vars(data)) data |> summarise_at(nms, myoperation) }\if{html}{\out{
}} \item Grouping variables covered by implicit selections are silently ignored by \code{summarise_all()} and \code{summarise_if()}. } } \section{Naming}{ The names of the new columns are derived from the names of the input variables and the names of the functions. \itemize{ \item if there is only one unnamed function (i.e. if \code{.funs} is an unnamed list of length one), the names of the input variables are used to name the new columns; \item for \verb{_at} functions, if there is only one unnamed variable (i.e., if \code{.vars} is of the form \code{vars(a_single_column)}) and \code{.funs} has length greater than one, the names of the functions are used to name the new columns; \item otherwise, the new names are created by concatenating the names of the input variables and the names of the functions, separated with an underscore \code{"_"}. } The \code{.funs} argument can be a named or unnamed list. If a function is unnamed and the name cannot be derived automatically, a name of the form "fn#" is used. Similarly, \code{\link[=vars]{vars()}} accepts named and unnamed arguments. If a variable in \code{.vars} is named, a new column by that name will be created. Name collisions in the new columns are disambiguated using a unique suffix. } \examples{ # The _at() variants directly support strings: starwars |> summarise_at(c("height", "mass"), mean, na.rm = TRUE) # -> starwars |> summarise(across(c("height", "mass"), ~ mean(.x, na.rm = TRUE))) # You can also supply selection helpers to _at() functions but you have # to quote them with vars(): starwars |> summarise_at(vars(height:mass), mean, na.rm = TRUE) # -> starwars |> summarise(across(height:mass, ~ mean(.x, na.rm = TRUE))) # The _if() variants apply a predicate function (a function that # returns TRUE or FALSE) to determine the relevant subset of # columns. Here we apply mean() to the numeric columns: starwars |> summarise_if(is.numeric, mean, na.rm = TRUE) starwars |> summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE))) by_species <- iris |> group_by(Species) # If you want to apply multiple transformations, pass a list of # functions. When there are multiple functions, they create new # variables instead of modifying the variables in place: by_species |> summarise_all(list(min, max)) # -> by_species |> summarise(across(everything(), list(min = min, max = max))) } \seealso{ \link[=scoped]{The other scoped verbs}, \code{\link[=vars]{vars()}} } \keyword{internal} dplyr/man/nth.Rd0000644000176200001440000000537715106134104013246 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nth-value.R \name{nth} \alias{nth} \alias{first} \alias{last} \title{Extract the first, last, or nth value from a vector} \usage{ nth(x, n, order_by = NULL, default = NULL, na_rm = FALSE) first(x, order_by = NULL, default = NULL, na_rm = FALSE) last(x, order_by = NULL, default = NULL, na_rm = FALSE) } \arguments{ \item{x}{A vector} \item{n}{For \code{nth()}, a single integer specifying the position. Negative integers index from the end (i.e. \code{-1L} will return the last value in the vector).} \item{order_by}{An optional vector the same size as \code{x} used to determine the order.} \item{default}{A default value to use if the position does not exist in \code{x}. If \code{NULL}, the default, a missing value is used. If supplied, this must be a single value, which will be cast to the type of \code{x}. When \code{x} is a list , \code{default} is allowed to be any value. There are no type or size restrictions in this case.} \item{na_rm}{Should missing values in \code{x} be removed before extracting the value?} } \value{ If \code{x} is a list, a single element from that list. Otherwise, a vector the same type as \code{x} with size 1. } \description{ These are useful helpers for extracting a single value from a vector. They are guaranteed to return a meaningful value, even when the input is shorter than expected. You can also provide an optional secondary vector that defines the ordering. } \details{ For most vector types, \code{first(x)}, \code{last(x)}, and \code{nth(x, n)} work like \code{x[[1]]}, \verb{x[[length(x)]}, and \code{x[[n]]}, respectively. The primary exception is data frames, where they instead retrieve rows, i.e. \code{x[1, ]}, \code{x[nrow(x), ]}, and \code{x[n, ]}. This is consistent with the tidyverse/vctrs principle which treats data frames as a vector of rows, rather than a vector of columns. } \examples{ x <- 1:10 y <- 10:1 first(x) last(y) nth(x, 1) nth(x, 5) nth(x, -2) # `first()` and `last()` are often useful in `summarise()` df <- tibble(x = x, y = y) df |> summarise( across(x:y, first, .names = "{col}_first"), y_last = last(y) ) # Selecting a position that is out of bounds returns a default value nth(x, 11) nth(x, 0) # This out of bounds behavior also applies to empty vectors first(integer()) # You can customize the default value with `default` nth(x, 11, default = -1L) first(integer(), default = 0L) # `order_by` provides optional ordering last(x) last(x, order_by = y) # `na_rm` removes missing values before extracting the value z <- c(NA, NA, 1, 3, NA, 5, NA) first(z) first(z, na_rm = TRUE) last(z, na_rm = TRUE) nth(z, 3, na_rm = TRUE) # For data frames, these select entire rows df <- tibble(a = 1:5, b = 6:10) first(df) nth(df, 4) } dplyr/DESCRIPTION0000644000176200001440000000407615140333347013124 0ustar liggesusersType: Package Package: dplyr Title: A Grammar of Data Manipulation Version: 1.2.0 Authors@R: c( person("Hadley", "Wickham", , "hadley@posit.co", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4757-117X")), person("Romain", "François", role = "aut", comment = c(ORCID = "0000-0002-2444-4226")), person("Lionel", "Henry", role = "aut"), person("Kirill", "Müller", role = "aut", comment = c(ORCID = "0000-0002-1416-3412")), person("Davis", "Vaughan", , "davis@posit.co", role = "aut", comment = c(ORCID = "0000-0003-4777-038X")), person("Posit Software, PBC", role = c("cph", "fnd")) ) Description: A fast, consistent tool for working with data frame like objects, both in memory and out of memory. License: MIT + file LICENSE URL: https://dplyr.tidyverse.org, https://github.com/tidyverse/dplyr BugReports: https://github.com/tidyverse/dplyr/issues Depends: R (>= 4.1.0) Imports: cli (>= 3.6.2), generics, glue (>= 1.3.2), lifecycle (>= 1.0.5), magrittr (>= 1.5), methods, pillar (>= 1.9.0), R6, rlang (>= 1.1.7), tibble (>= 3.2.0), tidyselect (>= 1.2.0), utils, vctrs (>= 0.7.1) Suggests: broom, covr, DBI, dbplyr (>= 2.2.1), ggplot2, knitr, Lahman, lobstr, nycflights13, purrr, rmarkdown, RSQLite, stringi (>= 1.7.6), testthat (>= 3.1.5), tidyr (>= 1.3.0), withr VignetteBuilder: knitr Config/build/compilation-database: true Config/Needs/website: tidyverse/tidytemplate Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.3 NeedsCompilation: yes Packaged: 2026-01-30 23:05:30 UTC; hadleywickham Author: Hadley Wickham [aut, cre] (ORCID: ), Romain François [aut] (ORCID: ), Lionel Henry [aut], Kirill Müller [aut] (ORCID: ), Davis Vaughan [aut] (ORCID: ), Posit Software, PBC [cph, fnd] Maintainer: Hadley Wickham Repository: CRAN Date/Publication: 2026-02-03 08:50:47 UTC