dtplyr/0000755000176200001440000000000014406577055011605 5ustar liggesusersdtplyr/NAMESPACE0000644000176200001440000001004414372711230013006 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(add_count,dtplyr_step) S3method(anti_join,dtplyr_step) S3method(arrange,dtplyr_step) S3method(as.data.frame,dtplyr_step) S3method(as.data.frame,foo) S3method(as.data.table,dtplyr_step) S3method(as_tibble,dtplyr_step) S3method(auto_copy,dtplyr_step) S3method(collect,dtplyr_step) S3method(compute,dtplyr_step) S3method(count,dtplyr_step) S3method(dim,dtplyr_step) S3method(dim,dtplyr_step_first) S3method(distinct,dtplyr_step) S3method(do,dtplyr_step) S3method(dt_call,dtplyr_step) S3method(dt_call,dtplyr_step_assign) S3method(dt_call,dtplyr_step_first) S3method(dt_call,dtplyr_step_join) S3method(dt_call,dtplyr_step_modify) S3method(dt_call,dtplyr_step_set) S3method(dt_call,dtplyr_step_subset) S3method(dt_has_computation,dtplyr_step) S3method(dt_has_computation,dtplyr_step_first) S3method(dt_has_computation,dtplyr_step_group) S3method(dt_sources,dtplyr_step) S3method(dt_sources,dtplyr_step_first) S3method(dt_sources,dtplyr_step_join) S3method(dt_sources,dtplyr_step_set) S3method(dt_sources,dtplyr_step_subset) S3method(full_join,dtplyr_step) S3method(glimpse,dtplyr_step) S3method(group_by,dtplyr_step) S3method(group_map,dtplyr_step) S3method(group_modify,dtplyr_step) S3method(group_size,dtplyr_step) S3method(group_vars,dtplyr_step) S3method(groups,dtplyr_step) S3method(head,dtplyr_step) S3method(inner_join,dtplyr_step) S3method(left_join,dtplyr_step) S3method(mutate,dtplyr_step) S3method(n_groups,dtplyr_step) S3method(print,dtplyr_step) S3method(pull,dtplyr_step) S3method(relocate,dtplyr_step) S3method(rename,dtplyr_step) S3method(rename_with,dtplyr_step) S3method(right_join,dtplyr_step) S3method(same_src,dtplyr_step) S3method(sample_frac,dtplyr_step) S3method(sample_n,dtplyr_step) S3method(select,dtplyr_step) S3method(semi_join,dtplyr_step) S3method(show_query,dtplyr_step) S3method(slice,dtplyr_step) S3method(slice_head,dtplyr_step) S3method(slice_max,dtplyr_step) S3method(slice_min,dtplyr_step) S3method(slice_sample,dtplyr_step) S3method(slice_tail,dtplyr_step) S3method(summarise,dtplyr_step) S3method(tail,dtplyr_step) S3method(tally,dtplyr_step) S3method(tbl_vars,dtplyr_step) S3method(tbl_vars,foo) S3method(tidyselect_data_has_predicates,dtplyr_step) S3method(tidyselect_data_proxy,dtplyr_step) S3method(transmute,dtplyr_step) S3method(ungroup,dtplyr_step) S3method(union_all,dtplyr_step) S3method(unique,dtplyr_step) export(.datatable.aware) export(lazy_dt) import(rlang) importFrom(data.table,as.data.table) importFrom(data.table,data.table) importFrom(data.table,is.data.table) importFrom(dplyr,add_count) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) importFrom(dplyr,auto_copy) importFrom(dplyr,collect) importFrom(dplyr,compute) importFrom(dplyr,count) importFrom(dplyr,distinct) importFrom(dplyr,do) importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,glimpse) importFrom(dplyr,group_by) importFrom(dplyr,group_map) importFrom(dplyr,group_modify) importFrom(dplyr,group_size) importFrom(dplyr,group_vars) importFrom(dplyr,groups) importFrom(dplyr,inner_join) importFrom(dplyr,intersect) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n_groups) importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) importFrom(dplyr,rename_with) importFrom(dplyr,right_join) importFrom(dplyr,same_src) importFrom(dplyr,sample_frac) importFrom(dplyr,sample_n) importFrom(dplyr,select) importFrom(dplyr,semi_join) importFrom(dplyr,setdiff) importFrom(dplyr,show_query) importFrom(dplyr,slice) importFrom(dplyr,slice_head) importFrom(dplyr,slice_max) importFrom(dplyr,slice_min) importFrom(dplyr,slice_sample) importFrom(dplyr,slice_tail) importFrom(dplyr,summarise) importFrom(dplyr,tally) importFrom(dplyr,tbl_vars) importFrom(dplyr,transmute) importFrom(dplyr,ungroup) importFrom(dplyr,union) importFrom(dplyr,union_all) importFrom(glue,glue) importFrom(lifecycle,deprecated) importFrom(tibble,as_tibble) importFrom(tidyselect,everything) importFrom(tidyselect,tidyselect_data_has_predicates) importFrom(tidyselect,tidyselect_data_proxy) importFrom(utils,head) importFrom(utils,tail) dtplyr/LICENSE0000644000176200001440000000005414004642135012573 0ustar liggesusersYEAR: 2020 COPYRIGHT HOLDER: dtplyr authors dtplyr/README.md0000644000176200001440000000732414406336073013063 0ustar liggesusers # dtplyr [![CRAN status](https://www.r-pkg.org/badges/version/dtplyr)](https://cran.r-project.org/package=dtplyr) [![R-CMD-check](https://github.com/tidyverse/dtplyr/workflows/R-CMD-check/badge.svg)](https://github.com/tidyverse/dtplyr/actions) [![Codecov test coverage](https://codecov.io/gh/tidyverse/dtplyr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidyverse/dtplyr?branch=main) ## Overview dtplyr provides a [data.table](http://r-datatable.com/) backend for dplyr. The goal of dtplyr is to allow you to write dplyr code that is automatically translated to the equivalent, but usually much faster, data.table code. See `vignette("translation")` for details of the current translations, and [table.express](https://github.com/asardaes/table.express) and [rqdatatable](https://github.com/WinVector/rqdatatable/) for related work. ## Installation You can install from CRAN with: ``` r install.packages("dtplyr") ``` Or try the development version from GitHub with: ``` r # install.packages("devtools") devtools::install_github("tidyverse/dtplyr") ``` ## Usage To use dtplyr, you must at least load dtplyr and dplyr. You may also want to load [data.table](http://r-datatable.com/) so you can access the other goodies that it provides: ``` r library(data.table) library(dtplyr) library(dplyr, warn.conflicts = FALSE) ``` Then use `lazy_dt()` to create a “lazy” data table that tracks the operations performed on it. ``` r mtcars2 <- lazy_dt(mtcars) ``` You can preview the transformation (including the generated data.table code) by printing the result: ``` r mtcars2 %>% filter(wt < 5) %>% mutate(l100k = 235.21 / mpg) %>% # liters / 100 km group_by(cyl) %>% summarise(l100k = mean(l100k)) #> Source: local data table [3 x 2] #> Call: `_DT1`[wt < 5][, `:=`(l100k = 235.21/mpg)][, .(l100k = mean(l100k)), #> keyby = .(cyl)] #> #> cyl l100k #> #> 1 4 9.05 #> 2 6 12.0 #> 3 8 14.9 #> #> # Use as.data.table()/as.data.frame()/as_tibble() to access results ``` But generally you should reserve this only for debugging, and use `as.data.table()`, `as.data.frame()`, or `as_tibble()` to indicate that you’re done with the transformation and want to access the results: ``` r mtcars2 %>% filter(wt < 5) %>% mutate(l100k = 235.21 / mpg) %>% # liters / 100 km group_by(cyl) %>% summarise(l100k = mean(l100k)) %>% as_tibble() #> # A tibble: 3 × 2 #> cyl l100k #> #> 1 4 9.05 #> 2 6 12.0 #> 3 8 14.9 ``` ## Why is dtplyr slower than data.table? There are two primary reasons that dtplyr will always be somewhat slower than data.table: - Each dplyr verb must do some work to convert dplyr syntax to data.table syntax. This takes time proportional to the complexity of the input code, not the input *data*, so should be a negligible overhead for large datasets. [Initial benchmarks](https://dtplyr.tidyverse.org/articles/translation.html#performance) suggest that the overhead should be under 1ms per dplyr call. - To match dplyr semantics, `mutate()` does not modify in place by default. This means that most expressions involving `mutate()` must make a copy that would not be necessary if you were using data.table directly. (You can opt out of this behaviour in `lazy_dt()` with `immutable = FALSE`). ## Code of Conduct Please note that the dtplyr project is released with a [Contributor Code of Conduct](https://dtplyr.tidyverse.org/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. dtplyr/man/0000755000176200001440000000000014406335651012352 5ustar liggesusersdtplyr/man/mutate.dtplyr_step.Rd0000644000176200001440000000605214372711230016504 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-mutate.R \name{mutate.dtplyr_step} \alias{mutate.dtplyr_step} \title{Create and modify columns} \usage{ \method{mutate}{dtplyr_step}( .data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL ) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Name-value pairs. The name gives the name of the column in the output. The value can be: \itemize{ \item A vector of length 1, which will be recycled to the correct length. \item A vector the same length as the current group (or the whole data frame if ungrouped). \item \code{NULL}, to remove the column. \item A data frame or tibble, to create multiple columns in the output. }} \item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[dplyr:group_by]{group_by()}}. For details and examples, see \link[dplyr:dplyr_by]{?dplyr_by}.} \item{.keep}{Control which columns from \code{.data} are retained in the output. Grouping columns and columns created by \code{...} are always kept. \itemize{ \item \code{"all"} retains all columns from \code{.data}. This is the default. \item \code{"used"} retains only the columns used in \code{...} to create new columns. This is useful for checking your work, as it displays inputs and outputs side-by-side. \item \code{"unused"} retains only the columns \emph{not} used in \code{...} to create new columns. This is useful if you generate new columns, but no longer need the columns used to generate them. \item \code{"none"} doesn't retain any extra columns from \code{.data}. Only the grouping variables and columns created by \code{...} are kept. } Note: With dtplyr \code{.keep} will only work with column names passed as symbols, and won't work with other workflows (e.g. \code{eval(parse(text = "x + 1"))})} \item{.before, .after}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Optionally, control where new columns should appear (the default is to add to the right hand side). See \code{\link[dplyr:relocate]{relocate()}} for more details.} } \description{ This is a method for the dplyr \code{\link[=mutate]{mutate()}} generic. It is translated to the \code{j} argument of \verb{[.data.table}, using \verb{:=} to modify "in place". If \code{.before} or \code{.after} is provided, the new columns are relocated with a call to \code{\link[data.table:setcolorder]{data.table::setcolorder()}}. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1:5, y = 5:1)) dt \%>\% mutate(a = (x + y) / 2, b = sqrt(x^2 + y^2)) # It uses a more sophisticated translation when newly created variables # are used in the same expression dt \%>\% mutate(x1 = x + 1, x2 = x1 + 1) } dtplyr/man/expand.dtplyr_step.Rd0000644000176200001440000000615114150760302016462 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-expand.R \name{expand.dtplyr_step} \alias{expand.dtplyr_step} \title{Expand data frame to include all possible combinations of values.} \usage{ \method{expand}{dtplyr_step}(data, ..., .name_repair = "check_unique") } \arguments{ \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{Specification of columns to expand. Columns can be atomic vectors or lists. \itemize{ \item To find all unique combinations of \code{x}, \code{y} and \code{z}, including those not present in the data, supply each variable as a separate argument: \code{expand(df, x, y, z)}. \item To find only the combinations that occur in the data, use \code{nesting}: \code{expand(df, nesting(x, y, z))}. \item You can combine the two forms. For example, \code{expand(df, nesting(school_id, student_id), date)} would produce a row for each present school-student combination for all possible dates. } Unlike the data.frame method, this method does not use the full set of levels, just those that appear in the data. When used with continuous variables, you may need to fill in values that do not appear in the data: to do so use expressions like \code{year = 2010:2020} or \code{year = full_seq(year,1)}.} \item{.name_repair}{Treatment of problematic column names: \itemize{ \item \code{"minimal"}: No name repair or checks, beyond basic existence, \item \code{"unique"}: Make sure names are unique and not empty, \item \code{"check_unique"}: (default value), no name repair, but check they are \code{unique}, \item \code{"universal"}: Make the names \code{unique} and syntactic \item a function: apply custom name repair (e.g., \code{.name_repair = make.names} for names in the style of base R). \item A purrr-style anonymous function, see \code{\link[rlang:as_function]{rlang::as_function()}} } This argument is passed on as \code{repair} to \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}}. See there for more details on these terms and the strategies used to enforce them.} } \description{ This is a method for the tidyr \code{expand()} generic. It is translated to \code{\link[data.table:J]{data.table::CJ()}}. } \examples{ library(tidyr) fruits <- lazy_dt(tibble( type = c("apple", "orange", "apple", "orange", "orange", "orange"), year = c(2010, 2010, 2012, 2010, 2010, 2012), size = factor( c("XS", "S", "M", "S", "S", "M"), levels = c("XS", "S", "M", "L") ), weights = rnorm(6, as.numeric(size) + 2) )) # All possible combinations --------------------------------------- # Note that only present levels of the factor variable `size` are retained. fruits \%>\% expand(type) fruits \%>\% expand(type, size) # This is different from the data frame behaviour: fruits \%>\% dplyr::collect() \%>\% expand(type, size) # Other uses ------------------------------------------------------- fruits \%>\% expand(type, size, 2010:2012) # Use `anti_join()` to determine which observations are missing all <- fruits \%>\% expand(type, size, year) all all \%>\% dplyr::anti_join(fruits) # Use with `right_join()` to fill in missing rows fruits \%>\% dplyr::right_join(all) } dtplyr/man/dot-datatable.aware.Rd0000644000176200001440000000052514375676067016464 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dtplyr-package.R \docType{data} \name{.datatable.aware} \alias{.datatable.aware} \title{dtplyr is data.table aware} \format{ An object of class \code{logical} of length 1. } \usage{ .datatable.aware } \description{ dtplyr is data.table aware } \keyword{internal} dtplyr/man/rename.dtplyr_step.Rd0000644000176200001440000000235214006775461016465 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-call.R \name{rename.dtplyr_step} \alias{rename.dtplyr_step} \alias{rename_with.dtplyr_step} \title{Rename columns using their names} \usage{ \method{rename}{dtplyr_step}(.data, ...) \method{rename_with}{dtplyr_step}(.data, .fn, .cols = everything(), ...) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}} \item{...}{For \code{rename()}: <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Use \code{new_name = old_name} to rename selected variables. For \code{rename_with()}: additional arguments passed onto \code{.fn}.} \item{.fn}{A function used to transform the selected \code{.cols}. Should return a character vector the same length as the input.} \item{.cols}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Columns to rename; defaults to all columns.} } \description{ These are methods for the dplyr generics \code{\link[=rename]{rename()}} and \code{\link[=rename_with]{rename_with()}}. They are both translated to \code{\link[data.table:setattr]{data.table::setnames()}}. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) dt \%>\% rename(new_x = x, new_y = y) dt \%>\% rename_with(toupper) } dtplyr/man/filter.dtplyr_step.Rd0000644000176200001440000000271014372711230016467 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-filter.R \name{filter.dtplyr_step} \alias{filter.dtplyr_step} \title{Subset rows using column values} \usage{ \method{filter}{dtplyr_step}(.data, ..., .by = NULL, .preserve = FALSE) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Expressions that return a logical value, and are defined in terms of the variables in \code{.data}. If multiple expressions are included, they are combined with the \code{&} operator. Only rows for which all conditions evaluate to \code{TRUE} are kept.} \item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[dplyr:group_by]{group_by()}}. For details and examples, see \link[dplyr:dplyr_by]{?dplyr_by}.} \item{.preserve}{Ignored} } \description{ This is a method for the dplyr \code{\link[=arrange]{arrange()}} generic. It is translated to the \code{i} argument of \verb{[.data.table} } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt \%>\% filter(cyl == 4) dt \%>\% filter(vs, am) dt \%>\% group_by(cyl) \%>\% filter(mpg > mean(mpg)) } dtplyr/man/drop_na.dtplyr_step.Rd0000644000176200001440000000136114300152547016626 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-call.R \name{drop_na.dtplyr_step} \alias{drop_na.dtplyr_step} \title{Drop rows containing missing values} \usage{ \method{drop_na}{dtplyr_step}(data, ...) } \arguments{ \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> Columns to inspect for missing values. If empty, all columns are used.} } \description{ This is a method for the tidyr \code{drop_na()} generic. It is translated to \code{data.table::na.omit()} } \examples{ library(dplyr) library(tidyr) dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) dt \%>\% drop_na() dt \%>\% drop_na(x) vars <- "y" dt \%>\% drop_na(x, any_of(vars)) } dtplyr/man/summarise.dtplyr_step.Rd0000644000176200001440000000554514372711230017220 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-summarise.R \name{summarise.dtplyr_step} \alias{summarise.dtplyr_step} \title{Summarise each group to one row} \usage{ \method{summarise}{dtplyr_step}(.data, ..., .by = NULL, .groups = NULL) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Name-value pairs of summary functions. The name will be the name of the variable in the result. The value can be: \itemize{ \item A vector of length 1, e.g. \code{min(x)}, \code{n()}, or \code{sum(is.na(y))}. \item A data frame, to add multiple columns from a single expression. } \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Returning values with size 0 or >1 was deprecated as of 1.1.0. Please use \code{\link[dplyr:reframe]{reframe()}} for this instead.} \item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[dplyr:group_by]{group_by()}}. For details and examples, see \link[dplyr:dplyr_by]{?dplyr_by}.} \item{.groups}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Grouping structure of the result. \itemize{ \item "drop_last": dropping the last level of grouping. This was the only supported option before version 1.0.0. \item "drop": All levels of grouping are dropped. \item "keep": Same grouping structure as \code{.data}. \item "rowwise": Each row is its own group. } When \code{.groups} is not specified, it is chosen based on the number of rows of the results: \itemize{ \item If all the results have 1 row, you get "drop_last". \item If the number of rows varies, you get "keep" (note that returning a variable number of rows was deprecated in favor of \code{\link[dplyr:reframe]{reframe()}}, which also unconditionally drops all levels of grouping). } In addition, a message informs you of that choice, unless the result is ungrouped, the option "dplyr.summarise.inform" is set to \code{FALSE}, or when \code{summarise()} is called from a function in a package.} } \description{ This is a method for the dplyr \code{\link[=summarise]{summarise()}} generic. It is translated to the \code{j} argument of \verb{[.data.table}. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt \%>\% group_by(cyl) \%>\% summarise(vs = mean(vs)) dt \%>\% group_by(cyl) \%>\% summarise(across(disp:wt, mean)) } dtplyr/man/lazy_dt.Rd0000644000176200001440000000547314007000430014275 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-first.R \name{lazy_dt} \alias{lazy_dt} \alias{tbl_dt} \alias{grouped_dt} \title{Create a "lazy" data.table for use with dplyr verbs} \usage{ lazy_dt(x, name = NULL, immutable = TRUE, key_by = NULL) } \arguments{ \item{x}{A data table (or something can can be coerced to a data table).} \item{name}{Optionally, supply a name to be used in generated expressions. For expert use only.} \item{immutable}{If \code{TRUE}, \code{x} is treated as immutable and will never be modified by any code generated by dtplyr. Alternatively, you can set \code{immutable = FALSE} to allow dtplyr to modify the input object.} \item{key_by}{Set keys for data frame, using \code{\link[=select]{select()}} semantics (e.g. \code{key_by = c(key1, key2)}. This uses \code{\link[data.table:setkey]{data.table::setkey()}} to sort the table and build an index. This will considerably improve performance for subsets, summaries, and joins that use the keys. See \code{vignette("datatable-keys-fast-subset")} for more details.} } \description{ A lazy data.table lazy captures the intent of dplyr verbs, only actually performing computation when requested (with \code{\link[=collect]{collect()}}, \code{\link[=pull]{pull()}}, \code{\link[=as.data.frame]{as.data.frame()}}, \code{\link[data.table:as.data.table]{data.table::as.data.table()}}, or \code{\link[tibble:as_tibble]{tibble::as_tibble()}}). This allows dtplyr to convert dplyr verbs into as few data.table expressions as possible, which leads to a high performance translation. See \code{vignette("translation")} for the details of the translation. } \examples{ library(dplyr, warn.conflicts = FALSE) # If you have a data.table, using it with any dplyr generic will # automatically convert it to a lazy_dt object dt <- data.table::data.table(x = 1:10, y = 10:1) dt \%>\% filter(x == y) dt \%>\% mutate(z = x + y) # Note that dtplyr will avoid mutating the input data.table, so the # previous translation includes an automatic copy(). You can avoid this # with a manual call to lazy_dt() dt \%>\% lazy_dt(immutable = FALSE) \%>\% mutate(z = x + y) # If you have a data frame, you can use lazy_dt() to convert it to # a data.table: mtcars2 <- lazy_dt(mtcars) mtcars2 mtcars2 \%>\% select(mpg:cyl) mtcars2 \%>\% select(x = mpg, y = cyl) mtcars2 \%>\% filter(cyl == 4) \%>\% select(mpg) mtcars2 \%>\% select(mpg, cyl) \%>\% filter(cyl == 4) mtcars2 \%>\% mutate(cyl2 = cyl * 2, cyl4 = cyl2 * 2) mtcars2 \%>\% transmute(cyl2 = cyl * 2, vs2 = vs * 2) mtcars2 \%>\% filter(cyl == 8) \%>\% mutate(cyl2 = cyl * 2) # Learn more about translation in vignette("translation") by_cyl <- mtcars2 \%>\% group_by(cyl) by_cyl \%>\% summarise(mpg = mean(mpg)) by_cyl \%>\% mutate(mpg = mean(mpg)) by_cyl \%>\% filter(mpg < mean(mpg)) \%>\% summarise(hp = mean(hp)) } dtplyr/man/intersect.dtplyr_step.Rd0000644000176200001440000000206714006775461017221 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-set.R \name{intersect.dtplyr_step} \alias{intersect.dtplyr_step} \alias{union.dtplyr_step} \alias{union_all.dtplyr_step} \alias{setdiff.dtplyr_step} \title{Set operations} \usage{ \method{intersect}{dtplyr_step}(x, y, ...) \method{union}{dtplyr_step}(x, y, ...) \method{union_all}{dtplyr_step}(x, y, ...) \method{setdiff}{dtplyr_step}(x, y, ...) } \arguments{ \item{x, y}{A pair of \code{\link[=lazy_dt]{lazy_dt()}}s.} \item{...}{Ignored} } \description{ These are methods for the dplyr generics \code{\link[=intersect]{intersect()}}, \code{\link[=union]{union()}}, \code{\link[=union_all]{union_all()}}, and \code{\link[=setdiff]{setdiff()}}. They are translated to \code{\link[data.table:setops]{data.table::fintersect()}}, \code{\link[data.table:setops]{data.table::funion()}}, and \code{\link[data.table:setops]{data.table::fsetdiff()}}. } \examples{ dt1 <- lazy_dt(data.frame(x = 1:4)) dt2 <- lazy_dt(data.frame(x = c(2, 4, 6))) intersect(dt1, dt2) union(dt1, dt2) setdiff(dt1, dt2) } dtplyr/man/group_modify.dtplyr_step.Rd0000644000176200001440000000255514406335651017723 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-modify.R \name{group_modify.dtplyr_step} \alias{group_modify.dtplyr_step} \alias{group_map.dtplyr_step} \title{Apply a function to each group} \usage{ \method{group_modify}{dtplyr_step}(.data, .f, ..., keep = FALSE) \method{group_map}{dtplyr_step}(.data, .f, ..., keep = FALSE) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}} \item{.f}{The name of a two argument function. The first argument is passed \code{.SD},the data.table representing the current group; the second argument is passed \code{.BY}, a list giving the current values of the grouping variables. The function should return a list or data.table.} \item{...}{Additional arguments passed to \code{.f}} \item{keep}{Not supported for \link{lazy_dt}.} } \value{ \code{group_map()} applies \code{.f} to each group, returning a list. \code{group_modify()} replaces each group with the results of \code{.f}, returning a modified \code{\link[=lazy_dt]{lazy_dt()}}. } \description{ These are methods for the dplyr \code{\link[=group_map]{group_map()}} and \code{\link[=group_modify]{group_modify()}} generics. They are both translated to \verb{[.data.table}. } \examples{ library(dplyr) dt <- lazy_dt(mtcars) dt \%>\% group_by(cyl) \%>\% group_modify(head, n = 2L) dt \%>\% group_by(cyl) \%>\% group_map(head, n = 2L) } dtplyr/man/relocate.dtplyr_step.Rd0000644000176200001440000000174514372711230017007 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-colorder-relocate.R \name{relocate.dtplyr_step} \alias{relocate.dtplyr_step} \title{Relocate variables using their names} \usage{ \method{relocate}{dtplyr_step}(.data, ..., .before = NULL, .after = NULL) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Columns to move.} \item{.before, .after}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Destination of columns selected by \code{...}. Supplying neither will move columns to the left-hand side; specifying both is an error.} } \description{ This is a method for the dplyr \code{\link[=relocate]{relocate()}} generic. It is translated to the \code{j} argument of \verb{[.data.table}. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) dt \%>\% relocate(z) dt \%>\% relocate(y, .before = x) dt \%>\% relocate(y, .after = y) } dtplyr/man/distinct.dtplyr_step.Rd0000644000176200001440000000223014372711230017020 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-call.R \name{distinct.dtplyr_step} \alias{distinct.dtplyr_step} \title{Subset distinct/unique rows} \usage{ \method{distinct}{dtplyr_step}(.data, ..., .keep_all = FALSE) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}} \item{...}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs, only the first row will be preserved. If omitted, will use all variables in the data frame.} \item{.keep_all}{If \code{TRUE}, keep all variables in \code{.data}. If a combination of \code{...} is not distinct, this keeps the first row of values.} } \description{ This is a method for the dplyr \code{\link[=distinct]{distinct()}} generic. It is translated to \code{\link[data.table:duplicated]{data.table::unique.data.table()}}. } \examples{ library(dplyr, warn.conflicts = FALSE) df <- lazy_dt(data.frame( x = sample(10, 100, replace = TRUE), y = sample(10, 100, replace = TRUE) )) df \%>\% distinct(x) df \%>\% distinct(x, y) df \%>\% distinct(x, .keep_all = TRUE) } dtplyr/man/left_join.dtplyr_step.Rd0000644000176200001440000000712014372711230017153 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-join.R \name{left_join.dtplyr_step} \alias{left_join.dtplyr_step} \title{Join data tables} \usage{ \method{left_join}{dtplyr_step}(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) } \arguments{ \item{x, y}{A pair of \code{\link[=lazy_dt]{lazy_dt()}}s.} \item{...}{Other parameters passed onto methods.} \item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character vector of variables to join by. If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all variables in common across \code{x} and \code{y}. A message lists the variables so that you can check they're correct; suppress the message by supplying \code{by} explicitly. To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with multiple expressions. For example, \code{join_by(a == b, c == d)} will match \code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between \code{x} and \code{y}, you can shorten this by listing only the variable names, like \code{join_by(a, c)}. \code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on these types of joins. For simple equality joins, you can alternatively specify a character vector of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. To perform a cross-join, generating all combinations of \code{x} and \code{y}, see \code{\link[dplyr:cross_join]{cross_join()}}.} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into the same src as \code{x}. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.} \item{suffix}{If there are non-joined duplicate variables in \code{x} and \code{y}, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.} } \description{ These are methods for the dplyr generics \code{\link[=left_join]{left_join()}}, \code{\link[=right_join]{right_join()}}, \code{\link[=inner_join]{inner_join()}}, \code{\link[=full_join]{full_join()}}, \code{\link[=anti_join]{anti_join()}}, and \code{\link[=semi_join]{semi_join()}}. Left, right, inner, and anti join are translated to the \verb{[.data.table} equivalent, full joins to \code{\link[data.table:merge]{data.table::merge.data.table()}}. Left, right, and full joins are in some cases followed by calls to \code{\link[data.table:setcolorder]{data.table::setcolorder()}} and \code{\link[data.table:setattr]{data.table::setnames()}} to ensure that column order and names match dplyr conventions. Semi-joins don't have a direct data.table equivalent. } \examples{ library(dplyr, warn.conflicts = FALSE) band_dt <- lazy_dt(dplyr::band_members) instrument_dt <- lazy_dt(dplyr::band_instruments) band_dt \%>\% left_join(instrument_dt) band_dt \%>\% right_join(instrument_dt) band_dt \%>\% inner_join(instrument_dt) band_dt \%>\% full_join(instrument_dt) band_dt \%>\% semi_join(instrument_dt) band_dt \%>\% anti_join(instrument_dt) } dtplyr/man/figures/0000755000176200001440000000000014004642135014006 5ustar liggesusersdtplyr/man/figures/logo.png0000644000176200001440000013661714004642135015472 0ustar liggesusersPNG  IHDRޫhgAMA a cHRMz&u0`:pQ<bKGD pHYs.#.#x?vtIME-BIDATxw$yߋ~ߪι{ټ؄HE A vI-YH]_k>9dɦ#D@ $"abyN3swٝMY_6CRZ.R)7RTBO_S:A\sQ.QUuw_+/V򠪪(+}Hl0==i($Rʚ !N !TmME??җ Q:tMPUծiR?RZ$ B?+B\+V_ 5#" WJQ$~x<.7kF`.% qw"BBӴQEQ~p/y @'aWBǤ>.3!V}C@~ǎ>}J_V | (VMK)x<_V5M+nW76|ںGJǀrr3 ###Moo/.kk9F knn6/|RdSJkoV#B$ɚy^f!lo !ZJ9!f}Jߪ TA4I)$risss 177=`B粒`\\ Bi)eJ_% FUk4)7H)x<0JbA__~V9(bXu[5MewbI*GGGV,犢MfEX,^[ _FGEQ= |]JyyEӴu=7EQhii]!?ր7E#)ev# eZVX,~RzDJV $Ν#P,/ZV06m.9X9 C!ZoEmȗxGBa@[#r.cll2e=wEoo/8Fio]Ӵ Y/^|>APoW#r4CCCD+z-`~PUuɹ.s 'l B$/#6Ψkl~ՐRF2}WE~`? !^R7 qbWl~_`_4Νc||Bp/.l6]]]lڴ ۽$}>Dbö EbQ#M7F|>O$ܹsR+}9 i&0vx+!_ "_<6|0LxTJOUad繳 1??Ůfݍngpp|>655OKKˊOOhrq `*  155E\"`Zٹs'w}7[lb /3Ϭ)VUvb[,p޶ʆm±A5+6/s6t EQ{a޽8N4MC ?'|\~Dž5}65b `ͯbw99p7xUEO'O^{\mWlB b)Rn^OߥFss3wqz+MMM& qޣ~mf3?Ew+hȏWAUVˉDa&&&7|>r wy'f܈ 癞z7|s]#ݻcޟ]xN_TU}\.oLX^ec*_Zl~d+r Ns6mڄ(J( T#G/L&̬W04XP(\rϖs-X,aulC-qm~M6q!mfZA'n. {j4 ( 5{>Bm&RYaֱA`jڶ:UKbs8 K&9z{{B!Dh4jn~TU5`0HXdtt!D ޾,yW&.H)?(ʟ !4mi~ӟ˵^ !pUk>iV_MMMQ(PU$Š>4Mf|ilقM(h !Bl޼Vfggdnn{R]v:5q*bjjP(m:gXJhŶ۝?t/j'!lnn5MQ.onnL&cimmEU ONNR(X,'''Mbl69uCC__pVhJgg'}}}KZy!JMN@AHQS"Eq)B?njjacK`c-=33oRe[.\Ve]m~Xxǒ4G>UU 乚[ "d2Ӄi|>"t9٬jkkj6ZZӁ\.3>>r}_CmYph@fH^ZvD=BK)AEQxBӴ?{+܊@l~Ƈ_=`=E1==m餵>fr-F9rȲr͙yp  q^/RJx< / %EQXXXXs_]V^03jK+ -= }.7eY`fnG?bDVˏK)b%??vŏ<7p8vGs,###|J#bs}e^ iSSS&)N'---:IF4^/@`EndIRk2qdyl |qZH!NEp ZLooﲲ*"ۄI)oR>.t:?6kպe!Dn֭}乗733C>p \n7<p)%&AQ%r$tnKKK]!0K|AΝ;װCQӝ:]͑eS`GQ>lis=EXyΞ=Ԫ*"+B)˓BQzK_J\>tR EQ܀ae5^n_4%L['---8,lx,FAK3z 3;,'I;KiʴJtب.J)4C+Ml~TL%R(aΝH)xmB(⏀rsU_6V]ݮdٛ+yAȶP*bxxL[ 8Ӽyd25 *FP HfZ۸;hmm],~ ,‰l~հZ$8 ^|M$%f'h~\]ʲ[P&;?͒,]|Q{_XX0dzZ~!R wtYVvVZ|>nf뮺]hF0$NkT UU9p@*+T*qI^z%N:׾5MUss3CCC )g3seGBs;_sVEI扤3~T 5c%dW1צ3/}"74m*? Im\.5諯>6fX%W_~?z+w}7o>le}v>vرd\.388K/Dgg'<G2;;k8'EQ4y(n455믿oM<fyf,`<Ef5DFtm~9J>ߧ+^@)23IWrreIcrKB ӫN["ORONMMؕ??Eg^2_>hT}E ``۶mfz$g߾}\J)W_äR)JR łDa+a9PӔn"lݷ+Z{Qb<'S?^T*MYfkkk#n!GJyHQ?BvDбJpENžGpR|u탦OJ߀BL]u'C\ʔ+f[1D"fXlZ=5l{\(4-r7p~-3?T199i^f4McvvnOSSI\UUI$꫼f˰uvv+,nbW諴o",9mf 2Cɯ~a,s6l_)?BR׍l~W-cCގb1hi.+MF$ sxu[[[}o1 `f6H4??Ϲs8}4o;W,9pf @UU,~! H)Ftb ض3p~4Jfs2FP_v5-b1vAOOOM4SŶE!_if\oECUm4j34L&jvrmO}P:f~~~YM7wM86f7MXDd8{YPB9a;s /2'N0Cem~+9j3[5xU?d19.\.݄B+$*?w !]d[<^2/>(R嚫l~ۯ&_.cvvFcH$!ؼy3^\(xyEٽ{w(l6=Ö-[PUuPU.5"l~FTJ0 |PE8hvDBg$ /GEQ2/ЋFm-B?R>dX :tOtbEQBT;jI/#388xYl~Rᠥ\&''ٲeKI\.s%zm=ݻ5{bw}f~j(VH&V1e۰*NJ2D]ND/"Eޚ.F~||l6(LLLy%ƏjTہbB5p YVP(T.*֕/J)6lClaܹs5ǙP(`B{n6mč7ވA4iΝ_ȑ#= _#~X#Ul~>k/ۃ4K| c$e叿HbF4Fњq9"HdXB-QJЧ-bC6䏕)}|m~k\^Ν;;jrBC !رc xM,ys򟔒+}6*nvb3tF#j~߃l~]8@Yƞ&U\j餫t Џ8mq-}k +q !&7k-H|+d2HTU1)?v7"4T5*K !s=ݻ]Sݶ5˙JTFـRB!}(_,X-W*oT-~GQZSvBs)Ks-=5̀=tO':H5BY !ĿR>B|T*E:DP[ܗC4o6j3:NOO딃]w7ߌ7s| r"QN r9fffؼys14GW^`J6D-4;cg1waSPye\Vx^zzzhmm5httPv6g/N FBXY>d&F8^mSRO !o?*+_j~n\FUծ_7o{wp8\֪J<筷ޢ\.O8o;qyr.~~/”(Jc̙3f#9c,CB]ϦЗ(J %~$#sZp8æu@<7[61Xve\rq:l1؏\#jbWWVqCJ_/%?t"[sE !>_{Zl~FG?6 En&,K>pQ~144֭[{(RHx~ѦMRf!BPMӈD"5Xka}.t]E)j"88Xi/, Դ ffF#[?s+G@ڋ݉9;!cY|>>ϴ"⿒RB1iCT*)a{Rp뭷lEN>/ɓ'ͭh4J:6~.999I.5-K*H$po@ 4ͯB淚#+hcg8]RPY\ U|.A A`%He!?4.BYk:zzzBfDcs=j>.}8.9~A"Q0z æ:`Zl5wEEQ~FK`T-kghj/.CQS 477#$ vbk8{,_IX,V3$077GWW盛U[,Fuo%GҪvDq0!v+Y֭0%&%42l%hTWq:}-*ݦKWakBwBk@w!Ytb_7)Me?@5F%%ns>2i 7 '[< m8.N;iu#9y d{p80IשgPT*aٰl4ٳ !>+T*/,Rg],^BQ;.{^|imۆG0!p.' JN$7`d|(J{l6x4Bm~{;^?8xS+ u;K,ٮb&) _L^B,%YHH:?q&%Cϣ5vz?qzL#حnQ-ڥnV{bJX- v+x]ػJE3,cÓqgb'$ ۀ4dt9caz=q;8J4' ?pFP*.>V+---tvvz3EȫEK)?gX5o5jF Z_f``Ƭ /jW4xcb[sl? H!NG`8y-vY{5;*:OX,ֲ K)O衯 9|ln891>#kd//یhorr|0 gK4TŶ#[sx74(䗦'V6:::55P},DF m^rf;j!7? b߅Pl~`G1z>ʲtCN._K 455ۻ755ز*z--\$;}tt::f8őp<&m8Ngg%s&&&]SWkRJ~mvESS=wW_ezzEe[oƔzZl <H1ľ@G C#Z _OߢB2#R4 ѤZHH>b'BWoQ:YwJKaGib.xng36RSs$ W45 +)VzkgΜ&wP(CÏ6|>nQnnn^Ϫc1?T{īEl#Aa.jajRvUݮ+ڧԘJ%@Xhkk4Mf$2X]6-(P^ř㣷6:J93>:6?fz׼5<7H0>>N4Jss3pl1y䑲RYsرuR]N,Q,,,,[PZͯ'0n|MqA$e/Q.OE2I]F\n 9 h骬Gs .Ư|&]#V\*I:_jQlN2:-9~G0kCm)!X`zr|6۩Ok@,nm %JZ >|S86*:6?UUZ\,&&&Y~5;;[nfWWp\f)A*6?fl gB?Nr6#ƾO494 '\ dUϓܺGԨDLnتp~ F98Y6 j ]JWillUQB,$K!X-`& qZ |-83D(0`0x\C5811֤UD"ɓ ?66D6mQ(ZSl~+9Ts;_ٹ˔?&~,^YbY\>c);$A"~u{|2ܴBI脫&bm3 H J$t՝Ev<(]<G"e&J,,,4r4e JBm~î܎"HÉo3y{Mk'G$'._O}B\'K w#[aB_ TIWpD>"h>0 B%MR3??_ 䤩,\ iWi~kmS]g[iͶ $T{ '^]OWNGዷ J&_&'J7ϗuojaͭZ|\rox9i犪ڨW"RyсrMAZ f?7˙d\.e|tuuLՅ:I;z{{kl~gϞ]Tc)~|fGQM8>XCpMco<u?By&h ׿Ύ0y[;?z ,`SQl+t4K[-v4)lJVEż\L&}XRHyI"0sg@ pR,Hd٭Iat'O^`k ?Y(F֊q_eAZ?.S<}S{ K;S4-dǘB+a>nlq6--RB(4)ÙyquV9MW-X>ύb*1;;;4Ǵfffg5߹s4w#=.|㌧^_U&|fpfF}i$.uSK, %W fc? hewEXvtědr3;DEѮ.v;SSSLMM5z/Q4:ͯj'brܖa6@/r&gɕ/̧ x؎ZщkX_(L޷85&7[ܷ7 m>}ʁFRNF^V:iv7Ti&t`P:D]*A. \~W- C*r̙5i5{/;rlE+G:k¯܅h硯 ;ǃhㇿ|Ɨ)Xravzq8- ݉Rmm-~,XPn:Uj峌*D|bQtZnknmd3 O&]FW-WPZ+V&]pPY&Rp28ә#H.oˊ;֭v[Co'$Rd_+_~ }ba} #{yg)>bg ZH* ^/nUZE+Z/G#,( .ƶP<7Ȩ*---Ǵ\ *B!6o\c[-7x-`\?އ] )1;cI6QZ"˷KfsA @ܲSn5.}\$Agq#J%}zŦФx<<VK@T蕽xo0̮r&1R6!@9==ewUv%RA\ϩ6YⵅA83 ğ&SiYXU_BV8l^ Ö?zghwot{oegNŭ[9=UXA-i~nn"*’&[W+D!׳vvC# rfggʲ'ݞzf.a!_N0|lj\syw?,C}]me<8X mv޿t8)ZSߤ-׸|V%DD*vWhQJ I^RBڲ$^.,,0>>ͯ|>ٳgפ[O\+Bkk+7od6x<А9>vLsVEI d]4֧'6'o g @+/×nEX֛:tgVPpInxăѤü3>q%\r^˽F7|ei1,m~HdY-ѥ EQoxL˥e'p 0KF-(-z6i~+ ^Z{|~ߧqZBH4g9?'wnxC˷QbMOl?UŪ wR2$+b[v?L+">[VZx փP^wi lKbFA"MJ e#b_:6F ﯑?f+a{ S"U [o>8pEtA~8e n 뺹bh+r_dy8c}{P*P^G؃9mͨR,d 477׵(\ꗜl~шFh](F};B_չ"L1z 3K! u¦=TƇ.w]jpU.y B^( ?yx|kOSSS8qgݞ+.Š2eSFMc=, 8\`&sbo_ød6$ Ja5߹sD"uAc;,Le·ߠ,/M_[xö[]C!wGܽK;|C96n̟*͟?_c\K/ׄR= J6?=Eo\ ]۸. _GGV,+(--l bKaܖVtO2x\Bمs .нNӱ~5C0<3o|<Sw+q󟯱 RMhdMczm~e dJvJ`AOOo``MAq =ApP,WjH 6rE|v&]}[ *)Ehh'd?w `47GiJa]#|0tpy5_Iwo6 `|غذ=.  +SLT_d?lǾ^ Ɯ!h텞]0xd}K !p22 K?#qGrtܺ$N?9f I$%R4&s;h-0,d>K6s40G幅+ދeC 0#+(-}^}8T?%fq2]F\V˯WV^kTJ}r^ʭGQZBMp7#;B\o8}^:!Z[(!4~64;kGN䈈M4] Ll9˙\ gP\UV8fPzfqf}c5_$ܹsumXN-ϲ5pU'~@d˳f ^ڙAG顥,\<6;;m/X#1E1R9 2HևGzB(JR)RR E(Νf0nFZpCVIyՈ5x/˭ͯz.|f,qF/q*]Wfpm} ɺr3p-(]Dػa:g'Gew-%l.' ?d}rF(Gգ| ,ҲċL&C"0WQE(Lx-%J56l6a6?_e@a">E+ԍTZ [*NJ2w~u?^ `R$GԪ;ͯ p%ELOxL_ik Q2 IRNyn.#HΓ|ic#F{{6?Ufpk;-:qkPhY%qC6z%]ͯb_&1Sel~~8,A$攃P.qehe8X1KJ=oN-wY҈|58/2ا:(۬\n l2wf!P r_ޏOJuX LN4 !e$5Cz'ɮ]\6M^=t:i˕S4RQU[{CpZi"gTG#ϭA6d[m= ]a0,gb']D`]}_I;pkP4ё2 IJ"c?l$Rm<7H<0̱UJGdSYsBL9=9MA¡0XǏרBvxx|a3t륗6Klp񧯩˾.455܌%388lA5`b~~Δ;a }VTa86\20>A}労927G N8&LK$g@3BA_ T - C:+`Oe8JQ0wM^ _;>n8?˾ϲ|e&&&-(-l~L4;v3;+)_2z#CZBѫק XAUUi\`T#Hm(z+}埿sϴ;:*ld٪bPbsf,nNدl 촞sXy%@'{VNO$xG逪gt ~B$O^ϨP/k9G8 }~c 59ei$0O21K,;)Xx.  [3xT*1==e(;^1^l[mj6n*?MΥyn,6?_M7nqO>okaWyg1(BsY11XͯX,lt{¿Ŧx;B_$`G--01gO(4tW#JxXMOOmmmiQ3229gN>3[cW"A:fԗB_%P₾sBE @6MvN^=#HYQV/]tle4kw?](~mjm+ [Xx☹څ6Np:S5a0W{>;gFQ"Ȳ[l _[T$-N}=8 ߥ;ǟd_$_o ~`:,V| & u L&d"2A \E+lvM[hL'Xg zh9Ӄi(>11xҌL͕@;wWO}}6o cn󛟟)bVX:a*᲼pYT Tgapp^?H')<#Ӻ:sp?y[NJDa3?C#HUa-|$e&N'VK_u*‘ sWݩ4|~0  v꽭gks0<sOO pY_5C`4BXqC^܈EK*i45;~ >Bh-pR$O")4\GV̈&cccKĕv |ww6t:ej0׵qܗ'@]! T*qܹb7ǛWZ`WU Y;gT¯Px_?[=IJG<6_wW^lR|A+_d|B Mw@бe|~BRf.wS,#JZ ~kn%ќo||.5N&g. ]eN;gY(y$p=imo\oUgܹ9FGG6X~?L{;.NZpyދg(5]"#t/k'B:YF.faxr&.Cvx  ,-Q\)v>\*A\ \]MMQV}hl@":IK'Le'2D2oBϷ'_v]~ӵsfsaJ[:{8[p:vIt'᥎un>H4ħ =wVy׶m6䏋QML^ _G6o򍃝ܱGo =`d^?zySۓ@uFlM*+ 23e/xXclt|Iv7toG:&&H"KV ޝ?)ZRΘp>誣e,̄شI?kPlghqr9,D9ޛHƖ-|?$ac+OON _Fe,3$/pnЭMح*o BC,γ*ϵt'GyA\*B',i9$Sn{ GVҏ@YHfXt1Y[auZ#+nbv@.;gv1Tgrt:ȪN20yhXhmjO0u—nCvxrq6?s0ܑ<1ύ@" Z8= {OYA:_VˊՖ¿%9S75eJ+s2V}5.N+E<B:e+;Vh"-Eq&8)2<ʪ{B]ոࢪ*]nn7 La;d5CZ[-Ouq:hg/S ͜\ZԹR3nJ.Vc*(t'w5~cw>7b{Īpy!ϟEnl=Mnf =&I.?{0$ULXJ\VTHG|r+yZhw;l8IHz)C7vi,4 'Ƙ,J(za8N6Cg8rxS'嵴pGՎBTN=ʼn* <)( v t:M9X詔O繹&i8:wg6}~GߦM1,kcv2ĮVyiiX$JfRIÇ[Iˡ^ClގلDJ ^ȓ NHh@~ST=<ЗVPW+y1]l\x^1Pܭ!Nn!|m &b7Gt%Pl3SRuO t4g⣉Ign1\.:;;k _]իD vЋϣuO2;o w| BQ)XOhqEt גm!y')7\P\@,gGCdDynXtG};vNrcO+ Qm "}AG[[x6Nv7shy^uKI/Dշ$3!?4͚g!Z~]X  6,- tG@9 RHߛ$eCLoT 57?W+f5XVsb8E X;4;|t+OrfH])#Դ4]ɝm헿"eq.jTE\6gn an%םBO} ׉ӓ@!rN"ܕpiZ MX!qu6bz޻4-\.7VʼHV^y5py1DZTRۚZrUC #E6o.yo[BX*o wS86emcOW;!":m /Zqܕ<텢*382al~u#مᰓQ8|2/̫n˻*qPvzc!okdr.Z{_"l~mv~3}m可d U W[Zsq^$PJ+a}l;B<޽~%?ԧ ή`spաRxL9't~p'_ VX,Ule<]Ih6lXj"!UNx.PX#+'jS:qA^ټSL[OBsكtScI;?d*:ZWDWWW5˕Uf]L|>X\#꫺[aS~Ưz; ^9-HV?ucstx 74ժsQ1/ZY\p'iz7a](]xci~k56U䏷sU?Z ^'9FXzտpY[~'k 1{ _Mrpf iLƗ?9|x*ANKʨVT(-Ȳ`WO}E>;SOr|M٥in .WWZRڙDe_ucS X(/Z{ $ /叧W?n.ֶAbS*rrdYd{OcU[^ +0]`H q-1z{{kRɜrPox=Xӽ Ӎa#)& #ދhbT jn (\^]Z~pswQ->t7]N~vzw A o Gwg(`EU#4ɩxvP. TTVYR:@5q v;턤3`W^1`u vM{{{͊hݡBcs=Aђ=4%y(Pl#X=5>@4J'}afB0E-q;懐kEΖ' ?9)&ܺ-Uc',TyC[;U!6#*jeUe0IIJe,ȳ0l69Bބޘr4?Ak=ϰw7~g))ۣOpvMMMk4hv ٞ z&6cJVq#y# +<|S^y @0-鎛S?2n!2sN^^(P V1SyrYJ, HOPVaR ,<ׂ6!1/ Z N=?@Әr0pUҙ;NGGmmm n >:~Ğv!'Ʒցˆ;-|s]<|sE]QDrPW(IkD(EIZQAwt4k|꜕u Wfd].&:nkhbJoChcO6"{;όBv!rfggix*lnb_!ܻp<vb8<,s%qin -K\YEZe\Ğb7vJمyDT~-OimvpbRrƀ [?@X4\i[+w+ c,Ϊc8ok4L[N.g넔Q|Kcf=peRA=z9NNennAݾ}3ԧd^ɧ;lW yc~RfCHfr Abo;ŀҵ o¿];U R+L9t_)v,}'TD^ R&*D/mpNgDhZX1!p {7S٦cS򊹜/ɘS糴Iv%jt$SFKbE')Q͢RvY)6vImi&L_|V>MWZ&oϺ8غGw2 ZGMu?";ZUaT/: u` 5ljr"sPmj$#,X~8C[su[=w`ESfK$K wFWvYDAVN\GGM@LMݑ* Z峮sme] | L9;H I]F(=GJ ˉSc2m +ǓHj{%mM׹H`S'LxHO4i`ǵϚGAV ; %AS H8܅7tdz[=v4e57{!(5"ZUs-USrS,$G$('PʞI2 TB&y2KbnXj Kũ&Ec{ 6+ vyKL u[;C8N5ϙK=$ó)/Rоc3dvw0u];m<\֪[,~7t\ݐN.jFopgo'N9uLx1cʁqKR^6 7 Y=4PT•dG)cJ?N<vŸv9 Qrb xi8/Gm~]t:3'&"_di {/ob^2]~J>{:Z/wzQ8pbxU\s.A2ӞEyQ]mv MÙNTZteB.'I\2H&v }wʽ[SO08!)ؿ ~~͈@G | 7y)S) Kq ?]DCAwU@-q32RJ"tB@_م b:N`/ u˳<8zEjfsRO@(!{6?g^v?UbS)ݩh52EwO3WFt  4)rm%/TqA#?]L̴e 0-VƝN!t^XRL&C&ETbt:6qNTTX޺{y+!{bp:]l6(eIp2#c,^?>Lس2N)ƞp9.n 0 ئ4Ղ(m$$GdJ?ژioc"AֹTxT4>sœ\NgLsdž׵]8;jxSJWOE)iM_X` ss og_a6{o p8(Y| oO<r GBxԆ ɸ}l6,2q?受,WSUBDK9tM33)Z;٣̘\N)wݪX YBLMt`$lrYl:¸.DF)"LMOSl練;}TNX]ޚx3u ojTh6N4bWj=ܾP7:`9_.7反슇m na^pR::O#BzӷTzWL3=TssN'(TL3rycdoWcK@&5B<k3>CaMPnEf[7::J.tym$g82<\sM4IkF|w;DY*btG䜵 ℔1£bQ}#ynXdffbuڹ{[])eeՊ*ϕH1GVdnjm{<sAuvBc;ӏ餠$9z'O$ti" %RDݺjqD)4"c1]*tGhETkl~z$.U{SK\η6ej[(c2d5KAHg[%jeӦMtuus}off)~K{f)2ro?l"iN"$[{xt BR2Uyn#fiIW [@+yn*b||yXvag{;7zlk@xƶAܢ(2a= cLNN6z.F]K)iii! ?rT޻ m^\.7\ywHgN90zKg ibدH{?PJxy6}d\ynp#嘜dzz$B}7F<<"yxNZZZiHΌ39<}Ndf2f 54CWJAfۢFZ۱(k#XS牛w8lgi~8xb=55ԔI,]fߍ97ay͊jC1QGk=AT`f|S K)o-o dkmjaQUL9{]fcu7\.pF dɵz. t\k NRE)%SiGYF8;;j~vwR-s?1E=ɜC?F磌.yJ ovcSX,*Rш& ,!XZ~n-J> wlҷ6*SH@!xxTBoG#)"X8r9:#NNa+ԗ?qk H.u]7ry"_^3f9MNGǘnxb&;<о&6CU,U)GN -`[C+B R8-4I;H7FD46R*#UDW֖i]Jח?f2"H",lvs友l~eQf9O5Aښ;̌N0>>z![>-Xx&e!Y;`hLr )ZX,(ʅ+AZKiW$BG1Btq%_|׳ݵ{eA6-ɸ{GTMw,KJcgyk ƣu7l7;sEU0BP4 G`c)7n^zv׳ŷ&_֒c5ɴsR O022rA6˅V©=mۿ5,/͒mb7q@"ܭy!4e6p@}c}7P>6˅5v:ͮ(ŋ.W!Pؘ9;fo$U%>G#$9$<toǹR?*UǤͯ n\XUaZimmq);΂DZ75}Noqȟahc:yЄsB6gҥw+y$ccck$5aECss3pwa8ϭǬ9Q(=Z_{/FP>#S(+X(aM%}>b友m~sy"I]xF,R' xwXǪ\:be`wq/OYv vr`YÚ)LNh>h D7GjhlwA'wh{4”ܶ9 VeXυ o!Y]`0NpP(011Xvi&I tk×7;knxzDcbW?} TI|o] &vfAl7@WŽ%e!B*d*L֔Ei.6+1ҘlREb9PXdc6?:"#21:#2J:ϠF&h6@?-.uzIJ6ڴ&lmf;B-I !Vt` 8Ս?;Q-$jJX>anrzE T*E]]۶m?1To;vpI<^/EEE444PTT~pi3*Un g f6d'Cc?q+X:t5uu=;H*pV)^7+芊TTtECWU㿅 d6}%гJaMiUC{F7Aft ?x~8}+c{S vp_`5JR&gبLŐNُVS_ v믿5k x7>cǎqmqFΟ?϶m(//w5(dØ.X__OEEhӧO^ S!" :.</}6Ҙ<.FAhKwm_9|~nuPuZ(D(ĂݤtYzšN]0 #.ΕHBA/jp8rfQg wH7|[lGgo;}p}}} с!Bss^3ќchZ.uV'j+/0͇E\Up ^; 禧ຄ7:s =p.D{ Yh:iZiddx#gާGãP" 1b|/zF1GcJBb౶ s颕n\3QZ´٧e22_Gl=w۸OsilTVV8w}ktvvRVVF$ܹslٲ`0ȱcXf s/fFR-H_h4J,+XʅNii)tvv299{%6ÿlbiyn:;~}Ӧ 2r^RV|(D%6hHĞtrNVs!@(VZukk({WPDaH4?z0T{np:")$I:{Dd (}:H > ٴiSfd# x?huhh .JhooMf5 !;\G322Bgg'XP_\r-pw+z͔l +.ZL }^jO2*s#~X兔ƠFS)үAzҎTe0ԅutO7ʽ'@q0KonjrS I"4*PIq΂wkJSdee%```VyQU;vyf$ǎܹs !y{Yi_Q?LCUU"gΜ)05_oT7fau9P ':!N:V~UnG/AbK#9DI\w+TU\cEofR"E>kk S&Q45.OuL3b$ƉF d2(ک^?ggWkilldll5kDmn<ȫɓ'u E(**Zv;R9y\kB;BP:u]ύMӄB!8RJT+cK`[h1Cqix|2LvWwqu8*NEx}g:nYGĨ_"" ݺʗSuww5 ;nbjR^';-%zw] n}MttƜ!=0d@o`~q#}Lk?wy5A`ʻ(r XTU~?t`WtHSSS眮W~%;QŦRʿJ)>}]^:YKY  CfRˈ|yrg7 ^;'&{1v'RTԥ 0tRF/ 󉛱__O6."r=¥."v ;~6w4cjʁFGw1Wd6===9G+7ZJp5\nwz*W<|_$ͲeN8(LNNֆɊ_x⪪Jmm-----=)y!*iEEQI) ؞JKJ(/ކMe\īcr1}~= 9Gz`|=K<*?:x <`hRkP9 vo-;3RYGVa%/eir-IT{Cکxg97#b~xY) LQlU&y/Ml]vՆJh \AC$Dڡ0zuT? 9˫!Bp뭷RZZc=$b1˳>s㭵k?Bz++Ȫy6Jy#o>iJlhQ<.P.Ɉ îbZh,J4?Q!65#ͤCx㕀 6ĹsԐL&O444DD"}^fq8 s !~zl6{ٯwߍ477k ]6 %ʵZ[><"5j##,+(P}Wϭq~=ʈE".(vo$E!ʼnW4׶/0^ZEV(T@/ 05P}49 ]M:kk Ր4RYPt1t5,+DP4b3cl{x l6;B3\3n0?NL'Nr#Fϟ`Σv5bOR!Evə3gؽ{7/"]wgϞcƍ444pIr \MMMx<S46BÞ*?xuwuUv~̏op0XdbE&U>9 WXƟ[TRs@d\DS%cŒ.NU݌6G#{[Zcr֙V c48/7$bjE4F*+7zOd~?a&ws?"㞚 WH(N\bCMi"(~wQ(Ibtdǃi h1_qq!}j)5)xx7)..f߾}׿ƳvoZ[[)--5@^1u]O !x'g}ݜJ'|Cp8RB6ϚcR eb6:SB<~e:]wkz #x#A`]詭*pæ":nYP^G!8"~g)Mn'=9E۰y=\d1Iϸsd&` 9%^9n~RIv%.s*nFx l6D)%l޼dW_}ODm65SF?Rp8faQ~'xKQ[RCRʿBR5D6ȔyYi3յtvv.jpreUgղE_eSP0\r}qJ֎ᮎ"4Lgs&U"}Lv )qYU5BR$3CaYѳ:uSV]]XϤ7c{Rbh53s S񗔼OÞ=5#FGQqQmؽ_L 01l~#!z{zr6?6e He<&sKaD;#=z(8N{9v͞={p\H)ν^Q*++imm| |䥝YM! !H)tzjqN.3`b}O0 .ඝ|:l,"2HQJ^k6oITg05I]2qĈ+w6~Z= \\2@Aj8kӴIC8Z6ʝH%J|oMfS c\jH5bp>/A^%~>[[>$3Cuu53<<(lٲkr|A֯_OCCh|MTUU1>>^ ,..{oyY!?R[GydIΒ'3<@Z=dia) !*ejFCCSP_Qǝוail }l6?w UGoEm!^##or2zsBDb`u5iyj6K~?H$B (dlbW8QjHMj"i6?Qa #6 E˻nKEss3daرc=;v`׮]<쳬_bwdpp0w ESS8FjpyxPL欦i(2guyXhL"l}&y̏oR:Y,tvZb` @SʟR7TMm3ACyA,˪]mx.rqTe6&KMs/O`4scRR<1AC1}xY&I(Xlmgg-NY0zj+M`ٰ!+ #{9C$3IƇEKAPGENAuu5{lݺ&|>x'FJuW";@\ae[WI_,Ʌ`cՙm?WsǦq4f+x|Ĺ2^d7',ogdTpƌ"RJ&&&/gvֺNںRkl=OUg2"Ð{{'L:\C㥿z3ɓucb,\?#3@nH@X-fo_n«zs5M˅Vb= ёܔ76[@.5"ns:fWʱchF:GN0̽v6WM 'K-X\q<a+.RoVXJI&!JNUgWZ9M,GG2I@D2|c(6z6w N\+\>`5Ā{,` u6yq,9i>OzeUUinnfϞ=x<>#B{嗿%RJO{{;?Ogxth$7( >|pEbpU—e٬qj;7նh RR9Ս 1XSE6SX9޽{illW_e|'|~qee%6l6XiSl6UM`u˴J)mqBidΥ`K hl`|ўJQ?3^(! ͰمumQ55/dr pi) ̔66eH¾bmlP`Gof3v+u$~$(`˖-l۶j~S[[ˮ]x衇Po|Ή'8~xbSSc;'!ă+ q%qw `MYl_L~<]8PoSiǬN( }x'SByFi6?AkvZ3q4?K3iO )J޴]g;ϕH!`tn.v:hzš (FA¼Y^7}nv444p-plق{躞tsNzv\Y5kl~ 7F_6w5˳ʁU7ܡifR'!maaqO6QLcd~FfUh]z\q3cʁ'sa+ɍI@A2TQKO}{!%E}38qS?5Nm8J~np1|>$@Ix8r۷ogpꨨ  .i{R.>5+tu]VXR)plc+y7TKKol`dS:88CsoNl߳x7Rk]ӈO5-Xh4,.mdAq6˸ɶ&|)ͯt,DC_?EkU6W 6K% 6m Wqׯ*9k:8WY[^ |>G.uLc.|WDL؂U2m7śd[qJEd_8B}a6!@ͯD+aw_n3͗?Fm1#Qzbϋݜflv3ﳧA'D7C_jJqqB)!H)/rSSZ ]i)l̵,1(E8uEL(YMl/F `6""SQmld˶I!32fM`0HMEsW.7Q^^Nkk+} !~$xT㳄 Ki֘?ebQ.?Nfs6J]48?j٩X@ P`󻕍9`G~wُ#0aƉ:BQIaEO-b L_{5/%>`vӶxlT*544TPLWӷq6I[~s|N~ֲv\bݲQ:n L-"(QF2I2?"2l~6r3O` yL)˼q>b>0aWE-lnƧe3 GHĬ6ݦͯql~V!+ڞCLzKC{&B4퉢{ eL`˰f4>7iůM JgY ib;UJ^a77a n`AL,gHu1|. l"fSu="3tq/ m~aauHd ˩pi~+mr MwZ?FQFd#o7el67ili>ĵ&iymR <7ՔKf)t BcTZN4e 9m~6 ϳ芆j3A&<@*tnYi~u=oZX%;EQ,w?2mѬ&hJYpa53@T GrS) TqJݒ-m~LEbZZZ]GEQu}iq 4V < ֶS:fmȳ-/Y"W2l~빞. ^2m~K-NF1m~l%zTUρn{ͰlE Ux<ٳg.g&[t-ޘ}v\I咅(^r~l~pHSSbm~B]0Sj*Ȧmqmi N6Bg01()ޥeK1m~zzK*_R%mNYK@ m|L]*fr r0UTeee-GoyJebZ5Roc/VKBKni~![t#M9XϏaI6]4X%%U|"0m_f^T~\UYߎ# xEO[a{\Cw<pal杶'ˬ itP;hCէҼO/G8M'iShY6Mϛra+xIfzEj4tP6sap ǒZ[[Y4BB<4ﳆU mR;-Bs=0@?a!?e WUT+U_Xatۢ \4? &Y ,gi;LF_ /V |'tK)$sXRJFFFbxx8PRPQQ[EyF*/3\,EQt]c bBT!tr%cU!3=WJ++?d2hAJטǶx<ނM9p\3cBL挦iĽX%F4 M˳-:2'&&b`` 7bX/ɔw6JpwZi[>xxx (rHOB**"X& U' bBT*炙;73؂M9XͯsZ]Juuu[MiXj{Dw'WW)V |#=϶x`>ۢk^1Xߋy6{ucH)-9-Α^wœ]Ws?X%{/t"rm3G۶8m~B?ug5ta2XaHdgmѻ"='isb4V )Vu3?QJYi~Wh6M˟^WV4tttgxFJ١(o1^!*ۼ(I)?Bd*++կ~u?*3ii ԘO jN;(ʪ3U'Tu]'_YUWlV叟%г%tEXtdate:create2020-07-31T14:58:15+00:00o%tEXtdate:modify2020-07-31T14:58:15+00:00kIENDB`dtplyr/man/dtplyr-package.Rd0000644000176200001440000000174314406335651015555 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dtplyr-package.R \docType{package} \name{dtplyr-package} \alias{dtplyr} \alias{dtplyr-package} \title{dtplyr: Data Table Back-End for 'dplyr'} \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} Provides a data.table backend for 'dplyr'. The goal of 'dtplyr' is to allow you to write 'dplyr' code that is automatically translated to the equivalent, but usually much faster, data.table code. } \seealso{ Useful links: \itemize{ \item \url{https://dtplyr.tidyverse.org} \item \url{https://github.com/tidyverse/dtplyr} \item Report bugs at \url{https://github.com/tidyverse/dtplyr/issues} } } \author{ \strong{Maintainer}: Hadley Wickham \email{hadley@posit.co} Authors: \itemize{ \item Maximilian Girlich \item Mark Fairbanks \item Ryan Dickerson } Other contributors: \itemize{ \item Posit Software, PBC [copyright holder, funder] } } \keyword{internal} dtplyr/man/pivot_longer.dtplyr_step.Rd0000644000176200001440000001100014372711230017701 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-call-pivot_longer.R \name{pivot_longer.dtplyr_step} \alias{pivot_longer.dtplyr_step} \title{Pivot data from wide to long} \usage{ \method{pivot_longer}{dtplyr_step}( data, cols, names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, names_ptypes = NULL, names_transform = NULL, names_repair = "check_unique", values_to = "value", values_drop_na = FALSE, values_ptypes = NULL, values_transform = NULL, ... ) } \arguments{ \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{cols}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> Columns to pivot into longer format.} \item{names_to}{A character vector specifying the new column or columns to create from the information stored in the column names of \code{data} specified by \code{cols}. \itemize{ \item If length 0, or if \code{NULL} is supplied, no columns will be created. \item If length 1, a single column will be created which will contain the column names specified by \code{cols}. \item If length >1, multiple columns will be created. In this case, one of \code{names_sep} or \code{names_pattern} must be supplied to specify how the column names should be split. There are also two additional character values you can take advantage of: \itemize{ \item \code{NA} will discard the corresponding component of the column name. \item \code{".value"} indicates that the corresponding component of the column name defines the name of the output column containing the cell values, overriding \code{values_to} entirely. } }} \item{names_prefix}{A regular expression used to remove matching text from the start of each variable name.} \item{names_sep, names_pattern}{If \code{names_to} contains multiple values, these arguments control how the column name is broken up. \code{names_sep} takes the same specification as \code{\link[tidyr:separate]{separate()}}, and can either be a numeric vector (specifying positions to break on), or a single string (specifying a regular expression to split on). \code{names_pattern} takes the same specification as \code{\link[tidyr:extract]{extract()}}, a regular expression containing matching groups (\verb{()}). If these arguments do not give you enough control, use \code{pivot_longer_spec()} to create a spec object and process manually as needed.} \item{names_ptypes, names_transform, values_ptypes, values_transform}{Not currently supported by dtplyr.} \item{names_repair}{What happens if the output has invalid column names? The default, \code{"check_unique"} is to error if the columns are duplicated. Use \code{"minimal"} to allow duplicates in the output, or \code{"unique"} to de-duplicated by adding numeric suffixes. See \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}} for more options.} \item{values_to}{A string specifying the name of the column to create from the data stored in cell values. If \code{names_to} is a character containing the special \code{.value} sentinel, this value will be ignored, and the name of the value column will be derived from part of the existing column names.} \item{values_drop_na}{If \code{TRUE}, will drop rows that contain only \code{NA}s in the \code{value_to} column. This effectively converts explicit missing values to implicit missing values, and should generally be used only when missing values in \code{data} were created by its structure.} \item{...}{Additional arguments passed on to methods.} } \description{ This is a method for the tidyr \code{pivot_longer()} generic. It is translated to \code{\link[data.table:melt.data.table]{data.table::melt()}} } \examples{ library(tidyr) # Simplest case where column names are character data relig_income_dt <- lazy_dt(relig_income) relig_income_dt \%>\% pivot_longer(!religion, names_to = "income", values_to = "count") # Slightly more complex case where columns have common prefix, # and missing missings are structural so should be dropped. billboard_dt <- lazy_dt(billboard) billboard \%>\% pivot_longer( cols = starts_with("wk"), names_to = "week", names_prefix = "wk", values_to = "rank", values_drop_na = TRUE ) # Multiple variables stored in column names lazy_dt(who) \%>\% pivot_longer( cols = new_sp_m014:newrel_f65, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ) # Multiple observations per row anscombe_dt <- lazy_dt(anscombe) anscombe_dt \%>\% pivot_longer( everything(), names_to = c(".value", "set"), names_pattern = "(.)(.)" ) } dtplyr/man/collect.dtplyr_step.Rd0000644000176200001440000000320614013221715016624 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step.R \name{collect.dtplyr_step} \alias{collect.dtplyr_step} \alias{compute.dtplyr_step} \alias{as.data.table.dtplyr_step} \alias{as.data.frame.dtplyr_step} \alias{as_tibble.dtplyr_step} \title{Force computation of a lazy data.table} \usage{ \method{collect}{dtplyr_step}(x, ...) \method{compute}{dtplyr_step}(x, name = unique_name(), ...) \method{as.data.table}{dtplyr_step}(x, keep.rownames = FALSE, ...) \method{as.data.frame}{dtplyr_step}(x, ...) \method{as_tibble}{dtplyr_step}(x, ..., .name_repair = "check_unique") } \arguments{ \item{x}{A \link{lazy_dt}} \item{...}{Arguments used by other methods.} \item{name}{Name of intermediate data.table.} \item{keep.rownames}{Ignored as dplyr never preserves rownames.} \item{.name_repair}{Treatment of problematic column names} } \description{ \itemize{ \item \code{collect()} returns a tibble, grouped if needed. \item \code{compute()} generates an intermediate assignment in the translation. \item \code{as.data.table()} returns a data.table. \item \code{as.data.frame()} returns a data frame. \item \code{as_tibble()} returns a tibble. } } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) # Generate translation avg_mpg <- dt \%>\% filter(am == 1) \%>\% group_by(cyl) \%>\% summarise(mpg = mean(mpg)) # Show translation and temporarily compute result avg_mpg # compute and return tibble avg_mpg_tb <- as_tibble(avg_mpg) avg_mpg_tb # compute and return data.table avg_mpg_dt <- data.table::as.data.table(avg_mpg) avg_mpg_dt # modify translation to use intermediate assignment compute(avg_mpg) } dtplyr/man/replace_na.dtplyr_step.Rd0000644000176200001440000000246314372711230017300 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/replace_na.R \name{replace_na.dtplyr_step} \alias{replace_na.dtplyr_step} \title{Replace NAs with specified values} \usage{ \method{replace_na}{dtplyr_step}(data, replace = list()) } \arguments{ \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{replace}{If \code{data} is a data frame, \code{replace} takes a named list of values, with one value for each column that has missing values to be replaced. Each value in \code{replace} will be cast to the type of the column in \code{data} that it being used as a replacement in. If \code{data} is a vector, \code{replace} takes a single value. This single value replaces all of the missing values in the vector. \code{replace} will be cast to the type of \code{data}.} } \description{ This is a method for the tidyr \code{replace_na()} generic. It is translated to \code{\link[data.table:coalesce]{data.table::fcoalesce()}}. Note that unlike \code{tidyr::replace_na()}, \code{data.table::fcoalesce()} cannot replace \code{NULL} values in lists. } \examples{ library(tidyr) # Replace NAs in a data frame dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) dt \%>\% replace_na(list(x = 0, y = "unknown")) # Replace NAs using `dplyr::mutate()` dt \%>\% dplyr::mutate(x = replace_na(x, 0)) } dtplyr/man/pivot_wider.dtplyr_step.Rd0000644000176200001440000001011014372711230017526 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-call-pivot_wider.R \name{pivot_wider.dtplyr_step} \alias{pivot_wider.dtplyr_step} \title{Pivot data from long to wide} \usage{ \method{pivot_wider}{dtplyr_step}( data, id_cols = NULL, names_from = name, names_prefix = "", names_sep = "_", names_glue = NULL, names_sort = FALSE, names_repair = "check_unique", values_from = value, values_fill = NULL, values_fn = NULL, ... ) } \arguments{ \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{id_cols}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> A set of columns that uniquely identify each observation. Typically used when you have redundant variables, i.e. variables whose values are perfectly correlated with existing variables. Defaults to all columns in \code{data} except for the columns specified through \code{names_from} and \code{values_from}. If a tidyselect expression is supplied, it will be evaluated on \code{data} after removing the columns specified through \code{names_from} and \code{values_from}.} \item{names_from, values_from}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> A pair of arguments describing which column (or columns) to get the name of the output column (\code{names_from}), and which column (or columns) to get the cell values from (\code{values_from}). If \code{values_from} contains multiple values, the value will be added to the front of the output column.} \item{names_prefix}{String added to the start of every variable name. This is particularly useful if \code{names_from} is a numeric vector and you want to create syntactic variable names.} \item{names_sep}{If \code{names_from} or \code{values_from} contains multiple variables, this will be used to join their values together into a single string to use as a column name.} \item{names_glue}{Instead of \code{names_sep} and \code{names_prefix}, you can supply a glue specification that uses the \code{names_from} columns (and special \code{.value}) to create custom column names.} \item{names_sort}{Should the column names be sorted? If \code{FALSE}, the default, column names are ordered by first appearance.} \item{names_repair}{What happens if the output has invalid column names? The default, \code{"check_unique"} is to error if the columns are duplicated. Use \code{"minimal"} to allow duplicates in the output, or \code{"unique"} to de-duplicated by adding numeric suffixes. See \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}} for more options.} \item{values_fill}{Optionally, a (scalar) value that specifies what each \code{value} should be filled in with when missing. This can be a named list if you want to apply different fill values to different value columns.} \item{values_fn}{A function, the default is \code{length()}. Note this is different behavior than \code{tidyr::pivot_wider()}, which returns a list column by default.} \item{...}{Additional arguments passed on to methods.} } \description{ This is a method for the tidyr \code{pivot_wider()} generic. It is translated to \code{\link[data.table:dcast.data.table]{data.table::dcast()}} } \examples{ library(tidyr) fish_encounters_dt <- lazy_dt(fish_encounters) fish_encounters_dt fish_encounters_dt \%>\% pivot_wider(names_from = station, values_from = seen) # Fill in missing values fish_encounters_dt \%>\% pivot_wider(names_from = station, values_from = seen, values_fill = 0) # Generate column names from multiple variables us_rent_income_dt <- lazy_dt(us_rent_income) us_rent_income_dt us_rent_income_dt \%>\% pivot_wider(names_from = variable, values_from = c(estimate, moe)) # When there are multiple `names_from` or `values_from`, you can use # use `names_sep` or `names_glue` to control the output variable names us_rent_income_dt \%>\% pivot_wider( names_from = variable, names_sep = ".", values_from = c(estimate, moe) ) # Can perform aggregation with values_fn warpbreaks_dt <- lazy_dt(as_tibble(warpbreaks[c("wool", "tension", "breaks")])) warpbreaks_dt warpbreaks_dt \%>\% pivot_wider( names_from = wool, values_from = breaks, values_fn = mean ) } dtplyr/man/select.dtplyr_step.Rd0000644000176200001440000000165614006775461016503 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-select.R \name{select.dtplyr_step} \alias{select.dtplyr_step} \title{Subset columns using their names} \usage{ \method{select}{dtplyr_step}(.data, ...) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted expressions separated by commas. Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can be used to select a range of variables.} } \description{ This is a method for the dplyr \code{\link[=select]{select()}} generic. It is translated to the \code{j} argument of \verb{[.data.table}. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x1 = 1, x2 = 2, y1 = 3, y2 = 4)) dt \%>\% select(starts_with("x")) dt \%>\% select(ends_with("2")) dt \%>\% select(z1 = x1, z2 = x2) } dtplyr/man/nest.dtplyr_step.Rd0000644000176200001440000000237514372711230016162 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-nest.R \name{nest.dtplyr_step} \alias{nest.dtplyr_step} \title{Nest} \usage{ \method{nest}{dtplyr_step}(.data, ..., .names_sep = NULL, .key = deprecated()) } \arguments{ \item{.data}{A data frame.} \item{...}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> Columns to nest, specified using name-variable pairs of the form \code{new_col = c(col1, col2, col3)}. The right hand side can be any valid tidy select expression.} \item{.names_sep}{If \code{NULL}, the default, the inner names will come from the former outer names. If a string, the new inner names will use the outer names with \code{names_sep} automatically stripped. This makes \code{names_sep} roughly symmetric between nesting and unnesting.} \item{.key}{Not supported.} \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} } \description{ This is a method for the tidyr \code{\link[tidyr:nest]{tidyr::nest()}} generic. It is translated using the non-nested variables in the \code{by} argument and \code{.SD} in the \code{j} argument. } \examples{ if (require("tidyr", quietly = TRUE)) { dt <- lazy_dt(tibble(x = c(1, 2, 1), y = c("a", "a", "b"))) dt \%>\% nest(data = y) dt \%>\% dplyr::group_by(x) \%>\% nest() } } dtplyr/man/unite.dtplyr_step.Rd0000644000176200001440000000266614372711230016340 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/unite.R \name{unite.dtplyr_step} \alias{unite.dtplyr_step} \title{Unite multiple columns into one by pasting strings together.} \usage{ \method{unite}{dtplyr_step}(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE) } \arguments{ \item{data}{A data frame.} \item{col}{The name of the new column, as a string or symbol. This argument is passed by expression and supports \link[rlang:topic-inject]{quasiquotation} (you can unquote strings and symbols). The name is captured from the expression with \code{\link[rlang:defusing-advanced]{rlang::ensym()}} (note that this kind of interface where symbols do not represent actual objects is now discouraged in the tidyverse; we support it here for backward compatibility).} \item{...}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> Columns to unite} \item{sep}{Separator to use between values.} \item{remove}{If \code{TRUE}, remove input columns from output data frame.} \item{na.rm}{If \code{TRUE}, missing values will be removed prior to uniting each value.} } \description{ This is a method for the tidyr \code{unite()} generic. } \examples{ library(tidyr) df <- lazy_dt(expand_grid(x = c("a", NA), y = c("b", NA))) df df \%>\% unite("z", x:y, remove = FALSE) # Separate is almost the complement of unite df \%>\% unite("xy", x:y) \%>\% separate(xy, c("x", "y")) # (but note `x` and `y` contain now "NA" not NA) } dtplyr/man/group_by.dtplyr_step.Rd0000644000176200001440000000405114406335651017037 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-group.R \name{group_by.dtplyr_step} \alias{group_by.dtplyr_step} \alias{ungroup.dtplyr_step} \title{Group and ungroup} \usage{ \method{group_by}{dtplyr_step}(.data, ..., .add = FALSE, arrange = TRUE) \method{ungroup}{dtplyr_step}(x, ...) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}} \item{...}{In \code{group_by()}, variables or computations to group by. Computations are always done on the ungrouped data frame. To perform computations on the grouped data, you need to use a separate \code{mutate()} step before the \code{group_by()}. Computations are not allowed in \code{nest_by()}. In \code{ungroup()}, variables to remove from the grouping.} \item{.add, add}{When \code{FALSE}, the default, \code{group_by()} will override existing groups. To add to the existing groups, use \code{.add = TRUE}. This argument was previously called \code{add}, but that prevented creating a new grouping variable called \code{add}, and conflicts with our naming conventions.} \item{arrange}{If \code{TRUE}, will automatically arrange the output of subsequent grouped operations by group. If \code{FALSE}, output order will be left unchanged. In the generated data.table code this switches between using the \code{keyby} (\code{TRUE}) and \code{by} (\code{FALSE}) arguments.} \item{x}{A \code{\link[dplyr:tbl]{tbl()}}} } \description{ These are methods for dplyr's \code{\link[=group_by]{group_by()}} and \code{\link[=ungroup]{ungroup()}} generics. Grouping is translated to the either \code{keyby} and \code{by} argument of \verb{[.data.table} depending on the value of the \code{arrange} argument. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) # group_by() is usually translated to `keyby` so that the groups # are ordered in the output dt \%>\% group_by(cyl) \%>\% summarise(mpg = mean(mpg)) # use `arrange = FALSE` to instead use `by` so the original order # or groups is preserved dt \%>\% group_by(cyl, arrange = FALSE) \%>\% summarise(mpg = mean(mpg)) } dtplyr/man/head.dtplyr_step.Rd0000644000176200001440000000155014006775461016116 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-call.R \name{head.dtplyr_step} \alias{head.dtplyr_step} \alias{tail.dtplyr_step} \title{Subset first or last rows} \usage{ \method{head}{dtplyr_step}(x, n = 6L, ...) \method{tail}{dtplyr_step}(x, n = 6L, ...) } \arguments{ \item{x}{A \code{\link[=lazy_dt]{lazy_dt()}}} \item{n}{Number of rows to select. Can use a negative number to instead drop rows from the other end.} \item{...}{Passed on to \code{\link[=head]{head()}}/\code{\link[=tail]{tail()}}.} } \description{ These are methods for the base generics \code{\link[=head]{head()}} and \code{\link[=tail]{tail()}}. They are not translated. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1:10)) # first three rows head(dt, 3) # last three rows tail(dt, 3) # drop first three rows tail(dt, -3) } dtplyr/man/separate.dtplyr_step.Rd0000644000176200001440000000410314126601265017007 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-separate.R \name{separate.dtplyr_step} \alias{separate.dtplyr_step} \title{Separate a character column into multiple columns with a regular expression or numeric locations} \usage{ \method{separate}{dtplyr_step}( data, col, into, sep = "[^[:alnum:]]+", remove = TRUE, convert = FALSE, ... ) } \arguments{ \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{col}{Column name or position. This argument is passed by expression and supports quasiquotation (you can unquote column names or column positions).} \item{into}{Names of new variables to create as character vector. Use \code{NA} to omit the variable in the output.} \item{sep}{Separator between columns. The default value is a regular expression that matches any sequence of non-alphanumeric values.} \item{remove}{If TRUE, remove the input column from the output data frame.} \item{convert}{If TRUE, will run type.convert() with as.is = TRUE on new columns. This is useful if the component columns are integer, numeric or logical. NB: this will cause string "NA"s to be converted to NAs.} \item{...}{Arguments passed on to methods} } \description{ This is a method for the \code{\link[tidyr:separate]{tidyr::separate()}} generic. It is translated to \code{\link[data.table:tstrsplit]{data.table::tstrsplit()}} in the \code{j} argument of \verb{[.data.table}. } \examples{ library(tidyr) # If you want to split by any non-alphanumeric value (the default): df <- lazy_dt(data.frame(x = c(NA, "x.y", "x.z", "y.z")), "DT") df \%>\% separate(x, c("A", "B")) # If you just want the second variable: df \%>\% separate(x, c(NA, "B")) # Use regular expressions to separate on multiple characters: df <- lazy_dt(data.frame(x = c(NA, "x?y", "x.z", "y:z")), "DT") df \%>\% separate(x, c("A","B"), sep = "([.?:])") # convert = TRUE detects column classes: df <- lazy_dt(data.frame(x = c("x:1", "x:2", "y:4", "z", NA)), "DT") df \%>\% separate(x, c("key","value"), ":") \%>\% str df \%>\% separate(x, c("key","value"), ":", convert = TRUE) \%>\% str } dtplyr/man/slice.dtplyr_step.Rd0000644000176200001440000000747314372711230016314 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-slice.R \name{slice.dtplyr_step} \alias{slice.dtplyr_step} \alias{slice_head.dtplyr_step} \alias{slice_tail.dtplyr_step} \alias{slice_min.dtplyr_step} \alias{slice_max.dtplyr_step} \title{Subset rows using their positions} \usage{ \method{slice}{dtplyr_step}(.data, ..., .by = NULL) \method{slice_head}{dtplyr_step}(.data, ..., n, prop, by = NULL) \method{slice_tail}{dtplyr_step}(.data, ..., n, prop, by = NULL) \method{slice_min}{dtplyr_step}(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE) \method{slice_max}{dtplyr_step}(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{For \code{slice()}: <\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Integer row values. Provide either positive values to keep, or negative values to drop. The values provided must be either all positive or all negative. Indices beyond the number of rows in the input are silently ignored. For \verb{slice_*()}, these arguments are passed on to methods.} \item{.by, by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[dplyr:group_by]{group_by()}}. For details and examples, see \link[dplyr:dplyr_by]{?dplyr_by}.} \item{n, prop}{Provide either \code{n}, the number of rows, or \code{prop}, the proportion of rows to select. If neither are supplied, \code{n = 1} will be used. If \code{n} is greater than the number of rows in the group (or \code{prop > 1}), the result will be silently truncated to the group size. \code{prop} will be rounded towards zero to generate an integer number of rows. A negative value of \code{n} or \code{prop} will be subtracted from the group size. For example, \code{n = -2} with a group of 5 rows will select 5 - 2 = 3 rows; \code{prop = -0.25} with 8 rows will select 8 * (1 - 0.25) = 6 rows.} \item{order_by}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Variable or function of variables to order by. To order by multiple variables, wrap them in a data frame or tibble.} \item{with_ties}{Should ties be kept together? The default, \code{TRUE}, may return more rows than you request. Use \code{FALSE} to ignore ties, and return the first \code{n} rows.} } \description{ These are methods for the dplyr \code{\link[=slice]{slice()}}, \code{slice_head()}, \code{slice_tail()}, \code{slice_min()}, \code{slice_max()} and \code{slice_sample()} generics. They are translated to the \code{i} argument of \verb{[.data.table}. Unlike dplyr, \code{slice()} (and \code{slice()} alone) returns the same number of rows per group, regardless of whether or not the indices appear in each group. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt \%>\% slice(1, 5, 10) dt \%>\% slice(-(1:4)) # First and last rows based on existing order dt \%>\% slice_head(n = 5) dt \%>\% slice_tail(n = 5) # Rows with minimum and maximum values of a variable dt \%>\% slice_min(mpg, n = 5) dt \%>\% slice_max(mpg, n = 5) # slice_min() and slice_max() may return more rows than requested # in the presence of ties. Use with_ties = FALSE to suppress dt \%>\% slice_min(cyl, n = 1) dt \%>\% slice_min(cyl, n = 1, with_ties = FALSE) # slice_sample() allows you to random select with or without replacement dt \%>\% slice_sample(n = 5) dt \%>\% slice_sample(n = 5, replace = TRUE) # you can optionally weight by a variable - this code weights by the # physical weight of the cars, so heavy cars are more likely to get # selected dt \%>\% slice_sample(weight_by = wt, n = 5) } dtplyr/man/arrange.dtplyr_step.Rd0000644000176200001440000000172714007004012016614 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-arrange.R \name{arrange.dtplyr_step} \alias{arrange.dtplyr_step} \title{Arrange rows by column values} \usage{ \method{arrange}{dtplyr_step}(.data, ..., .by_group = FALSE) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Variables, or functions of variables. Use \code{\link[dplyr:desc]{desc()}} to sort a variable in descending order.} \item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to grouped data frames only.} } \description{ This is a method for dplyr generic \code{\link[=arrange]{arrange()}}. It is translated to an \code{\link[=order]{order()}} call in the \code{i} argument of \verb{[.data.table}. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt \%>\% arrange(vs, cyl) dt \%>\% arrange(desc(vs), cyl) dt \%>\% arrange(across(mpg:disp)) } dtplyr/man/fill.dtplyr_step.Rd0000644000176200001440000000460614021424751016136 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/fill.R \name{fill.dtplyr_step} \alias{fill.dtplyr_step} \title{Fill in missing values with previous or next value} \usage{ \method{fill}{dtplyr_step}(data, ..., .direction = c("down", "up", "downup", "updown")) } \arguments{ \item{data}{A data frame.} \item{...}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> Columns to fill.} \item{.direction}{Direction in which to fill missing values. Currently either "down" (the default), "up", "downup" (i.e. first down and then up) or "updown" (first up and then down).} } \description{ This is a method for the tidyr \code{fill()} generic. It is translated to \code{\link[data.table:nafill]{data.table::nafill()}}. Note that \code{data.table::nafill()} currently only works for integer and double columns. } \examples{ library(tidyr) # Value (year) is recorded only when it changes sales <- lazy_dt(tibble::tribble( ~quarter, ~year, ~sales, "Q1", 2000, 66013, "Q2", NA, 69182, "Q3", NA, 53175, "Q4", NA, 21001, "Q1", 2001, 46036, "Q2", NA, 58842, "Q3", NA, 44568, "Q4", NA, 50197, "Q1", 2002, 39113, "Q2", NA, 41668, "Q3", NA, 30144, "Q4", NA, 52897, "Q1", 2004, 32129, "Q2", NA, 67686, "Q3", NA, 31768, "Q4", NA, 49094 )) # `fill()` defaults to replacing missing data from top to bottom sales \%>\% fill(year) # Value (n_squirrels) is missing above and below within a group squirrels <- lazy_dt(tibble::tribble( ~group, ~name, ~role, ~n_squirrels, 1, "Sam", "Observer", NA, 1, "Mara", "Scorekeeper", 8, 1, "Jesse", "Observer", NA, 1, "Tom", "Observer", NA, 2, "Mike", "Observer", NA, 2, "Rachael", "Observer", NA, 2, "Sydekea", "Scorekeeper", 14, 2, "Gabriela", "Observer", NA, 3, "Derrick", "Observer", NA, 3, "Kara", "Scorekeeper", 9, 3, "Emily", "Observer", NA, 3, "Danielle", "Observer", NA )) # The values are inconsistently missing by position within the group # Use .direction = "downup" to fill missing values in both directions squirrels \%>\% dplyr::group_by(group) \%>\% fill(n_squirrels, .direction = "downup") \%>\% dplyr::ungroup() # Using `.direction = "updown"` accomplishes the same goal in this example } dtplyr/man/count.dtplyr_step.Rd0000644000176200001440000000263114406335651016343 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/count.R \name{count.dtplyr_step} \alias{count.dtplyr_step} \title{Count observations by group} \usage{ \method{count}{dtplyr_step}(x, ..., wt = NULL, sort = FALSE, name = NULL) } \arguments{ \item{x}{A \code{\link[=lazy_dt]{lazy_dt()}}} \item{...}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Variables to group by.} \item{wt}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Frequency weights. Can be \code{NULL} or a variable: \itemize{ \item If \code{NULL} (the default), counts the number of rows in each group. \item If a variable, computes \code{sum(wt)} for each group. }} \item{sort}{If \code{TRUE}, will show the largest groups at the top.} \item{name}{The name of the new column in the output. If omitted, it will default to \code{n}. If there's already a column called \code{n}, it will use \code{nn}. If there's a column called \code{n} and \code{nn}, it'll use \code{nnn}, and so on, adding \code{n}s until it gets a new name.} } \description{ This is a method for the dplyr \code{\link[=count]{count()}} generic. It is translated using \code{.N} in the \code{j} argument, and supplying groups to \code{keyby} as appropriate. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(dplyr::starwars) dt \%>\% count(species) dt \%>\% count(species, sort = TRUE) dt \%>\% count(species, wt = mass, sort = TRUE) } dtplyr/man/transmute.dtplyr_step.Rd0000644000176200001440000000206514372711230017227 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-subset-transmute.R \name{transmute.dtplyr_step} \alias{transmute.dtplyr_step} \title{Create new columns, dropping old} \usage{ \method{transmute}{dtplyr_step}(.data, ...) } \arguments{ \item{.data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[dplyr:dplyr_data_masking]{data-masking}}> Name-value pairs. The name gives the name of the column in the output. The value can be: \itemize{ \item A vector of length 1, which will be recycled to the correct length. \item A vector the same length as the current group (or the whole data frame if ungrouped). \item \code{NULL}, to remove the column. \item A data frame or tibble, to create multiple columns in the output. }} } \description{ This is a method for the dplyr \code{\link[=transmute]{transmute()}} generic. It is translated to the \code{j} argument of \verb{[.data.table}. } \examples{ library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(dplyr::starwars) dt \%>\% transmute(name, sh = paste0(species, "/", homeworld)) } dtplyr/man/complete.dtplyr_step.Rd0000644000176200001440000000376014372711230017020 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/complete.R \name{complete.dtplyr_step} \alias{complete.dtplyr_step} \title{Complete a data frame with missing combinations of data} \usage{ \method{complete}{dtplyr_step}(data, ..., fill = list()) } \arguments{ \item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} \item{...}{<\code{\link[tidyr:tidyr_data_masking]{data-masking}}> Specification of columns to expand or complete. Columns can be atomic vectors or lists. \itemize{ \item To find all unique combinations of \code{x}, \code{y} and \code{z}, including those not present in the data, supply each variable as a separate argument: \code{expand(df, x, y, z)} or \code{complete(df, x, y, z)}. \item To find only the combinations that occur in the data, use \code{nesting}: \code{expand(df, nesting(x, y, z))}. \item You can combine the two forms. For example, \code{expand(df, nesting(school_id, student_id), date)} would produce a row for each present school-student combination for all possible dates. } When used with factors, \code{\link[tidyr:expand]{expand()}} and \code{\link[tidyr:complete]{complete()}} use the full set of levels, not just those that appear in the data. If you want to use only the values seen in the data, use \code{forcats::fct_drop()}. When used with continuous variables, you may need to fill in values that do not appear in the data: to do so use expressions like \code{year = 2010:2020} or \code{year = full_seq(year,1)}.} \item{fill}{A named list that for each variable supplies a single value to use instead of \code{NA} for missing combinations.} } \description{ This is a method for the tidyr \code{complete()} generic. This is a wrapper around \code{dtplyr} translations for \code{expand()}, \code{full_join()}, and \code{replace_na()} that's useful for completing missing combinations of data. } \examples{ library(tidyr) tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) dt <- lazy_dt(tbl) dt \%>\% complete(x, y) dt \%>\% complete(x, y, fill = list(z = 10L)) } dtplyr/DESCRIPTION0000644000176200001440000000271314406577055013316 0ustar liggesusersPackage: dtplyr Title: Data Table Back-End for 'dplyr' Version: 1.3.1 Authors@R: c( person("Hadley", "Wickham", , "hadley@posit.co", role = c("cre", "aut")), person("Maximilian", "Girlich", role = "aut"), person("Mark", "Fairbanks", role = "aut"), person("Ryan", "Dickerson", role = "aut"), person("Posit Software, PBC", role = c("cph", "fnd")) ) Description: Provides a data.table backend for 'dplyr'. The goal of 'dtplyr' is to allow you to write 'dplyr' code that is automatically translated to the equivalent, but usually much faster, data.table code. License: MIT + file LICENSE URL: https://dtplyr.tidyverse.org, https://github.com/tidyverse/dtplyr BugReports: https://github.com/tidyverse/dtplyr/issues Depends: R (>= 3.3) Imports: cli (>= 3.4.0), data.table (>= 1.13.0), dplyr (>= 1.1.0), glue, lifecycle, rlang (>= 1.0.4), tibble, tidyselect (>= 1.2.0), vctrs (>= 0.4.1) Suggests: bench, covr, knitr, rmarkdown, testthat (>= 3.1.2), tidyr (>= 1.1.0), waldo (>= 0.3.1) VignetteBuilder: knitr Config/Needs/website: tidyverse/tidytemplate Config/testthat/edition: 3 Encoding: UTF-8 RoxygenNote: 7.2.3 NeedsCompilation: no Packaged: 2023-03-21 14:31:40 UTC; hadleywickham Author: Hadley Wickham [cre, aut], Maximilian Girlich [aut], Mark Fairbanks [aut], Ryan Dickerson [aut], Posit Software, PBC [cph, fnd] Maintainer: Hadley Wickham Repository: CRAN Date/Publication: 2023-03-22 13:10:05 UTC dtplyr/build/0000755000176200001440000000000014406337714012700 5ustar liggesusersdtplyr/build/vignette.rds0000644000176200001440000000030714406337714015237 0ustar liggesusersmP0 DIpbH2K7\, Ku{u;ApK01;gitUcҢ. ʫ5Egt 3Z:S7e_z"սkYp#Jcc `.f`qAZtаN Ndtplyr/tests/0000755000176200001440000000000014406337707012745 5ustar liggesusersdtplyr/tests/testthat/0000755000176200001440000000000014406577055014607 5ustar liggesusersdtplyr/tests/testthat/test-step-subset-separate.R0000644000176200001440000000535014300165007021751 0ustar liggesuserstest_that("missing values in input are missing in output", { dt <- lazy_dt(tibble(x = c(NA, "a b")), "DT") step <- separate(dt, x, c("x", "y")) out <- collect(step) expect_equal( show_query(step), expr(copy(DT)[, `:=`(!!c("x", "y"), tstrsplit(x, split = "[^[:alnum:]]+"))]) ) expect_equal(out$x, c(NA, "a")) expect_equal(out$y, c(NA, "b")) }) test_that("convert produces integers etc", { dt <- lazy_dt(tibble(x = "1-1.5-FALSE"), "DT") step <- separate(dt, x, c("x", "y", "z"), "-", convert = TRUE) out <- collect(step) expect_equal( show_query(step), expr(copy(DT)[, `:=`(!!c("x", "y", "z"), tstrsplit(x, split = "-", type.convert = TRUE))]) ) expect_equal(out$x, 1L) expect_equal(out$y, 1.5) expect_equal(out$z, FALSE) }) test_that("overwrites existing columns", { dt <- lazy_dt(tibble(x = "a:b"), "DT") step <- dt %>% separate(x, c("x", "y")) out <- collect(step) expect_equal( show_query(step), expr(copy(DT)[, `:=`(!!c("x", "y"), tstrsplit(x, split = "[^[:alnum:]]+"))]) ) expect_equal(step$vars, c("x", "y")) expect_equal(out$x, "a") }) test_that("drops NA columns", { dt <- lazy_dt(tibble(x = c(NA, "a-b", "c-d")), "DT") step <- separate(dt, x, c(NA, "y"), "-") out <- collect(step) expect_equal(step$vars, "y") expect_equal(out$y, c(NA, "b", "d")) }) test_that("checks type of `into` and `sep`", { dt <- lazy_dt(tibble(x = "a:b"), "DT") expect_snapshot( separate(dt, x, "x", FALSE), error = TRUE ) expect_snapshot( separate(dt, x, FALSE), error = TRUE ) }) test_that("only copies when necessary", { dt <- tibble(x = paste(letters[1:3], letters[1:3], sep = "-"), y = 1:3) %>% lazy_dt("DT") step <- dt %>% filter(y < 4) %>% separate(x, into = c("left", "right"), sep = "-") expect_equal( show_query(step), expr(DT[y < 4][, `:=`(!!c("left", "right"), tstrsplit(x, split = "-"))][, `:=`("x", NULL)]) ) }) test_that("can pass quosure to `col` arg, #359", { dt <- lazy_dt(tibble(combined = c("a_b", "a_b")), "DT") separate2 <- function(df, col, into) { collect(separate(df, {{ col }}, into)) } out <- separate2(dt, combined, into = c("a", "b")) expect_named(out, c("a", "b")) expect_equal(out$a, c("a", "a")) expect_equal(out$b, c("b", "b")) }) test_that("can use numeric `col` arg", { dt <- lazy_dt(tibble(combined = c("a_b", "a_b")), "DT") out <- collect(separate(dt, 1, into = c("a", "b"))) expect_named(out, c("a", "b")) expect_equal(out$a, c("a", "a")) expect_equal(out$b, c("b", "b")) }) test_that("errors on multiple columns in `col`", { dt <- lazy_dt(tibble(x = c("a_b", "a_b"), y = x), "DT") expect_error(separate(dt, c(x, y), into = c("left", "right")), "must select exactly one column") }) dtplyr/tests/testthat/test-step-first.R0000644000176200001440000000335214021443044017771 0ustar liggesuserstest_that("constructor has sensible defaults", { dt <- data.table(x = 1:2, y = 1:2) step <- step_first(dt) expect_s3_class(step, "dtplyr_step_first") expect_equal(step$parent, dt) expect_equal(step$vars, c("x", "y")) expect_equal(step$groups, character()) expect_match(as.character(step$name), "_DT") }) # mutability -------------------------------------------------------------- test_that("doesn't need copy", { dt <- lazy_dt(mtcars) expect_false(dt$needs_copy) }) test_that("mutable object must be a data table", { expect_error(lazy_dt(mtcars, immutable = FALSE), "not already a data table") }) test_that("mutable object never needs copy", { dt <- lazy_dt(as.data.table(mtcars), immutable = FALSE) expect_false(dt$needs_copy) expect_false(dt %>% mutate(x = 1) %>% .$needs_copy) }) test_that("dt_call() copies if requested", { dt <- lazy_dt(mtcars, name = "DT") expect_equal(dt_call(dt, FALSE), quote(DT)) expect_equal(dt_call(dt, TRUE), quote(copy(DT))) }) test_that("lazy_dt doesn't copy input", { dt <- data.table(x = 1) lz <- lazy_dt(dt) expect_equal(data.table::address(dt), data.table::address(lz$parent)) }) # keys -------------------------------------------------------------------- test_that("can set keys", { dt <- lazy_dt(mtcars, key_by = cyl) expect_equal(data.table::key(dt$parent), "cyl") }) test_that("setting doesn't modify data.table", { dt1 <- data.table(x = c(5, 1, 2)) dt2 <- lazy_dt(dt1, key_by = x) expect_equal(data.table::key(dt1$parent), NULL) expect_equal(data.table::key(dt2$parent), "x") }) # groups ------------------------------------------------------------------ test_that("keeps groups", { dt <- lazy_dt(group_by(mtcars, cyl)) expect_equal(group_vars(dt), "cyl") }) dtplyr/tests/testthat/helpers-library.R0000644000176200001440000000011614013221715020013 0ustar liggesuserslibrary(dplyr, warn.conflicts = FALSE) library(tidyr, warn.conflicts = FALSE) dtplyr/tests/testthat/test-complete.R0000644000176200001440000000125714031070705017504 0ustar liggesuserstest_that("complete with no variables returns data as is", { mtcars_dt <- lazy_dt(mtcars, "DT") expect_equal(complete(mtcars_dt), mtcars_dt) }) test_that("basic invocation works", { tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) dt <- lazy_dt(tbl, "DT") out <- dt %>% complete(x, y) %>% collect() expect_equal(nrow(out), 4) expect_equal(out$z, c(3, NA, NA, 4)) }) test_that("empty expansion returns original", { tbl <- tibble(x = character()) dt <- lazy_dt(tbl, "DT") out <- dt %>% complete(y = NULL) %>% collect() expect_equal(out, tbl) tbl <- tibble(x = 1:4) dt <- lazy_dt(tbl, "DT") out <- dt %>% complete(y = NULL) %>% collect() expect_equal(out, tbl) }) dtplyr/tests/testthat/test-step.R0000644000176200001440000000425714126601265016660 0ustar liggesuserstest_that("tbl metadata as expected", { dt <- lazy_dt(data.table(x = c(1, 1, 1, 2, 2, 3)), "DT") expect_equal(dim(dt), c(6, 1)) expect_equal(as.character(tbl_vars(dt)), "x") expect_equal(show_query(dt), expr(DT)) }) test_that("group metadata as expected", { dt <- lazy_dt(data.table(x = c(1, 1, 1, 2, 2, 3))) expect_equal(group_vars(dt), character()) expect_equal(groups(dt), list()) expect_equal(group_size(dt), 6) expect_equal(n_groups(dt), 1) gt <- group_by(dt, x) expect_equal(group_vars(gt), c("x")) expect_equal(groups(gt), syms("x")) expect_equal(group_size(gt), c(3, 2, 1)) expect_equal(n_groups(gt), 3) }) test_that("has useful display methods", { expect_snapshot({ dt <- lazy_dt(mtcars, "DT") dt dt %>% group_by(vs, am) dt %>% mutate(y = 10) %>% compute("DT2") }) }) test_that("can evaluate to any data frame type", { dt <- lazy_dt(mtcars, "DT") expect_identical(class(as.data.frame(dt)), "data.frame") expect_s3_class(as.data.table(dt), "data.table") expect_s3_class(as_tibble(dt), "tbl_df") expect_s3_class(collect(dt), "tbl_df") }) test_that("compute returns lazy_dt", { dt <- lazy_dt(mtcars, "DT") dt <- summarise(dt, n = n()) dt2 <- compute(dt) expect_s3_class(dt2, "dtplyr_step") expect_equal(as.character(tbl_vars(dt2)), "n") }) test_that("collect and compute return grouped data", { dt <- group_by(lazy_dt(data.table(x = 1, y = 1), "DT"), x) expect_equal(dt %>% compute() %>% group_vars(), "x") expect_equal(dt %>% collect() %>% group_vars(), "x") }) # pull() ------------------------------------------------------------------ test_that("pull default extracts last var from data frame", { df <- lazy_dt(tibble(x = 1:10, y = 1:10), "DT") expect_equal(pull(df), 1:10) }) test_that("can extract by name, or positive/negative position", { x <- 1:10 df <- lazy_dt(tibble(x = x, y = runif(10)), "DT") expect_equal(pull(df, x), x) expect_equal(pull(df, 1), x) expect_equal(pull(df, -2L), x) }) test_that("can extract named vectors", { x <- 1:10 y <- letters[x] df <- lazy_dt(tibble(x = x, y = y), "DT") xn <- set_names(x, y) expect_equal(pull(df, x, y), xn) expect_equal(pull(df, 1, 2), xn) }) dtplyr/tests/testthat/test-step-subset-transmute.R0000644000176200001440000001233614300165007022171 0ustar liggesuserstest_that("simple calls generate expected translations", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% transmute(x) %>% show_query(), expr(DT[, .(x = x)]) ) }) test_that("transmute generates compound expression if needed", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_equal( dt %>% transmute(x2 = x * 2, x4 = x2 * 2) %>% show_query(), expr(DT[, { x2 <- x * 2 x4 <- x2 * 2 .(x2, x4) }]) ) }) test_that("allows multiple assignment to the same variable", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") # when nested expect_equal( dt %>% transmute(x = x * 2, x = x * 2) %>% show_query(), expr(DT[, { x <- x * 2 x <- x * 2 .(x) }]) ) # when not nested expect_equal( dt %>% transmute(z = 2, y = 3) %>% show_query(), expr(DT[, .(z = 2, y = 3)]) ) }) test_that("groups are respected", { dt <- lazy_dt(data.table(x = 1), "DT") %>% group_by(x) %>% transmute(y = 2) expect_equal(dt$vars, c("x", "y")) expect_equal( dt %>% show_query(), expr(DT[, .(y = 2), keyby = .(x)]) ) }) test_that("grouping vars can be transmuted", { dt <- lazy_dt(data.table(x = 1), "DT") %>% group_by(x) %>% transmute(x = x + 1, y = 2) expect_equal(dt$vars, c("x", "y")) expect_equal(dt$groups, "x") expect_equal( dt %>% show_query(), expr(copy(DT)[, `:=`(x = x + 1)][, .(y = 2), keyby = .(x)]) ) skip("transmuting grouping vars with nesting is not supported") dt <- lazy_dt(data.table(x = 1), "DT") %>% group_by(x) %>% transmute(x = x + 1, y = x + 1, x = y + 1) expect_equal(dt$vars, c("x", "y")) expect_equal( dt %>% collect(), tibble(x = 4, y = 3) %>% group_by(x) ) }) test_that("empty transmute works", { dt <- lazy_dt(data.frame(x = 1), "DT") expect_equal(transmute(dt) %>% show_query(), expr(DT[, 0L])) expect_equal(transmute(dt)$vars, character()) expect_equal(transmute(dt, !!!list()) %>% show_query(), expr(DT[, 0L])) dt_grouped <- lazy_dt(data.frame(x = 1), "DT") %>% group_by(x) expect_equal(transmute(dt_grouped)$vars, "x") }) test_that("only transmuting groups works", { dt <- lazy_dt(data.frame(x = 1)) %>% group_by(x) expect_equal(transmute(dt, x) %>% collect(), dt %>% collect()) expect_equal(transmute(dt, x)$vars, "x") }) test_that("across() can access previously created variables", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- transmute(dt, y = 2, across(y, sqrt)) expect_equal( collect(step), tibble(y = sqrt(2)) ) expect_equal( show_query(step), expr(DT[, { y <- 2 y <- sqrt(y) .(y) }]) ) }) test_that("new columns take precedence over global variables", { dt <- lazy_dt(data.frame(x = 1), "DT") y <- 'global var' step <- transmute(dt, y = 2, z = y + 1) expect_equal( collect(step), tibble(y = 2, z = 3) ) expect_equal( show_query(step), expr(DT[, { y <- 2 z <- y + 1 .(y, z) }]) ) }) # var = NULL ------------------------------------------------------------- test_that("var = NULL when var is in original data", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- dt %>% transmute(x = 2, z = x*2, x = NULL) expect_equal( collect(step), tibble(z = 4) ) expect_equal( step$vars, "z" ) expect_equal( show_query(step), expr(DT[, { x <- 2 z <- x * 2 .(x, z) }][, `:=`("x", NULL)]) ) }) test_that("var = NULL when var is in final output", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- transmute(dt, y = NULL, y = 3) expect_equal( collect(step), tibble(y = 3) ) expect_equal( show_query(step), expr(DT[, { y <- NULL y <- 3 .(y) }]) ) }) test_that("temp var with nested arguments", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- transmute(dt, y = 2, z = y*2, y = NULL) expect_equal( collect(step), tibble(z = 4) ) expect_equal( step$vars, "z" ) expect_equal( show_query(step), expr(DT[, { y <- 2 z <- y * 2 .(y, z) }][, `:=`("y", NULL)]) ) }) test_that("temp var with no new vars added", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- transmute(dt, y = 2, y = NULL) expect_equal( collect(step), tibble() ) expect_equal( step$vars, character() ) expect_equal( show_query(step), expr(DT[, { y <- 2 .(y) }][, `:=`("y", NULL)]) ) }) test_that("var = NULL works when data is grouped", { dt <- lazy_dt(data.frame(x = 1, g = 1), "DT") %>% group_by(g) # when var is in original data step <- dt %>% transmute(x = 2, z = x*2, x = NULL) expect_equal( collect(step), tibble(g = 1, z = 4) %>% group_by(g) ) expect_equal( step$vars, c("g", "z") ) expect_equal( show_query(step), expr(DT[, { x <- 2 z <- x * 2 .(x, z) }, keyby = .(g)][, `:=`("x", NULL)]) ) # when var is not in original data step <- transmute(dt, y = 2, z = y*2, y = NULL) expect_equal( collect(step), tibble(g = 1, z = 4) %>% group_by(g) ) expect_equal( step$vars, c("g", "z") ) expect_equal( show_query(step), expr(DT[, { y <- 2 z <- y * 2 .(y, z) }, keyby = .(g)][, `:=`("y", NULL)]) ) }) dtplyr/tests/testthat/test-step-subset-expand.R0000644000176200001440000000564514126601265021442 0ustar liggesuserstest_that("expand completes all values", { tbl <- tibble(x = 1:2, y = 1:2) dt <- lazy_dt(tbl, "DT") step <- expand(dt, x, y) out <- collect(step) expect_equal( show_query(step), expr(DT[, CJ(x = x, y = y, unique = TRUE)]) ) expect_equal(step$vars, c("x", "y")) expect_equal(nrow(out), 4) }) test_that("multiple variables in one arg doesn't expand", { tbl <- tibble(x = 1:2, y = 1:2) dt <- lazy_dt(tbl, "DT") step <- expand(dt, c(x, y)) out <- collect(step) expect_equal(nrow(out), 2) }) test_that("works with unnamed vectors", { tbl <- tibble(x = 1:2, y = 1:2) dt <- lazy_dt(tbl, "DT") step <- expand(dt, x, 1:2) out <- collect(step) expect_equal( show_query(step), expr(DT[, CJ(x = x, V2 = 1:2, unique = TRUE)]) ) expect_equal(step$vars, c("x", "V2")) expect_equal(nrow(out), 4) }) test_that("works with named vectors", { tbl <- tibble(x = 1:2, y = 1:2) dt <- lazy_dt(tbl, "DT") step <- expand(dt, x, val = 1:2) out <- collect(step) expect_equal( show_query(step), expr(DT[, CJ(x = x, val = 1:2, unique = TRUE)]) ) expect_equal(step$vars, c("x", "val")) expect_equal(nrow(out), 4) }) test_that("expand respects groups", { tbl <- tibble( a = c(1L, 1L, 2L), b = c(1L, 2L, 1L), c = c(2L, 1L, 1L) ) dt <- lazy_dt(tbl, "DT") step <- dt %>% group_by(c) %>% expand(a, b) out <- collect(step) expect_equal( show_query(step), expr(DT[, CJ(a = a, b = b, unique = TRUE), keyby = .(c)]) ) expect_equal(step$vars, c("c", "a", "b")) expect_equal(out$a, c(1, 1, 2, 2, 1)) expect_equal(out$b, c(1, 2, 1, 2, 1)) }) test_that("expand handles group variables as arguments", { dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3), "DT") # single group var, not redefined res <- dt %>% group_by(x) %>% expand(x, y) expect_equal( show_query(res), expr(DT[, CJ(x = x, y = y, unique = TRUE), keyby = .(x)][, `:=`("x", NULL)]) ) expect_equal( res$groups, "x" ) # multiple group vars, not redefined res <- dt %>% group_by(x, y) %>% expand(x, y, z) expect_equal( show_query(res), expr(DT[, CJ(x = x, y = y, z = z, unique = TRUE), keyby = .(x, y) ][, !!expr(!!c("x", "y") := NULL)]) ) expect_equal( res$groups, c("x", "y") ) # redefined group var res <- dt %>% group_by(x) %>% expand(x = 5, y) expect_equal( show_query(res), expr(DT[, CJ(x = 5, y = y, unique = TRUE), keyby = .(x)][, `:=`("x", NULL)]) ) expect_equal( res$groups, c("x") ) expect_equal( as_tibble(res), tibble(x = 5, y = 2) ) }) test_that("NULL inputs", { tbl <- tibble(x = 1:5) dt <- lazy_dt(tbl, "DT") step <- expand(dt, x, y = NULL) out <- collect(step) expect_equal(out, tbl) }) test_that("expand respects .name_repair", { dt <- lazy_dt(tibble(x = 1:2), "DT") suppressMessages( expect_named(dt %>% expand(x, x, .name_repair = "unique") %>% collect(), c("x...1", "x...2")) ) }) dtplyr/tests/testthat/test-tidyeval-across.R0000644000176200001440000001744514406327117021023 0ustar liggesuserstest_that("across() translates NULL", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal( capture_across(dt, across(a:b)), list(a = expr(a), b = expr(b)) ) }) test_that("across() drops groups", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal( capture_across(group_by(dt, a), across(everything())), list(b = expr(b)) ) expect_equal( capture_across(group_by(dt, b), across(everything())), list(a = expr(a)) ) }) test_that("across() translates functions", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal( capture_across(dt, across(a:b, log)), exprs(a = log(a), b = log(b)) ) expect_equal( capture_across(dt, across(a:b, log, base = 2)), exprs(a = log(a, base = 2), b = log(b, base = 2)) ) expect_equal( capture_across(dt, across(a, list(log, exp))), exprs(a_1 = log(a), a_2 = exp(a)) ) }) test_that("across() captures anonymous functions", { dt <- lazy_dt(data.frame(a = 1)) expect_equal( capture_across(dt, across(a, function(x) log(x))), list(a = call2(function(x) log(x), quote(a))) ) }) test_that("dots are translated too", { fun <- function() { dt <- lazy_dt(data.frame(a = 1, b = 2)) z <- TRUE capture_across(dt, across(a, mean, na.rm = z)) } expect_equal(fun(), exprs(a = mean(a, na.rm = TRUE))) }) test_that("across() translates formulas", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal( capture_across(dt, across(a:b, ~ log(.x))), exprs(a = log(a), b = log(b)) ) # and recursively translates expect_equal( capture_across(dt, across(a, ~ .x / n())), exprs(a = a / .N) ) expect_equal( capture_across(dt, across(a:b, ~2)), exprs(a = 2, b = 2) ) expect_equal( capture_across(dt, across(a:b, list(~log(.x)))), exprs(a_1 = log(a), b_1 = log(b)) ) }) test_that("across() does not support formulas with dots", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_snapshot({ (expect_error(capture_across(dt, across(a:b, ~log(.x, base = .y), base = 2)))) (expect_error(capture_across(dt, across(a:b, list(~log(.x, base = .y)), base = 2)))) }) }) test_that("across() gives informative errors", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_snapshot(error = TRUE, { capture_across(dt, across(a, 1)) capture_across(dt, across(a, list(1))) }) }) test_that("across() can use named selections", { dt <- lazy_dt(data.frame(x = 1, y = 2)) # no fns expect_equal( capture_across(dt, across(c(a = x, b = y))), list(a = quote(x), b = quote(y)) ) expect_equal( capture_across(dt, across(all_of(c(a = "x", b = "y")))), list(a = quote(x), b = quote(y)) ) # one fn expect_equal( capture_across(dt, across(c(a = x, b = y), mean)), list(a = quote(mean(x)), b = quote(mean(y))) ) expect_equal( capture_across(dt, across(all_of(c(a = "x", b = "y")), mean)), list(a = quote(mean(x)), b = quote(mean(y))) ) # multiple fns expect_equal( capture_across(dt, across(c(a = x, b = y), list(mean, nm = sum))), list( a_1 = quote(mean(x)), a_nm = quote(sum(x)), b_1 = quote(mean(y)), b_nm = quote(sum(y)) ) ) expect_equal( capture_across(dt, across(all_of(c(a = "x", b = "y")), list(mean, nm = sum))), list( a_1 = quote(mean(x)), a_nm = quote(sum(x)), b_1 = quote(mean(y)), b_nm = quote(sum(y)) ) ) }) test_that("across() can handle empty selection", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_equal( dt %>% mutate(across(character(), c)) %>% show_query(), expr(DT) ) }) test_that("across() .cols is evaluated in across()'s calling environment", { dt <- lazy_dt(data.frame(y = 1)) fun <- function(x) capture_across(dt, across(all_of(x))) expect_equal( fun("y"), list(y = expr(y)) ) }) test_that("across() output can be used as a data frame", { df <- lazy_dt(tibble(x = 1:3, y = 1:3, z = c("a", "a", "b"))) res <- df %>% mutate(across_df = rowSums(across(c(x, y), ~ .x + 1))) %>% collect() expect_named(res, c("x", "y", "z", "across_df")) expect_equal(res$across_df, c(4, 6, 8)) expr <- dt_squash(expr(across(c(x, y), ~ .x + 1)), df$env, df, is_top = FALSE) expect_equal(expr, expr(data.table(x = x + 1, y = y + 1))) }) test_that("pick() works", { df <- lazy_dt(tibble(x = 1:3, y = 1:3, z = c("a", "a", "b"))) res <- df %>% mutate(row_sum = rowSums(pick(x, y))) %>% collect() expect_named(res, c("x", "y", "z", "row_sum")) expect_equal(res$row_sum, c(2, 4, 6)) expr <- dt_squash(expr(pick(x, y)), df$env, df, is_top = FALSE) expect_equal(expr, expr(data.table(x = x, y = y))) # Top level pick works expect_equal(group_by(df, pick(x, y))$groups, c("x", "y")) }) test_that("`across()` ignores variables in `.by`, #412", { dt <- lazy_dt(data.table(x = 1:3, y = c("a", "a", "b"))) step <- dt %>% mutate(across(everything(), ~ .x + 1), .by = y) expect_equal(as_tibble(step), tibble(x = 2:4, y = c("a", "a", "b"))) expect_true(length(step$groups) == 0) step <- dt %>% summarize(across(everything(), sum), .by = y) expect_equal(as_tibble(step), tibble(y = c("a", "b"), x = c(3, 3))) }) # if_all ------------------------------------------------------------------ test_that("if_all collapses multiple expressions", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal(capture_if_all(dt, if_all(everything(), is.na)), expr(is.na(a) & is.na(b))) }) test_that("if_all works without `.fns` argument", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal(capture_if_all(dt, if_all(c(a:b))), expr(a & b)) }) test_that("if_all() drops groups", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal( capture_if_all(group_by(dt, a), if_all(everything())), sym("b") ) expect_equal( capture_if_all(group_by(dt, b), if_all(everything())), sym("a") ) }) test_that("if_all() translates functions", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal( capture_if_all(dt, if_all(a:b, log)), expr(log(a) & log(b)) ) expect_equal( capture_if_all(dt, if_all(a:b, log, base = 2)), expr(log(a, base = 2) & log(b, base = 2)) ) expect_equal( capture_if_all(dt, if_all(a, list(log, exp))), expr(log(a) & exp(a)) ) }) test_that("if_all() captures anonymous functions", { dt <- lazy_dt(data.frame(a = 1)) expect_equal( capture_if_all(dt, if_all(a, function(x) log(x))), call2(function(x) log(x), quote(a)) ) }) test_that("if_all() translates dots", { fun <- function() { dt <- lazy_dt(data.frame(a = 1, b = 2)) z <- TRUE capture_if_all(dt, if_all(a, mean, na.rm = z)) } expect_equal(fun(), expr(mean(a, na.rm = TRUE))) }) test_that("if_all() translates formulas", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_equal( capture_if_all(dt, if_all(a:b, ~ log(.x))), expr(log(a) & log(b)) ) expect_equal( capture_if_all(dt, if_all(a:b, ~2)), expr(2 & 2) ) expect_equal( capture_if_all(dt, if_all(a:b, list(~log(.x)))), expr(log(a) & log(b)) ) }) test_that("if_all() gives informative errors", { dt <- lazy_dt(data.frame(a = 1, b = 2)) expect_snapshot(error = TRUE, { capture_if_all(dt, if_all(a, 1)) capture_if_all(dt, if_all(a, list(1))) }) }) test_that("if_all() cannot rename variables", { dt <- lazy_dt(data.frame(x = 1, y = 2)) # no fns expect_snapshot( (expect_error(capture_if_all(dt, if_all(c(a = x, b = y))))) ) }) test_that("if_all() can handle empty selection", { skip("tidyselect issue #221") dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_equal( dt %>% mutate(if_all(character(), c)) %>% show_query(), expr(DT) ) }) test_that("if_all() .cols is evaluated in across()'s calling environment", { dt <- lazy_dt(data.frame(y = 1)) fun <- function(x) capture_if_all(dt, if_all(all_of(x))) expect_equal( fun("y"), expr(y) ) }) dtplyr/tests/testthat/test-fill.R0000644000176200001440000000204214015730574016624 0ustar liggesuserstest_that("missings are filled correctly & translations are correct", { tbl <- tibble(x = c(NA, 1, NA, 2, NA, NA)) dt <- lazy_dt(tbl, "DT") step <- fill(dt, x) expect_equal(show_query(step), expr(copy(DT)[, `:=`(x = nafill(x, "locf"))])) expect_equal(collect(step)$x, c(NA, 1, 1, 2, 2, 2)) step <- fill(dt, x, .direction = "up") expect_equal(show_query(step), expr(copy(DT)[, `:=`(x = nafill(x, "nocb"))])) expect_equal(collect(step)$x, c(1, 1, 2, 2, NA, NA)) step <- fill(dt, x, .direction = 'downup') expect_equal(show_query(step), expr(copy(DT)[, `:=`(x = nafill(nafill(x, "locf"), "nocb"))])) expect_equal(collect(step)$x, c(1, 1, 1, 2, 2, 2)) step <- fill(dt, x, .direction = 'updown') expect_equal(show_query(step), expr(copy(DT)[, `:=`(x = nafill(nafill(x, "nocb"), "locf"))])) expect_equal(collect(step)$x, c(1, 1, 2, 2, 2, 2)) }) test_that("auto-conversion to lazy_dt works as intended", { dt <- data.table(x = c(NA, 1, NA, 2, NA, NA)) out <- collect(fill(dt, x)) expect_equal(out$x, c(NA, 1, 1, 2, 2, 2)) }) dtplyr/tests/testthat/test-step-set.R0000644000176200001440000000203514300165007017432 0ustar liggesuserstest_that("basic ops generate expected translation", { dt1 <- lazy_dt(data.frame(x = 1:3), "dt1") dt2 <- lazy_dt(data.frame(x = 2L), "dt2") expect_equal( dt1 %>% intersect(dt2) %>% show_query(), expr(fintersect(dt1, dt2)) ) expect_equal( dt1 %>% union(dt2) %>% show_query(), expr(funion(dt1, dt2)) ) expect_equal( dt1 %>% union_all(dt2) %>% show_query(), expr(funion(dt1, dt2, all = TRUE)) ) expect_equal( dt1 %>% setdiff(dt2) %>% show_query(), expr(fsetdiff(dt1, dt2)) ) }) test_that("joins captures locals from both parents", { dt1 <- lazy_dt(data.frame(x = 1)) %>% mutate(y = 1) %>% compute("D1") dt2 <- lazy_dt(data.frame(x = 1)) %>% mutate(z = 1) %>% compute("D2") expect_named(intersect(dt1, dt2)$locals, c("D1", "D2")) }) test_that("vars set correctly", { # data.table functions require the inputs to have same columns dt1 <- lazy_dt(data.frame(x = 1, y = 2), "dt1") dt2 <- lazy_dt(data.frame(x = 2, y = 2), "dt2") expect_equal(dt1 %>% union(dt2) %>% .$vars, c("x", "y")) }) dtplyr/tests/testthat/test-step-subset.R0000644000176200001440000000144414006775461020165 0ustar liggesuserstest_that("construtor has sensible defaults", { first <- step_first(data.table(x = 1), "DT") step <- step_subset(first) expect_s3_class(step, "dtplyr_step_subset") expect_equal(step$parent, first) expect_equal(step$vars, "x") expect_equal(step$groups, character()) expect_equal(step$i, NULL) expect_equal(step$j, NULL) }) test_that("generates expected calls", { first <- lazy_dt(data.table(x = 1), "DT") ungrouped <- step_subset(first, i = quote(i), j = quote(j)) expect_equal(dt_call(ungrouped), expr(DT[i, j])) with_i <- step_subset(first, i = quote(i), j = quote(j), groups = "x") expect_equal(dt_call(with_i), expr(DT[i, j, keyby = .(x)])) without_i <- step_subset(first, j = quote(j), groups = "x") expect_equal(dt_call(without_i), expr(DT[, j, keyby = .(x)])) }) dtplyr/tests/testthat/test-count.R0000644000176200001440000000607214300165007017023 0ustar liggesusers test_that("can be used grouped or ungrouped", { dt <- lazy_dt(data.table(x = c(1, 1, 1, 2)), "DT") expect_equal( dt %>% count(x) %>% collect(), tibble(x = c(1, 2), n = c(3, 1)) ) expect_equal( dt %>% group_by(x) %>% count() %>% collect(), tibble(x = c(1, 2), n = c(3, 1)) %>% group_by(x) ) }) test_that("can control name", { dt <- lazy_dt(data.table(x = c(1, 1, 1, 2)), "DT") expect_equal( dt %>% count(x, name = "y") %>% collect(), tibble(x = c(1, 2), y = c(3, 1)) ) expect_snapshot( dt %>% count(name = 10) %>% collect(), error = TRUE ) }) test_that("name can match existing group var", { dt <- lazy_dt(data.table(a = 2)) expect_equal( dt %>% group_by(a) %>% tally(name = 'a') %>% collect(), tibble(a = 1) ) expect_equal( dt %>% count(a, name = 'a') %>% collect(), tibble(a = 1) ) }) test_that("can weight", { dt <- lazy_dt(data.table(x = c(1, 1, 2), y = c(1, 2, 10)), "DT") expect_equal( dt %>% count(x, wt = y) %>% collect(), tibble(x = c(1, 2), n = c(3, 10)) ) expect_equal( dt %>% add_count(x, wt = y) %>% collect(), dt %>% mutate(n = c(3, 3, 10)) %>% collect() ) }) test_that("can sort", { dt <- lazy_dt(data.table(x = c(1, 1, 2), y = c(1, 2, 10)), "DT") expect_equal( dt %>% count(x, wt = y, sort = TRUE) %>% collect(), tibble(x = c(2, 1), n = c(10, 3)) ) expect_equal( dt %>% add_count(x, wt = y, sort = TRUE) %>% collect(), tibble(x = c(2, 1, 1), y = c(10, 1, 2), n = c(10, 3, 3)) ) }) test_that("tally works", { dt <- lazy_dt(data.table(x = c(1, 1, 1, 2)), "DT") expect_equal( dt %>% group_by(x) %>% tally() %>% collect(), tibble(x = c(1, 2), n = c(3, 1)) ) }) test_that("informs if n column already present, unless overridden", { dt <- lazy_dt(data.frame(n = c(1, 1, 2, 2, 2))) expect_message(out <- count(dt, n), "already present") expect_named(as_tibble(out), c("n", "nn")) # not a good idea, but supported expect_message(out <- count(dt, n, name = "n"), NA) expect_named(as_tibble(out), "n") expect_message(out <- count(dt, n, name = "nn"), NA) expect_named(as_tibble(out), c("n", "nn")) dt <- lazy_dt(data.frame(n = c(1, 1, 2, 2, 2), nn = 1:5)) expect_message(out <- count(dt, n), "already present") expect_named(as_tibble(out), c("n", "nn")) expect_message(out <- count(dt, n, nn), "already present") expect_named(as_tibble(out), c("n", "nn", "nnn")) }) test_that("name must be string", { dt <- lazy_dt(data.frame(x = c(1, 2))) expect_error(count(dt, x, name = 1), "string") expect_error(count(dt, x, name = letters), "string") }) # add_count --------------------------------------------------------------- test_that("add_count() gives expected calls and groups", { dt <- lazy_dt(data.frame(g = c(1, 2, 2, 2)), "DT") res <- dt %>% add_count(g) expect_equal(show_query(res), expr(copy(DT)[, `:=`(n = .N), by = .(g)])) expect_equal(res$groups, character()) res <- dt %>% group_by(g) %>% add_count() expect_equal(show_query(res), expr(copy(DT)[, `:=`(n = .N), by = .(g)])) expect_equal(res$groups, "g") }) dtplyr/tests/testthat/test-step-join.R0000644000176200001440000002375514300165007017612 0ustar liggesuserstest_that("dt_sources captures all tables", { dt1 <- lazy_dt(data.frame(x = 1), "dt1") dt2 <- lazy_dt(data.frame(x = 2), "dt2") dt3 <- lazy_dt(data.frame(x = 3), "dt3") out <- dt1 %>% left_join(dt2, by = "x") %>% left_join(dt3, by = "x") expect_equal( dt_sources(out)[c("dt1", "dt2", "dt3")], list(dt1 = dt1$parent, dt2 = dt2$parent, dt3 = dt3$parent) ) }) test_that("joins captures locals from both parents", { dt1 <- lazy_dt(data.frame(x = 1)) %>% mutate(y = 1) %>% compute("D1") dt2 <- lazy_dt(data.frame(x = 1)) %>% mutate(z = 1) %>% compute("D2") expect_named(left_join(dt1, dt2, by = "x")$locals, c("D1", "D2")) expect_named(inner_join(dt1, dt2, by = "x")$locals, c("D1", "D2")) }) # dplyr verbs ------------------------------------------------------------- test_that("simple usage generates expected translation", { dt1 <- lazy_dt(tibble(x = 1, y = 2, a = 3), "dt1") dt2 <- lazy_dt(tibble(x = 1, y = 2, b = 4), "dt2") expect_equal( dt1 %>% left_join(dt2, by = "x") %>% show_query(), expr( setnames( setcolorder( dt2[dt1, on = .(x), allow.cartesian = TRUE], !!c(1L, 4L, 5L, 2L, 3L) ), !!c("i.y", "y"), !!c("y.x", "y.y") ) ) ) expect_equal( dt1 %>% right_join(dt2, by = "x") %>% show_query(), expr( setnames( dt1[dt2, on = .(x), allow.cartesian = TRUE], !!c("y", "i.y"), !!c("y.x", "y.y") ) ) ) expect_equal( dt1 %>% inner_join(dt2, by = "x") %>% show_query(), expr( setnames( dt1[dt2, on = .(x), nomatch = NULL, allow.cartesian = TRUE], !!c("y", "i.y"), !!c("y.x", "y.y") ) ) ) expect_equal( dt1 %>% full_join(dt2, by = "x") %>% show_query(), expr(merge(dt1, dt2, all = TRUE, by.x = "x", by.y = "x", allow.cartesian = TRUE)) ) expect_equal( dt1 %>% anti_join(dt2, by = "x") %>% show_query(), expr(dt1[!dt2, on = .(x)]) ) expect_equal( dt1 %>% semi_join(dt2, by = "x") %>% show_query(), expr(dt1[unique(dt1[dt2, which = TRUE, nomatch = NULL, on = .(x)])]) ) }) test_that("full_join produces correct names with default suffix", { ### names are set correctly for basic join df1 <- tibble(a = "a", b = "b.x") df2 <- tibble(a = "a", b = "b.y") dt1 <- lazy_dt(df1, "dt1") dt2 <- lazy_dt(df2, "dt2") expect_equal( full_join(dt1, dt2, by = "a") %>% collect, full_join(df1, df2, by = "a") ) ### names are set correctly for join which requires `setnames()` # data.table: use merge which simply appends the corresponding suffix # producing duplicates # dplyr: appends suffix until name is unique df1 <- tibble(a = "a", b = "b.x", b.x = "b.x.x.x") df2 <- tibble(a = "a", b = "b.y", b.x.x = "b.x.x") dt1 <- lazy_dt(df1, "dt1") dt2 <- lazy_dt(df2, "dt2") joined_dt <- full_join(dt1, dt2, by = "a") expected <- full_join(df1, df2, by = "a") expect_equal( joined_dt %>% .$vars, colnames(expected) ) # suppress warning created by `data.table::merge()` expect_equal( suppressWarnings(joined_dt %>% collect()), expected ) }) test_that("full_join produces correct names with user-supplied suffix", { ### names are set correctly for basic join df1 <- tibble(a = "a", b = "b.x") df2 <- tibble(a = "a", b = "b.y") dt1 <- lazy_dt(df1, "dt1") dt2 <- lazy_dt(df2, "dt2") expect_equal( full_join(dt1, dt2, by = "a", suffix = c(".one", ".two")) %>% collect, full_join(df1, df2, by = "a", suffix = c(".one", ".two")) ) ### names are set correctly for join which requires `setnames()` # data.table: use merge which simply appends the corresponding suffix # producing duplicates # dplyr: appends suffix until name is unique df1 <- tibble(a = "a", b = "b.one", b.one = "b.one.one.one") df2 <- tibble(a = "a", b = "b.two", b.one.one = "b.one.one") dt1 <- lazy_dt(df1, "dt1") dt2 <- lazy_dt(df2, "dt2") joined_dt <- full_join(dt1, dt2, by = "a", suffix = c(".one", ".two")) expected <- full_join(df1, df2, by = "a", suffix = c(".one", ".two")) expect_equal( joined_dt %>% .$vars, colnames(expected) ) # suppress warning created by `data.table::merge()` expect_equal( suppressWarnings(joined_dt %>% collect()), expected ) }) test_that("join can handle `by` where order doesn't match input", { dt1 <- lazy_dt(tibble(a = "a", b = "b", c = "c"), name = "dt1") dt2 <- lazy_dt(tibble(a = "a", b = "b", c = "c", d = "d"), name = "dt2") dt3 <- left_join(dt1, dt2, by = c("c", "b", "a")) expect_equal(dt3$vars, letters[1:4]) expect_equal(collect(dt3), collect(dt2)) dt4 <- full_join(dt1, dt2, by = c("c", "b", "a")) expect_equal(dt4$vars, letters[1:4]) expect_equal(collect(dt4), collect(dt2)) dt5 <- left_join(dt1, dt2, by = c("c", "b")) expect_equal( collect(dt5), tibble(a.x = "a", b = "b", c = "c", a.y = "a", d = "d") ) }) test_that("left_join produces correct names", { # data.table: uses y[x] which prefixes `x` vars with "i." and if name is not # unique it appends "." with the smallest number without a collision # dplyr: appends suffix until name is unique df1 <- tibble(a = "a", b = "b.x", i.b = "i.b") df2 <- tibble(a = "a", b = "b.y") dt1 <- lazy_dt(df1, "dt1") dt2 <- lazy_dt(df2, "dt2") joined_dt <- left_join(dt1, dt2, by = "a") expected <- left_join(df1, df2, by = "a") %>% colnames() expect_equal( joined_dt %>% .$vars, expected ) expect_equal( joined_dt %>% collect() %>% colnames(), expected ) }) test_that("named by converted to by.x and by.y", { dt1 <- lazy_dt(data.frame(a1 = 1:3, z = 1), "dt1") dt2 <- lazy_dt(data.frame(a2 = 1:3, z = 2), "dt2") out_inner <- inner_join(dt1, dt2, by = c('a1' = 'a2')) expect_equal( out_inner %>% show_query(), expr( setnames( dt1[dt2, on = .(a1 = a2), nomatch = NULL, allow.cartesian = TRUE], !!c("z", "i.z"), !!c("z.x", "z.y") ) ) ) expect_setequal(tbl_vars(out_inner), c("a1", "z.x", "z.y")) out_left <- left_join(dt1, dt2, by = c('a1' = 'a2')) expect_equal( out_left %>% show_query(), expr( setnames( setcolorder( dt2[dt1, on = .(a2 = a1), allow.cartesian = TRUE], !!c(1L, 3L, 2L) ), !!c("a2", "i.z", "z"), !!c("a1", "z.x", "z.y") ) ) ) expect_setequal(tbl_vars(out_left), c("a1", "z.x", "z.y")) }) test_that("named by can handle edge cases", { test_equal <- function(f_join) { joined_dt <- f_join(dt1, dt2, by = c("x", z = "y")) expected <- f_join(df1, df2, by = c("x", z = "y")) expect_equal( joined_dt %>% collect(), expected ) expect_equal( joined_dt$vars, colnames(expected) ) } df1 <- tibble(x = 1, y = 1, z = 2) df2 <- tibble(x = 1, y = 2) dt1 <- lazy_dt(df1, "dt1") dt2 <- lazy_dt(df2, "dt2") test_equal(left_join) test_equal(right_join) test_equal(full_join) test_equal(semi_join) test_equal(anti_join) }) test_that("setnames only used when necessary", { dt1 <- lazy_dt(data.frame(x = 1:2, a = 3), "dt1") dt2 <- lazy_dt(data.frame(x = 2:3, b = 4), "dt2") expect_equal( dt1 %>% left_join(dt2, by = "x") %>% show_query(), expr(setcolorder(dt2[dt1, on = .(x), allow.cartesian = TRUE], !!c(1L, 3L, 2L))) ) expect_equal( dt1 %>% left_join(dt2, by = "x") %>% pull(x), dt1 %>% pull(x) ) }) test_that("correctly determines vars", { dt1 <- lazy_dt(data.frame(x = 1, y = 2, a = 3), "dt1") dt2 <- lazy_dt(data.frame(x = 1, y = 2, b = 4), "dt2") expect_setequal( dt1 %>% left_join(dt2, by = c("x", "y")) %>% .$vars, c("x", "y", "a", "b") ) expect_setequal( dt1 %>% left_join(dt2, by = "x") %>% .$vars, c("x", "y.x", "y.y", "a", "b") ) expect_setequal( dt1 %>% semi_join(dt2, by = "x") %>% .$vars, c("x", "y", "a") ) }) test_that("can override suffixes", { dt1 <- lazy_dt(data.frame(x = 1, y = 2, a = 3), "dt1") dt2 <- lazy_dt(data.frame(x = 1, y = 22, b = 4), "dt2") expect_equal( dt1 %>% left_join(dt2, by = "x", suffix = c("X", "Y")) %>% show_query(), expr( setnames( setcolorder( dt2[dt1, on = .(x), allow.cartesian = TRUE], !!c(1L, 4L, 5L, 2L, 3L) ), !!c("i.y", "y"), !!c("yX", "yY") ) ) ) }) test_that("automatically data.frame converts to lazy_dt", { dt1 <- lazy_dt(data.frame(x = 1, y = 2, a = 3), "dt1") df2 <- data.frame(x = 1, y = 2, a = 3) out <- left_join(dt1, df2, by = "x") expect_s3_class(out, "dtplyr_step") }) test_that("converts other types if requested", { dt1 <- lazy_dt(data.frame(x = 1, y = 2, a = 3), "dt1") x <- structure(10, class = "foo") expect_error(left_join(dt1, x, by = "x"), "copy") expect_s3_class(left_join(dt1, x, by = "x", copy = TRUE), "dtplyr_step_join") }) test_that("mutates inside joins are copied as needed", { dt <- data.table(x = 1) lhs <- lazy_dt(dt, "dt1") %>% mutate(y = x + 1) rhs <- lazy_dt(dt, "dt2") %>% mutate(z = x + 1) collect(inner_join(lhs, rhs, by = "x")) expect_named(dt, "x") }) test_that("performs cartesian joins as needed", { x <- lazy_dt(data.frame(x = c(2, 2, 2), y = 1:3)) y <- lazy_dt(data.frame(x = c(2, 2, 2), z = 1:3)) out <- collect(left_join(x, y, by = "x")) expect_equal(nrow(out), 9) }) test_that("performs cross join", { df1 <- data.frame(x = 1:2, y = "a", stringsAsFactors = FALSE) df2 <- data.frame(x = 3:4) dt1 <- lazy_dt(df1, "dt1") dt2 <- lazy_dt(df2, "dt2") expected <- left_join(df1, df2, by = character()) %>% as_tibble() expect_snapshot(left_join(dt1, dt2, by = character())) expect_equal(left_join(dt1, dt2, by = character()) %>% collect(), expected) expect_snapshot(right_join(dt1, dt2, by = character())) expect_equal(right_join(dt1, dt2, by = character()) %>% collect(), expected) expect_snapshot(full_join(dt1, dt2, by = character())) expect_equal(full_join(dt1, dt2, by = character()) %>% collect(), expected) expect_snapshot(inner_join(dt1, dt2, by = character())) expect_equal(inner_join(dt1, dt2, by = character()) %>% collect(), expected) }) dtplyr/tests/testthat/test-step-colorder-relocate.R0000644000176200001440000000527414372711230022260 0ustar liggesuserstest_that(".before and .after relocate individual cols", { dt <- lazy_dt(data.table(x = 1, y = 1), "DT") expect_equal( dt %>% relocate(x, .after = y) %>% show_query(), expr(setcolorder(copy(DT), !!c("y", "x"))) ) expect_equal( dt %>% relocate(y, .before = x) %>% show_query(), expr(setcolorder(copy(DT), !!c("y", "x"))) ) }) test_that("can move blocks of variables", { dt <- lazy_dt(data.table(x = 1, a = 1, y = 1, b = 1), "DT") expect_equal( dt %>% relocate(y, b, .before = a) %>% show_query(), expr(setcolorder(copy(DT), !!c("x", "y", "b", "a"))) ) expect_equal( dt %>% relocate(any_of(c("y", "b")), .before = a) %>% show_query(), expr(setcolorder(copy(DT), !!c("x", "y", "b", "a"))) ) }) test_that("All columns move before (after) columns in .before (.after)", { dt <- lazy_dt(data.table(x = 1, a = 1, y = 1, b = 1), "DT") expect_equal( dt %>% relocate(y, b, .before = c(x, a)) %>% show_query(), expr(setcolorder(copy(DT), !!c("y", "b", "x", "a"))) ) expect_equal( dt %>% relocate(x, a, .after = c(y, b)) %>% show_query(), expr(setcolorder(copy(DT), !!c("y", "b", "x", "a"))) ) }) test_that("extra variables in .before/.after unaffected", { dt <- lazy_dt(data.table(a = 1, b = 1, c = 1, d = 1, e = 1), "DT") expect_equal( dt %>% relocate(b, .after = c(a, c, e)) %>% show_query(), expr(setcolorder(copy(DT), !!c("a", "c", "d", "e", "b"))) ) expect_equal( dt %>% relocate(e, .before = c(b, d)) %>% show_query(), expr(setcolorder(copy(DT), !!c("a", "e", "b", "c", "d"))) ) }) test_that("no .before/.after moves to front", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_equal( dt %>% relocate(y) %>% show_query(), expr(setcolorder(copy(DT), !!c("y", "x"))) ) }) test_that("can only supply one of .before and .after", { dt <- lazy_dt(data.table(x = 1, y = 1), "DT") expect_snapshot(relocate(dt, y, .before = x, .after = x), error = TRUE) }) test_that("relocate() respects order specified by ...", { dt <- lazy_dt(data.table(a = 1, x = 1, b = 1, z = 1, y = 1), "DT") expect_equal( dt %>% relocate(x, y, z, .before = x) %>% show_query(), expr(setcolorder(copy(DT), !!c("a", "x", "y", "z", "b"))) ) expect_equal( dt %>% relocate(x, y, z, .after = last_col()) %>% show_query(), expr(setcolorder(copy(DT), !!c("a", "b", "x", "y", "z"))) ) expect_equal( dt %>% relocate(x, a, z) %>% show_query(), expr(setcolorder(copy(DT), !!c("x", "a", "z", "b", "y"))) ) }) test_that("relocate() only not alter grouping", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% group_by(x, y) %>% relocate(y, .before = x) %>% .$groups, c("x", "y") ) }) dtplyr/tests/testthat/test-step-mutate.R0000644000176200001440000002320714406327117020153 0ustar liggesuserstest_that("constructor has sensible defaults", { first <- step_first(data.table(x = 1), "DT") step <- step_mutate(first) expect_s3_class(step, "dtplyr_step_mutate") expect_equal(step$parent, first) expect_equal(step$vars, "x") expect_equal(step$groups, character()) expect_equal(step$new_vars, list()) }) # copies ------------------------------------------------------------------ test_that("need to copy when there's a mutate", { dt <- lazy_dt(data.table(x = 1)) expect_false(dt %>% .$needs_copy) expect_false(dt %>% filter(x == 1) %>% .$needs_copy) expect_false(dt %>% head() %>% .$needs_copy) expect_true(dt %>% mutate(y = 1) %>% .$needs_copy) expect_true(dt %>% mutate(y = 1) %>% filter(x == 1) %>% .$needs_copy) expect_true(dt %>% mutate(y = 1) %>% head() %>% .$needs_copy) }) test_that("unless there's already an implicit copy", { dt <- lazy_dt(data.table(x = 1)) expect_true(dt %>% filter(x == 1) %>% .$implicit_copy) expect_false(dt %>% filter(x == 1) %>% mutate(y = 1) %>% .$needs_copy) expect_true(dt %>% head() %>% .$implicit_copy) expect_false(dt %>% head() %>% mutate(y = 1) %>% .$needs_copy) }) test_that("properly copies with chained operations, #210", { dt <- lazy_dt(data.table(x = 1)) query <- dt %>% mutate(z1 = 1) %>% summarize(z2 = 2) %>% mutate(z3 = 4) expect_true(query$implicit_copy) expect_true(query$needs_copy) }) # dplyr verbs ------------------------------------------------------------- test_that("generates single calls as expect", { dt <- lazy_dt(data.table(x = 1), "DT") expect_equal( dt %>% mutate(x2 = x * 2) %>% show_query(), expr(copy(DT)[, `:=`(x2 = x * 2)]) ) expect_equal( dt %>% group_by(x) %>% mutate(x2 = x * 2) %>% show_query(), expr(copy(DT)[, `:=`(x2 = x * 2), by = .(x)]) ) expect_equal( dt %>% transmute(x2 = x * 2) %>% show_query(), expr(DT[, .(x2 = x * 2)]) ) }) test_that("mutate generates compound expression if needed", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_equal( dt %>% mutate(x2 = x * 2, x4 = x2 * 2) %>% show_query(), expr(copy(DT)[, c("x2", "x4") := { x2 <- x * 2 x4 <- x2 * 2 .(x2, x4) }]) ) }) test_that("allows multiple assignment to the same variable", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") # when nested expect_equal( dt %>% mutate(x = x * 2, x = x * 2) %>% show_query(), expr(copy(DT)[, c("x") := { x <- x * 2 x <- x * 2 .(x) }]) ) # when not nested expect_equal( dt %>% mutate(z = 2, z = 3) %>% show_query(), expr(copy(DT)[, `:=`(c("z"), { z <- 2 z <- 3 .(z) })]) ) }) test_that("can use across", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_equal( dt %>% mutate(across(everything(), ~ . + 1)) %>% show_query(), expr(copy(DT)[, `:=`(x = x + 1, y = y + 1)]) ) expect_equal( dt %>% mutate(across(.fns = ~ . + 1)) %>% show_query(), expr(copy(DT)[, `:=`(x = x + 1, y = y + 1)]) ) }) test_that("across() can access previously created variables", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- mutate(dt, y = 2, across(y, sqrt)) expect_equal( collect(step), tibble(x = 1, y = sqrt(2)) ) expect_equal( step$vars, c("x", "y") ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("y"), { y <- 2 y <- sqrt(y) .(y) })]) ) }) test_that("vars set correctly", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% mutate(z = 1) %>% .$vars, c("x", "y", "z")) expect_equal(dt %>% mutate(x = NULL, z = 1) %>% .$vars, c("y", "z")) }) test_that("empty mutate returns input", { dt <- lazy_dt(data.frame(x = 1)) expect_equal(mutate(dt), dt) expect_equal(mutate(dt, !!!list()), dt) }) test_that("unnamed arguments matching column names are ignored", { dt <- lazy_dt(data.frame(x = 1), "DT") expect_identical(mutate(dt, x), dt) expect_snapshot(mutate(dt, y), error = TRUE) }) test_that("new columns take precedence over global variables", { dt <- lazy_dt(data.frame(x = 1), "DT") y <- 'global var' step <- mutate(dt, y = 2, z = y + 1) expect_equal( collect(step), tibble(x = 1, y = 2, z = 3) ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("y", "z"), { y <- 2 z <- y + 1 .(y, z) })]) ) }) test_that("works with `.by`", { dt <- lazy_dt(data.table(x = 1:3, y = c("a", "a", "b"))) step <- dt %>% mutate(row_num = row_number(), .by = y) expect_equal(as_tibble(step), tibble(x = 1:3, y = c("a", "a", "b"), row_num = c(1, 2, 1))) expect_true(length(step$groups) == 0) }) # var = NULL ------------------------------------------------------------- test_that("var = NULL works when var is in original data", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- dt %>% mutate(x = 2, z = x*2, x = NULL) expect_equal( collect(step), tibble(z = 4) ) expect_equal( step$vars, "z" ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("x", "z"), { x <- 2 z <- x * 2 .(x, z) })][, `:=`("x", NULL)] ) ) }) test_that("var = NULL when var is in final output", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- mutate(dt, y = NULL, y = 3) expect_equal( collect(step), tibble(x = 1, y = 3) ) expect_equal( step$vars, c("x", "y") ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("y"), { y <- NULL y <- 3 .(y) })]) ) }) test_that("temp var with nested arguments", { dt <- lazy_dt(data.frame(x = 1), "DT") step <- mutate(dt, y = 2, z = y*2, y = NULL) expect_equal( collect(step), tibble(x = 1, z = 4) ) expect_equal( step$vars, c("x", "z") ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("y", "z"), { y <- 2 z <- y * 2 .(y, z) })][, `:=`("y", NULL)]) ) }) test_that("temp var with no new vars added", { dt <- lazy_dt(data.frame(x = 1), "DT") # when no other vars are added step <- mutate(dt, y = 2, y = NULL) expect_equal( collect(step), tibble(x = 1) ) expect_equal( step$vars, "x" ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("y"), { y <- 2 .(y) })][, `:=`("y", NULL)]) ) }) test_that("var = NULL works when data is grouped", { dt <- lazy_dt(data.frame(x = 1, g = 1), "DT") %>% group_by(g) # when var is not in original data step <- mutate(dt, y = 2, z = y*2, y = NULL) expect_equal( collect(step), tibble(x = 1, g = 1, z = 4) %>% group_by(g) ) expect_equal( step$vars, c("x", "g", "z") ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("y", "z"), { y <- 2 z <- y * 2 .(y, z) }), by = .(g)][, `:=`("y", NULL)]) ) # when var is in original data step <- dt %>% mutate(x = 2, z = x*2, x = NULL) expect_equal( collect(step), tibble(g = 1, z = 4) %>% group_by(g) ) expect_equal( step$vars, c("g", "z") ) expect_equal( show_query(step), expr(copy(DT)[, `:=`(c("x", "z"), { x <- 2 z <- x * 2 .(x, z) }), by = .(g)][, `:=`("x", NULL)]) ) }) # .before and .after ----------------------------------------------------------- test_that("can use .before and .after to control column position", { dt <- lazy_dt(data.frame(x = 1, y = 2)) expect_named( mutate(dt, z = 1) %>% as_tibble(), c("x", "y", "z") ) expect_named( mutate(dt, z = 1, .before = x) %>% as_tibble(), c("z", "x", "y") ) expect_named( mutate(dt, z = 1, .after = x) %>% as_tibble(), c("x", "z", "y") ) # but doesn't affect order of existing columns expect_named( mutate(dt, x = 1, .after = y) %>% as_tibble(), c("x", "y") ) }) # .before and .after ----------------------------------------------------------- test_that(".keep = 'unused' keeps variables explicitly mentioned", { df <- data.table(x = 1, y = 2) out <- df %>% mutate(x1 = x + 1, y = y, .keep = "unused") %>% as.data.table() expect_named(out, c("y", "x1")) }) test_that(".keep = 'used' not affected by across()", { df <- data.table(x = 1, y = 2, z = 3, a = "a", b = "b", c = "c") # This must evaluate every column in order to figure out if should # be included in the set or not, but that shouldn't be counted for # the purposes of "used" variables out <- df %>% mutate(across(c(x, y, z), identity), .keep = "unused") %>% as.data.table() expect_named(out, names(df)) }) test_that(".keep = 'used' keeps variables used in expressions", { df <- data.table(a = 1, b = 2, c = 3, x = 1, y = 2) out <- df %>% mutate(xy = x + y, .keep = "used") %>% as.data.table() expect_named(out, c("x", "y", "xy")) }) test_that(".keep = 'none' only keeps grouping variables", { df <- data.table(x = 1, y = 2) gf <- df %>% group_by(x) out1 <- df %>% mutate(z = 1, .keep = "none") %>% as.data.table() expect_named(out1, "z") out2 <- gf %>% mutate(z = 1, .keep = "none") %>% as.data.table() expect_named(out2, c("x", "z")) }) test_that(".keep = 'none' retains original ordering", { df <- data.table(x = 1, y = 2) out1 <- df %>% mutate(y = 1, x = 2, .keep = "none") %>% as.data.table() expect_named(out1, c("x", "y")) # even when grouped out2 <- df %>% group_by(x) %>% mutate(y = 1, x = 2, .keep = "none") %>% as.data.table() expect_named(out2, c("x", "y")) }) test_that("works with empty dots", { df <- data.table(x = 1, y = 2) out <- df %>% mutate(.keep = "used") %>% as.data.table() expect_equal(ncol(out), 0) }) test_that("works with trivial dots", { out <- lazy_dt(mtcars) %>% mutate(mpg, .keep = 'used') %>% as.data.table() expect_named(out, "mpg") }) dtplyr/tests/testthat/test-step-modify.R0000644000176200001440000000113614006775461020145 0ustar liggesuserstest_that("group_modify creates modified data frame", { dt <- lazy_dt(data.table(g = c(1, 1, 2), x = 1:3)) foo <- function(rows, g) { list(nc = ncol(rows), nr = nrow(rows)) } out <- dt %>% group_by(g) %>% group_modify(foo) %>% collect() expect_equal(out$nc, c(1, 1)) expect_equal(out$nr, c(2, 1)) }) test_that("group_map works", { dt <- lazy_dt(data.table(g = c(1, 1, 2), x = 1:3)) out <- dt %>% group_by(g) %>% group_map(~ nrow(.)) expect_equal(out, list(2, 1)) # don't include group data out <- dt %>% group_by(g) %>% group_map(~ ncol(.)) expect_equal(out, list(1, 1)) }) dtplyr/tests/testthat/test-step-call-pivot_wider.R0000644000176200001440000001751414372711230022117 0ustar liggesuserstest_that("can pivot all cols to wide", { df <- lazy_dt(tibble(key = c("x", "y", "z"), val = 1:3), "DT") step <- pivot_wider(df, names_from = key, values_from = val) pv <- collect(step) expect_equal(step$vars, c("x", "y", "z")) expect_equal(nrow(pv), 1) expect_equal( show_query(step), expr(dcast(DT, formula = "..." ~ key, value.var = "val")[, `:=`(".", NULL)]) ) }) test_that("non-pivoted cols are preserved", { df <- lazy_dt(tibble(a = 1, key = c("x", "y"), val = 1:2), "DT") step <- pivot_wider(df, names_from = key, values_from = val) pv <- collect(step) expect_equal(step$vars, c("a", "x", "y")) expect_equal(nrow(pv), 1) expect_equal( show_query(step), expr(dcast(DT, formula = a ~ key, value.var = "val")) ) }) test_that("implicit missings turn into explicit missings", { df <- lazy_dt(tibble(a = 1:2, key = c("x", "y"), val = 1:2)) pv <- collect(pivot_wider(df, names_from = key, values_from = val)) expect_equal(pv$a, c(1, 2)) expect_equal(pv$x, c(1, NA)) expect_equal(pv$y, c(NA, 2)) }) test_that("error when overwriting existing column", { df <- tibble( a = c(1, 1), key = c("a", "b"), val = c(1, 2) ) df <- lazy_dt(df) expect_error( pivot_wider(df, names_from = key, values_from = val), "Names must be unique" ) }) test_that("grouping is preserved", { df <- lazy_dt(tibble(g = 1, k = "x", v = 2)) out <- df %>% group_by(g) %>% pivot_wider(names_from = k, values_from = v) expect_equal(out$groups, "g") }) # https://github.com/tidyverse/tidyr/issues/804 test_that("column with `...j` name can be used as `names_from`", { df <- lazy_dt(tibble(...8 = c("x", "y", "z"), val = 1:3)) step <- pivot_wider(df, names_from = ...8, values_from = val) pv <- collect(step) expect_equal(step$vars, c("x", "y", "z")) expect_equal(nrow(pv), 1) }) test_that("correctly handles columns named NA, #394", { df <- lazy_dt(tibble(x = c("a", "a"), y = c("a", NA), z = 1:2)) res <- df %>% pivot_wider(names_from = y, values_from = z, names_glue = "{y}_new", names_repair = "minimal") %>% collect() expect_named(res, c("x", "NA_new", "a_new")) res <- df %>% pivot_wider(names_from = y, values_from = z, names_glue = "{y}", names_repair = "minimal") %>% collect() expect_named(res, c("x", "NA", "a")) df <- lazy_dt(tibble(x = c('a', NA), y = 1:2)) res <- df %>% pivot_wider(names_from = 'x', values_from = 'y') %>% collect() expect_named(res, c("NA", "a")) }) # column names ------------------------------------------------------------- test_that("names_glue affects output names", { df <- lazy_dt( data.frame( x = c("X", "Y"), y = 1:2, a = 1:2, b = 1:2 ), "DT" ) step <- pivot_wider(df, names_from = x:y, values_from = a:b, names_glue = "{x}{y}_{.value}") expect_snapshot(show_query(step)) expect_equal(step$vars, c("X1_a", "Y2_a", "X1_b", "Y2_b")) }) test_that("can use names_glue without .value", { df <- lazy_dt(tibble(label = c("x", "y", "z"), val = 1:3)) step <- pivot_wider( df, names_from = label, values_from = val, names_glue = "test_{label}" ) pv <- collect(step) expect_equal(step$vars, c("test_x", "test_y", "test_z")) expect_equal(nrow(pv), 1) }) test_that("can add name prefix", { df <- lazy_dt(tibble(label = c("x", "y", "z"), val = 1:3), "DT") step <- pivot_wider( df, names_from = label, values_from = val, names_prefix = "test_" ) expect_named(collect(step), c("test_x", "test_y", "test_z")) }) test_that("can sort column names", { df <- tibble( int = c(1, 3, 2), chr = c("Wed", "Tue", "Mon"), ) df <- lazy_dt(df, "DT") step <- pivot_wider(df, names_from = chr, values_from = int, names_sort = TRUE) expect_snapshot(show_query(step)) expect_equal(step$vars, c("Mon", "Tue", "Wed")) }) test_that("can sort column names with id", { df <- tibble( id = 1:3, int = c(1, 3, 2), chr = c("Wed", "Tue", "Mon"), ) df <- lazy_dt(df, "DT") step <- pivot_wider(df, names_from = chr, values_from = int, names_sort = TRUE) expect_snapshot(show_query(step)) expect_equal(step$vars, c("id", "Mon", "Tue", "Wed")) }) test_that("can repair names if requested", { df <- lazy_dt(tibble(x = 1, lab = "x", val = 2), "DT") expect_snapshot(error = TRUE, { pivot_wider(df, names_from = lab, values_from = val) pivot_wider(df, names_from = lab, values_from = val, names_repair = "unique") }) }) test_that("can handle numeric column in names_from", { df <- lazy_dt(tibble(x = 1, name = 1, value = 2), "DT") expect_named(pivot_wider(df, names_prefix = "nm") %>% collect(), c("x", "nm1")) }) # keys --------------------------------------------------------- test_that("can override default keys", { df <- tribble( ~row, ~name, ~var, ~value, 1, "Sam", "age", 10, 2, "Sam", "height", 1.5, 3, "Bob", "age", 20, ) df <- lazy_dt(df, "DT") step <- pivot_wider(df, id_cols = name, names_from = var, values_from = value) pv <- collect(step) expect_equal(nrow(pv), 2) expect_equal( show_query(step), expr(dcast(DT, formula = name ~ var, value.var = "value")) ) }) # non-unique keys --------------------------------------------------------- test_that("warning suppressed by supplying values_fn", { df <- lazy_dt(tibble(a = c(1, 1, 2), key = c("x", "x", "x"), val = 1:3)) pv <- df %>% pivot_wider(names_from = key, values_from = val, values_fn = list(val = list)) %>% collect() expect_equal(pv$a, c(1, 2)) expect_equal(as.list(pv$x), list(c(1L, 2L), 3L)) }) test_that("values_fn can be a single function", { df <- lazy_dt(tibble(a = c(1, 1, 2), key = c("x", "x", "x"), val = c(1, 10, 100)), "DT") step <- pivot_wider(df, names_from = key, values_from = val, values_fn = sum) pv <- collect(step) expect_equal(step$vars, c("a", "x")) expect_equal(pv$x, c(11, 100)) }) test_that("values_summarize applied even when no-duplicates", { df <- lazy_dt(tibble(a = c(1, 2), key = c("x", "x"), val = 1:2)) pv <- df %>% pivot_wider(names_from = key, values_from = val, values_fn = list(val = list)) %>% collect() expect_equal(pv$a, c(1, 2)) expect_equal(as.list(pv$x), list(1L, 2L)) }) # can fill missing cells -------------------------------------------------- test_that("can fill in missing cells", { df <- lazy_dt(tibble(g = c(1, 2), var = c("x", "y"), val = c(1, 2))) widen <- function(...) { df %>% pivot_wider(names_from = var, values_from = val, ...) %>% collect() } expect_equal(widen()$x, c(1, NA)) expect_equal(widen(values_fill = 0)$x, c(1, 0)) expect_equal(widen(values_fill = list(val = 0))$x, c(1, 0)) }) test_that("values_fill only affects missing cells", { df <- lazy_dt(tibble(g = c(1, 2), names = c("x", "y"), value = c(1, NA)), "DT") step <- pivot_wider(df, names_from = names, values_from = value, values_fill = 0) out <- collect(step) expect_equal(out$y, c(0, NA)) expect_equal( show_query(step), expr(dcast(DT, formula = g ~ names, value.var = "value", fill = 0)) ) }) # multiple values ---------------------------------------------------------- test_that("can pivot from multiple measure cols", { df <- lazy_dt(tibble(row = 1, var = c("x", "y"), a = 1:2, b = 3:4)) step <- pivot_wider(df, names_from = var, values_from = c(a, b)) pv <- collect(step) expect_equal(step$vars, c("row", "a_x", "a_y", "b_x", "b_y")) expect_equal(pv$a_x, 1) expect_equal(pv$b_y, 4) }) test_that("can pivot from multiple measure cols using all keys", { df <- lazy_dt(tibble(var = c("x", "y"), a = 1:2, b = 3:4)) step <- pivot_wider(df, names_from = var, values_from = c(a, b)) pv <- collect(step) expect_equal(step$vars, c("a_x", "a_y", "b_x", "b_y")) expect_equal(pv$a_x, 1) expect_equal(pv$b_y, 4) }) dtplyr/tests/testthat/test-step-subset-summarise.R0000644000176200001440000001020514372711230022151 0ustar liggesuserstest_that("simple calls generate expected translations", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% summarise(x = mean(x)) %>% show_query(), expr(DT[, .(x = mean(x))]) ) expect_equal( dt %>% transmute(x) %>% show_query(), expr(DT[, .(x = x)]) ) }) test_that("can use with across", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% summarise(across(x:y, mean)) %>% show_query(), expr(DT[, .(x = mean(x), y = mean(y))]) ) }) test_that("can merge iff j-generating call comes after i", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% filter(x > 1) %>% summarise(y = mean(x)) %>% show_query(), expr(DT[x > 1, .(y = mean(x))]) ) expect_equal( dt %>% summarise(y = mean(x)) %>% filter(y > 1) %>% show_query(), expr(DT[, .(y = mean(x))][y > 1]) ) }) test_that("summarise peels off layer of grouping", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1)) gt <- group_by(dt, x, y) suppressMessages({ expect_equal(summarise(gt)$groups, "x") expect_equal(summarise(summarise(gt))$groups, character()) }) }) test_that("works with `.by`", { dt <- lazy_dt(data.table(x = 1:3, y = c("a", "a", "b"), z = c("a", "a", "b"))) step <- dt %>% summarize(first_x = first(x), .by = c(y, z)) expect_equal(as_tibble(step), tibble(y = c("a", "b"), z = c("a", "b"), first_x = c(1, 3))) expect_true(length(step$groups) == 0) }) test_that("works with `.by` and no dots", { dt <- lazy_dt(data.table(x = 1:3, y = c("a", "a", "b"), z = c("a", "a", "b"))) step <- dt %>% summarize(.by = c(y, z)) expect_equal(as_tibble(step), tibble(y = c("a", "b"), z = c("a", "b"))) expect_true(length(step$groups) == 0) }) test_that("summarise sorts groups", { dt <- lazy_dt(data.table(x = 2:1)) expect_equal( dt %>% group_by(x) %>% summarise(n = n()) %>% pull(x), 1:2 ) }) test_that("vars set correctly", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% summarise(z = mean(x)) %>% .$vars, "z") expect_equal(dt %>% group_by(y) %>% summarise(z = mean(x)) %>% .$vars, c("y", "z")) }) test_that("empty summarise returns unique groups", { dt <- lazy_dt(data.table(x = c(1, 1, 2), y = 1, z = 1), "DT") expect_equal( dt %>% group_by(x) %>% summarise() %>% show_query(), expr(unique(DT[, .(x)])) ) # If no groups, return null data.table expect_equal( dt %>% summarise() %>% show_query(), expr(DT[, 0L]) ) }) test_that("if for unsupported resummarise", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_error(dt %>% summarise(x = mean(x), x2 = sd(x)), "mutate") }) test_that("summarise(.groups=)", { # the `dplyr::` prefix is needed for `check()` # should produce a message when called directly by user expect_message(eval_bare( expr(lazy_dt(data.frame(x = 1, y = 2), "DT") %>% group_by(x, y) %>% dplyr::summarise() %>% show_query()), env(global_env()) )) expect_snapshot(eval_bare( expr(lazy_dt(data.frame(x = 1, y = 2), "DT") %>% group_by(x, y) %>% dplyr::summarise() %>% show_query()), env(global_env()) )) # should be silent when called in another package expect_silent(eval_bare( expr(lazy_dt(data.frame(x = 1, y = 2), "DT") %>% group_by(x, y) %>% dplyr::summarise() %>% show_query()), asNamespace("testthat") )) df <- lazy_dt(data.table(x = 1, y = 2), "DT") %>% group_by(x, y) suppressMessages(expect_equal(df %>% summarise() %>% group_vars(), "x")) expect_equal(df %>% summarise(.groups = "drop_last") %>% group_vars(), "x") expect_equal(df %>% summarise(.groups = "drop") %>% group_vars(), character()) expect_equal(df %>% summarise(.groups = "keep") %>% group_vars(), c("x", "y")) expect_snapshot_error(df %>% summarise(.groups = "rowwise")) }) test_that("can change group vars", { dt <- lazy_dt(data.frame(a = 1), "DT") %>% group_by(a) res <- dt %>% summarise(a = 2) expect_equal( show_query(res), expr(DT[, .(a = 2), keyby = .(a)][, `:=`("a", NULL)]) ) expect_equal( as_tibble(res), tibble(a = 2) ) # but not with across expect_error( dt %>% summarise(across(a, ~ 2)), "Column `a` doesn't exist" ) }) dtplyr/tests/testthat/test-unite.R0000644000176200001440000000440414300165007017014 0ustar liggesuserstest_that("unite pastes columns together & removes old col", { df <- lazy_dt(data.table(x = "a", y = "b"), "DT") step <- unite(df, "z", x:y) out <- as.data.table(step) expect_equal(names(out), "z") expect_equal(out$z, "a_b") expect_equal( show_query(step), expr(copy(DT)[, `:=`(z = paste(x, y, sep = "_"))][, `:=`(!!c("x", "y"), NULL)]) ) }) test_that("unite does not remove new col in case of name clash", { df <- lazy_dt(data.table(x = "a", y = "b"), "DT") step <- unite(df, x, x:y) out <- as.data.table(step) expect_equal(names(out), "x") expect_equal(out$x, "a_b") }) test_that("correct column order when remove = FALSE", { df <- lazy_dt(data.table(x = "a", y = "b"), "DT") step <- unite(df, "united", y, x, remove = FALSE) out <- as.data.table(step) expect_equal(names(out), c("united", "x", "y")) expect_equal(out$united, "b_a") }) test_that("unite preserves grouping", { df <- lazy_dt(data.table(g = 1, x = "a"), "DT") %>% group_by(g) step <- df %>% unite(x, x) expect_equal(dplyr::group_vars(df), dplyr::group_vars(step)) }) test_that("doesn't use `by` for unite step", { df <- lazy_dt(data.table(x = "a", y = "b", z = "c"), "DT") %>% group_by(z) step <- unite(df, "z", x:y) out <- as.data.table(step) expect_equal(names(out), "z") expect_equal(out$z, "a_b") expect_equal(step$groups, "z") expect_equal( show_query(step), expr(copy(DT)[, `:=`(z = paste(x, y, sep = "_"))][, `:=`(!!c("x", "y"), NULL)]) ) }) test_that("drops grouping when needed", { df <- lazy_dt(data.table(g = 1, x = "a"), "DT") %>% group_by(g) step <- df %>% unite(gx, g, x) rs <- as.data.table(step) expect_equal(rs$gx, "1_a") expect_equal(dplyr::group_vars(rs), character()) }) test_that("keeps groups when needed", { df <- lazy_dt(data.table(x = "x", y = "y"), "DT") %>% group_by(x, y) step <- df %>% unite("z", x) rs <- as.data.table(step) expect_equal(rs$z, "x") expect_equal(dplyr::group_vars(step), "y") }) test_that("empty var spec uses all vars", { df <- lazy_dt(data.table(x = "a", y = "b"), "DT") expect_equal(collect(unite(df, "z")), tibble(z = "a_b")) }) test_that("errors on na.rm", { df <- lazy_dt(data.table(x = c("a", NA), y = c("b", NA)), "DT") expect_snapshot_error(unite(df, "z", x:y, na.rm = TRUE)) }) dtplyr/tests/testthat/test-step-call.R0000644000176200001440000001157014300165007017556 0ustar liggesusers # head and tail ------------------------------------------------------------- test_that("simple calls generate expected results", { dt <- lazy_dt(data.table(x = 1), "DT") expect_equal( dt %>% head() %>% show_query(), expr(head(DT, n = 6L)) ) expect_equal( dt %>% tail() %>% show_query(), expr(tail(DT, n = 6L)) ) }) test_that("vars set correctly", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% head() %>% .$vars, c("x", "y")) }) # rename ------------------------------------------------------------------ test_that("simple calls generate expected translations", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% rename(b = y) %>% show_query(), expr(setnames(copy(DT), "y", "b")) ) }) test_that("vars set correctly", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% rename(a = x) %>% .$vars, c("a", "y")) }) test_that("empty rename returns original", { dt <- data.table(x = 1, y = 1, z = 1) lz <- lazy_dt(dt, "DT") expect_equal(lz %>% rename() %>% show_query(), expr(DT)) }) test_that("renames grouping vars", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1)) gt <- group_by(dt, x) expect_equal(rename(gt, a = x)$groups, "a") }) test_that("can rename with a function or formula", { dt <- lazy_dt(data.table(x = 1, y = 1)) expect_equal(dt %>% rename_with(toupper) %>% .$vars, c("X", "Y")) expect_equal(dt %>% rename_with(toupper, 1) %>% .$vars, c("X", "y")) expect_equal(dt %>% rename_with("toupper") %>% .$vars, c("X", "Y")) expect_equal(dt %>% rename_with(~ toupper(.x)) %>% .$vars, c("X", "Y")) }) test_that("but not with anything else", { dt <- lazy_dt(data.table(x = 1, y = 1)) expect_snapshot(error = TRUE, { dt %>% rename_with(1) }) }) test_that("rename_with generates minimal spec", { dt <- lazy_dt(matrix(ncol = 26, dimnames = list(NULL, letters)), "DT") expect_snapshot({ dt %>% rename_with(toupper) %>% show_query() dt %>% rename_with(toupper, 1:3) %>% show_query() }) }) # distinct ---------------------------------------------------------------- test_that("no input uses all variables", { dt <- lazy_dt(data.table(x = c(1, 1), y = c(1, 2)), "dt") expect_equal( dt %>% distinct() %>% show_query(), expr(unique(dt)) ) expect_equal(dt %>% distinct() %>% .$vars, c("x", "y")) }) test_that("uses supplied variables", { dt <- lazy_dt(data.table(x = c(1, 1), y = c(1, 2)), "dt") expect_equal( dt %>% distinct(y) %>% show_query(), expr(unique(dt[, .(y)])) ) expect_equal(dt %>% distinct(y) %>% .$vars, "y") expect_equal( dt %>% group_by(x) %>% distinct(x, y) %>% show_query(), expr(unique(dt)) ) }) test_that("doesn't duplicate variables", { dt <- lazy_dt(data.table(x = c(1, 1), y = c(1, 2)), "dt") expect_equal( dt %>% distinct(x, x) %>% show_query(), expr(unique(dt[, .(x)])) ) expect_equal(dt %>% distinct(x, x) %>% .$vars, "x") expect_equal( dt %>% group_by(x) %>% distinct(x) %>% show_query(), expr(unique(dt[, .(x)])) ) }) test_that("keeps all variables if requested", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "dt") expect_equal( dt %>% distinct(y, .keep_all = TRUE) %>% show_query(), expr(unique(dt, by = "y")) ) expect_equal(dt %>% distinct(y, .keep_all = TRUE) %>% .$vars, c("x", "y", "z")) expect_equal( dt %>% group_by(x) %>% distinct(y, .keep_all = TRUE) %>% show_query(), expr(unique(dt, by = !!c("x", "y"))) ) }) test_that("can compute distinct computed variables", { dt <- lazy_dt(data.table(x = c(1, 1), y = c(1, 2)), "dt") expect_equal( dt %>% distinct(z = x + y) %>% show_query(), expr(unique(dt[, .(z = x + y)])) ) expect_equal( dt %>% distinct(z = x + y, .keep_all = TRUE) %>% show_query(), expr(unique(copy(dt)[, `:=`(z = x + y)], by = "z")) ) }) # unique ------------------------------------------------------------------ test_that("unique is an alias for distinct", { dt <- lazy_dt(data.table(x = c(1, 1))) expect_equal(unique(dt), distinct(dt)) }) # drop_na ------------------------------------------------------------------ test_that("empty call drops every row", { tb <- tibble(x = c(1, 2, NA), y = c("a", NA, "b")) step <- drop_na(lazy_dt(tb, "DT")) expect_equal(show_query(step), expr(na.omit(DT))) expect_equal(as_tibble(step), tb[1, ]) }) test_that("uses specified variables", { df <- tibble(x = c(1, 2, NA), y = c("a", NA, "b")) dt <- lazy_dt(df, "DT") step <- drop_na(dt, x) expect_equal(show_query(step), expr(na.omit(DT, cols = "x"))) expect_equal(collect(step), df[1:2, ]) step <- drop_na(dt, x:y) expect_equal(show_query(step), expr(na.omit(DT, cols = !!c("x", "y")))) expect_equal(collect(step), df[1, ]) }) test_that("errors are raised", { tb <- tibble(x = c(1, 2, NA), y = c("a", NA, "b")) dt <- lazy_dt(tb, "DT") expect_snapshot(collect(drop_na(dt, "z")), error = TRUE) }) dtplyr/tests/testthat/test-step-subset-select.R0000644000176200001440000000666414300165007021435 0ustar liggesuserstest_that("can select variables", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% select(-z) %>% show_query(), expr(DT[, .(x, y)]) ) expect_equal( dt %>% select(a = x, y) %>% show_query(), expr(DT[, .(a = x, y)]) ) }) test_that("can merge iff j-generating call comes after i", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% filter(x > 1) %>% select(y) %>% show_query(), expr(DT[x > 1, .(y)]) ) expect_equal( dt %>% select(x = y) %>% filter(x > 1) %>% show_query(), expr(DT[, .(x = y)][x > 1]) ) }) test_that("renames grouping vars", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1)) gt <- group_by(dt, x) expect_equal(select(gt, y = x)$groups, "y") }) test_that("empty select returns no columns", { dt <- data.table(x = 1, y = 1, z = 1) lz <- lazy_dt(dt, "DT") expect_equal( lz %>% select() %>% collect(), tibble() ) # unless it's grouped skip_if(utils::packageVersion("rlang") < "0.5.0") expect_snapshot(out <- lz %>% group_by(x) %>% select()) expect_equal( out %>% collect(), group_by(tibble(x = 1), x) ) }) test_that("vars set correctly", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% select(a = x, y) %>% .$vars, c("a", "y")) }) test_that("only add step if necessary", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3), "DT") expect_equal(dt %>% select(everything()), dt) expect_equal(dt %>% select(x, y), dt) }) ### When data is copied (either implicitly or explicitly) test_that("copied data: can select variables", { dt <- lazy_dt(data.table(x = 1, y = 2, z = 3), "DT") dt$needs_copy <- TRUE expect_equal( dt %>% select(-z) %>% show_query(), expr(copy(DT)[, `:=`(!!"z", NULL)]) ) expect_equal( dt %>% select(y, x) %>% show_query(), expr(setcolorder(copy(DT)[, `:=`("z", NULL)], !!c("y", "x"))) ) expect_equal( dt %>% select(a = x, y) %>% show_query(), expr(copy(DT)[, .(a = x, y)]) ) }) test_that("copied data: renaming uses regular selection", { dt <- lazy_dt(data.table(x = 1, y = 2, z = 3), "DT") dt$needs_copy <- TRUE step <- dt %>% select(a = x, y) expect_equal( show_query(step), expr(copy(DT)[, .(a = x, y)]) ) expect_named(collect(step), c("a", "y")) }) test_that("copied data: can merge iff j-generating call comes after i", { dt <- lazy_dt(data.table(x = 1, y = 2, z = 3), "DT") dt$needs_copy <- TRUE expect_equal( dt %>% filter(x > 1) %>% select(y) %>% show_query(), expr(copy(DT)[x > 1, .(y)]) ) expect_equal( dt %>% select(x = y) %>% filter(x > 1) %>% show_query(), expr(copy(DT)[, .(x = y)][x > 1]) ) }) test_that("copied data: renames grouping vars", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1)) gt <- group_by(dt, x) gt$needs_copy <- TRUE expect_equal(select(gt, y = x)$groups, "y") }) test_that("copied data: empty select returns no columns", { dt <- data.table(x = 1, y = 2, z = 3) lz <- lazy_dt(dt, "DT") lz$needs_copy <- TRUE expect_equal( lz %>% select() %>% collect(), tibble() ) # unless it's grouped expect_snapshot(out <- lz %>% group_by(x) %>% select()) expect_equal( out %>% collect(), group_by(tibble(x = 1), x) ) }) test_that("copied data: only add step if necessary", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3), "DT") dt$needs_copy <- TRUE expect_equal(dt %>% select(everything()), dt) expect_equal(dt %>% select(x, y), dt) }) dtplyr/tests/testthat/test-step-call-pivot_longer.R0000644000176200001440000001057414372711230022272 0ustar liggesuserstest_that("can pivot all cols to long", { tbl <- tibble(x = 1:2, y = 3:4) dt <- lazy_dt(tbl, "DT") step <- pivot_longer(dt, x:y) out <- collect(step) expect_equal( show_query(step), expr(melt(DT, measure.vars = !!c("x", "y"), variable.name = "name", variable.factor = FALSE)) ) expect_equal(step$vars, c("name", "value")) expect_equal(out$name, c("x", "x", "y", "y")) expect_equal(out$value, c(1, 2, 3, 4)) }) test_that("preserves original keys", { tbl <- tibble(x = 1:2, y = 2L, z = 1:2) dt <- lazy_dt(tbl, "DT") step <- pivot_longer(dt, y:z) out <- collect(step) expect_equal( show_query(step), expr(melt(DT, measure.vars = !!c("y", "z"), variable.name = "name", variable.factor = FALSE)) ) expect_equal(step$vars, c("x", "name", "value")) expect_equal(out$x, rep(tbl$x, 2)) }) test_that("can drop missing values", { tbl <- tibble(x = c(1, NA), y = c(NA, 2)) dt <- lazy_dt(tbl, "DT") step <- pivot_longer(dt, x:y, values_drop_na = TRUE) out <- collect(step) expect_equal( show_query(step), expr(melt(DT, measure.vars = !!c("x", "y"), variable.name = "name", na.rm = TRUE, variable.factor = FALSE)) ) expect_equal(out$name, c("x", "y")) expect_equal(out$value, c(1, 2)) }) test_that("can pivot to multiple measure cols", { dt <- lazy_dt(head(anscombe, 2), "DT") step <- pivot_longer( dt, everything(), names_to = c(".value", "set"), names_pattern = "(.)(.)" ) out <- collect(step) expect_snapshot(show_query(step)) expect_equal(step$vars, c("set", "x", "y")) }) test_that(".value can be at any position in `names_to`", { samp1 <- tibble( i = 1:4, y_t1 = rnorm(4), y_t2 = rnorm(4), z_t1 = rep(3, 4), z_t2 = rep(-2, 4), ) dt1 <- lazy_dt(samp1, "DT1") value_first <- dt1 %>% pivot_longer(-i, names_to = c(".value", "time"), names_sep = "_") %>% collect() samp2 <- dplyr::rename(samp1, t1_y = y_t1, t2_y = y_t2, t1_z = z_t1, t2_z = z_t2) dt2 <- lazy_dt(samp2, "DT2") value_second <- dt2 %>% pivot_longer(-i, names_to = c("time", ".value"), names_sep = "_") %>% collect() expect_identical(value_first, value_second) }) test_that("errors on unbalanced datasets", { tbl <- tibble(x_1 = 1, x_2 = 1, y_3 = 1, y_4 = 1) dt <- lazy_dt(tbl, "DT") expect_snapshot(error = TRUE, pivot_longer(dt, everything(), names_to = c(".value", "id"), names_sep = "_") ) }) test_that("can use names_prefix", { tbl <- tibble(x_x = 1:2, x_y = 3:4) dt <- lazy_dt(tbl, "DT") out <- dt %>% pivot_longer(everything(), names_prefix = "x_") %>% arrange(name, value) %>% collect() expect_equal(out$name, c("x","x","y","y")) expect_equal(out$value, c(1,2,3,4)) }) test_that("can use names_pattern w/out .value in names_to", { dt <- data.table(a1_1 = 1, b2_2 = 2) out <- dt %>% pivot_longer( cols = everything(), names_to = c("a", "b"), names_pattern = "([[:alnum:]]+)_([[:alnum:]]+)" ) %>% collect() expect_named(out, c("a", "b", "value")) expect_equal(out$a, c("a1", "b2")) expect_equal(out$b, c("1", "2")) expect_equal(out$value, c(1, 2)) }) test_that("can use names_sep w/out .value in names_to", { dt <- data.table(a1_1 = 1, b2_2 = 2) out <- dt %>% pivot_longer( cols = everything(), names_to = c("a", "b"), names_sep = "_" ) %>% collect() expect_named(out, c("a", "b", "value")) expect_equal(out$a, c("a1", "b2")) expect_equal(out$b, c("1", "2")) expect_equal(out$value, c(1, 2)) }) test_that("informative errors on unsupported features", { dt <- lazy_dt(data.table(a1_1 = 1, b2_2 = 2)) expect_snapshot(error = TRUE, { dt %>% pivot_longer(names_ptypes = list()) dt %>% pivot_longer(names_transform = list()) dt %>% pivot_longer(values_ptypes = list()) dt %>% pivot_longer(values_transform = list()) }) }) test_that("can pivot all cols to long", { tbl <- tibble(x = 1:2, y = 3:4) dt <- lazy_dt(tbl, "DT") step <- pivot_longer(dt, x:y) out <- collect(step) expect_equal( show_query(step), expr(melt(DT, measure.vars = !!c("x", "y"), variable.name = "name", variable.factor = FALSE)) ) expect_equal(step$vars, c("name", "value")) expect_equal(out$name, c("x", "x", "y", "y")) expect_equal(out$value, c(1, 2, 3, 4)) }) dtplyr/tests/testthat/_snaps/0000755000176200001440000000000014372711230016055 5ustar liggesusersdtplyr/tests/testthat/_snaps/step-subset-filter.md0000644000176200001440000000102214300165007022127 0ustar liggesusers# errors for named input Code filter(dt, x = 1) Condition Error in `filter()`: ! Problem with `filter()` input `..1`. x Input `..1` is named. i This usually means that you've used `=` instead of `==`. i Did you mean `x == 1`? --- Code filter(dt, y > 1, x = 1) Condition Error in `filter()`: ! Problem with `filter()` input `..2`. x Input `..2` is named. i This usually means that you've used `=` instead of `==`. i Did you mean `x == 1`? dtplyr/tests/testthat/_snaps/step-subset-summarise.md0000644000176200001440000000066714300165007022665 0ustar liggesusers# summarise(.groups=) Code eval_bare(expr(lazy_dt(data.frame(x = 1, y = 2), "DT") %>% group_by(x, y) %>% dplyr::summarise() %>% show_query()), env(global_env())) Message `summarise()` has grouped output by 'x'. You can override using the `.groups` argument. Output unique(DT) --- `.groups` can't be "rowwise" in dtplyr i Possible values are NULL (default), "drop_last", "drop", and "keep" dtplyr/tests/testthat/_snaps/tidyeval-across.md0000644000176200001440000000342014372711230021507 0ustar liggesusers# across() does not support formulas with dots Code (expect_error(capture_across(dt, across(a:b, ~ log(.x, base = .y), base = 2)))) Output Error in `across_fun()`: ! `dtplyr::across()` does not support `...` when a purrr-style lambda is used in `.fns`. i Use a lambda instead. i Or inline them via a purrr-style lambda. Code (expect_error(capture_across(dt, across(a:b, list(~ log(.x, base = .y)), base = 2))) ) Output Error in `FUN()`: ! `dtplyr::across()` does not support `...` when a purrr-style lambda is used in `.fns`. i Use a lambda instead. i Or inline them via a purrr-style lambda. # across() gives informative errors Code capture_across(dt, across(a, 1)) Condition Error in `across_funs()`: ! `.fns` argument to dtplyr::across() must be a NULL, a function, formula, or list Code capture_across(dt, across(a, list(1))) Condition Error in `FUN()`: ! .fns argument to dtplyr::across() must contain a function or a formula x Problem with 1 # if_all() gives informative errors Code capture_if_all(dt, if_all(a, 1)) Condition Error in `across_funs()`: ! `.fns` argument to dtplyr::across() must be a NULL, a function, formula, or list Code capture_if_all(dt, if_all(a, list(1))) Condition Error in `FUN()`: ! .fns argument to dtplyr::across() must contain a function or a formula x Problem with 1 # if_all() cannot rename variables Code (expect_error(capture_if_all(dt, if_all(c(a = x, b = y))))) Output Error in `if_all()`: ! Can't rename variables in this context. dtplyr/tests/testthat/_snaps/step-join.md0000644000176200001440000000465014300152652020312 0ustar liggesusers# performs cross join Code left_join(dt1, dt2, by = character()) Output Source: local data table [4 x 3] Call: setnames(setcolorder(copy(dt2)[, `:=`(.cross_join_col = 1)][copy(dt1)[, `:=`(.cross_join_col = 1)], on = .(.cross_join_col), allow.cartesian = TRUE], c(3L, 4L, 2L, 1L)), c("i.x", "x"), c("x.x", "x.y"))[, !".cross_join_col"] x.x y x.y 1 1 a 3 2 1 a 4 3 2 a 3 4 2 a 4 # Use as.data.table()/as.data.frame()/as_tibble() to access results --- Code right_join(dt1, dt2, by = character()) Output Source: local data table [4 x 3] Call: setnames(setcolorder(copy(dt2)[, `:=`(.cross_join_col = 1)][copy(dt1)[, `:=`(.cross_join_col = 1)], on = .(.cross_join_col), allow.cartesian = TRUE], c(3L, 4L, 2L, 1L)), c("i.x", "x"), c("x.x", "x.y"))[, !".cross_join_col"] x.x y x.y 1 1 a 3 2 1 a 4 3 2 a 3 4 2 a 4 # Use as.data.table()/as.data.frame()/as_tibble() to access results --- Code full_join(dt1, dt2, by = character()) Output Source: local data table [4 x 3] Call: setnames(setcolorder(copy(dt2)[, `:=`(.cross_join_col = 1)][copy(dt1)[, `:=`(.cross_join_col = 1)], on = .(.cross_join_col), allow.cartesian = TRUE], c(3L, 4L, 2L, 1L)), c("i.x", "x"), c("x.x", "x.y"))[, !".cross_join_col"] x.x y x.y 1 1 a 3 2 1 a 4 3 2 a 3 4 2 a 4 # Use as.data.table()/as.data.frame()/as_tibble() to access results --- Code inner_join(dt1, dt2, by = character()) Output Source: local data table [4 x 3] Call: setnames(setcolorder(copy(dt2)[, `:=`(.cross_join_col = 1)][copy(dt1)[, `:=`(.cross_join_col = 1)], on = .(.cross_join_col), allow.cartesian = TRUE], c(3L, 4L, 2L, 1L)), c("i.x", "x"), c("x.x", "x.y"))[, !".cross_join_col"] x.x y x.y 1 1 a 3 2 1 a 4 3 2 a 3 4 2 a 4 # Use as.data.table()/as.data.frame()/as_tibble() to access results dtplyr/tests/testthat/_snaps/count.md0000644000176200001440000000022714300165007017524 0ustar liggesusers# can control name Code dt %>% count(name = 10) %>% collect() Condition Error in `check_name()`: ! `name` must be a string dtplyr/tests/testthat/_snaps/step-call.md0000644000176200001440000000115514300165007020261 0ustar liggesusers# but not with anything else Code dt %>% rename_with(1) Condition Error in `rename_with()`: ! `.fn` must be a function name or formula # rename_with generates minimal spec Code dt %>% rename_with(toupper) %>% show_query() Output setnames(copy(DT), toupper) Code dt %>% rename_with(toupper, 1:3) %>% show_query() Output setnames(copy(DT), c("a", "b", "c"), toupper) # errors are raised Code collect(drop_na(dt, "z")) Condition Error in `drop_na()`: ! Can't subset columns that don't exist. x Column `z` doesn't exist. dtplyr/tests/testthat/_snaps/step-group.md0000644000176200001440000000037214372711230020506 0ustar liggesusers# can add groups if requested Code . <- dt %>% group_by(x) %>% group_by(y, add = TRUE) Condition Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0. i Please use the `.add` argument instead. dtplyr/tests/testthat/_snaps/step.md0000644000176200001440000000510714372711230017355 0ustar liggesusers# has useful display methods Code dt <- lazy_dt(mtcars, "DT") dt Output Source: local data table [32 x 11] Call: DT mpg cyl disp hp drat wt qsec vs am gear carb 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 # ... with 26 more rows # Use as.data.table()/as.data.frame()/as_tibble() to access results Code dt %>% group_by(vs, am) Output Source: local data table [32 x 11] Groups: vs, am Call: DT mpg cyl disp hp drat wt qsec vs am gear carb 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 # ... with 26 more rows # Use as.data.table()/as.data.frame()/as_tibble() to access results Code dt %>% mutate(y = 10) %>% compute("DT2") Output Source: local data table [32 x 12] Call: DT2 <- copy(DT)[, `:=`(y = 10)] DT2 mpg cyl disp hp drat wt qsec vs am gear carb y 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 10 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 10 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 10 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 10 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 10 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 10 # ... with 26 more rows # Use as.data.table()/as.data.frame()/as_tibble() to access results dtplyr/tests/testthat/_snaps/step-subset-separate.md0000644000176200001440000000044614300165007022457 0ustar liggesusers# checks type of `into` and `sep` Code separate(dt, x, "x", FALSE) Condition Error in `separate()`: ! `sep` must be a character vector. --- Code separate(dt, x, FALSE) Condition Error in `separate()`: ! `into` must be a character vector. dtplyr/tests/testthat/_snaps/step-call-pivot_wider.md0000644000176200001440000000253514300165007022615 0ustar liggesusers# names_glue affects output names Code show_query(step) Output setnames(dcast(DT, formula = "..." ~ x + y, value.var = c("a", "b"))[, `:=`(".", NULL)], c("a_X_1", "a_Y_2", "b_X_1", "b_Y_2" ), c("X1_a", "Y2_a", "X1_b", "Y2_b")) # can sort column names Code show_query(step) Output setcolorder(dcast(DT, formula = "..." ~ chr, value.var = "int")[, `:=`(".", NULL)], c("Mon", "Tue", "Wed")) # can sort column names with id Code show_query(step) Output setcolorder(dcast(DT, formula = id ~ chr, value.var = "int"), c("id", "Mon", "Tue", "Wed")) # can repair names if requested Code pivot_wider(df, names_from = lab, values_from = val) Condition Error in `step_repair()`: ! Names must be unique. x These names are duplicated: * "x" at locations 1 and 2. Code pivot_wider(df, names_from = lab, values_from = val, names_repair = "unique") Message New names: * `x` -> `x...1` * `x` -> `x...2` Output Source: local data table [1 x 2] Call: setnames(dcast(copy(DT), formula = x ~ lab, value.var = "val"), 1:2, c("x...1", "x...2")) x...1 x...2 1 1 2 # Use as.data.table()/as.data.frame()/as_tibble() to access results dtplyr/tests/testthat/_snaps/step-colorder-relocate.md0000644000176200001440000000030314372711230022751 0ustar liggesusers# can only supply one of .before and .after Code relocate(dt, y, .before = x, .after = x) Condition Error in `relocate()`: ! Can't supply both `.before` and `.after`. dtplyr/tests/testthat/_snaps/step-subset-select.md0000644000176200001440000000046314300165007022131 0ustar liggesusers# empty select returns no columns Code out <- lz %>% group_by(x) %>% select() Message Adding missing grouping variables: `x` # copied data: empty select returns no columns Code out <- lz %>% group_by(x) %>% select() Message Adding missing grouping variables: `x` dtplyr/tests/testthat/_snaps/unite.md0000644000176200001440000000007514300165007017521 0ustar liggesusers# errors on na.rm `na.rm` is not implemented in dtplyr dtplyr/tests/testthat/_snaps/tidyeval.md0000644000176200001440000000067114372711230020224 0ustar liggesusers# translates lag()/lead() The `order_by` argument of `lag()` is not supported by dtplyr # errors when `where()` is used, #271/#368 This tidyselect interface doesn't support predicates. --- This tidyselect interface doesn't support predicates. # desc() checks the number of arguments Code capture_dot(df, desc(a, b)) Condition Error in `check_one_arg()`: ! `desc()` expects exactly one argument. dtplyr/tests/testthat/_snaps/step-colorder.md0000644000176200001440000000033614300152651021160 0ustar liggesusers# can handle duplicate column names The column(s) x do not uniquely match a column in `x`. # checks col_order Every element of `col_order` must be unique. --- Every element of `col_order` must be unique. dtplyr/tests/testthat/_snaps/step-subset-slice.md0000644000176200001440000000335214372711230021755 0ustar liggesusers# slice_*() checks for empty ... Code slice_head(dt, 5) Condition Error in `slice_head()`: ! `n` must be explicitly named. i Did you mean `slice_head(n = 5)`? Code slice_tail(dt, 5) Condition Error in `slice_tail()`: ! `n` must be explicitly named. i Did you mean `slice_tail(n = 5)`? Code slice_min(dt, x, 5) Condition Error in `slice_min()`: ! `n` must be explicitly named. i Did you mean `slice_min(n = 5)`? Code slice_max(dt, x, 5) Condition Error in `slice_max()`: ! `n` must be explicitly named. i Did you mean `slice_max(n = 5)`? Code slice_sample(dt, 5) Condition Error in `slice_sample()`: ! `n` must be explicitly named. i Did you mean `slice_sample(n = 5)`? --- Code slice_min(dt) Condition Error in `slice_min()`: ! `order_by` is absent but must be supplied. Code slice_max(dt) Condition Error in `slice_max()`: ! `order_by` is absent but must be supplied. # check_slice_catches common errors Code slice_head(dt, n = 1, prop = 1) Condition Error in `slice_head()`: ! Must supply exactly one of `n` and `prop` arguments. Code slice_head(dt, n = "a") Condition Error in `slice_head()`: ! `n` must be a single number. Code slice_head(dt, prop = "a") Condition Error in `slice_head()`: ! `prop` must be a single number. Code slice_head(dt, n = NA) Condition Error in `slice_head()`: ! `n` must be a single number. Code slice_head(dt, prop = NA) Condition Error in `slice_head()`: ! `prop` must be a single number. dtplyr/tests/testthat/_snaps/step-call-pivot_longer.md0000644000176200001440000000236414300165007022771 0ustar liggesusers# can pivot to multiple measure cols Code show_query(step) Output melt(DT, measure.vars = list(c("x1", "x2", "x3", "x4"), c("y1", "y2", "y3", "y4")), variable.name = "set", value.name = c("x", "y"), variable.factor = FALSE)[, `:=`(set = c("1", "1", "2", "2", "3", "3", "4", "4"))] # errors on unbalanced datasets Code pivot_longer(dt, everything(), names_to = c(".value", "id"), names_sep = "_") Condition Error in `pivot_longer()`: ! `data.table::melt()` doesn't currently support melting of unbalanced datasets. # informative errors on unsupported features Code dt %>% pivot_longer(names_ptypes = list()) Condition Error in `pivot_longer()`: ! `names_ptypes` is not supported by dtplyr Code dt %>% pivot_longer(names_transform = list()) Condition Error in `pivot_longer()`: ! `names_transform` is not supported by dtplyr Code dt %>% pivot_longer(values_ptypes = list()) Condition Error in `pivot_longer()`: ! `values_ptypes` is not supported by dtplyr Code dt %>% pivot_longer(values_transform = list()) Condition Error in `pivot_longer()`: ! `values_transform` is not supported by dtplyr dtplyr/tests/testthat/_snaps/step-mutate.md0000644000176200001440000000021514300165007020641 0ustar liggesusers# unnamed arguments matching column names are ignored Code mutate(dt, y) Condition Error: ! object 'y' not found dtplyr/tests/testthat/test-step-nest.R0000644000176200001440000000517514126601265017627 0ustar liggesuserstest_that("nest turns grouped values into one list-df", { ldt <- lazy_dt(tibble(x = c(1, 1, 1), y = 1:3), "DT") out <- nest(ldt, data = y) outc <- collect(out) expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x)])) expect_equal(group_vars(out), character()) expect_equal(out$vars, c("x", "data")) expect_equal(outc$x, 1) expect_equal(length(outc$data), 1L) expect_equal(outc$data[[1L]], data.table(y = 1:3)) }) test_that("nest uses grouping vars if present", { ldt <- lazy_dt(tibble(x = c(1, 1, 1), y = 1:3), "DT") out <- nest(dplyr::group_by(ldt, x)) expect_equal(group_vars(out), "x") expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x)])) }) test_that("provided grouping vars override grouped defaults", { ldt <- tibble(x = 1, y = 2, z = 3) %>% group_by(x) %>% lazy_dt("DT") out <- nest(ldt, data = y) expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x, z)])) expect_equal(group_vars(out), "x") expect_equal(out$vars, c("x", "z", "data")) }) test_that("puts data into the correct row", { ldt <- tibble(x = 1:3, y = c("B", "A", "A")) %>% lazy_dt() out <- nest(ldt, data = x) %>% collect() %>% dplyr::filter(y == "B") expect_equal(out$data[[1]]$x, 1) }) test_that("nesting everything yields a simple data frame", { dt <- data.table(x = 1:3, y = c("B", "A", "A")) ldt <- lazy_dt(dt, "DT") out <- nest(ldt, data = c(x, y)) expect_equal(show_query(out), expr(DT[, .(data = .(.SD))])) expect_equal(out$vars, "data") expect_equal(collect(out)$data, list(dt)) }) test_that("nest preserves order of data", { ldt <- lazy_dt(tibble(x = c(1, 3, 2, 3, 2), y = 1:5), "DT") out <- nest(ldt, data = y) expect_equal(collect(out)$x, c(1, 3, 2)) }) test_that("can strip names", { ldt <- lazy_dt(tibble(x = c(1, 1, 1), ya = 1:3, yb = 4:6), "DT") out <- nest(ldt, y = starts_with("y"), .names_sep = "") expect_equal( show_query(out), expr(DT[, .(y = .(data.table(a = ya, b = yb))), by = .(x)]) ) expect_named(collect(out)$y[[1]], c("a", "b")) }) test_that("can nest multiple columns", { ldt <- lazy_dt(tibble(x = 1, a1 = 1, a2 = 2, b1 = 1, b2 = 2), "DT") out <- ldt %>% nest(a = c(a1, a2), b = c(b1, b2)) expect_equal( show_query(out), expr(DT[, .(a = .(data.table(a1, a2)), b = .(data.table(b1, b2))), by = .(x)]) ) expect_equal(out$vars, c("x", "a", "b")) }) test_that("nesting no columns nests all inputs", { # included only for backward compatibility ldt <- lazy_dt(tibble(a1 = 1, a2 = 2, b1 = 1, b2 = 2), "DT") expect_warning(out <- nest(ldt), "must not be empty") expect_equal(show_query(out), expr(DT[, .(data = .(.SD))])) }) dtplyr/tests/testthat/test-step-subset-slice.R0000644000176200001440000002071014372711230021245 0ustar liggesuserstest_that("can slice", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_equal( dt %>% slice() %>% show_query(), expr(DT) ) expect_equal( dt %>% slice(c(1, 2)) %>% show_query(), expr(DT[{ .rows <- c(1, 2) .rows[between(.rows, -.N, .N)] }]) ) expect_equal( dt %>% slice(1, 2, 3) %>% show_query(), expr(DT[{ .rows <- c(1, 2, 3) .rows[between(.rows, -.N, .N)] }]) ) }) test_that("can slice when grouped", { dt1 <- lazy_dt(data.table(x = c(1, 1, 2, 2), y = c(1, 2, 3, 4)), "DT") dt2 <- dt1 %>% group_by(x) %>% slice(1) expect_equal( dt2 %>% show_query(), expr(DT[DT[, .I[{ .rows <- 1 .rows[between(.rows, -.N, .N)] }], by = .(x)]$V1]) ) expect_equal(as_tibble(dt2), tibble(x = c(1, 2), y = c(1, 3))) }) test_that("can use `.by`", { dt1 <- lazy_dt(data.table(x = c(1, 1, 2, 2), y = c(1, 2, 3, 4)), "DT") dt2 <- dt1 %>% slice(1, .by = x) expect_equal( dt2 %>% show_query(), expr(DT[DT[, .I[{ .rows <- 1 .rows[between(.rows, -.N, .N)] }], by = .(x)]$V1]) ) expect_equal(collect(dt2), tibble(x = c(1, 2), y = c(1, 3))) }) test_that("slicing doesn't sorts groups", { dt <- lazy_dt(data.table(x = 2:1)) expect_equal( dt %>% group_by(x) %>% slice(1) %>% pull(x), 2:1 ) }) test_that("doesn't return excess rows, #10", { dt <- lazy_dt(data.table(x = 1:2)) expect_equal( dt %>% slice(1:3) %>% pull(x), 1:2 ) }) # variants ---------------------------------------------------------------- test_that("functions silently truncate results", { dt <- lazy_dt(data.frame(x = 1:5)) expect_equal(dt %>% slice_head(n = 6) %>% as_tibble() %>% nrow(), 5) expect_equal(dt %>% slice_tail(n = 6) %>% as_tibble() %>% nrow(), 5) expect_equal(dt %>% slice_sample(n = 6) %>% as_tibble() %>% nrow(), 5) expect_equal(dt %>% slice_min(x, n = 6) %>% as_tibble() %>% nrow(), 5) expect_equal(dt %>% slice_max(x, n = 6) %>% as_tibble() %>% nrow(), 5) expect_equal(dt %>% slice_head(n = -6) %>% as_tibble() %>% nrow(), 0) expect_equal(dt %>% slice_tail(n = -6) %>% as_tibble() %>% nrow(), 0) expect_equal(dt %>% slice_sample(n = -6) %>% as_tibble() %>% nrow(), 0) expect_equal(dt %>% slice_min(x, n = -6) %>% as_tibble() %>% nrow(), 0) expect_equal(dt %>% slice_max(x, n = -6) %>% as_tibble() %>% nrow(), 0) }) test_that("proportion rounds down", { dt <- lazy_dt(data.frame(x = 1:10)) expect_equal(dt %>% slice_head(prop = 0.11) %>% as_tibble() %>% nrow(), 1) expect_equal(dt %>% slice_tail(prop = 0.11) %>% as_tibble() %>% nrow(), 1) expect_equal(dt %>% slice_sample(prop = 0.11) %>% as_tibble() %>% nrow(), 1) expect_equal(dt %>% slice_min(x, prop = 0.11) %>% as_tibble() %>% nrow(), 1) expect_equal(dt %>% slice_max(x, prop = 0.11) %>% as_tibble() %>% nrow(), 1) expect_equal(dt %>% slice_min(x, prop = 0.11, with_ties = FALSE) %>% as_tibble() %>% nrow(), 1) expect_equal(dt %>% slice_max(x, prop = 0.11, with_ties = FALSE) %>% as_tibble() %>% nrow(), 1) }) test_that("min and max return ties by default", { dt <- lazy_dt(data.frame(x = c(1, 1, 1, 2, 2))) expect_equal(dt %>% slice_min(x) %>% collect() %>% nrow(), 3) expect_equal(dt %>% slice_max(x) %>% collect() %>% nrow(), 2) expect_equal(dt %>% slice_min(x, with_ties = FALSE) %>% collect() %>% nrow(), 1) expect_equal(dt %>% slice_max(x, with_ties = FALSE) %>% collect() %>% nrow(), 1) }) test_that("min and max work with character", { dt <- lazy_dt(data.table(x = c("b", "a", "d", "c"))) expect_equal(dt %>% slice_min(x) %>% pull(x), "a") expect_equal(dt %>% slice_max(x) %>% pull(x), "d") }) test_that("min and max reorder results and auto-convert data.tables", { dt <- lazy_dt(data.frame(id = 1:4, x = c(2, 3, 1, 2))) expect_equal(dt %>% slice_min(x, n = 2) %>% pull(id), c(3, 1, 4)) expect_equal(dt %>% slice_min(x, n = 2, with_ties = FALSE) %>% pull(id), c(3, 1)) expect_equal(dt %>% slice_max(x, n = 2) %>% pull(id), c(2, 1, 4)) expect_equal(dt %>% slice_max(x, n = 2, with_ties = FALSE) %>% pull(id), c(2, 1)) dt <- data.table(id = 1:4, x = c(2, 3, 1, 2)) expect_equal(dt %>% slice_min(x, n = 2) %>% pull(id), c(3, 1, 4)) expect_equal(dt %>% slice_max(x, n = 2) %>% pull(id), c(2, 1, 4)) }) test_that("min and max ignore NA's (#4826)", { dt <- lazy_dt(data.frame(id = 1:4, x = c(2, NA, 1, 2), y = c(NA, NA, NA, NA))) expect_equal(dt %>% slice_min(x, n = 2) %>% pull(id), c(3, 1, 4)) expect_equal(dt %>% slice_min(y, n = 2) %>% pull(id), integer()) expect_equal(dt %>% slice_max(x, n = 2) %>% pull(id), c(1, 4)) expect_equal(dt %>% slice_max(y, n = 2) %>% pull(id), integer()) }) test_that("arguments to sample are passed along", { dt <- lazy_dt(data.frame(x = 1:100, wt = c(1, rep(0, 99)))) expect_equal(dt %>% slice_sample(n = 1, weight_by = wt) %>% pull(x), 1) expect_equal(dt %>% slice_sample(n = 2, weight_by = wt, replace = TRUE) %>% pull(x), c(1, 1)) }) test_that("slice_*() checks for empty ...", { dt <- lazy_dt(data.frame(x = 1:10)) expect_snapshot(error = TRUE, { slice_head(dt, 5) slice_tail(dt, 5) slice_min(dt, x, 5) slice_max(dt, x, 5) slice_sample(dt, 5) }) expect_snapshot(error = TRUE, { slice_min(dt) slice_max(dt) }) }) test_that("slice_*() checks for constant n= and prop=", { dt <- lazy_dt(data.frame(x = 1:10)) expect_error(slice_head(dt, n = n()), "constant") expect_error(slice_head(dt, prop = n()), "constant") expect_error(slice_tail(dt, n = n()), "constant") expect_error(slice_tail(dt, prop = n()), "constant") expect_error(slice_min(dt, x, n = n()), "constant") expect_error(slice_min(dt, x, prop = n()), "constant") expect_error(slice_max(dt, x, n = n()), "constant") expect_error(slice_max(dt, x, prop = n()), "constant") expect_error(slice_sample(dt, n = n()), "constant") expect_error(slice_sample(dt, prop = n()), "constant") }) test_that("check_slice_catches common errors", { dt <- lazy_dt(data.frame(x = 1:10)) expect_snapshot(error = TRUE, { slice_head(dt, n = 1, prop = 1) slice_head(dt, n = "a") slice_head(dt, prop = "a") slice_head(dt, n = NA) slice_head(dt, prop = NA) }) }) test_that("slice_head/slice_tail correctly slice ungrouped dt when n < 0", { dt <- lazy_dt(data.frame(x = 1:10)) expect_equal( slice_head(dt, n = -2) %>% as_tibble(), slice_head(dt, n = nrow(dt) - 2) %>% as_tibble() ) expect_equal( slice_tail(dt, n = -2) %>% as_tibble(), slice_tail(dt, n = nrow(dt) - 2) %>% as_tibble() ) }) test_that("slice_head/slice_tail correctly slice grouped dt when n < 0", { dt <- data.frame(x = 1:10, g = c(rep(1, 8), rep(2, 2))) %>% lazy_dt() %>% group_by(g) expect_equal( slice_head(dt, n = -3) %>% as_tibble(), slice(dt, rlang::seq2(1L, n() - 3)) %>% as_tibble() ) expect_equal( n_groups(slice_head(dt, n = -3)), 1L ) expect_equal( slice_tail(dt, n = -3) %>% as_tibble(), slice(dt, rlang::seq2(3 + 1, n())) %>% as_tibble() ) expect_equal( n_groups(slice_tail(dt, n = -3)), 1L ) }) test_that("Non-integer number of rows computed correctly", { expect_equal(eval_tidy(get_slice_size(n = 1.6), list(.N = 10)), 1) expect_equal(eval_tidy(get_slice_size(prop = 0.16), list(.N = 10)), 1) expect_equal(eval_tidy(get_slice_size(n = -1.6), list(.N = 10)), 9) expect_equal(eval_tidy(get_slice_size(prop = -0.16), list(.N = 10)), 9) }) test_that("variants work with `by`", { df <- lazy_dt(data.table(x = 1:3, y = c("a", "a", "b")), "DT") expect_equal( df %>% slice_head(n = 1, by = y) %>% collect(), tibble(x = c(1, 3), y = c("a", "b")) ) expect_equal( df %>% slice_tail(n = 1, by = y) %>% as_tibble(), tibble(x = c(2, 3), y = c("a", "b")) ) expect_equal( df %>% slice_min(n = 1, x, by = y) %>% as_tibble(), tibble(x = c(1, 3), y = c("a", "b")) ) expect_equal( df %>% slice_max(n = 1, x, by = y) %>% as_tibble(), tibble(x = c(3, 2), y = c("b", "a")) ) }) # sample ------------------------------------------------------------------ test_that("basic usage generates expected calls", { dt <- lazy_dt(data.table(x = 1:5, y = 1), "DT") expect_equal( dt %>% sample_n(3) %>% show_query(), expr(DT[sample(.N, 3)]) ) expect_equal( dt %>% sample_frac(0.5) %>% show_query(), expr(DT[sample(.N, .N * 0.5)]) ) expect_equal( dt %>% sample_n(3, replace = TRUE) %>% show_query(), expr(DT[sample(.N, 3, replace = TRUE)]) ) expect_equal( dt %>% sample_n(3, weight = y) %>% show_query(), expr(DT[sample(.N, 3, prob = y)]) ) }) dtplyr/tests/testthat/test-step-colorder.R0000644000176200001440000000307614126601265020465 0ustar liggesuserstest_that("can reorder columns", { dt <- lazy_dt(data.frame(x = 1:3, y = 1), "DT") expect_equal( dt %>% step_colorder(c("y", "x")) %>% show_query(), expr(setcolorder(copy(DT), !!c("y", "x"))) ) expect_named( dt %>% step_colorder(c("y", "x")) %>% collect(), c("y", "x") ) expect_equal( dt %>% step_colorder(c(2L, 1L)) %>% show_query(), expr(setcolorder(copy(DT), !!c(2L, 1L))) ) expect_named( dt %>% step_colorder(c(2L, 1L)) %>% collect(), c("y", "x") ) }) test_that("can handle duplicate column names", { dt <- lazy_dt(data.table(x = 3, x = 2, y = 1), "DT") expect_snapshot_error(dt %>% step_colorder(c("y", "x"))) expect_equal( dt %>% step_colorder(c(3L, 2L)) %>% show_query(), expr(setcolorder(copy(DT), !!c(3L, 2L))) ) expect_equal( dt %>% step_colorder(c(3L, 2L)) %>% as.data.table(), data.table(y = 1, x = 2, x = 3) ) }) test_that("checks col_order", { dt <- lazy_dt(data.frame(x = 1:3, y = 1), "DT") expect_snapshot_error(dt %>% step_colorder(c("y", "y"))) expect_snapshot_error(dt %>% step_colorder(c(1L, 1L))) }) test_that("works for empty input", { dt <- lazy_dt(data.frame(x = 1), "DT") expect_equal(dt %>% step_colorder(character()), dt) expect_equal(dt %>% step_colorder(integer()), dt) }) test_that("doesn't add step if not necessary", { dt <- lazy_dt(data.frame(x = 1, y = 2), "DT") expect_equal(dt %>% step_colorder(c("x", "y")), dt) expect_equal(dt %>% step_colorder("x"), dt) expect_equal(dt %>% step_colorder(1:2), dt) expect_equal(dt %>% step_colorder(1L), dt) }) dtplyr/tests/testthat/test-tidyeval.R0000644000176200001440000002371414372711230017522 0ustar liggesuserstest_that("simple expressions left as is", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) expect_equal(capture_dot(dt, NULL), NULL) expect_equal(capture_dot(dt, 10), 10) expect_equal(capture_dot(dt, x), quote(x)) expect_equal(capture_dot(dt, x + y), quote(x + y)) expect_equal(capture_dot(dt, x[[1]]), quote(x[[1]])) # logicals expect_equal(eval(capture_dot(dt, T), globalenv()), TRUE) expect_equal(eval(capture_dot(dt, F), globalenv()), FALSE) expect_equal(capture_dot(dt, TRUE), TRUE) expect_equal(capture_dot(dt, FALSE), FALSE) }) test_that("existing non-variables get inlined", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) n <- 10 expect_equal(capture_dot(dt, x + n), quote(x + 10)) expect_equal(capture_dot(dt, x + m), quote(x + m)) # even if they start with "." (#386) .n <- 20 expect_equal(capture_dot(dt, x + .n), quote(x + 20)) }) test_that("unless we're operating in the global environment", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) quo <- new_quosure(quote(x + n), globalenv()) expect_equal(capture_dot(dt, !!quo), quote(x + ..n)) expect_equal(capture_dot(dt, !!quo, j = FALSE), quote(x + n)) }) test_that("using environment of inlined quosures", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) n <- 10 quo <- new_quosure(quote(x + n), env(n = 20)) expect_equal(capture_dot(dt, f(!!quo)), quote(f(x + 20))) expect_equal(capture_dot(dt, f(!!quo), j = FALSE), quote(f(x + 20))) }) test_that(". gets converted to .SD", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) expect_equal(capture_dot(dt, .), quote(.SD)) expect_equal(capture_dot(dt, .SD), quote(.SD)) }) test_that("translate context functions", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) expect_equal(capture_dot(dt, cur_data()), quote(.SD)) expect_error(capture_dot(dt, cur_data_all()), "not available") expect_equal(capture_dot(dt, cur_group()), quote(.BY)) expect_equal(capture_dot(dt, cur_group_id()), quote(.GRP)) expect_equal(capture_dot(dt, cur_group_rows()), quote(.I)) }) test_that("translates if_else()/ifelse()", { df <- data.frame(x = 1:5) expect_equal( capture_dot(df, ifelse(x < 0, 1, 2)), expr(fifelse(x < 0, 1, 2)) ) expect_equal( capture_dot(df, if_else(x < 0, 1, 2)), expr(fifelse(x < 0, 1, 2)) ) # Handles unusual argument names/order suppressWarnings({ expect_equal( capture_dot(df, ifelse(x < 0, n = 2, yes = 1)), expr(fifelse(x < 0, 1, 2)) ) expect_equal( capture_dot(df, if_else(x < 0, f = 2, true = 1)), expr(fifelse(x < 0, 1, 2)) ) }) # tidyeval works inside if_else, #220 expect_equal( capture_dot(df, if_else(.data$x < 3, 1, 2)), expr(fifelse(x < 3, 1, 2)) ) }) test_that("translates coalesce()", { df <- data.frame(x = 1:5) expect_equal( capture_dot(df, coalesce(x, 1)), expr(fcoalesce(x, 1)) ) }) test_that("can use local variable with coalesce() and replace_na()", { dt <- lazy_dt(data.frame(x = c(1, NA)), "dt") n <- 10 expect_equal( capture_dot(dt, coalesce(x, n)), expr(fcoalesce(x, 10)) ) expect_equal( capture_dot(dt, replace_na(x, n)), expr(fcoalesce(x, 10)) ) }) test_that("translates case_when()", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) expect_equal( capture_dot(dt, case_when(x1 ~ y1, x2 ~ y2, x3 ~ TRUE, TRUE ~ y4)), quote(fcase(x1, y1, x2, y2, x3, TRUE, rep(TRUE, .N), y4)) ) # can use T for default, #272 expect_equal( capture_dot(dt, case_when(x1 ~ y1, x2 ~ y2, x3 ~ TRUE, T ~ y4)), quote(fcase(x1, y1, x2, y2, x3, TRUE, rep(TRUE, .N), y4)) ) # translates recursively expect_equal( capture_dot(dt, case_when(x == 1 ~ n())), quote(fcase(x == 1, .N)) ) }) test_that("translates lag()/lead()", { df <- data.frame(x = 1:5, y = 1:5) expect_equal( capture_dot(df, lag(x)), expr(shift(x, type = "lag")) ) expect_equal( capture_dot(df, lead(x, 2, default = 3)), expr(shift(x, n = 2, fill = 3, type = "lead")) ) # Errors with order_by expect_snapshot_error( capture_dot(df, lag(x, order_by = y)), ) }) test_that("can use local variable with lag()/lead()", { dt <- lazy_dt(data.frame(x = c(1, NA)), "dt") n <- 10 expect_equal( capture_dot(dt, lag(x, n)), expr(shift(x, n = 10, type = "lag")) ) }) test_that("can process many expressions in one go", { dt <- lazy_dt(data.frame(x = 1:10, y = 1:10)) n <- 10 dots <- capture_dots(dt, x = x + n, y = y) expect_named(dots, c("x", "y")) expect_equal(dots$x, quote(x + 10)) }) test_that("can use anonymous functions", { dt <- lazy_dt(data.frame(x = 1:2, y = 1)) expect_equal( capture_dot(dt, x = sapply(x, function(x) x)) %>% deparse(), "sapply(x, function(x) x)" ) }) test_that("can splice a data frame", { df <- data.frame(b = rep(2, 3), c = rep(3, 3)) dots <- capture_dots(df, !!!df) expect_equal(dots, as.list(df)) }) test_that("can use glue, (#344)", { df <- data.table(a = letters[1:3], b = letters[1:3]) expect_equal( capture_dot(df, glue::glue("{a}_{b}")), quote(glue::glue("{a}_{b}", .envir = .SD)) ) expect_equal( capture_dot(df, glue::glue("{a}_{b}"), j = FALSE), quote(glue::glue("{a}_{b}")) ) out <- df %>% transmute(a_b = glue::glue("{a}_{b}")) %>% collect() expect_equal(out$a_b, c("a_a", "b_b", "c_c")) }) test_that("properly handles anonymous functions, #362", { df <- data.table(a = list(1, 1, 1)) expect_equal( capture_dot(df, sapply(a, function(x) x + n())), quote(sapply(a, function(x) x + .N)) ) }) # evaluation -------------------------------------------------------------- test_that("can access functions in local env", { dt <- lazy_dt(data.frame(g = c(1, 1, 2), x = 1:3)) f <- function(x) 100 expect_equal(dt %>% summarise(n = f()) %>% pull(), 100) }) test_that("can disambiguate using .data and .env", { dt <- lazy_dt(data.frame(x = 1)) x <- 2 expect_equal(capture_dot(dt, .data$x), quote(x)) expect_equal(capture_dot(dt, .env$x), quote(..x)) out <- dt %>% summarise(data = .data$x, env = .env$x) %>% as_tibble() expect_equal(out, tibble(data = 1, env = 2)) var <- "x" out <- dt %>% summarise(data = .data[[var]], env = .env[[var]]) %>% collect() expect_equal(out, tibble(data = 1, env = 2)) }) test_that("locals are executed before call", { dt <- lazy_dt(data.frame(x = 1, y = 2)) expect_equal( dt %>% step_locals(exprs(a = 1, b = 2, c = a + b), "c") %>% dt_eval(), 3 ) }) test_that("errors when `where()` is used, #271/#368", { dt <- lazy_dt(data.frame(x = 1, y = 2)) expect_snapshot_error( select(dt, where(is.numeric)) ) expect_snapshot_error( mutate(dt, across(!where(is.character), ~ .x + 1)) ) }) # dplyr verbs ------------------------------------------------------------- test_that("n() is equivalent to .N", { dt <- lazy_dt(data.frame(g = c(1, 1, 2), x = 1:3)) expect_equal( dt %>% summarise(n = n()) %>% pull(), 3L ) expect_equal( dt %>% group_by(g) %>% summarise(n = n()) %>% pull(), c(2L, 1L) ) }) test_that("row_number() is equivalent seq_len(.N)", { dt <- lazy_dt(data.frame(g = c(1, 1, 2), x = 1:3)) expect_equal( dt %>% mutate(n = row_number()) %>% pull(), 1:3L ) expect_equal( dt %>% group_by(g) %>% mutate(n = row_number()) %>% pull(), c(1:2, 1) ) }) test_that("row_number(x) is equivalent to rank", { dt <- lazy_dt(data.frame(x = c(10, 30, 20))) expect_equal( dt %>% mutate(n = row_number(x)) %>% pull(), c(1L, 3L, 2L) ) }) test_that("ranking functions are translated", { df <- lazy_dt(tibble(x = c(1, 2, NA, 1, 0, NaN))) res <- df %>% mutate(percent_rank = percent_rank(x), min_rank = min_rank(x), dense_rank = dense_rank(x), cume_dist = cume_dist(x)) expect_equal(pull(res, percent_rank), c(1 / 3, 1, NA, 1 / 3, 0, NA)) expect_equal(pull(res, min_rank), c(2L, 4L, NA, 2L, 1L, NA)) expect_equal(pull(res, dense_rank), c(2L, 3L, NA, 2L, 1L, NA)) expect_equal(pull(res, cume_dist), c(.75, 1, NA, .75, .25, NA)) }) test_that("scoped verbs produce nice output", { dt <- lazy_dt(data.table(x = 1:5), "DT") expect_equal( dt %>% summarise_all(mean) %>% show_query(), expr(DT[, .(x = mean(x))]) ) expect_equal( dt %>% summarise_all(~ mean(.)) %>% show_query(), expr(DT[, .(x = mean(x))]) ) expect_equal( dt %>% summarise_all(row_number) %>% show_query(), expr(DT[, .(x = frank(x, ties.method = "first", na.last = "keep"))]) ) expect_equal( dt %>% summarise_all(~ n()) %>% show_query(), expr(DT[, .(x = .N)]) ) }) test_that("non-Gforce verbs work", { dt <- lazy_dt(data.table(x = 1:2), "DT") add <- function(x) sum(x) expect_equal(dt %>% summarise_at(vars(x), add) %>% pull(), 3) expect_equal(dt %>% mutate_at(vars(x), add) %>% pull(), c(3, 3)) }) test_that("`desc(col)` is translated to `-col` inside arrange", { dt <- lazy_dt(data.table(x = c("a", "b")), "DT") step <- arrange(dt, desc(x)) out <- collect(step) expect_equal(show_query(step), expr(DT[order(-x)])) expect_equal(out$x, c("b", "a")) }) test_that("desc() checks the number of arguments", { expect_snapshot(error = TRUE, capture_dot(df, desc(a, b))) }) test_that("n_distinct() is translated to uniqueN()", { # Works with multiple inputs expect_equal( dt_squash(expr(n_distinct(c(1, 1, 2), c(1, 2, 1)))), expr(uniqueN(data.table(c(1, 1, 2), c(1, 2, 1)))) ) # Works with single column selection (in summarise()) expect_equal( dt_squash(expr(n_distinct(x))), expr(uniqueN(x)) ) dt <- lazy_dt(data.table(x = c("a", "a", "b", NA)), "DT") step <- summarise(dt, num = n_distinct(x, na.rm = TRUE)) out <- collect(step) expect_equal( show_query(step), expr(DT[, .(num = uniqueN(x, na.rm = TRUE))]) ) expect_equal(out$num, 2) }) # fun_name ---------------------------------------------------------------- test_that("finds name of functions with GForce implementations", { expect_equal(fun_name(mean), expr(mean)) # unless overridden mean <- function() {} expect_equal(fun_name(mean), NULL) }) dtplyr/tests/testthat/test-step-group.R0000644000176200001440000000620314150757413020007 0ustar liggesuserstest_that("grouping and ungrouping adjust groups field", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% .$groups, character()) expect_equal(dt %>% group_by(x) %>% .$groups, "x") expect_equal(dt %>% group_by(a = x) %>% .$groups, "a") expect_equal(dt %>% group_by(x) %>% group_by(y) %>% .$groups, "y") expect_equal(dt %>% group_by(x) %>% ungroup() %>% .$groups, character()) }) test_that("ungroup can remove variables from grouping", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) %>% group_by(x, y) expect_equal(dt %>% ungroup(y) %>% group_vars(), "x") }) test_that("can use across", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% group_by(across(everything())) %>% .$groups, c("x", "y")) }) test_that("can add groups if requested", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3), "DT") expect_equal( dt %>% group_by(x) %>% group_by(y, .add = TRUE) %>% .$groups, c("x", "y") ) expect_snapshot({ . <- dt %>% group_by(x) %>% group_by(y, add = TRUE) }) }) test_that("grouping can compute new variables if needed", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3), "DT") expect_equal( dt %>% group_by(xy = x + y) %>% show_query(), expr(copy(DT)[, `:=`(xy = x + y)]) ) # also works when RHS is only a symbol expect_equal( dt %>% group_by(z = x) %>% show_query(), expr(copy(DT)[, `:=`(z = x)]) ) expect_equal( dt %>% group_by(xy = x + y) %>% summarise(x = mean(x)) %>% show_query(), expr(copy(DT)[, `:=`(xy = x + y)][, .(x = mean(x)), keyby = .(xy)]) ) }) test_that("vars set correctly", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% group_by(x) %>% .$vars, c("x", "y")) }) test_that("`key` switches between keyby= and by=", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3), "DT") dt1 <- lazy_dt(mtcars, "DT1") expect_equal( dt %>% group_by(xy = x + y, arrange = FALSE) %>% summarize(x = mean(x)) %>% show_query(), expr(copy(DT)[, `:=`(xy = x + y)][, .(x = mean(x)), by = .(xy)]) ) expect_equal( dt1 %>% group_by(cyl, arrange = FALSE) %>% summarize(mean_mpg = mean(mpg)) %>% show_query(), expr(DT1[, .(mean_mpg = mean(mpg)), by = .(cyl)]) ) expect_equal( dt1 %>% group_by(cyl) %>% summarize(mean_mpg = mean(mpg)) %>% show_query(), expr(DT1[, .(mean_mpg = mean(mpg)), keyby = .(cyl)]) ) }) test_that("emtpy and NULL group_by ungroups", { dt <- lazy_dt(data.frame(x = 1)) %>% group_by(x) expect_equal(group_by(dt) %>% group_vars(), character()) expect_equal(group_by(dt, NULL) %>% group_vars(), character()) expect_equal(group_by(dt, !!!list()) %>% group_vars(), character()) }) test_that("only adds step if necessary", { dt <- lazy_dt(data.table(x = 1, y = 1), "DT") expect_equal(dt %>% group_by(), dt) expect_equal(dt %>% ungroup(), dt) expect_equal(dt %>% ungroup(x), dt) dt_grouped <- dt %>% group_by(x) dt_grouped2 <- dt_grouped %>% group_by(x) expect_equal(dt_grouped, dt_grouped2) expect_equal(dt_grouped %>% ungroup(y), dt_grouped) out <- dt_grouped %>% mutate(y = y - mean(y)) %>% group_by() expect_s3_class(out, "dtplyr_step_group") expect_equal(group_vars(out), character()) }) dtplyr/tests/testthat/test-step-subset-arrange.R0000644000176200001440000000521014300165007021557 0ustar liggesuserstest_that("arrange orders variables", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% arrange(x) %>% show_query(), expr(DT[order(x)]) ) }) test_that("arrange doesn't use, but still preserves, grouping", { dt <- group_by(lazy_dt(data.table(x = 1, y = 2), "DT"), x) step <- arrange(dt, y) expect_equal(step$groups, "x") expect_equal(dt_call(step), expr(DT[order(y)])) step2 <- arrange(dt, y, .by_group = TRUE) expect_equal(dt_call(step2), expr(DT[order(x, y)])) }) test_that("empty arrange returns input unchanged", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_true(identical(arrange(dt), dt)) }) test_that("can use with across", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% arrange(across(x:y)) %>% show_query(), expr(DT[order(x, y)]) ) }) test_that("vars set correctly", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% arrange(x) %>% .$vars, c("x", "y")) }) test_that("desc works with internal quosure", { dt <- lazy_dt(data.table(x = c(4,3,9,7), y = 1:4)) desc_df <- dt %>% arrange(desc(!!quo(x))) %>% collect() expect_equal(desc_df$x, c(9,7,4,3)) }) test_that("desc works .data pronoun", { dt <- lazy_dt(data.table(x = c(4,3,9,7), y = 1:4)) desc_df <- dt %>% arrange(desc(.data$x)) %>% collect() expect_equal(desc_df$x, c(9,7,4,3)) }) test_that("only add step if necessary", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3)) expect_equal(dt %>% arrange(), dt) expect_equal(dt %>% arrange(!!!list()), dt) }) test_that("uses setorder when there is already a copy", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3), "DT") # Works with implicit copy step_implicit <- dt %>% filter(x < 4) %>% arrange(x, y) expect_equal( show_query(step_implicit), expr(setorder(DT[x < 4], x, y, na.last = TRUE)) ) # Works with explicit copy step_explicit <- dt %>% mutate(x = x * 2) %>% arrange(x, -y) expect_equal( show_query(step_explicit), expr(setorder(copy(DT)[, `:=`(x = x * 2)], x, -y, na.last = TRUE)) ) }) test_that("setorder places NAs last", { dt <- lazy_dt(tibble(x = c("b", NA, "a")), "DT") dt$needs_copy <- TRUE # Works with implicit copy res <- dt %>% arrange(x) %>% as.data.table() expect_equal(res$x, c("a", "b", NA)) }) test_that("works with a transmute expression", { dt <- lazy_dt(data.frame(x = 1:3, y = 1:3), "DT") step <- dt %>% arrange(x + 1) expect_equal(show_query(step), expr(DT[order(x + 1)])) # Works with complex expression step <- dt %>% arrange(-(x + y)) expect_equal(show_query(step), expr(DT[order(-(x + y))])) }) dtplyr/tests/testthat/test-replace_na.R0000644000176200001440000000210314021443044017753 0ustar liggesusers# lazy data.tables ----------------------------------------------------------- test_that("empty call does nothing", { tbl <- tibble(x = c(1, NA)) dt <- lazy_dt(tbl, "DT") out <- collect(replace_na(dt)) expect_equal(out, tbl) }) test_that("missing values are replaced", { tbl <- tibble(x = c(1, NA)) dt <- lazy_dt(tbl, "DT") step <- replace_na(dt, list(x = 0)) out <- collect(step) expect_equal(show_query(step), expr(copy(DT)[, `:=`(x = fcoalesce(x, 0))])) expect_equal(out$x, c(1, 0)) }) test_that("don't complain about variables that don't exist", { tbl <- tibble(a = c(1, NA)) dt <- lazy_dt(tbl, "DT") out <- collect(replace_na(dt, list(a = 100, b = 0))) expect_equal(out, tibble(a = c(1, 100))) }) # Inside mutate() ----------------------------------------------------------- test_that("missing values are replaced", { tbl <- tibble(x = c(1, NA)) dt <- lazy_dt(tbl, "DT") step <- mutate(dt, x = replace_na(x, 0)) out <- collect(step) expect_equal(show_query(step), expr(copy(DT)[, `:=`(x = fcoalesce(x, 0))])) expect_equal(out$x, c(1, 0)) }) dtplyr/tests/testthat/test-step-subset-do.R0000644000176200001440000000052414006775461020563 0ustar liggesuserstest_that("basic operation as expected", { dt <- lazy_dt(data.frame(g = c(1, 1, 2), x = 1:3), "DT") expect_equal( dt %>% do(y = ncol(.)) %>% show_query(), expr(DT[, .(y = .(ncol(.SD)))]) ) expect_equal( dt %>% group_by(g) %>% do(y = ncol(.)) %>% show_query(), expr(DT[, .(y = .(ncol(.SD))), keyby = .(g)]) ) }) dtplyr/tests/testthat/test-step-subset-filter.R0000644000176200001440000000573214126601265021445 0ustar liggesuserstest_that("can filter by value", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% filter() %>% show_query(), expr(DT) ) expect_equal( dt %>% filter(x) %>% show_query(), expr(DT[(x)]) ) expect_equal( dt %>% filter(x > 1) %>% show_query(), expr(DT[x > 1]) ) expect_equal( dt %>% filter(x > 1, y > 2) %>% show_query(), expr(DT[x > 1 & y > 2]) ) }) test_that("can filter with logical columns", { dt <- lazy_dt(data.table(x = c(TRUE, FALSE)), "DT") expect_equal( dt %>% filter(x) %>% show_query(), expr(DT[(x)]) ) expect_equal( dt %>% filter(!x) %>% show_query(), expr(DT[(!x)]) ) }) test_that("inlines external variables", { dt <- lazy_dt(data.table(x = 1), "DT") l <- c(1, 10) expect_equal( dt %>% filter(x %in% l) %>% show_query(), quote(DT[x %in% !!l]) ) # Except in the global environment # But I can't figure out how to test this - it's not too important # as it only affects the quality of the translation not the correctness }) test_that("can use with across", { dt <- lazy_dt(data.table(x = 1, y = 1, z = 1), "DT") expect_equal( dt %>% filter(across(x:y, ~ . > 0)) %>% show_query(), expr(DT[x > 0 & y > 0]) ) expect_equal( dt %>% filter(if_all(x:y, ~ . > 0)) %>% show_query(), expr(DT[x > 0 & y > 0]) ) expect_equal( dt %>% filter(if_any(x:y, ~ . > 0)) %>% show_query(), expr(DT[x > 0 | y > 0]) ) # .cols defaults to everything() expect_equal( dt %>% filter(if_all(.fns = ~ . > 0)) %>% show_query(), expr(DT[x > 0 & y > 0 & z > 0]) ) expect_equal( dt %>% filter(if_any(.fns = ~ . > 0)) %>% show_query(), expr(DT[x > 0 | y > 0 | z > 0]) ) }) test_that("can filter when grouped", { dt1 <- lazy_dt(data.table(x = c(1, 1, 2, 2), y = c(1, 2, 3, 4)), "DT") dt2 <- dt1 %>% group_by(x) %>% filter(sum(y) == 3) expect_equal( dt2 %>% show_query(), expr(DT[DT[, .I[sum(y) == 3], by = .(x)]$V1]) ) expect_equal(as_tibble(dt2), tibble(x = c(1, 1), y = c(1, 2))) }) test_that("grouped filter doesn't reorder", { dt1 <- lazy_dt(data.frame(x = c(2, 2, 1, 1), y = 1:4), "DT") dt2 <- dt1 %>% group_by(x) %>% filter(TRUE) expect_equal( dt2 %>% show_query(), expr(DT[DT[, .I[TRUE], by = .(x)]$V1]) ) expect_equal(dt2 %>% as_tibble(), as_tibble(dt1)) }) test_that("only adds step if dots are not empty", { dt <- lazy_dt(data.table(x = 1), "DT") expect_equal(dt %>% filter(), dt) expect_equal(dt %>% filter(!!!list()), dt) }) test_that("errors for named input", { dt <- lazy_dt(data.table(x = 1, y = 2), "DT") expect_snapshot(error = TRUE, filter(dt, x = 1)) expect_snapshot(error = TRUE, filter(dt, y > 1, x = 1)) }) test_that("allows named constants that resolve to logical vectors", { dt <- lazy_dt(mtcars, "DT") filters <- mtcars %>% transmute( cyl %in% 6:8, hp / drat > 50 ) expect_equal( filter(dt, !!!filters), filter(dt, !!!unname(filters)) ) }) dtplyr/tests/testthat.R0000644000176200001440000000007012667402221014714 0ustar liggesuserslibrary(testthat) library(dtplyr) test_check("dtplyr") dtplyr/vignettes/0000755000176200001440000000000014406337714013611 5ustar liggesusersdtplyr/vignettes/translation.Rmd0000644000176200001440000001764514406327117016624 0ustar liggesusers--- title: "Translation" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{translation} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ## Introduction This vignette shows the details of how dtplyr translates dplyr expressions into the equivalent [data.table](http://r-datatable.com/) code. If you see places where you think I could generate better data.table code, please [let me know](https://github.com/tidyverse/dtplyr/issues)! This document assumes that you're familiar with the basics of data.table; if you're not, I recommend starting at `vignette("datatable-intro.html")`. ```{r setup, message = FALSE} library(dtplyr) library(data.table) library(dplyr) ``` ## The basics To get started, I'll create a simple lazy table with `lazy_dt()`: ```{r} df <- data.frame(a = 1:5, b = 1:5, c = 1:5, d = 1:5) dt <- lazy_dt(df) ``` The actual data doesn't matter here since we're just looking at the translation. When you print a lazy frame, it tells you that it's a local data table with four rows. It also prints the call that dtplyr will evaluate when we execute the lazy table. In this case it's very simple: ```{r} dt ``` If we just want to see the generated code, you can use `show_query()`. I'll use that a lot in this vignette. ```{r} dt %>% show_query() ``` ## Simple verbs Many dplyr verbs have a straightforward translation to either the `i` or `j` component of `[.data.table`. ### `filter()` and `arrange()` `filter()` and `arrange()` become elements of `i`: ```{r} dt %>% arrange(a, b, c) %>% show_query() dt %>% filter(b == c) %>% show_query() dt %>% filter(b == c, c == d) %>% show_query() ``` ### `select()`, `summarise()`, `transmute()` `select()`, `summarise()` and `transmute()` all become elements of `j`: ```{r} dt %>% select(a:b) %>% show_query() dt %>% summarise(a = mean(a)) %>% show_query() dt %>% transmute(a2 = a * 2) %>% show_query() ``` `mutate()` also uses the `j` component with data.table's special `:=` operator: ```{r} dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` Note that dplyr will not copy the input data by default, see below for more details. `mutate()` allows to refer to variables that you just created using an "extended `j`" expression: ```{r} dt %>% mutate(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query() ``` `transmute()` works similarly: ```{r} dt %>% transmute(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query() ``` ## Other calls Other verbs require calls to other functions: ### `rename()` `rename()` uses `setnames()`: ```{r} dt %>% rename(x = a, y = b) %>% show_query() ``` ### `distinct()` `distinct()` uses `unique()`: ```{r} dt %>% distinct() %>% show_query() dt %>% distinct(a, b) %>% show_query() dt %>% distinct(a, b, .keep_all = TRUE) %>% show_query() ``` `distinct()` on a computed column uses an intermediate mutate: ```{r} dt %>% distinct(c = a + b) %>% show_query() dt %>% distinct(c = a + b, .keep_all = TRUE) %>% show_query() ``` ### Joins Most joins use the `[.data.table` equivalent: ```{r} dt2 <- lazy_dt(data.frame(a = 1)) dt %>% inner_join(dt2, by = "a") %>% show_query() dt %>% right_join(dt2, by = "a") %>% show_query() dt %>% left_join(dt2, by = "a") %>% show_query() dt %>% anti_join(dt2, by = "a") %>% show_query() ``` But `full_join()` uses `merge()` ```{r} dt %>% full_join(dt2, by = "a") %>% show_query() ``` In some case extra calls to `data.table::setcolorder()` and `data.table::setnames()` are required to ensure correct column order and names in: ```{r} dt3 <- lazy_dt(data.frame(b = 1, a = 1)) dt %>% left_join(dt3, by = "a") %>% show_query() dt %>% full_join(dt3, by = "b") %>% show_query() ``` Semi-joins are little more complex: ```{r} dt %>% semi_join(dt2, by = "a") %>% show_query() ``` ### Set operations Set operations use the fast data.table alternatives: ```{r} dt %>% intersect(dt2) %>% show_query() dt %>% setdiff(dt2) %>% show_query() dt %>% union(dt2) %>% show_query() ``` ## Grouping Just like in dplyr, `group_by()` doesn't do anything by itself, but instead modifies the operation of downstream verbs. This generally just involves using the `keyby` argument: ```{r} dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query() ``` You may use `by` instead of `keyby` if you set `arrange = FALSE`: ```{r} dt %>% group_by(a, arrange = FALSE) %>% summarise(b = mean(b)) %>% show_query() ``` Often, there won't be too much of a difference between these, but for larger grouped operations, the overhead of reordering data may become significant. In these situations, using `arrange = FALSE` becomes preferable. The primary exception is grouped `filter()`, which requires the use of `.SD`: ```{r} dt %>% group_by(a) %>% filter(b < mean(b)) %>% show_query() ``` ## Combinations dtplyr tries to generate generate data.table code as close as possible to what you'd write by hand, as this tends to unlock data.table's tremendous speed. For example, if you `filter()` and then `select()`, dtplyr generates a single `[`: ```{r} dt %>% filter(a == 1) %>% select(-a) %>% show_query() ``` And similarly when combining filtering and summarising: ```{r} dt %>% group_by(a) %>% filter(b < mean(b)) %>% summarise(c = max(c)) %>% show_query() ``` This is particularly nice when joining two tables together because you can select variables after you have joined and data.table will only carry those into the join: ```{r} dt3 <- lazy_dt(data.frame(x = 1, y = 2)) dt4 <- lazy_dt(data.frame(x = 1, a = 2, b = 3, c = 4, d = 5, e = 7)) dt3 %>% left_join(dt4) %>% select(x, a:c) %>% show_query() ``` Note, however, that `select()`ing and then `filter()`ing must generate two separate calls to `[`, because data.table evaluates `i` before `j`. ```{r} dt %>% select(X = a, Y = b) %>% filter(X == 1) %>% show_query() ``` Similarly, a `filter()` and `mutate()` can't be combined because `dt[a == 1, .(b2 := b * 2)]` would modify the selected rows in place: ```{r} dt %>% filter(a == 1) %>% mutate(b2 = b * 2) %>% show_query() ``` ## Copies By default dtplyr avoids mutating the input data, automatically creating a `copy()` if needed: ```{r} dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` Note that dtplyr does its best to avoid needless copies, so it won't explicitly copy if there's already an implicit copy produced by `[`, `head()`, `merge()` or similar: ```{r} dt %>% filter(x == 1) %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` You can choose to opt out of this copy, and take advantage of data.table's reference semantics (see `vignette("datatable-reference-semantics")` for more details). Do this by setting `immutable = FALSE` on construction: ```{r} dt2 <- data.table(a = 1:10) dt_inplace <- lazy_dt(dt2, immutable = FALSE) dt_inplace %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` ## Performance There are two components to the performance of dtplyr: how long it takes to generate the translation, and how well the translation performs. Given my explorations so far, I'm reasonably confident that we're generating high-quality data.table code, so most of the cost should be in the translation itself. The following code briefly explores the performance of a few different translations. A significant amount of work is done by the dplyr verbs, so we benchmark the whole process. ```{r} bench::mark( filter = dt %>% filter(a == b, c == d), mutate = dt %>% mutate(a = a * 2, a4 = a2 * 2, a8 = a4 * 2) %>% show_query(), summarise = dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query(), check = FALSE )[1:6] ``` These translations all take less than a millisecond, suggesting that the performance overhead of dtplyr should be negligible for realistic data sizes. Note that dtplyr run-time scales with the complexity of the pipeline, not the size of the data, so these timings should apply regardless of the size of the underlying data[^copy]. [^copy]: Unless a copy is performed. dtplyr/R/0000755000176200001440000000000014406335651012000 5ustar liggesusersdtplyr/R/unite.R0000644000176200001440000000261314372711230013242 0ustar liggesusers#' Unite multiple columns into one by pasting strings together. #' #' @description #' This is a method for the tidyr `unite()` generic. #' #' @inheritParams tidyr::unite #' @examples #' library(tidyr) #' #' df <- lazy_dt(expand_grid(x = c("a", NA), y = c("b", NA))) #' df #' #' df %>% unite("z", x:y, remove = FALSE) #' #' # Separate is almost the complement of unite #' df %>% #' unite("xy", x:y) %>% #' separate(xy, c("x", "y")) #' # (but note `x` and `y` contain now "NA" not NA) # exported onLoad unite.dtplyr_step <- function(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE) { if (is_true(na.rm)) { abort("`na.rm` is not implemented in dtplyr") } .col <- as_name(enquo(col)) dots <- enquos(...) if (length(dots) == 0) { .cols <- data$vars locs <- seq_along(.cols) } else { locs <- tidyselect::eval_select(expr(c(!!!dots)), data, allow_rename = FALSE) .cols <- data$vars[locs] } out <- mutate(ungroup(data), !!.col := paste(!!!syms(.cols), sep = sep)) remove <- is_true(remove) if (remove) { .drop_cols <- setdiff(.cols, .col) out <- select(out, -tidyselect::all_of(.drop_cols)) } group_vars <- data$groups if (remove && any(.cols %in% group_vars)) { group_vars <- setdiff(group_vars, .cols) } out <- relocate(out, !!.col, .before = min(locs)) if (length(group_vars) > 0) { out <- group_by(out, !!!syms(group_vars)) } out } dtplyr/R/count.R0000644000176200001440000000425414406335651013260 0ustar liggesusers#' Count observations by group #' #' This is a method for the dplyr [count()] generic. It is translated using #' `.N` in the `j` argument, and supplying groups to `keyby` as appropriate. #' #' @param x A [lazy_dt()] #' @inheritParams dplyr::count #' @importFrom dplyr count #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(dplyr::starwars) #' dt %>% count(species) #' dt %>% count(species, sort = TRUE) #' dt %>% count(species, wt = mass, sort = TRUE) count.dtplyr_step <- function(x, ..., wt = NULL, sort = FALSE, name = NULL) { if (!missing(...)) { out <- group_by(x, ..., .add = TRUE) .groups <- "drop" } else { out <- x .groups <- "keep" } out <- tally_count(out, {{ wt }}, sort, name, .groups) out } #' @importFrom dplyr add_count #' @export add_count.dtplyr_step <- function(x, ..., wt = NULL, sort = FALSE, name = NULL) { if (!missing(...)) { out <- group_by(x, ..., .add = TRUE) } else { out <- x } out <- dplyr::add_tally(out, wt = !!enquo(wt), sort = sort, name = name) out <- group_by(out, !!!syms(group_vars(x))) out } #' @importFrom dplyr tally #' @export tally.dtplyr_step <- function(x, wt = NULL, sort = FALSE, name = NULL) { tally_count(x, {{ wt }}, sort, name, "drop_last") } # Helpers ----------------------------------------------------------------- tally_count <- function(.data, wt = NULL, sort = FALSE, name = NULL, .groups = "drop_last") { wt <- enquo(wt) if (quo_is_null(wt)) { n <- expr(n()) } else { n <- expr(sum(!!wt, na.rm = TRUE)) } name <- check_name(name, .data$groups) out <- summarise(.data, !!name := !!n, .groups = .groups) if (sort) { out <- arrange(out, desc(!!sym(name))) } out } check_name <- function(name, vars) { if (is.null(name)) { name <- n_name(vars) if (name != "n") { inform(c( glue::glue("Storing counts in `{name}`, as `n` already present in input"), i = "Use `name = \"new_name\"` to pick a new name." )) } } else if (!is_string(name)) { abort("`name` must be a string") } name } n_name <- function(x) { name <- "n" while (name %in% x) { name <- paste0("n", name) } name } dtplyr/R/step-setnames.R0000644000176200001440000000151314126601265014707 0ustar liggesusersstep_setnames <- function(x, old, new, in_place, rename_groups = FALSE) { stopifnot(is_step(x)) stopifnot(is.character(old) || is.integer(old)) stopifnot(is.character(new)) stopifnot(length(old) == length(new)) stopifnot(is_bool(in_place)) stopifnot(is_bool(rename_groups)) if (is.integer(old)) { locs <- old } else { locs <- vctrs::vec_match(old, x$vars) } name_changed <- x$vars[locs] != new old <- old[name_changed] new <- new[name_changed] locs <- locs[name_changed] if (length(old) == 0) { return(x) } new_vars <- x$vars new_vars[locs] <- new out <- step_call(x, "setnames", args = list(old, new), vars = new_vars, in_place = in_place ) if (rename_groups) { groups <- rename_groups(x$groups, set_names(old, new)) out <- step_group(out, groups) } out } dtplyr/R/step-mutate.R0000644000176200001440000001476214375142710014402 0ustar liggesusersstep_mutate <- function(parent, new_vars = list(), use_braces = FALSE, by = new_by()) { vars <- union(parent$vars, names(new_vars)) var_is_null <- map_lgl(new_vars, is_null) is_last <- !duplicated(names(new_vars), fromLast = TRUE) vars <- setdiff(vars, names(new_vars)[var_is_null & is_last]) if (by$uses_by) { parent$groups <- by$names } out <- new_step( parent, vars = vars, groups = parent$groups, arrange = parent$arrange, needs_copy = parent$needs_copy || !parent$implicit_copy, new_vars = new_vars, use_braces = use_braces, class = "dtplyr_step_mutate" ) if (by$uses_by) { out <- ungroup(out) } out } dt_call.dtplyr_step_mutate <- function(x, needs_copy = x$needs_copy) { # i is always empty because we never mutate a subset if (is_empty(x$new_vars)) { j <- quote(.SD) } else if (!x$use_braces) { j <- call2(":=", !!!x$new_vars) } else { mutate_list <- mutate_with_braces(x$new_vars) j <- call2(":=", call2("c", !!!mutate_list$new_vars), mutate_list$expr) } out <- call2("[", dt_call(x$parent, needs_copy), , j) add_grouping_param(out, x, arrange = FALSE) } mutate_with_braces <- function(mutate_vars) { assign <- map2(syms(names(mutate_vars)), mutate_vars, function(x, y) call2("<-", x, y)) new_vars <- unique(names(mutate_vars)) output <- call2(".", !!!syms(new_vars)) list( expr = call2("{", !!!assign, output), new_vars = new_vars ) } # dplyr methods ----------------------------------------------------------- #' Create and modify columns #' #' This is a method for the dplyr [mutate()] generic. It is translated to #' the `j` argument of `[.data.table`, using `:=` to modify "in place". If #' `.before` or `.after` is provided, the new columns are relocated with a call #' to [data.table::setcolorder()]. #' #' @param .data A [lazy_dt()]. #' @inheritParams dplyr::mutate #' @param .keep #' Control which columns from `.data` are retained in the output. Grouping #' columns and columns created by `...` are always kept. #' #' * `"all"` retains all columns from `.data`. This is the default. #' * `"used"` retains only the columns used in `...` to create new #' columns. This is useful for checking your work, as it displays inputs #' and outputs side-by-side. #' * `"unused"` retains only the columns _not_ used in `...` to create new #' columns. This is useful if you generate new columns, but no longer need #' the columns used to generate them. #' * `"none"` doesn't retain any extra columns from `.data`. Only the grouping #' variables and columns created by `...` are kept. #' #' Note: With dtplyr `.keep` will only work with column names passed as symbols, and won't #' work with other workflows (e.g. `eval(parse(text = "x + 1"))`) #' @importFrom dplyr mutate #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(data.frame(x = 1:5, y = 5:1)) #' dt %>% #' mutate(a = (x + y) / 2, b = sqrt(x^2 + y^2)) #' #' # It uses a more sophisticated translation when newly created variables #' # are used in the same expression #' dt %>% #' mutate(x1 = x + 1, x2 = x1 + 1) mutate.dtplyr_step <- function(.data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL) { by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") all_dots <- capture_new_vars(.data, ..., .by = by) trivial_dot <- imap(all_dots, ~ is_symbol(.x) && sym(.y) == .x && .y %in% .data$vars) dots <- all_dots[!as.vector(trivial_dot, "logical")] dots_list <- process_new_vars(.data, dots) dots <- dots_list$dots if (is_null(dots) || is_empty(dots)) { out <- .data } else { out <- step_mutate(.data, dots, dots_list$use_braces, by) .before <- enquo(.before) .after <- enquo(.after) if (!quo_is_null(.before) || !quo_is_null(.after)) { # Only change the order of new columns new <- setdiff(names(dots), .data$vars) out <- relocate(out, !!new, .before = !!.before, .after = !!.after) } if (dots_list$need_removal_step) { out <- select(out, -tidyselect::all_of(dots_list$vars_removed)) } } .keep <- arg_match(.keep) if (.keep != "all") { cols_retain <- keep_vars(.data, out, all_dots, .keep) out <- select(out, tidyselect::all_of(cols_retain)) } out } nested_vars <- function(.data, dots, all_vars) { new_vars <- character() all_new_vars <- unique(names(dots)) init <- 0L for (i in seq_along(dots)) { cur_var <- names(dots)[[i]] used_vars <- all_names(get_expr(dots[[i]])) if (any(used_vars %in% new_vars)) { return(TRUE) } else { new_vars <- c(new_vars, cur_var) } } FALSE } # Helpers ----------------------------------------------------------------- all_names <- function(x) { if (is.name(x)) return(as.character(x)) if (!is.call(x)) return(NULL) unique(unlist(lapply(x[-1], all_names), use.names = FALSE)) } process_new_vars <- function(.data, dots) { # identify where var = NULL is being used to remove a variable var_is_null <- map_lgl(dots, is.null) is_last <- !duplicated(names(dots), fromLast = TRUE) var_removals <- var_is_null & is_last vars_removed <- names(var_removals)[var_removals] nested <- nested_vars(.data, dots, .data$vars) repeated <- anyDuplicated(names(dots)) use_braces <- nested | repeated grouped <- !is_empty(group_vars(.data)) need_removal_step <- any(var_removals) && (use_braces | grouped) if (need_removal_step) { dots <- dots[!var_removals] } list( dots = dots, use_braces = use_braces, need_removal_step = need_removal_step, vars_removed = vars_removed ) } keep_vars <- function(.data, out, dots, .keep) { used <- unique(unlist(map(dots, all_names))) %||% character() used <- set_names(out$vars %in% used, out$vars) cols_data <- .data$vars cols_group <- .data$groups cols_expr <- names(dots) cols_expr_modified <- intersect(cols_expr, cols_data) cols_expr_new <- setdiff(cols_expr, cols_expr_modified) cols_used <- setdiff(cols_data, c(cols_group, cols_expr_modified, names(used)[!used])) cols_unused <- setdiff(cols_data, c(cols_group, cols_expr_modified, names(used)[used])) cols_out <- out$vars if (.keep == "used") { cols_retain <- setdiff(cols_out, cols_unused) } else if (.keep == "unused") { cols_retain <- setdiff(cols_out, cols_used) } else if (.keep == "none") { cols_retain <- setdiff(cols_out, c(cols_used, cols_unused)) } cols_retain } dtplyr/R/utils.R0000644000176200001440000000137614126601265013266 0ustar liggesuserscat_line <- function(...) cat(paste(..., "\n", collapse = "", sep = "")) # nocov start - compat-purrr.R imap <- function(.x, .f, ...) { map2(.x, names(.x) %||% seq_along(.x), .f, ...) } map2 <- function(.x, .y, .f, ...) { .f <- as_function(.f, env = global_env()) out <- mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE) if (length(out) == length(.x)) { set_names(out, names(.x)) } else { set_names(out, NULL) } } # nocov end # nocov start - compat-tidyr.R strip_names <- function(df, base, names_sep) { base <- paste0(base, names_sep) names <- names(df) has_prefix <- startsWith(names, base) names[has_prefix] <- substr(names[has_prefix], nchar(base) + 1, nchar(names[has_prefix])) set_names(df, names) } # nocov end dtplyr/R/zzz.R0000644000176200001440000000277614300165007012761 0ustar liggesusers# nocov start .onLoad <- function(...) { register_s3_method("dplyr", "filter", "dtplyr_step") register_s3_method("dplyr", "intersect", "dtplyr_step") register_s3_method("dplyr", "setdiff", "dtplyr_step") register_s3_method("dplyr", "union", "dtplyr_step") register_s3_method("tidyr", "complete", "dtplyr_step") register_s3_method("tidyr", "drop_na", "dtplyr_step") register_s3_method("tidyr", "expand", "dtplyr_step") register_s3_method("tidyr", "fill", "dtplyr_step") register_s3_method("tidyr", "pivot_longer", "dtplyr_step") register_s3_method("tidyr", "pivot_wider", "dtplyr_step") register_s3_method("tidyr", "replace_na", "dtplyr_step") register_s3_method("tidyr", "nest", "dtplyr_step") register_s3_method("tidyr", "separate", "dtplyr_step") register_s3_method("tidyr", "unite", "dtplyr_step") } register_s3_method <- function(pkg, generic, class, fun = NULL) { stopifnot(is.character(pkg), length(pkg) == 1) stopifnot(is.character(generic), length(generic) == 1) stopifnot(is.character(class), length(class) == 1) if (is.null(fun)) { fun <- get(paste0(generic, ".", class), envir = parent.frame()) } else { stopifnot(is.function(fun)) } if (pkg %in% loadedNamespaces()) { registerS3method(generic, class, fun, envir = asNamespace(pkg)) } # Always register hook in case package is later unloaded & reloaded setHook( packageEvent(pkg, "onLoad"), function(...) { registerS3method(generic, class, fun, envir = asNamespace(pkg)) } ) } # nocov end dtplyr/R/dtplyr-package.R0000644000176200001440000000050214375676066015044 0ustar liggesusers#' @import rlang #' @importFrom data.table data.table as.data.table is.data.table #' @importFrom lifecycle deprecated #' @importFrom glue glue #' @keywords internal "_PACKAGE" #' dtplyr is data.table aware #' #' @keywords internal #' @export .datatable.aware <- TRUE globalVariables(c(".SD", ".N", ".BY", ".I", "desc")) dtplyr/R/step-set.R0000644000176200001440000000404014300165007013652 0ustar liggesusersstep_set <- function(x, y, style) { stopifnot(is_step(x)) stopifnot(is_step(y)) stopifnot(is.character(style)) new_step( parent = x, parent2 = y, locals = utils::modifyList(x$locals, y$locals), style = style, class = "dtplyr_step_set", ) } #' @export dt_sources.dtplyr_step_set <- function(x) { dt_sources.dtplyr_step_join(x) } #' @export dt_call.dtplyr_step_set <- function(x, needs_copy = x$needs_copy) { lhs <- dt_call(x$parent, needs_copy) rhs <- dt_call(x$parent2) call <- switch(x$style, intersect = call2("fintersect", lhs, rhs), union = call2("funion", lhs, rhs), union_all = call2("funion", lhs, rhs, all = TRUE), setdiff = call2("fsetdiff", lhs, rhs), ) call } # dplyr verbs ------------------------------------------------------------- #' Set operations #' #' These are methods for the dplyr generics [intersect()], [union()], #' [union_all()], and [setdiff()]. They are translated to #' [data.table::fintersect()], [data.table::funion()], and #' [data.table::fsetdiff()]. #' #' @importFrom dplyr intersect #' @param x,y A pair of [lazy_dt()]s. #' @param ... Ignored #' @examples #' dt1 <- lazy_dt(data.frame(x = 1:4)) #' dt2 <- lazy_dt(data.frame(x = c(2, 4, 6))) #' #' intersect(dt1, dt2) #' union(dt1, dt2) #' setdiff(dt1, dt2) #' # Exported onload intersect.dtplyr_step <- function(x, y, ...) { if (!is_step(y)) { y <- lazy_dt(y) } step_set(x, y, style = "intersect") } #' @importFrom dplyr union #' @rdname intersect.dtplyr_step # Exported onload union.dtplyr_step <- function(x, y, ...) { if (!is_step(y)) { y <- lazy_dt(y) } step_set(x, y, style = "union") } #' @importFrom dplyr union_all #' @rdname intersect.dtplyr_step #' @export union_all.dtplyr_step <- function(x, y, ...) { if (!is_step(y)) { y <- lazy_dt(y) } step_set(x, y, style = "union_all") } #' @importFrom dplyr setdiff #' @rdname intersect.dtplyr_step # Exported onload setdiff.dtplyr_step <- function(x, y, ...) { if (!is_step(y)) { y <- lazy_dt(y) } step_set(x, y, style = "setdiff") } dtplyr/R/step-subset-filter.R0000644000176200001440000000343314375142710015664 0ustar liggesusers #' Subset rows using column values #' #' This is a method for the dplyr [arrange()] generic. It is translated to #' the `i` argument of `[.data.table` #' #' @param .data A [lazy_dt()]. #' @param .preserve Ignored #' @inheritParams dplyr::filter #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(mtcars) #' dt %>% filter(cyl == 4) #' dt %>% filter(vs, am) #' #' dt %>% #' group_by(cyl) %>% #' filter(mpg > mean(mpg)) #' @importFrom dplyr filter # exported onLoad filter.dtplyr_step <- function(.data, ..., .by = NULL, .preserve = FALSE) { check_filter(...) by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") dots <- capture_dots(.data, ..., .j = FALSE, .by = by) if (filter_by_lgl_col(dots)) { # Suppress data.table warning when filtering with a logical variable i <- call2("(", dots[[1]]) } else { i <- Reduce(function(x, y) call2("&", x, y), dots) } step_subset_i(.data, i, by) } filter_by_lgl_col <- function(dots) { if (length(dots) > 1) { return(FALSE) } dot <- dots[[1]] if (is_symbol(dot)) { return(TRUE) } # catch expressions of form `!x` is_call(dot, name = "!", n = 1) && is_symbol(dot[[2]]) } check_filter <- function(...) { dots <- enquos(...) named <- have_name(dots) for (i in which(named)) { quo <- dots[[i]] # only allow named logical vectors, anything else # is suspicious expr <- quo_get_expr(quo) if (!is.logical(expr)) { abort(c( glue::glue("Problem with `filter()` input `..{i}`."), x = glue::glue("Input `..{i}` is named."), i = glue::glue("This usually means that you've used `=` instead of `==`."), i = glue::glue("Did you mean `{name} == {as_label(expr)}`?", name = names(dots)[i]) ), call = caller_env()) } } } dtplyr/R/step-subset-expand.R0000644000176200001440000000666714300165007015662 0ustar liggesusers#' Expand data frame to include all possible combinations of values. #' #' @description #' This is a method for the tidyr `expand()` generic. It is translated to #' [data.table::CJ()]. #' #' @param ... Specification of columns to expand. Columns can be atomic vectors #' or lists. #' #' * To find all unique combinations of `x`, `y` and `z`, including those not #' present in the data, supply each variable as a separate argument: #' `expand(df, x, y, z)`. #' * To find only the combinations that occur in the #' data, use `nesting`: `expand(df, nesting(x, y, z))`. #' * You can combine the two forms. For example, #' `expand(df, nesting(school_id, student_id), date)` would produce #' a row for each present school-student combination for all possible #' dates. #' #' Unlike the data.frame method, this method does not use the full set of #' levels, just those that appear in the data. #' #' When used with continuous variables, you may need to fill in values #' that do not appear in the data: to do so use expressions like #' `year = 2010:2020` or `year = full_seq(year,1)`. #' @param data A [lazy_dt()]. #' @inheritParams tidyr::expand #' @examples #' library(tidyr) #' #' fruits <- lazy_dt(tibble( #' type = c("apple", "orange", "apple", "orange", "orange", "orange"), #' year = c(2010, 2010, 2012, 2010, 2010, 2012), #' size = factor( #' c("XS", "S", "M", "S", "S", "M"), #' levels = c("XS", "S", "M", "L") #' ), #' weights = rnorm(6, as.numeric(size) + 2) #' )) #' #' # All possible combinations --------------------------------------- #' # Note that only present levels of the factor variable `size` are retained. #' fruits %>% expand(type) #' fruits %>% expand(type, size) #' #' # This is different from the data frame behaviour: #' fruits %>% dplyr::collect() %>% expand(type, size) #' #' # Other uses ------------------------------------------------------- #' fruits %>% expand(type, size, 2010:2012) #' #' # Use `anti_join()` to determine which observations are missing #' all <- fruits %>% expand(type, size, year) #' all #' all %>% dplyr::anti_join(fruits) #' #' # Use with `right_join()` to fill in missing rows #' fruits %>% dplyr::right_join(all) # exported onLoad expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") { dots <- capture_dots(data, ..., .j = FALSE) dots <- dots[!map_lgl(dots, is_null)] if (length(dots) == 0) { return(data) } named_dots <- have_name(dots) if (any(!named_dots)) { # Auto-names generated by enquos() don't always work with the CJ() step ## Ex: `1:3` # Replicates the "V" naming convention data.table uses symbol_dots <- map_lgl(dots, is_symbol) needs_v_name <- !symbol_dots & !named_dots v_names <- paste0("V", 1:length(dots)) names(dots)[needs_v_name] <- v_names[needs_v_name] names(dots)[symbol_dots] <- lapply(dots[symbol_dots], as_name) } names(dots) <- vctrs::vec_as_names(names(dots), repair = .name_repair) dots_names <- names(dots) out <- step_subset_j( data, vars = union(data$groups, dots_names), j = expr(CJ(!!!dots, unique = TRUE)) ) # Delete duplicate columns if group vars are expanded if (any(dots_names %in% out$groups)) { group_vars <- out$groups expanded_group_vars <- dots_names[dots_names %in% group_vars] out <- step_subset( out, groups = character(), j = expr(!!expanded_group_vars := NULL) ) out <- group_by(out, !!!syms(group_vars)) } out } dtplyr/R/step-subset-select.R0000644000176200001440000000562214372711230015654 0ustar liggesusers #' Subset columns using their names #' #' This is a method for the dplyr [select()] generic. It is translated to #' the `j` argument of `[.data.table`. #' #' @param .data A [lazy_dt()]. #' @inheritParams dplyr::select #' @importFrom dplyr select #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(data.frame(x1 = 1, x2 = 2, y1 = 3, y2 = 4)) #' #' dt %>% select(starts_with("x")) #' dt %>% select(ends_with("2")) #' dt %>% select(z1 = x1, z2 = x2) select.dtplyr_step <- function(.data, ...) { locs <- tidyselect::eval_select(expr(c(...)), .data) locs <- ensure_group_vars(locs, .data$vars, .data$groups) vars <- set_names(.data$vars[locs], names(locs)) if (length(vars) == 0) { j <- 0L groups <- .data$groups is_unnamed <- TRUE } else { groups <- rename_groups(.data$groups, vars) vars <- simplify_names(vars) is_unnamed <- all(!have_name(vars)) if (is_unnamed && identical(unname(vars), .data$vars)) { return(.data) } j <- call2(".", !!!syms(vars)) } if (is_copied(.data) && is_unnamed && !can_merge_subset(.data)) { # Drop columns by reference if: # * Data has been copied (implicitly or explicitly) # * There is no renaming in the select statement # * The selection can't be combined with a prior `i` step. Ex: dt[x < 7, .(x, y)] vars_drop <- setdiff(.data$vars, vars) out <- remove_vars(.data, vars_drop) out <- step_colorder(out, vars) } else { out <- step_subset_j(.data, vars = names(locs), groups = character(), j = j) } step_group(out, groups) } #' @importFrom tidyselect tidyselect_data_proxy #' @exportS3Method tidyselect_data_proxy.dtplyr_step <- function(x) { simulate_vars(x) } #' @importFrom tidyselect tidyselect_data_has_predicates #' @exportS3Method tidyselect_data_has_predicates.dtplyr_step <- function(x) { FALSE } simulate_vars <- function(x, drop_groups = FALSE) { if (drop_groups) { vars <- setdiff(x$vars, x$groups) } else { vars <- x$vars } as_tibble(rep_named(vars, list(logical())), .name_repair = "minimal") } ensure_group_vars <- function(loc, names, groups) { group_loc <- match(groups, names) missing <- setdiff(group_loc, loc) if (length(missing) > 0) { vars <- names[missing] inform(paste0( "Adding missing grouping variables: ", paste0("`", names[missing], "`", collapse = ", ") )) loc <- c(set_names(missing, vars), loc) } loc } rename_groups <- function(groups, vars) { old2new <- set_names(names(vars), vars) groups[groups %in% names(old2new)] <- old2new[groups] groups } simplify_names <- function(vars) { names(vars)[vars == names(vars)] <- "" vars } remove_vars <- function(.data, vars) { if (is_empty(vars)) { return(.data) } out <- step_subset( .data, groups = character(), j = expr(!!unique(vars) := NULL), vars = setdiff(.data$vars, vars) ) group_by(out, !!!syms(.data$groups)) } dtplyr/R/step-call.R0000644000176200001440000001300114372711230013773 0ustar liggesusersstep_call <- function(parent, fun, args = list(), vars = parent$vars, in_place = FALSE) { stopifnot(is_step(parent)) stopifnot(is.character(fun)) stopifnot(is.list(args)) new_step( parent = parent, vars = vars, groups = parent$groups, implicit_copy = !in_place, needs_copy = in_place || parent$needs_copy, fun = fun, args = args, class = "dtplyr_step_call" ) } dt_call.dtplyr_step_call <- function(x, needs_copy = x$needs_copy) { call2(x$fun, dt_call(x$parent, needs_copy), !!!x$args) } # dplyr verbs ------------------------------------------------------------- #' Subset first or last rows #' #' These are methods for the base generics [head()] and [tail()]. They #' are not translated. #' #' @param x A [lazy_dt()] #' @param n Number of rows to select. Can use a negative number to instead #' drop rows from the other end. #' @param ... Passed on to [head()]/[tail()]. #' @importFrom utils head #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' dt <- lazy_dt(data.frame(x = 1:10)) #' #' # first three rows #' head(dt, 3) #' # last three rows #' tail(dt, 3) #' #' # drop first three rows #' tail(dt, -3) head.dtplyr_step <- function(x, n = 6L, ...) { step_call(x, "head", args = list(n = n)) } #' @importFrom utils tail #' @export #' @rdname head.dtplyr_step tail.dtplyr_step <- function(x, n = 6L, ...) { step_call(x, "tail", args = list(n = n)) } #' Rename columns using their names #' #' These are methods for the dplyr generics [rename()] and [rename_with()]. #' They are both translated to [data.table::setnames()]. #' #' @param .data A [lazy_dt()] #' @inheritParams dplyr::rename #' @importFrom dplyr rename #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) #' dt %>% rename(new_x = x, new_y = y) #' dt %>% rename_with(toupper) rename.dtplyr_step <- function(.data, ...) { locs <- tidyselect::eval_rename(expr(c(...)), .data) step_setnames(.data, .data$vars[locs], names(locs), in_place = TRUE, rename_groups = TRUE) } #' @importFrom dplyr rename_with #' @importFrom tidyselect everything #' @rdname rename.dtplyr_step #' @export rename_with.dtplyr_step <- function(.data, .fn, .cols = everything(), ...) { if (!missing(...)) { abort("`dtplyr::rename_with() doesn't support ...") } fn_expr <- enexpr(.fn) if (is_symbol(fn_expr)) { fn <- fn_expr } else if (is_string(fn_expr)) { fn <- sym(fn_expr) } else if (is_call(fn_expr, "~")) { env <- caller_env() call <- dt_squash_formula( fn_expr, env = env, data = .data, j = FALSE, replace = quote(x) ) fn <- new_function(exprs(x =), call, env) } else { abort("`.fn` must be a function name or formula") } # Still have to compute the new variable names for the table metadata # But this should be fast, so doing it twice shouldn't matter .fn <- as_function(.fn) locs <- unname(tidyselect::eval_select(enquo(.cols), .data)) old_vars <- .data$vars[locs] new_vars <- .fn(old_vars) vars <- .data$vars vars[locs] <- new_vars if (identical(locs, seq_along(.data$vars))) { out <- step_call(.data, "setnames", args = list(fn), vars = vars, in_place = TRUE ) } else { out <- step_call(.data, "setnames", args = list(old_vars, fn), vars = vars, in_place = TRUE ) } groups <- rename_groups(.data$groups, set_names(new_vars, old_vars)) step_group(out, groups) } #' Subset distinct/unique rows #' #' This is a method for the dplyr [distinct()] generic. It is translated to #' [data.table::unique.data.table()]. #' #' @importFrom dplyr distinct #' @param .data A [lazy_dt()] #' @inheritParams dplyr::distinct #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' df <- lazy_dt(data.frame( #' x = sample(10, 100, replace = TRUE), #' y = sample(10, 100, replace = TRUE) #' )) #' #' df %>% distinct(x) #' df %>% distinct(x, y) #' df %>% distinct(x, .keep_all = TRUE) distinct.dtplyr_step <- function(.data, ..., .keep_all = FALSE) { dots <- capture_dots(.data, ...) if (length(dots) > 0) { only_syms <- all(map_lgl(dots, is_symbol)) if (.keep_all) { if (only_syms) { by <- union(.data$groups, names(dots)) } else { .data <- mutate(.data, !!!dots) by <- names(.data$new_vars) } } else { if (only_syms) { .data <- select(.data, !!!dots) } else { .data <- transmute(.data, !!!dots) } by <- NULL } } else { by <- NULL } args <- list() args$by <- by step_call(.data, "unique", args = args) } #' @export unique.dtplyr_step <- function(x, incomparables = FALSE, ...) { if (!missing(incomparables)) { abort("`incomparables` not supported by `unique.dtplyr_step()`") } distinct(x) } # tidyr verbs ------------------------------------------------------------- #' Drop rows containing missing values #' #' @description #' This is a method for the tidyr `drop_na()` generic. It is translated to #' `data.table::na.omit()` #' #' @param data A [lazy_dt()]. #' @inheritParams tidyr::drop_na #' @examples #' library(dplyr) #' library(tidyr) #' #' dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) #' dt %>% drop_na() #' dt %>% drop_na(x) #' #' vars <- "y" #' dt %>% drop_na(x, any_of(vars)) # exported onLoad drop_na.dtplyr_step <- function(data, ...) { locs <- names(tidyselect::eval_select(expr(c(...)), data)) args <- list() if (length(locs) > 0) { args$cols <- locs } step_call(data, "na.omit", args = args) } dtplyr/R/step-colorder-relocate.R0000644000176200001440000000145314372711230016475 0ustar liggesusers#' Relocate variables using their names #' #' This is a method for the dplyr [relocate()] generic. It is translated to #' the `j` argument of `[.data.table`. #' #' @param .data A [lazy_dt()]. #' @inheritParams dplyr::relocate #' @importFrom dplyr relocate #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) #' #' dt %>% relocate(z) #' dt %>% relocate(y, .before = x) #' dt %>% relocate(y, .after = y) relocate.dtplyr_step <- function(.data, ..., .before = NULL, .after = NULL) { new_vars <- names(tidyselect::eval_relocate( expr(c(...)), .data, before = enquo(.before), after = enquo(.after), before_arg = ".before", after_arg = ".after" )) out <- step_colorder(.data, new_vars) step_group(out, .data$groups) } dtplyr/R/by.R0000644000176200001440000000275414372711230012536 0ustar liggesuserscompute_by <- function(by, data, ..., by_arg = "by", data_arg = "data", error_call = caller_env()) { check_dots_empty0(...) by <- enquo(by) check_by(by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call) names <- eval_select_by(by, data, error_call = error_call) if (length(names) == 0) { uses_by <- FALSE } else { uses_by <- TRUE } new_by(uses_by = uses_by, names = names) } is_grouped_dt <- function(data) { !is_empty(group_vars(data)) } check_by <- function(by, data, ..., by_arg = "by", data_arg = "data", error_call = caller_env()) { check_dots_empty0(...) if (quo_is_null(by)) { return(invisible(NULL)) } if (is_grouped_dt(data)) { message <- paste0( "Can't supply {.arg {by_arg}} when ", "{.arg {data_arg}} is a grouped data frame." ) cli::cli_abort(message, call = error_call) } invisible(NULL) } eval_select_by <- function(by, data, error_call = caller_env()) { out <- tidyselect::eval_select( expr = by, data = data, allow_rename = FALSE, error_call = error_call ) names(out) } new_by <- function(uses_by = FALSE, names = character()) { structure(list(uses_by = uses_by, names = names), class = "dtplyr_by") } dtplyr/R/step-call-pivot_wider.R0000644000176200001440000001232614372711230016335 0ustar liggesusers#' Pivot data from long to wide #' #' @description #' This is a method for the tidyr `pivot_wider()` generic. It is translated to #' [data.table::dcast()] #' #' @param data A [lazy_dt()]. #' @inheritParams tidyr::pivot_wider #' @param values_fn A function, the default is `length()`. Note this is different #' behavior than `tidyr::pivot_wider()`, which returns a list column by default. #' @examples #' library(tidyr) #' #' fish_encounters_dt <- lazy_dt(fish_encounters) #' fish_encounters_dt #' fish_encounters_dt %>% #' pivot_wider(names_from = station, values_from = seen) #' # Fill in missing values #' fish_encounters_dt %>% #' pivot_wider(names_from = station, values_from = seen, values_fill = 0) #' #' # Generate column names from multiple variables #' us_rent_income_dt <- lazy_dt(us_rent_income) #' us_rent_income_dt #' us_rent_income_dt %>% #' pivot_wider(names_from = variable, values_from = c(estimate, moe)) #' #' # When there are multiple `names_from` or `values_from`, you can use #' # use `names_sep` or `names_glue` to control the output variable names #' us_rent_income_dt %>% #' pivot_wider( #' names_from = variable, #' names_sep = ".", #' values_from = c(estimate, moe) #' ) #' #' # Can perform aggregation with values_fn #' warpbreaks_dt <- lazy_dt(as_tibble(warpbreaks[c("wool", "tension", "breaks")])) #' warpbreaks_dt #' warpbreaks_dt %>% #' pivot_wider( #' names_from = wool, #' values_from = breaks, #' values_fn = mean #' ) # exported onLoad pivot_wider.dtplyr_step <- function(data, id_cols = NULL, names_from = name, names_prefix = "", names_sep = "_", names_glue = NULL, names_sort = FALSE, names_repair = "check_unique", values_from = value, values_fill = NULL, values_fn = NULL, ...) { names_from <- names(tidyselect::eval_select(enquo(names_from), data)) values_from <- names(tidyselect::eval_select(enquo(values_from), data)) id_cols <- enquo(id_cols) if (quo_is_null(id_cols)) { id_cols <- setdiff(data$vars, c(names_from, values_from)) } else { id_cols <- names(tidyselect::eval_select(id_cols, data)) } if (length(names_from) > 1) { new_vars <- mutate(shallow_dt(data), .names_from = paste(!!!syms(names_from), sep = names_sep)) new_vars <- unique(pull(new_vars, .names_from)) } else { new_vars <- unique(pull(data, !!sym(names_from))) new_vars <- as.character(new_vars) } new_vars <- vctrs::vec_assign(new_vars, is.na(new_vars), "NA") if (!is.null(names_glue)) { glue_df <- as.data.table(distinct(ungroup(data), !!!syms(names_from))) glue_df <- vctrs::vec_rep(glue_df, length(values_from)) glue_df$.value <- vctrs::vec_rep_each(values_from, length(new_vars)) glue_vars <- as.character(glue::glue_data(glue_df, names_glue)) } if (length(values_from) > 1) { new_vars <- lapply(values_from, function(.x) paste(.x, new_vars, sep = names_sep)) new_vars <- unlist(new_vars) } no_id <- length(id_cols) == 0 if (no_id) { lhs <- "..." # using symbol causes dcast() to fail new_vars <- c(".", new_vars) } else { lhs <- call_reduce(syms(id_cols), "+") } rhs <- call_reduce(syms(names_from), "+") vars <- c(id_cols, new_vars) args <- list( formula = call2("~", lhs, rhs), value.var = values_from, fun.aggregate = values_fn, sep = names_sep, fill = values_fill ) # Clean up call args if defaults are used args <- args[!map_lgl(args, is.null)] if (names_sep == "_") { args$sep <- NULL } out <- step_call(data, "dcast", args = args, vars = vars) if (no_id && names_sort) { new_vars <- new_vars[new_vars != "."] cols_sorted <- sort(new_vars) out <- select(out, !!!syms(cols_sorted)) } else if (no_id) { new_vars <- new_vars[new_vars != "."] out <- select(out, -.) } if (!is.null(names_glue)) { out <- step_setnames(out, new_vars, glue_vars, in_place = FALSE) # In case of names_sort = TRUE new_vars <- glue_vars } else if (nchar(names_prefix) > 0) { new_names <- paste0(names_prefix, new_vars) out <- step_setnames(out, new_vars, new_names, in_place = FALSE) # In case of names_sort = TRUE new_vars <- new_names } if (names_sort && !no_id) { cols_sorted <- c(id_cols, sort(new_vars)) out <- step_colorder(out, cols_sorted) } out <- step_repair(out, repair = names_repair) out } globalVariables(c(".", ".names_from", "name", "value", "pivot_wider")) step_repair <- function(data, repair = "check_unique", in_place = TRUE) { sim_data <- simulate_vars(data) data_names <- names(sim_data) repaired_names <- vctrs::vec_as_names(data_names, repair = repair) if (any(data_names != repaired_names)) { data <- step_setnames(data, seq_along(data_names), repaired_names, in_place = in_place) } data } shallow_dt <- function(x) { filter(x, TRUE) } call_reduce <- function(x, fun) { Reduce(function(x, y) call2(fun, x, y), x) } dtplyr/R/step.R0000644000176200001440000001372314406335651013104 0ustar liggesusers# We use a hybrid approach where most of the computation is done on # construction. This avoids the deeply recursive approach of dbplyr, which # should improve performance because we're not repeatedly re-computing the # same values. # # dt_call() is managed separately because it involves much more code (which # which dilute the intent of the constructor), and should only be called # relatively few times. new_step <- function(parent, vars = parent$vars, groups = parent$groups, locals = parent$locals, implicit_copy = parent$implicit_copy, needs_copy = parent$needs_copy, env = parent$env, ..., class = character()) { stopifnot(is.data.table(parent) || is_step(parent)) stopifnot(is.character(vars)) stopifnot(is.character(groups)) structure( list( parent = parent, vars = vars, groups = groups, locals = locals, implicit_copy = implicit_copy, needs_copy = needs_copy, env = env, ... ), class = c(class, "dtplyr_step") ) } #' @export dim.dtplyr_step <- function(x) { c(NA, length(x$vars)) } #' @importFrom dplyr tbl_vars #' @export tbl_vars.dtplyr_step <- function(x) { x$vars } #' @importFrom dplyr groups #' @export groups.dtplyr_step <- function(x) { syms(x$groups) } #' @importFrom dplyr group_vars #' @export group_vars.dtplyr_step <- function(x) { x$groups } #' @importFrom dplyr group_size #' @export group_size.dtplyr_step <- function(x) { collect(summarise(x, n = .N))$n } #' @importFrom dplyr n_groups #' @export n_groups.dtplyr_step <- function(x) { length(group_size(x)) } #' Force computation of a lazy data.table #' #' * `collect()` returns a tibble, grouped if needed. #' * `compute()` generates an intermediate assignment in the translation. #' * `as.data.table()` returns a data.table. #' * `as.data.frame()` returns a data frame. #' * `as_tibble()` returns a tibble. #' #' @export #' @param x A [lazy_dt] #' @param ... Arguments used by other methods. #' @importFrom dplyr collect #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(mtcars) #' #' # Generate translation #' avg_mpg <- dt %>% #' filter(am == 1) %>% #' group_by(cyl) %>% #' summarise(mpg = mean(mpg)) #' #' # Show translation and temporarily compute result #' avg_mpg #' #' # compute and return tibble #' avg_mpg_tb <- as_tibble(avg_mpg) #' avg_mpg_tb #' #' # compute and return data.table #' avg_mpg_dt <- data.table::as.data.table(avg_mpg) #' avg_mpg_dt #' #' # modify translation to use intermediate assignment #' compute(avg_mpg) #' collect.dtplyr_step <- function(x, ...) { # for consistency with dbplyr::collect() out <- as_tibble(x) if (length(x$groups) > 0) { out <- group_by(out, !!!syms(x$groups)) } out } #' @rdname collect.dtplyr_step #' @param name Name of intermediate data.table. #' @export #' @importFrom dplyr compute compute.dtplyr_step <- function(x, name = unique_name(), ...) { if (!dt_has_computation(x)) { return(x) } step_locals(x, set_names(list(dt_call(x)), name), name) } #' @rdname collect.dtplyr_step #' @export #' @param keep.rownames Ignored as dplyr never preserves rownames. as.data.table.dtplyr_step <- function(x, keep.rownames = FALSE, ...) { dt_eval(x)[] } #' @rdname collect.dtplyr_step #' @export as.data.frame.dtplyr_step <- function(x, ...) { as.data.frame(dt_eval(x)) } #' @rdname collect.dtplyr_step #' @export #' @importFrom tibble as_tibble #' @param .name_repair Treatment of problematic column names as_tibble.dtplyr_step <- function(x, ..., .name_repair = "check_unique") { out <- as_tibble(dt_eval(x), .name_repair = .name_repair) attr(out, ".internal.selfref") <- NULL attr(out, "sorted") <- NULL out } #' @export #' @importFrom dplyr pull pull.dtplyr_step <- function(.data, var = -1, name = NULL, ...) { var <- sym(tidyselect::vars_pull(.data$vars, !!enquo(var))) .data <- ungroup(.data) name <- enquo(name) if (quo_is_null(name)) { .data <- select(.data, !! var) .data <- collect(.data) .data[[1]] } else { name <- sym(tidyselect::vars_pull(.data$vars, !!name)) .data <- select(.data, !! var, !! name) .data <- collect(.data) set_names(.data[[1]], .data[[2]]) } } #' @export print.dtplyr_step <- function(x, ...) { dt <- as.data.table(x) cat_line(cli::style_bold("Source: "), "local data table ", dplyr::dim_desc(dt)) if (length(x$groups) > 0) { cat_line(cli::style_bold("Groups: "), paste(x$groups, collapse = ", ")) } if (length(x$locals) > 0) { cat_line(cli::style_bold("Call:")) for (var in names(x$locals)) { cat_line(" ", var, " <- ", expr_deparse(x$locals[[var]])) } cat_line(" ", expr_text(dt_call(x))) } else { cat_line(cli::style_bold("Call: "), expr_text(dt_call(x))) } cat_line() cat_line(format(as_tibble(dt, .name_repair = "minimal"), n = 6)[-1]) # Hack to remove "A tibble" line cat_line() cat_line(cli::col_silver( "# Use as.data.table()/as.data.frame()/as_tibble() to access results" )) invisible(x) } #' @importFrom dplyr glimpse #' @export glimpse.dtplyr_step <- function(x, width = NULL, ...) { glimpse(collect(x), width = width, ...) } #' @importFrom dplyr show_query #' @export show_query.dtplyr_step <- function(x, ...) { dt_call(x) } is_step <- function(x) inherits(x, "dtplyr_step") # Returns a named list of data.tables: most just dispatch to their # parent. The only exceptions are dt_step_first() and the two-table verbs. dt_sources <- function(x) { UseMethod("dt_sources") } #' @export dt_sources.dtplyr_step <- function(x) { dt_sources(x$parent) } dt_call <- function(x, needs_copy = x$needs_copy) { UseMethod("dt_call") } #' @export dt_call.dtplyr_step <- function(x, needs_copy = x$needs_copy) { dt_call(x$parent, needs_copy) } dt_has_computation <- function(x) { UseMethod("dt_has_computation") } #' @export dt_has_computation.dtplyr_step <- function(x) { TRUE } dtplyr/R/step-first.R0000644000176200001440000001027714406333051014222 0ustar liggesusers#' Create a "lazy" data.table for use with dplyr verbs #' #' @description #' A lazy data.table lazy captures the intent of dplyr verbs, only actually #' performing computation when requested (with [collect()], [pull()], #' [as.data.frame()], [data.table::as.data.table()], or [tibble::as_tibble()]). #' This allows dtplyr to convert dplyr verbs into as few data.table expressions #' as possible, which leads to a high performance translation. #' #' See `vignette("translation")` for the details of the translation. #' #' @param x A data table (or something can can be coerced to a data table). #' @param immutable If `TRUE`, `x` is treated as immutable and will never #' be modified by any code generated by dtplyr. Alternatively, you can set #' `immutable = FALSE` to allow dtplyr to modify the input object. #' @param name Optionally, supply a name to be used in generated expressions. #' For expert use only. #' @param key_by Set keys for data frame, using [select()] semantics (e.g. #' `key_by = c(key1, key2)`. #' #' This uses [data.table::setkey()] to sort the table and build an index. #' This will considerably improve performance for subsets, summaries, and #' joins that use the keys. #' #' See `vignette("datatable-keys-fast-subset")` for more details. #' @export #' @aliases tbl_dt grouped_dt #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' # If you have a data.table, using it with any dplyr generic will #' # automatically convert it to a lazy_dt object #' dt <- data.table::data.table(x = 1:10, y = 10:1) #' dt %>% filter(x == y) #' dt %>% mutate(z = x + y) #' #' # Note that dtplyr will avoid mutating the input data.table, so the #' # previous translation includes an automatic copy(). You can avoid this #' # with a manual call to lazy_dt() #' dt %>% #' lazy_dt(immutable = FALSE) %>% #' mutate(z = x + y) #' #' # If you have a data frame, you can use lazy_dt() to convert it to #' # a data.table: #' mtcars2 <- lazy_dt(mtcars) #' mtcars2 #' mtcars2 %>% select(mpg:cyl) #' mtcars2 %>% select(x = mpg, y = cyl) #' mtcars2 %>% filter(cyl == 4) %>% select(mpg) #' mtcars2 %>% select(mpg, cyl) %>% filter(cyl == 4) #' mtcars2 %>% mutate(cyl2 = cyl * 2, cyl4 = cyl2 * 2) #' mtcars2 %>% transmute(cyl2 = cyl * 2, vs2 = vs * 2) #' mtcars2 %>% filter(cyl == 8) %>% mutate(cyl2 = cyl * 2) #' #' # Learn more about translation in vignette("translation") #' by_cyl <- mtcars2 %>% group_by(cyl) #' by_cyl %>% summarise(mpg = mean(mpg)) #' by_cyl %>% mutate(mpg = mean(mpg)) #' by_cyl %>% #' filter(mpg < mean(mpg)) %>% #' summarise(hp = mean(hp)) lazy_dt <- function(x, name = NULL, immutable = TRUE, key_by = NULL) { # in case `x` has an `as.data.table()` method but not a `group_vars()` method groups <- tryCatch(group_vars(x), error = function(e) character()) if (!is.data.table(x)) { if (!immutable) { abort("`immutable` must be `TRUE` when `x` is not already a data table.") } x <- as.data.table(x) copied <- TRUE } else { copied <- FALSE } key_by <- enquo(key_by) key_vars <- unname(tidyselect::vars_select(names(x), !!key_by)) if (length(key_vars)) { if (immutable && !copied) { x <- data.table::copy(x) } data.table::setkeyv(x, key_vars) } step_first(x, name = name, groups = groups, immutable = immutable, env = caller_env()) } #' @export dim.dtplyr_step_first <- function(x) { dim(x$parent) } step_first <- function(parent, name = NULL, groups = character(), immutable = TRUE, env = caller_env()) { stopifnot(is.data.table(parent)) if (is.null(name)) { name <- unique_name() } new_step(parent, vars = names(parent), groups = groups, locals = list(), implicit_copy = !immutable, needs_copy = FALSE, name = sym(name), env = env, class = "dtplyr_step_first" ) } #' @export dt_call.dtplyr_step_first <- function(x, needs_copy = FALSE) { if (needs_copy) { expr(copy(!!x$name)) } else { x$name } } #' @export dt_sources.dtplyr_step_first <- function(x) { stats::setNames(list(x$parent), as.character(x$name)) } #' @export dt_has_computation.dtplyr_step_first <- function(x) { FALSE } unique_name <- local({ i <- 0 function() { i <<- i + 1 paste0("_DT", i) } }) dtplyr/R/step-subset-do.R0000644000176200001440000000133414300165007014767 0ustar liggesusers#' @importFrom dplyr do #' @export do.dtplyr_step <- function(.data, ...) { # This is a partial implementation, because I don't think that many # people are likely to use it, given that do() is marked as questioning # Problems: # * doesn't handle unnamed case # * doesn't set .SDcols so `.SD` will only refer to non-groups # * can duplicating group vars (#5) dots <- capture_dots(.data, ...) if (any(names2(dots) == "")) { # I can't see any way to figure out what the variables are abort("Unnamed do() not supported by dtplyr") } new_vars <- lapply(dots, function(x) call2(".", x)) j <- call2(".", !!!new_vars) vars <- union(.data$vars, names(dots)) step_subset_j(.data, vars = vars, j = j) } dtplyr/R/step-assign.R0000644000176200001440000000064714006775461014372 0ustar liggesusersstep_locals <- function(parent, locals, name) { stopifnot(is_step(parent)) stopifnot(is.list(locals)) stopifnot(is_string(name)) new_step( parent = parent, locals = utils::modifyList(parent$locals, locals), implicit_copy = TRUE, needs_copy = FALSE, name = name, class = "dtplyr_step_assign", ) } #' @export dt_call.dtplyr_step_assign <- function(x, needs_copy = FALSE) { sym(x$name) } dtplyr/R/step-colorder.R0000644000176200001440000000177214300165007014701 0ustar liggesusersstep_colorder <- function(x, col_order) { stopifnot(is_step(x)) stopifnot(is.character(col_order) || is.integer(col_order)) if (any(duplicated(col_order))) { abort("Every element of `col_order` must be unique.") } col_order <- unname(col_order) if (is.integer(col_order)) { if (identical(col_order, seq_along(col_order))) { return(x) } vars <- x$vars[col_order] } else { vars_selected <- x$vars[x$vars %in% col_order] vars_count <- vctrs::vec_count(vars_selected) vars_problematic <- vars_count$key[vars_count$count != 1] if (!is_empty(vars_problematic)) { vars_error <- paste0(vars_problematic, collapse = ", ") msg <- paste0("The column(s) ", vars_error, " do not uniquely match a column in `x`.") abort(msg) } if (identical(col_order, x$vars[seq_along(col_order)])) { return(x) } vars <- col_order } step_call(x, "setcolorder", args = list(col_order), vars = vars, in_place = !x$implicit_copy ) } dtplyr/R/step-nest.R0000644000176200001440000000422514372711230014041 0ustar liggesusers#' Nest #' #' @description #' This is a method for the tidyr [tidyr::nest()] generic. It is translated #' using the non-nested variables in the `by` argument and `.SD` in the `j` #' argument. #' #' @inheritParams tidyr::nest #' @param ... <[`tidy-select`][tidyr::tidyr_tidy_select]> Columns to nest, specified #' using name-variable pairs of the form `new_col = c(col1, col2, col3)`. #' The right hand side can be any valid tidy select expression. #' @param .key Not supported. #' @param data A [lazy_dt()]. #' @examples #' if (require("tidyr", quietly = TRUE)) { #' dt <- lazy_dt(tibble(x = c(1, 2, 1), y = c("a", "a", "b"))) #' dt %>% nest(data = y) #' #' dt %>% dplyr::group_by(x) %>% nest() #' } # exported onLoad nest.dtplyr_step <- function(.data, ..., .names_sep = NULL, .key = deprecated()) { if (lifecycle::is_present(.key)) { abort(c( "`nest()` for lazy data.tables doesn't support the `.key` argument.", i = "Use a name in the `...` argument instead." )) } cols <- eval_nest_dots(.data, ...) cols <- lapply(cols, set_names) if (!is.null(.names_sep)) { cols <- imap(cols, strip_names, .names_sep) } if (length(cols) == 1 && is.null(.names_sep)) { # use `.SD` as it is shorter and faster nm <- names(cols) j_exprs <- exprs(!!nm := .(.SD)) } else { j_exprs <- imap( cols, function(x, name) { x <- simplify_names(x) expr(.(data.table(!!!syms(x)))) } ) } asis <- setdiff(.data$vars, unlist(cols)) out <- step_subset_j( .data, vars = c(asis, names(cols)), j = expr(.(!!!j_exprs)), groups = asis, arrange = FALSE ) groups <- intersect(out$vars, group_vars(.data)) group_by(out, !!!syms(groups)) } eval_nest_dots <- function(.data, ...) { if (missing(...)) { groups <- group_vars(.data) if (is_empty(groups)) { warn(paste0( "`...` must not be empty for ungrouped data frames.\n", "Did you want `data = everything()`?" )) } nest_vars <- setdiff(.data$vars, groups) list(data = nest_vars) } else { cols <- enquos(...) lapply(cols, function(.x) names(tidyselect::eval_select(.x, .data))) } } dtplyr/R/step-subset-slice.R0000644000176200001440000002116014406335651015476 0ustar liggesusers #' Subset rows using their positions #' #' @description #' These are methods for the dplyr [slice()], `slice_head()`, `slice_tail()`, #' `slice_min()`, `slice_max()` and `slice_sample()` generics. They are #' translated to the `i` argument of `[.data.table`. #' #' Unlike dplyr, `slice()` (and `slice()` alone) returns the same number of #' rows per group, regardless of whether or not the indices appear in each #' group. #' #' @importFrom dplyr slice #' @param .data A [lazy_dt()]. #' @inheritParams dplyr::slice #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(mtcars) #' dt %>% slice(1, 5, 10) #' dt %>% slice(-(1:4)) #' #' # First and last rows based on existing order #' dt %>% slice_head(n = 5) #' dt %>% slice_tail(n = 5) #' #' # Rows with minimum and maximum values of a variable #' dt %>% slice_min(mpg, n = 5) #' dt %>% slice_max(mpg, n = 5) #' #' # slice_min() and slice_max() may return more rows than requested #' # in the presence of ties. Use with_ties = FALSE to suppress #' dt %>% slice_min(cyl, n = 1) #' dt %>% slice_min(cyl, n = 1, with_ties = FALSE) #' #' # slice_sample() allows you to random select with or without replacement #' dt %>% slice_sample(n = 5) #' dt %>% slice_sample(n = 5, replace = TRUE) #' #' # you can optionally weight by a variable - this code weights by the #' # physical weight of the cars, so heavy cars are more likely to get #' # selected #' dt %>% slice_sample(weight_by = wt, n = 5) slice.dtplyr_step <- function(.data, ..., .by = NULL) { dots <- capture_dots(.data, ..., .j = FALSE) by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") if (length(dots) == 0) { i <- NULL } else { if (length(dots) == 1) { .rows <- dots[[1]] } else { .rows <- call2("c", !!!dots) } # Update logic once data.table #4353 is merged # https://github.com/Rdatatable/data.table/pull/4353 assign_rows_var <- expr(.rows <- !!.rows) subset_valid_rows <- expr(.rows[between(.rows, -.N, .N)]) i <- call2("{", assign_rows_var, subset_valid_rows) } step_subset_i(.data, i, by) } #' @rdname slice.dtplyr_step #' @importFrom dplyr slice_head #' @inheritParams dplyr::slice #' @export slice_head.dtplyr_step <- function(.data, ..., n, prop, by = NULL) { check_dots_empty() by <- compute_by({{ by }}, .data, by_arg = "by", data_arg = ".data") size <- get_slice_size(n, prop, "slice_head") i <- expr(rlang::seq2(1L, !!size)) step_subset_i(.data, i = i, by) } #' @rdname slice.dtplyr_step #' @importFrom dplyr slice_tail #' @export slice_tail.dtplyr_step <- function(.data, ..., n, prop, by = NULL) { check_dots_empty() by <- compute_by({{ by }}, .data, by_arg = "by", data_arg = ".data") size <- get_slice_size(n, prop, "slice_tail") i <- expr(rlang::seq2(.N - !!size + 1L, .N)) step_subset_i(.data, i = i, by) } #' @rdname slice.dtplyr_step #' @importFrom dplyr slice_min #' @inheritParams dplyr::slice #' @export slice_min.dtplyr_step <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE) { if (missing(order_by)) { abort("argument `order_by` is missing, with no default.") } slice_min_max( .data, order_by = {{ order_by }}, decreasing = FALSE, ..., n = n, prop = prop, by = {{ by }}, with_ties = with_ties, .slice_fn = "slice_min" ) } #' @rdname slice.dtplyr_step #' @importFrom dplyr slice_max #' @export slice_max.dtplyr_step <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE) { if (missing(order_by)) { abort("argument `order_by` is missing, with no default.") } slice_min_max( .data, order_by = {{ order_by }}, decreasing = TRUE, ..., n = n, prop = prop, by = {{ by }}, with_ties = with_ties, .slice_fn = "slice_max" ) } slice_min_max <- function(.data, order_by, decreasing, ..., n, prop, by = NULL, with_ties = TRUE, .slice_fn = "slice_min_max") { check_dots_empty() size <- get_slice_size(n, prop, .slice_fn) by <- compute_by({{ by }}, .data, by_arg = "by", data_arg = ".data") order_by <- capture_dot(.data, {{ order_by }}, j = FALSE) if (decreasing) { order_by <- expr(desc(!!order_by)) } if (with_ties) { ties.method <- "min" } else { ties.method <- "first" } i <- expr(!!smaller_ranks(!!order_by, !!size, ties.method = ties.method)) out <- step_subset_i(.data, i, by) arrange(out, !!order_by, .by_group = TRUE) } smaller_ranks <- function(x, y, ties.method = "min") { x <- enexpr(x) y <- enexpr(y) # `frank()` by group is much slower than rank # https://github.com/Rdatatable/data.table/issues/3988 # also https://github.com/Rdatatable/data.table/issues/4284 expr(rank(!!x, ties.method = !!ties.method, na.last = "keep") <= !!y) } #' @importFrom dplyr slice_sample #' @inheritParams dplyr::slice #' @export slice_sample.dtplyr_step <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) { check_dots_empty() size <- get_slice_size(n, prop, "slice_sample") wt <- enexpr(weight_by) i <- sample_int(.N, !!size, replace = replace, wt = wt) step_subset_i(.data, i) } sample_int <- function(n, size, replace = FALSE, wt = NULL) { n <- enexpr(n) size <- enexpr(size) if (replace) { out <- expr(sample.int(!!n, !!size, replace = TRUE)) } else { out <- expr(sample.int(!!n, min(!!size, !!n))) } if (!is.null(wt)) { out$prob <- wt } out } # sample_ ----------------------------------------------------------------- #' @importFrom dplyr sample_n #' @export sample_n.dtplyr_step <- function(tbl, size, replace = FALSE, weight = NULL, .env = NULL, ... ) { weight <- enexpr(weight) step_subset_i(tbl, i = sample_call(size, replace, weight)) } #' @importFrom dplyr sample_frac #' @export sample_frac.dtplyr_step <- function(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ... ) { weight <- enexpr(weight) step_subset_i(tbl, i = sample_call(expr(.N * !!size), replace, weight)) } # helpers ----------------------------------------------------------------- check_constant <- function(x, name, fn) { withCallingHandlers(force(x), error = function(e) { abort(c( glue("`{name}` must be a constant in `{fn}()`."), x = conditionMessage(e) ), parent = e) }) } check_slice_size <- function(n, prop, .slice_fn = "check_slice_size", call = caller_env()) { if (missing(n) && missing(prop)) { list(type = "n", n = 1L) } else if (!missing(n) && missing(prop)) { n <- check_constant(n, "n", .slice_fn) if (!is.numeric(n) || length(n) != 1 || is.na(n)) { abort("`n` must be a single number.", call = call) } list(type = "n", n = as.integer(n)) } else if (!missing(prop) && missing(n)) { prop <- check_constant(prop, "prop", .slice_fn) if (!is.numeric(prop) || length(prop) != 1 || is.na(prop)) { abort("`prop` must be a single number.", call = call) } list(type = "prop", prop = prop) } else { abort("Must supply exactly one of `n` and `prop` arguments.", call = call) } } get_slice_size <- function(n, prop, .slice_fn = "get_slice_size") { slice_input <- check_slice_size(n, prop, .slice_fn, call = caller_env()) if (slice_input$type == "n") { if (slice_input$n < 0) { expr(max(.N + !!slice_input$n, 0L)) } else { expr(min(!!slice_input$n, .N)) } } else if (slice_input$type == "prop") { if (slice_input$prop < 0) { expr(max(.N + as.integer(!!slice_input$prop * .N), 0L)) } else { expr(min(as.integer(!!slice_input$prop * .N), .N)) } } } sample_call <- function(size, replace = FALSE, weight = NULL) { call <- expr(sample(.N, !!size)) if (replace) { call$replace <- TRUE } call$prob <- weight call } dtplyr/R/step-subset-arrange.R0000644000176200001440000000245614300165007016012 0ustar liggesusers#' Arrange rows by column values #' #' This is a method for dplyr generic [arrange()]. It is translated to #' an [order()] call in the `i` argument of `[.data.table`. #' #' @param .data A [lazy_dt()]. #' @inheritParams dplyr::arrange #' @importFrom dplyr arrange #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(mtcars) #' dt %>% arrange(vs, cyl) #' dt %>% arrange(desc(vs), cyl) #' dt %>% arrange(across(mpg:disp)) arrange.dtplyr_step <- function(.data, ..., .by_group = FALSE) { dots <- capture_dots(.data, ..., .j = FALSE) if (.by_group) { dots <- c(syms(.data$groups), dots) } if (length(dots) == 0) { return(.data) } no_transmute <- all(map_lgl(dots, is_simple_arrange)) # Order without grouping then restore dots <- set_names(dots, NULL) if (is_copied(.data) && no_transmute) { dots <- c(dots, na.last = TRUE) step <- step_call(.data, "setorder", dots) } else { step <- step_subset(.data, i = call2("order", !!!dots), groups = character()) } step_group(step, groups = .data$groups) } is_copied <- function(x) { x$implicit_copy || x$needs_copy } is_simple_arrange <- function(x) { out <- FALSE if (is_symbol(x)) { out <- TRUE } else if (is_call(x, "-", 1)) { if (is_symbol(x[[2]])) { out <- TRUE } } out } dtplyr/R/step-call-pivot_longer.R0000644000176200001440000002255114372711230016512 0ustar liggesusers#' Pivot data from wide to long #' #' @description #' This is a method for the tidyr `pivot_longer()` generic. It is translated to #' [data.table::melt()] #' #' @param data A [lazy_dt()]. #' @inheritParams tidyr::pivot_longer #' @param names_ptypes,names_transform,values_ptypes,values_transform #' Not currently supported by dtplyr. #' @examples #' library(tidyr) #' #' # Simplest case where column names are character data #' relig_income_dt <- lazy_dt(relig_income) #' relig_income_dt %>% #' pivot_longer(!religion, names_to = "income", values_to = "count") #' #' # Slightly more complex case where columns have common prefix, #' # and missing missings are structural so should be dropped. #' billboard_dt <- lazy_dt(billboard) #' billboard %>% #' pivot_longer( #' cols = starts_with("wk"), #' names_to = "week", #' names_prefix = "wk", #' values_to = "rank", #' values_drop_na = TRUE #' ) #' #' # Multiple variables stored in column names #' lazy_dt(who) %>% #' pivot_longer( #' cols = new_sp_m014:newrel_f65, #' names_to = c("diagnosis", "gender", "age"), #' names_pattern = "new_?(.*)_(.)(.*)", #' values_to = "count" #' ) #' #' # Multiple observations per row #' anscombe_dt <- lazy_dt(anscombe) #' anscombe_dt %>% #' pivot_longer( #' everything(), #' names_to = c(".value", "set"), #' names_pattern = "(.)(.)" #' ) # exported onLoad pivot_longer.dtplyr_step <- function(data, cols, names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, names_ptypes = NULL, names_transform = NULL, names_repair = "check_unique", values_to = "value", values_drop_na = FALSE, values_ptypes = NULL, values_transform = NULL, ...) { if (!is.null(names_ptypes)) { abort("`names_ptypes` is not supported by dtplyr") } if (!is.null(names_transform)) { abort("`names_transform` is not supported by dtplyr") } if (!is.null(values_ptypes)) { abort("`values_ptypes` is not supported by dtplyr") } if (!is.null(values_transform)) { abort("`values_transform` is not supported by dtplyr") } measure_vars <- names(tidyselect::eval_select(enquo(cols), data)) if (length(measure_vars) == 0) { abort("`cols` must select at least one column.") } multiple_names_to <- length(names_to) > 1 uses_dot_value <- ".value" %in% names_to variable_name <- "variable" if (uses_dot_value) { if (!is.null(names_sep)) { names_to_setup <- str_separate(measure_vars, into = names_to, sep = names_sep) } else if (!is.null(names_pattern)) { names_to_setup <- str_extract(measure_vars, into = names_to, names_pattern) } else { abort("If you use '.value' in `names_to` you must also supply `names_sep' or `names_pattern") } .value <- names_to_setup$.value v_fct <- factor(.value, levels = unique(.value)) measure_vars <- split(measure_vars, v_fct) values_to <- names(measure_vars) names(measure_vars) <- NULL if (multiple_names_to) { variable_name <- names_to[!names_to == ".value"] .value_ids <- split(names_to_setup[[variable_name]], v_fct) .value_id <- .value_ids[[1]] # Make sure data is "balanced" # https://github.com/Rdatatable/data.table/issues/2575 # The list passed to measure.vars also needs the same number of column names per element equal_ids <- map_lgl( .value_ids[-1], function(.x) isTRUE(all.equal(.value_id, .x)) ) if (all(equal_ids)) { .value_id <- vctrs::vec_rep_each(.value_id, length(pull(data))) } else { abort("`data.table::melt()` doesn't currently support melting of unbalanced datasets.") } } } else if (multiple_names_to) { if (is.null(names_sep) && is.null(names_pattern)) { abort("If you supply multiple names in `names_to` you must also supply `names_sep` or `names_pattern`") } else if (!is.null(names_sep) && !is.null(names_pattern)) { abort("only one of names_sep or names_pattern should be provided") } } else { variable_name <- names_to } args <- list( measure.vars = measure_vars, variable.name = variable_name, value.name = values_to, na.rm = values_drop_na, variable.factor = FALSE ) # Clean up call args if defaults are used if (variable_name == "variable") { args$variable.name <- NULL } if (identical(values_to, "value")) { args$value.name <- NULL } if (is_false(values_drop_na)) { args$na.rm <- NULL } id_vars <- setdiff(data$vars, unlist(measure_vars)) out <- step_call( data, "melt", args = args, vars = c(id_vars, variable_name, values_to) ) if (!is.null(names_prefix)) { out <- mutate(out, !!variable_name := gsub(paste0("^", names_prefix), "", !!sym(variable_name))) } if (multiple_names_to && uses_dot_value) { out <- mutate(out, !!variable_name := !!.value_id) } else if (multiple_names_to && !uses_dot_value) { if (!is.null(names_sep)) { into_cols <- str_separate(pull(out, !!sym(variable_name)), names_to, sep = names_sep) } else { into_cols <- str_extract(pull(out, !!sym(variable_name)), into = names_to, regex = names_pattern) } out <- mutate(out, !!!into_cols) # Need to drop variable_name and move names_to vars to correct position # Recreates relocate logic so only select is necessary, not relocate + select out_vars <- out$vars var_idx <- which(out_vars == variable_name) before_vars <- out_vars[seq_along(out_vars) < var_idx] after_vars <- out_vars[seq_along(out_vars) > var_idx] out <- select(out, !!!syms(before_vars), !!!syms(names_to), !!!syms(after_vars)) } else if (!multiple_names_to && uses_dot_value) { out <- mutate(out, variable = NULL) } step_repair(out, repair = names_repair) } # ============================================================================== # inlined from tidyr # https://github.com/tidyverse/tidyr/issues/1103 # ============================================================================== # nocov start # str_extract() ----------------------------------------------------------------- str_extract <- function(x, into, regex, convert = FALSE) { stopifnot( is_string(regex), is_character(into) ) out <- str_match_first(x, regex) if (length(out) != length(into)) { stop( "`regex` should define ", length(into), " groups; ", ncol(out), " found.", call. = FALSE ) } # Handle duplicated names if (anyDuplicated(into)) { pieces <- split(out, into) into <- names(pieces) out <- lapply(pieces, pmap_chr, paste0, sep = "") } into <- as_utf8_character(into) non_na_into <- !is.na(into) out <- out[non_na_into] names(out) <- into[non_na_into] if (convert) { out[] <- lapply(out, utils::type.convert, as.is = TRUE) } out } str_match_first <- function(string, regex) { loc <- regexpr(regex, string, perl = TRUE) loc <- group_loc(loc) out <- lapply( seq_len(loc$matches), function(i) substr(string, loc$start[, i], loc$end[, i]) ) out[-1] } group_loc <- function(x) { start <- cbind(as.vector(x), attr(x, "capture.start")) end <- start + cbind(attr(x, "match.length"), attr(x, "capture.length")) - 1L no_match <- start == -1L start[no_match] <- NA end[no_match] <- NA list(matches = ncol(start), start = start, end = end) } # str_separate() ----------------------------------------------------------------- str_separate <- function(x, into, sep, convert = FALSE, extra = "warn", fill = "warn") { if (!is.character(into)) { abort("`into` must be a character vector") } if (is.numeric(sep)) { out <- strsep(x, sep) } else if (is_character(sep)) { out <- data.table::tstrsplit(x, sep, fixed = TRUE, names = TRUE) out <- as_tibble(out) } else { abort("`sep` must be either numeric or character") } names(out) <- as_utf8_character(into) out <- out[!is.na(names(out))] if (convert) { out[] <- lapply(out, utils::type.convert, as.is = TRUE) } out } strsep <- function(x, sep) { nchar <- nchar(x) pos <- lapply(sep, function(i) { if (i >= 0) return(i) pmax(0, nchar + i) }) pos <- c(list(0), pos, list(nchar)) lapply(1:(length(pos) - 1), function(i) { substr(x, pos[[i]] + 1, pos[[i + 1]]) }) } str_split_n <- function(x, pattern, n_max = -1) { if (is.factor(x)) { x <- as.character(x) } m <- gregexpr(pattern, x, perl = TRUE) if (n_max > 0) { m <- lapply(m, function(x) slice_match(x, seq_along(x) < n_max)) } regmatches(x, m, invert = TRUE) } slice_match <- function(x, i) { structure( x[i], match.length = attr(x, "match.length")[i], index.type = attr(x, "index.type"), useBytes = attr(x, "useBytes") ) } list_indices <- function(x, max = 20) { if (length(x) > max) { x <- c(x[seq_len(max)], "...") } paste(x, collapse = ", ") } # pmap_chr() ----------------------------------------------------------------- pmap_chr <- function(.l, .f, ...) { as.character(pmap(.l, .f, ...)) } # nocov end dtplyr/R/complete.R0000644000176200001440000000157714300165007013732 0ustar liggesusers#' Complete a data frame with missing combinations of data #' #' @description #' This is a method for the tidyr `complete()` generic. This is a wrapper #' around `dtplyr` translations for `expand()`, `full_join()`, and `replace_na()` #' that's useful for completing missing combinations of data. #' #' @param data A [lazy_dt()]. #' @inheritParams tidyr::complete #' @examples #' library(tidyr) #' tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) #' dt <- lazy_dt(tbl) #' #' dt %>% #' complete(x, y) #' #' dt %>% #' complete(x, y, fill = list(z = 10L)) # exported onLoad complete.dtplyr_step <- function(data, ..., fill = list()) { dots <- enquos(...) dots <- dots[!map_lgl(dots, quo_is_null)] if (length(dots) == 0) { return(data) } full <- tidyr::expand(data, !!!dots) full <- dplyr::full_join(full, data, by = full$vars) full <- tidyr::replace_na(full, replace = fill) full } dtplyr/R/step-subset-separate.R0000644000176200001440000000550714300165007016177 0ustar liggesusers#' Separate a character column into multiple columns with a regular #' expression or numeric locations #' #' @description #' This is a method for the [tidyr::separate()] generic. It is translated to #' [data.table::tstrsplit()] in the `j` argument of `[.data.table`. #' #' @param data A [lazy_dt()]. #' @param col Column name or position. #' #' This argument is passed by expression and supports quasiquotation #' (you can unquote column names or column positions). #' @param into Names of new variables to create as character vector. #' Use `NA` to omit the variable in the output. #' @param sep Separator between columns. #' The default value is a regular expression that matches any sequence of non-alphanumeric values. #' @param remove If TRUE, remove the input column from the output data frame. #' @param convert If TRUE, will run type.convert() with as.is = TRUE on new columns. #' This is useful if the component columns are integer, numeric or logical. #' #' NB: this will cause string "NA"s to be converted to NAs. #' @param ... Arguments passed on to methods #' @examples #' library(tidyr) #' # If you want to split by any non-alphanumeric value (the default): #' df <- lazy_dt(data.frame(x = c(NA, "x.y", "x.z", "y.z")), "DT") #' df %>% separate(x, c("A", "B")) #' #' # If you just want the second variable: #' df %>% separate(x, c(NA, "B")) #' #' # Use regular expressions to separate on multiple characters: #' df <- lazy_dt(data.frame(x = c(NA, "x?y", "x.z", "y:z")), "DT") #' df %>% separate(x, c("A","B"), sep = "([.?:])") #' #' # convert = TRUE detects column classes: #' df <- lazy_dt(data.frame(x = c("x:1", "x:2", "y:4", "z", NA)), "DT") #' df %>% separate(x, c("key","value"), ":") %>% str #' df %>% separate(x, c("key","value"), ":", convert = TRUE) %>% str # exported onLoad separate.dtplyr_step <- function(data, col, into, sep = "[^[:alnum:]]+", remove = TRUE, convert = FALSE, ...) { if (!vctrs::vec_is(into, character())) { abort("`into` must be a character vector.") } if (!vctrs::vec_is(sep, character())) { abort("`sep` must be a character vector.") } col <- sym(tidyselect::vars_pull(data$vars, !!enquo(col))) into_length <- length(into) not_na_into <- !is.na(into) keep <- seq_along(into)[not_na_into] into <- into[not_na_into] t_str_split <- call2("tstrsplit", col, split = sep) if (length(keep) < into_length) { t_str_split$keep <- keep } if (isTRUE(convert)) { t_str_split$type.convert <- TRUE } out <- step_subset( data, vars = union(data$vars, into), j = call2(":=", into, t_str_split), needs_copy = data$needs_copy || !data$implicit_copy ) if (remove && !as.character(col) %in% into) { out <- select(out, -!!col) } out } dtplyr/R/step-subset-summarise.R0000644000176200001440000000620314375142710016402 0ustar liggesusers #' Summarise each group to one row #' #' This is a method for the dplyr [summarise()] generic. It is translated to #' the `j` argument of `[.data.table`. #' #' @param .data A [lazy_dt()]. #' @inheritParams dplyr::summarise #' @importFrom dplyr summarise #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(mtcars) #' #' dt %>% #' group_by(cyl) %>% #' summarise(vs = mean(vs)) #' #' dt %>% #' group_by(cyl) %>% #' summarise(across(disp:wt, mean)) summarise.dtplyr_step <- function(.data, ..., .by = NULL, .groups = NULL) { by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") if (by$uses_by) { group_vars <- by$names .groups <- "drop" } else { group_vars <- .data$groups } dots <- capture_dots(.data, ..., .by = by) check_summarise_vars(dots) if (length(dots) == 0) { if (length(group_vars) == 0) { out <- step_subset_j(.data, vars = character(), j = 0L) } else { # Acts like distinct on grouping vars out <- distinct(.data, !!!syms(group_vars)) } } else { out <- step_subset_j( .data, vars = union(group_vars, names(dots)), j = call2(".", !!!dots), by = by ) } replaced_group_vars <- intersect(group_vars, names(dots)) if (!is_empty(replaced_group_vars)) { out <- step_subset( out, groups = character(), j = expr(!!replaced_group_vars := NULL) ) } out_groups <- summarise_groups(.data, .groups, caller_env()) step_group(out, groups = out_groups) } # For each expression, check if it uses any newly created variables check_summarise_vars <- function(dots) { for (i in seq_along(dots)) { used_vars <- all_names(get_expr(dots[[i]])) cur_vars <- names(dots)[seq_len(i - 1)] if (any(used_vars %in% cur_vars)) { abort(paste0( "`", names(dots)[[i]], "` ", "refers to a variable created earlier in this summarise().\n", "Do you need an extra mutate() step?" ), call = caller_env()) } } } summarise_groups <- function(.data, .groups, env_caller) { if (!is.null(.groups) && !.groups %in% c("drop_last", "drop", "keep")) { abort(c( paste0( "`.groups` can't be ", as_label(.groups), if (.groups == "rowwise") " in dtplyr" ), i = 'Possible values are NULL (default), "drop_last", "drop", and "keep"' ), call = caller_env()) } group_vars <- .data$groups n <- length(group_vars) verbose <- summarise_verbose(.groups, env_caller) if (verbose && n > 1) { new_groups <- glue::glue_collapse(paste0("'", group_vars[-n], "'"), sep = ", ") summarise_inform("has grouped output by {new_groups}") } .groups <- .groups %||% "drop_last" switch(.groups, drop_last = group_vars[-n], keep = group_vars, drop = character() ) } summarise_verbose <- function(.groups, .env) { is.null(.groups) && is_reference(topenv(.env), global_env()) && !identical(getOption("dplyr.summarise.inform"), FALSE) } summarise_inform <- function(..., .env = parent.frame()) { inform(paste0( "`summarise()` ", glue::glue(..., .envir = .env), '. You can override using the `.groups` argument.' )) } dtplyr/R/replace_na.R0000644000176200001440000000254214300165007014204 0ustar liggesusers#' Replace NAs with specified values #' #' @description #' This is a method for the tidyr `replace_na()` generic. It is translated to #' [data.table::fcoalesce()]. #' #' Note that unlike `tidyr::replace_na()`, `data.table::fcoalesce()` cannot #' replace `NULL` values in lists. #' #' @inheritParams tidyr::replace_na #' @param data A [lazy_dt()]. #' @examples #' library(tidyr) #' #' # Replace NAs in a data frame #' dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) #' dt %>% replace_na(list(x = 0, y = "unknown")) #' #' # Replace NAs using `dplyr::mutate()` #' dt %>% dplyr::mutate(x = replace_na(x, 0)) # exported onLoad replace_na.dtplyr_step <- function(data, replace = list()) { stopifnot(is.list(replace)) if (length(replace) == 0) { return(data) } sim_data <- simulate_vars(data) replace_vars <- intersect(names(replace), names(sim_data)) replace_calls <- vector("list", length(replace_vars)) names(replace_calls) <- replace_vars for (i in seq_along(replace_vars)) { var <- replace_vars[[i]] check_replacement(replace[[i]], var) replace_calls[[i]] <- call2("fcoalesce", sym(var), replace[[i]]) } mutate(data, !!!replace_calls) } check_replacement <- function(x, var) { n <- length(x) if (n == 1) { return() } abort(glue::glue("Replacement for `{var}` is length {n}, not length 1"), call = caller_env()) } dtplyr/R/fill.R0000644000176200001440000000506214300165007013041 0ustar liggesusers#' Fill in missing values with previous or next value #' #' @description #' This is a method for the tidyr `fill()` generic. It is translated to #' [data.table::nafill()]. Note that `data.table::nafill()` currently only #' works for integer and double columns. #' #' @inheritParams tidyr::fill #' @examples #' library(tidyr) #' #' # Value (year) is recorded only when it changes #' sales <- lazy_dt(tibble::tribble( #' ~quarter, ~year, ~sales, #' "Q1", 2000, 66013, #' "Q2", NA, 69182, #' "Q3", NA, 53175, #' "Q4", NA, 21001, #' "Q1", 2001, 46036, #' "Q2", NA, 58842, #' "Q3", NA, 44568, #' "Q4", NA, 50197, #' "Q1", 2002, 39113, #' "Q2", NA, 41668, #' "Q3", NA, 30144, #' "Q4", NA, 52897, #' "Q1", 2004, 32129, #' "Q2", NA, 67686, #' "Q3", NA, 31768, #' "Q4", NA, 49094 #' )) #' #' # `fill()` defaults to replacing missing data from top to bottom #' sales %>% fill(year) #' #' # Value (n_squirrels) is missing above and below within a group #' squirrels <- lazy_dt(tibble::tribble( #' ~group, ~name, ~role, ~n_squirrels, #' 1, "Sam", "Observer", NA, #' 1, "Mara", "Scorekeeper", 8, #' 1, "Jesse", "Observer", NA, #' 1, "Tom", "Observer", NA, #' 2, "Mike", "Observer", NA, #' 2, "Rachael", "Observer", NA, #' 2, "Sydekea", "Scorekeeper", 14, #' 2, "Gabriela", "Observer", NA, #' 3, "Derrick", "Observer", NA, #' 3, "Kara", "Scorekeeper", 9, #' 3, "Emily", "Observer", NA, #' 3, "Danielle", "Observer", NA #' )) #' #' # The values are inconsistently missing by position within the group #' # Use .direction = "downup" to fill missing values in both directions #' squirrels %>% #' dplyr::group_by(group) %>% #' fill(n_squirrels, .direction = "downup") %>% #' dplyr::ungroup() #' #' # Using `.direction = "updown"` accomplishes the same goal in this example # exported onLoad fill.dtplyr_step <- function(data, ..., .direction = c("down", "up", "downup", "updown")) { dots <- enquos(...) .direction <- arg_match(.direction) if (.direction %in% c("down", "up")) { type <- switch(.direction, "down" = "locf", "up" = "nocb") mutate(data, dplyr::across(c(!!!dots), nafill, type)) } else { if (.direction == "downup") { type1 <- "locf" type2 <- "nocb" } else { type1 <- "nocb" type2 <- "locf" } mutate(data, dplyr::across(c(!!!dots), ~ nafill(nafill(.x, type1), type2))) } } dtplyr/R/step-modify.R0000644000176200001440000000402514406335651014364 0ustar liggesusersstep_modify <- function(parent, fun, args) { new_step( parent, groups = parent$groups, arrange = parent$arrange, implicit_copy = TRUE, fun = fun, args = args, class = "dtplyr_step_modify" ) } #' @export dt_call.dtplyr_step_modify <- function(x, needs_copy = x$needs_copy) { j <- call2(x$fun, quote(.SD), quote(.BY), !!!x$args) out <- call2("[", dt_call(x$parent, needs_copy), , j) add_grouping_param(out, x, arrange = FALSE) } # dplyr methods ----------------------------------------------------------- #' Apply a function to each group #' #' These are methods for the dplyr [group_map()] and [group_modify()] generics. #' They are both translated to `[.data.table`. #' #' @param .data A [lazy_dt()] #' @param .f The name of a two argument function. The first argument is passed #' `.SD`,the data.table representing the current group; the second argument #' is passed `.BY`, a list giving the current values of the grouping #' variables. The function should return a list or data.table. #' @param ... Additional arguments passed to `.f` #' @param keep Not supported for [lazy_dt]. #' @returns `group_map()` applies `.f` to each group, returning a list. #' `group_modify()` replaces each group with the results of `.f`, returning a #' modified [lazy_dt()]. #' @importFrom dplyr group_modify #' @export #' @examples #' library(dplyr) #' #' dt <- lazy_dt(mtcars) #' #' dt %>% #' group_by(cyl) %>% #' group_modify(head, n = 2L) #' #' dt %>% #' group_by(cyl) %>% #' group_map(head, n = 2L) group_modify.dtplyr_step <- function(.data, .f, ..., keep = FALSE) { if (!missing(keep)) { abort("`keep` is not supported for lazy data tables") } .f <- ensym(.f) args <- enquos(...) step_modify(.data, fun = .f, args = args) } #' @importFrom dplyr group_map #' @rdname group_modify.dtplyr_step #' @export group_map.dtplyr_step <- function(.data, .f, ..., keep = FALSE) { .f <- as_function(.f, caller_env()) dt <- as.data.table(.data) dt[, list(list(.f(.SD, .BY, ...))), by = eval(.data$groups)]$V1 } dtplyr/R/tidyeval-across.R0000644000176200001440000001163214372711230015230 0ustar liggesuserscapture_across <- function(data, x, j = TRUE) { x <- enquo(x) dt_squash_across(get_expr(x), get_env(x), data, j) } dt_squash_across <- function(call, env, data, j = j, is_top = TRUE) { call <- match.call(dplyr::across, call, expand.dots = FALSE, envir = env) out <- across_setup(data, call, env, allow_rename = TRUE, j = j, fn = "across()") if (is_false(is_top)) { out <- call2("data.table", !!!out) } out } capture_if_all <- function(data, x, j = TRUE) { x <- enquo(x) dt_squash_if(get_expr(x), get_env(x), data, j) } dt_squash_if <- function(call, env, data, j = j, reduce = "&") { call <- match.call(dplyr::if_any, call, expand.dots = FALSE, envir = env) if (reduce == "&") { fn <- "if_all()" } else { fn <- "if_any()" } out <- across_setup(data, call, env, allow_rename = FALSE, j = j, fn = fn) Reduce(function(x, y) call2(reduce, x, y), out) } across_funs <- function(funs, env, data, j, dots, names_spec, fn) { if (is.null(funs)) { fns <- list(`1` = function(x, ...) x) names_spec <- names_spec %||% "{.col}" return(list(fns = fns, names = names_spec)) } else if (is_symbol(funs) || is_function(funs)) { fns <- list(`1` = across_fun(funs, env, data, j = j, dots = dots, fn = fn)) names_spec <- names_spec %||% "{.col}" } else if (is_call(funs, "~")) { fns <- list(`1` = across_fun(funs, env, data, j = j, dots = dots, fn = fn)) names_spec <- names_spec %||% "{.col}" } else if (is_call(funs, "list")) { args <- call_args(funs) fns <- lapply(args, across_fun, env, data, j = j, dots = dots, fn = fn) names_spec <- names_spec %||% "{.col}_{.fn}" } else if (!is.null(env)) { # Try evaluating once, just in case funs <- eval(funs, env) return(across_funs(funs, NULL, j = j, dots = dots, names_spec = NULL, fn = fn)) } else { abort("`.fns` argument to dtplyr::across() must be a NULL, a function, formula, or list") } list(fns = fns, names = names_spec) } across_fun <- function(fun, env, data, j, dots, fn) { if (is_symbol(fun) || is_string(fun) || is_call(fun, "function") || is_function(fun)) { function(x) call2(fun, x, !!!dots) } else if (is_call(fun, "~")) { if (!is_empty(dots)) { msg <- c( paste0("`dtplyr::", fn, "` does not support `...` when a purrr-style lambda is used in `.fns`."), i = "Use a lambda instead.", i = "Or inline them via a purrr-style lambda." ) abort(msg) } call <- dt_squash_formula(fun, env, data, j = j, replace = quote(!!.x)) function(x) inject(expr(!!call), child_env(empty_env(), .x = x, expr = rlang::expr)) } else { abort(c( ".fns argument to dtplyr::across() must contain a function or a formula", x = paste0("Problem with ", expr_deparse(fun)) )) } } dt_squash_formula <- function(x, env, data, j = TRUE, replace = quote(!!.x)) { call <- f_rhs(x) call <- replace_dot(call, replace) if (is_call(call)) { call <- dt_squash_call(call, env, data, j = j) } call } across_setup <- function(data, call, env, allow_rename, j, fn) { .cols <- call$.cols %||% expr(everything()) .cols <- as_quosure(.cols, env) tbl <- simulate_vars(data, drop_groups = TRUE) locs <- tidyselect::eval_select( .cols, lazy_dt(tbl), error_call = call2(fn), allow_rename = allow_rename ) vars <- syms(names(tbl))[locs] if (allow_rename) { names_vars <- names(locs) } else { names_vars <- names(tbl)[locs] } dots <- lapply(call$..., dt_squash, env = env, data = data, j = j) names_spec <- eval(call$.names, env) funs_across_data <- across_funs( funs = call$.fns, env = env, data = data, j = j, dots = dots, names_spec = names_spec, fn = fn ) fns_is_null <- funs_across_data$fns_is_null fns <- funs_across_data$fns names_spec <- funs_across_data$names # make sure fns has names, use number to replace unnamed names_fns <- names2(fns) empties <- which(names_fns == "") if (length(empties)) { names_fns[empties] <- empties } glue_mask <- across_glue_mask(env, .col = rep(names_vars, each = length(fns)), .fn = rep(names_fns , length(vars)) ) names_out <- vctrs::vec_as_names(glue(names_spec, .envir = glue_mask), repair = "check_unique") across_apply_fns(vars, fns, names_out) } across_apply_fns <- function(vars, fns, names) { out <- vector("list", length(vars) * length(fns)) out <- set_names(out, names) k <- 1 for (i in seq_along(vars)) { for (j in seq_along(fns)) { out[[k]] <- exec(fns[[j]], vars[[i]]) k <- k + 1 } } out } across_glue_mask <- function(.col, .fn, .caller_env) { glue_mask <- env(.caller_env, .col = .col, .fn = .fn) # TODO: we can make these bindings louder later env_bind_active( glue_mask, col = function() glue_mask$.col, fn = function() glue_mask$.fn ) glue_mask } dtplyr/R/step-subset.R0000644000176200001440000000675214372711230014404 0ustar liggesusersstep_subset <- function(parent, vars = parent$vars, groups = parent$groups, locals = parent$locals, arrange = parent$arrange, i = NULL, j = NULL, on = character(), allow_cartesian = NULL, needs_copy = FALSE ) { stopifnot(is_step(parent)) stopifnot(is_expression(i) || is_call(i) || is_step(i)) stopifnot(is_expression(j) || is_call(j)) stopifnot(is.character(on)) new_step( parent = parent, vars = vars, groups = groups, locals = locals, arrange = arrange, i = i, j = j, on = on, allow_cartesian = allow_cartesian, implicit_copy = !is.null(i) || !is.null(j), needs_copy = needs_copy || parent$needs_copy, class = "dtplyr_step_subset" ) } # Grouped i needs an intermediate assignment for maximum efficiency step_subset_i <- function(parent, i, by = new_by()) { if (is_empty(i)) { return(parent) } if (by$uses_by) { parent$groups <- by$names } if (length(parent$groups) > 0) { parent <- compute(parent) nm <- sym(parent$name) i <- expr((!!nm)[, .I[!!i]]) # dt[, .I[]] i <- add_grouping_param(i, parent, FALSE) # dt[, .I[], by = ()] i <- call("$", i, quote(V1)) # dt[, .I[], by = ()]$V1 } if (by$uses_by) { parent <- ungroup(parent) } step_subset(parent, i = i) } # When adding a subset that contains only j, it may be possible to merge # the previous step. step_subset_j <- function(parent, vars = parent$vars, groups = parent$groups, arrange = parent$arrange, j = NULL, by = new_by()) { if (can_merge_subset(parent)) { i <- parent$i on <- parent$on parent <- parent$parent } else { i <- NULL on <- character() } if (by$uses_by) { parent$groups <- by$names } out <- step_subset( parent, vars = vars, groups = groups, arrange = arrange, i = i, j = j, on = on ) if (by$uses_by) { out <- ungroup(out) } out } can_merge_subset <- function(x) { # Can only merge subsets if (!inherits(x, "dtplyr_step_subset")) { return(FALSE) } # Don't need to check that groups are identical because the only # dplyr functions that generate expression in i are # filter/slice/sample/arrange/join and don't affect groups is.null(x$j) } #' @export dt_sources.dtplyr_step_subset <- function(x) { # TODO: need to throw error if same name refers to different tables. if (is_step(x$i)) { utils::modifyList(dt_sources(x$parent), dt_sources(x$i)) } else { dt_sources(x$parent) } } #' @export dt_call.dtplyr_step_subset <- function(x, needs_copy = x$needs_copy) { if (is.null(x$i) && is.null(x$j)) { return(dt_call(x$parent)) } i <- if (is_step(x$i)) dt_call(x$i) else x$i parent <- dt_call(x$parent, needs_copy) if (is.null(i) && is.null(x$j)) { out <- parent } else if (is.null(i) && !is.null(x$j)) { out <- call2("[", parent, , x$j) } else if (!is.null(i) && is.null(x$j)) { out <- call2("[", parent, i) } else { out <- call2("[", parent, i, x$j) } if (!is.null(x$j)) { out <- add_grouping_param(out, x) } if (length(x$on) > 0) { out$on <- call2(".", !!!syms(x$on)) out$allow.cartesian <- x$allow_cartesian } out } dtplyr/R/step-subset-transmute.R0000644000176200001440000000310014300165007016400 0ustar liggesusers#' Create new columns, dropping old #' #' This is a method for the dplyr [transmute()] generic. It is translated to #' the `j` argument of `[.data.table`. #' #' @param .data A [lazy_dt()]. #' @inheritParams mutate.dtplyr_step #' @importFrom dplyr transmute #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' dt <- lazy_dt(dplyr::starwars) #' dt %>% transmute(name, sh = paste0(species, "/", homeworld)) transmute.dtplyr_step <- function(.data, ...) { dots <- capture_new_vars(.data, ...) dots_list <- process_new_vars(.data, dots) dots <- dots_list$dots groups <- group_vars(.data) if (!is_empty(groups)) { # TODO could check if there is actually anything mutated, e.g. to avoid # DT[, .(x = x)] is_group_var <- names(dots) %in% groups group_dots <- dots[is_group_var] .data <- mutate(ungroup(.data), !!!group_dots) .data <- group_by(.data, !!!syms(groups)) dots <- dots[!is_group_var] } if (is_empty(dots)) { # grouping variables have been removed from `dots` so `select()` would # produce a message "Adding grouping vars". # As `dplyr::transmute()` doesn't generate a message when adding group vars # we can also leave it away here return(select(.data, !!!group_vars(.data))) } if (!dots_list$use_braces) { j <- call2(".", !!!dots) } else { j <- mutate_with_braces(dots)$expr } vars <- union(group_vars(.data), names(dots)) out <- step_subset_j(.data, vars = vars, j = j) if (dots_list$need_removal_step) { out <- select(out, -tidyselect::all_of(dots_list$vars_removed)) } out } dtplyr/R/step-group.R0000644000176200001440000000626214406335651014236 0ustar liggesusersstep_group <- function(parent, groups = parent$groups, arrange = parent$arrange) { if (can_step_group_return_early(parent, groups, arrange)) { return(parent) } new_step( parent, vars = parent$vars, groups = groups, class = "dtplyr_step_group", arrange = arrange, name = parent$name ) } #' @export dt_has_computation.dtplyr_step_group <- function(x) { dt_has_computation(x$parent) } add_grouping_param <- function(call, step, arrange = step$arrange) { if (length(step$groups) == 0) { return(call) } arrange <- arrange %||% TRUE using <- if (isTRUE(arrange)) "keyby" else "by" call[[using]] <- call2(".", !!!syms(step$groups)) call } # dplyr methods ----------------------------------------------------------- #' Group and ungroup #' #' These are methods for dplyr's [group_by()] and [ungroup()] generics. #' Grouping is translated to the either `keyby` and `by` argument of #' `[.data.table` depending on the value of the `arrange` argument. #' #' @inheritParams dplyr::group_by #' @param .data A [lazy_dt()] #' @param arrange If `TRUE`, will automatically arrange the output of #' subsequent grouped operations by group. If `FALSE`, output order will be #' left unchanged. In the generated data.table code this switches between #' using the `keyby` (`TRUE`) and `by` (`FALSE`) arguments. #' @param .add,add When `FALSE`, the default, `group_by()` will #' override existing groups. To add to the existing groups, use #' `.add = TRUE`. #' #' This argument was previously called `add`, but that prevented #' creating a new grouping variable called `add`, and conflicts with #' our naming conventions. #' @importFrom dplyr group_by #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' dt <- lazy_dt(mtcars) #' #' # group_by() is usually translated to `keyby` so that the groups #' # are ordered in the output #' dt %>% #' group_by(cyl) %>% #' summarise(mpg = mean(mpg)) #' #' # use `arrange = FALSE` to instead use `by` so the original order #' # or groups is preserved #' dt %>% #' group_by(cyl, arrange = FALSE) %>% #' summarise(mpg = mean(mpg)) group_by.dtplyr_step <- function(.data, ..., .add = FALSE, arrange = TRUE) { dots <- capture_dots(.data, ..., .j = TRUE) dots <- dots[!map_lgl(dots, is.null)] # need `eval(expr(...))` to trigger warning for `add` groups <- eval(expr(dplyr::group_by_prepare(.data, !!!dots, .add = .add))) arranged <- if (!is.null(.data$arrange)) .data$arrange && arrange else arrange step_group(groups$data, as.character(groups$groups), arranged) } can_step_group_return_early <- function(parent, groups, arrange) { if (is_empty(groups)) { return(is_empty(parent$groups)) } same_arrange <- (is_false(arrange) || identical(arrange, parent$arrange)) same_groups <- identical(groups, parent$groups) same_arrange && same_groups } #' @importFrom dplyr ungroup #' @export #' @rdname group_by.dtplyr_step ungroup.dtplyr_step <- function(x, ...) { if (missing(...)) { step_group(x, groups = character()) } else { old_groups <- group_vars(x) to_remove <- tidyselect::vars_select(x$vars, ...) new_groups <- setdiff(old_groups, to_remove) step_group(x, groups = new_groups) } } dtplyr/R/step-join.R0000644000176200001440000002151114300165007014020 0ustar liggesusersstep_join <- function(x, y, on, style, copy, suffix = c(".x", ".y")) { stopifnot(is_step(x)) y <- dtplyr_auto_copy(x, y, copy = copy) stopifnot(is_step(y)) stopifnot(is.null(on) || is.character(on)) style <- match.arg(style, c("inner", "full", "right", "left", "semi", "anti")) if (is_character(on, 0)) { return(cross_join(x, y)) } on <- dplyr::common_by(on, x, y) vars_out_dt <- dt_join_vars(x$vars, y$vars, on$x, on$y, suffix = suffix, style = style) colorder <- dt_join_colorder(x$vars, y$vars, on$x, on$y, style) # TODO suppress warning in merge # "column names ... are duplicated in the result out <- new_step( parent = if (style == "left") y else x, implicit_copy = TRUE, parent2 = if (style == "left") x else y, vars = vars_out_dt, on = if (style %in% c("left", "full")) on else list(x = on$y, y = on$x), style = style, locals = utils::modifyList(x$locals, y$locals), suffix = suffix, class = "dtplyr_step_join" ) if (style %in% c("anti", "semi")) { return(out) } out <- step_colorder(out, colorder) x_sim <- simulate_vars(x) y_sim <- simulate_vars(y) vars <- dplyr_join_vars(x_sim, y_sim, on$x, on$y, suffix = suffix) if (any(duplicated(vars_out_dt))) { step_setnames(out, colorder, vars, in_place = FALSE) } else { step_setnames(out, vars_out_dt[colorder], vars, in_place = FALSE) } } cross_join <- function(x, y) { xy <- left_join( mutate(x, .cross_join_col = 1), mutate(y, .cross_join_col = 1), by = ".cross_join_col" ) # use custom select to produce way shorter query step_subset_j( xy, vars = setdiff(xy$vars, ".cross_join_col"), j = expr(!".cross_join_col") ) } #' @export dt_sources.dtplyr_step_join <- function(x) { # TODO: need to throw error if same name refers to different tables. utils::modifyList(dt_sources(x$parent), dt_sources(x$parent2)) } #' @export dt_call.dtplyr_step_join <- function(x, needs_copy = x$needs_copy) { lhs <- dt_call(x$parent, needs_copy) rhs <- dt_call(x$parent2) on2 <- simplify_names(stats::setNames(x$on$x, x$on$y)) on <- call2(".", !!!syms(on2)) join_call <- switch(x$style, full = call2("merge", lhs, rhs, all = TRUE, by.x = x$on$x, by.y = x$on$y, allow.cartesian = TRUE), left = call2("[", lhs, rhs, on = on, allow.cartesian = TRUE), inner = call2("[", lhs, rhs, on = on, nomatch = NULL, allow.cartesian = TRUE), right = call2("[", lhs, rhs, on = on, allow.cartesian = TRUE), anti = call2("[", lhs, call2("!", rhs), on = on), semi = call2("[", lhs, call2("unique", call2("[", lhs, rhs, which = TRUE, nomatch = NULL, on = on))) ) if (x$style == "full") { default_suffix <- c(".x", ".y") if (!identical(x$suffix, default_suffix)) { join_call <- call_modify(join_call, suffixes = x$suffix) } } join_call } # dplyr verbs ------------------------------------------------------------- #' Join data tables #' #' These are methods for the dplyr generics [left_join()], [right_join()], #' [inner_join()], [full_join()], [anti_join()], and [semi_join()]. Left, right, #' inner, and anti join are translated to the `[.data.table` equivalent, #' full joins to [data.table::merge.data.table()]. #' Left, right, and full joins are in some cases followed by calls to #' [data.table::setcolorder()] and [data.table::setnames()] to ensure that column #' order and names match dplyr conventions. #' Semi-joins don't have a direct data.table equivalent. #' #' @param x,y A pair of [lazy_dt()]s. #' @inheritParams dplyr::left_join #' @importFrom dplyr left_join #' @export #' @examples #' library(dplyr, warn.conflicts = FALSE) #' #' band_dt <- lazy_dt(dplyr::band_members) #' instrument_dt <- lazy_dt(dplyr::band_instruments) #' #' band_dt %>% left_join(instrument_dt) #' band_dt %>% right_join(instrument_dt) #' band_dt %>% inner_join(instrument_dt) #' band_dt %>% full_join(instrument_dt) #' #' band_dt %>% semi_join(instrument_dt) #' band_dt %>% anti_join(instrument_dt) left_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) { step_join(x, y, by, style = "left", copy = copy, suffix = suffix) } #' @importFrom dplyr right_join #' @export right_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) { step_join(x, y, by, style = "right", copy = copy, suffix = suffix) } #' @importFrom dplyr inner_join #' @export inner_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) { step_join(x, y, on = by, style = "inner", copy = copy, suffix = suffix) } #' @importFrom dplyr full_join #' @export full_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) { step_join(x, y, on = by, style = "full", copy = copy, suffix = suffix) } #' @importFrom dplyr anti_join #' @export anti_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE) { step_join(x, y, on = by, style = "anti", copy = copy) } #' @importFrom dplyr semi_join #' @export semi_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE) { step_join(x, y, on = by, style = "semi", copy = copy) } # helpers ----------------------------------------------------------------- dtplyr_auto_copy <- function(x, y, copy = copy) { if (is_step(y)) { y } else if (is.data.frame(y)) { # includes data tables lazy_dt(y) } else { dplyr::auto_copy(x, y, copy = copy) } } add_suffixes <- function (x, y, suffix) { x[x %in% y] <- paste0(x[x %in% y], suffix) x } dplyr_join_vars <- function(x, y, on_x, on_y, suffix) { colnames(left_join(x, y, by = stats::setNames(on_y, on_x), suffix = suffix)) } dt_join_vars <- function(x, y, on_x, on_y, suffix, style) { style <- match.arg(style, c("inner", "full", "right", "left", "semi", "anti")) if (style == "left") { # need to swap `x` and `y` as the data.table left join is `y[x, on]` subset_join_vars(y, x, on_y = on_x) } else if (style %in% c("right", "inner")) { subset_join_vars(x, y, on_y) } else if (style == "full") { merge_vars(x, y, on_x, on_y, suffix) } else { x } } # column names as generated in `x[y, on = on]` subset_join_vars <- function(x, y, on_y) { # `y` variables used for joining are not included again y_out <- setdiff(y, on_y) # remaining `y` columns that are also in `x` get _prefixed_ by "i." y_out[y_out %in% x] <- paste0("i.", y_out[y_out %in% x]) out_names <- c(x, y_out) add_dt_suffix(out_names) } add_dt_suffix <- function(x) { for (i in seq_along(x)) { j <- 1 nm <- x[[i]] first_occurrence <- !nm %in% x[seq(0, i - 1)] if (!first_occurrence) { while (nm %in% x[-i]) { nm <- paste0(x[[i]], ".", j) j <- j + 1 } } x[[i]] <- nm } x } # column names as generated by `merge(x, y, by.x = on_x, by.y = on_y, suffixes = suffix)` merge_vars <- function(x, y, on_x, on_y, suffix = c(".x", ".y")) { x <- setdiff(x, on_x) y <- setdiff(y, on_y) x_out <- add_suffixes(x, y, suffix[[1]]) y_out <- add_suffixes(y, x, suffix[[2]]) c(on_x, x_out, y_out) } dt_join_colorder <- function(x, y, on_x, on_y, style) { style <- match.arg(style, c("inner", "full", "right", "left", "semi", "anti")) if (style == "left") { subset_left_join_colorder(x, y, on_x, on_y) } else if (style == "full") { merge_join_colorder(x, y, on_x, on_y) } else { seq(length(x) + length(y) - length(on_x)) } } #' column order of data.table left join `y[x]` compared to `left_join(y, x)` #' @noRd subset_left_join_colorder <- function(x, y, on_x, on_y) { # variable order # y[x, on]: y-vars, x-vars - on_x # left_join(x, y, on): x-vars, y-vars - on_y x_loc <- rep_along(x, NA_integer_) # locations of x-vars not used in `on_x` used_in_on_x <- x %in% on_x x_loc[!used_in_on_x] <- seq_along(x[!used_in_on_x]) + length(y) # locations of x-vars used in `on_x` # They have a matching column in `y`. Map `x-vars` according to `on_x` and `on_y` x <- dplyr::recode(x, !!!set_names(on_y, on_x)) x_loc[used_in_on_x] <- vctrs::vec_match(x[used_in_on_x], y) y_out_dt <- setdiff(y, on_y) y_loc <- vctrs::vec_match(y_out_dt, y) c(x_loc, y_loc) } merge_join_colorder <- function(x, y, on_x, on_y) { # variable order # merge(x, y, on_x, on_y): on_x, x-vars - on_x, y-vars - on_y # full_join(x, y, on): x-vars, y-vars - on_y x_out_dt <- setdiff(x, on_x) x_loc <- vctrs::vec_match(x, x_out_dt) + length(on_x) x_loc[is.na(x_loc)] <- vctrs::vec_match(x[is.na(x_loc)], on_x) n_x <- length(x) n_y_out <- length(y) - length(on_x) c(x_loc, n_x + seq2(1, n_y_out)) } #' @importFrom dplyr same_src #' @export same_src.dtplyr_step <- function(x, y) { is_step(y) } #' @importFrom dplyr auto_copy #' @export auto_copy.dtplyr_step <- function(x, y, copy = FALSE, ...) { lazy_dt(as.data.frame(y)) } # Needed to test auto_copy #' @export tbl_vars.foo <- function(x) "x" #' @export as.data.frame.foo <- function(x, ...) data.frame(x = 1:10) dtplyr/R/compat-purrr.R0000644000176200001440000001154514300165007014551 0ustar liggesusers# nocov start - compat-purrr.R # Latest version: https://github.com/r-lib/rlang/blob/master/R/compat-purrr.R # This file provides a minimal shim to provide a purrr-like API on top of # base R functions. They are not drop-in replacements but allow a similar style # of programming. # # Changelog: # 2020-04-14: # * Removed `pluck*()` functions # * Removed `*_cpl()` functions # * Used `as_function()` to allow use of `~` # * Used `.` prefix for helpers # # 2021-05-21: # * Fixed "object `x` not found" error in `imap()` (@mgirlich) # # 2021-12-15: # * `transpose()` now supports empty lists. map <- function(.x, .f, ...) { .f <- as_function(.f, env = global_env()) lapply(.x, .f, ...) } walk <- function(.x, .f, ...) { map(.x, .f, ...) invisible(.x) } map_lgl <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, logical(1), ...) } map_int <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, integer(1), ...) } map_dbl <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, double(1), ...) } map_chr <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, character(1), ...) } .rlang_purrr_map_mold <- function(.x, .f, .mold, ...) { .f <- as_function(.f, env = global_env()) out <- vapply(.x, .f, .mold, ..., USE.NAMES = FALSE) names(out) <- names(.x) out } map2 <- function(.x, .y, .f, ...) { .f <- as_function(.f, env = global_env()) out <- mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE) if (length(out) == length(.x)) { set_names(out, names(.x)) } else { set_names(out, NULL) } } map2_lgl <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "logical") } map2_int <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "integer") } map2_dbl <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "double") } map2_chr <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "character") } imap <- function(.x, .f, ...) { map2(.x, names(.x) %||% seq_along(.x), .f, ...) } pmap <- function(.l, .f, ...) { .f <- as.function(.f) args <- .rlang_purrr_args_recycle(.l) do.call("mapply", c( FUN = list(quote(.f)), args, MoreArgs = quote(list(...)), SIMPLIFY = FALSE, USE.NAMES = FALSE )) } .rlang_purrr_args_recycle <- function(args) { lengths <- map_int(args, length) n <- max(lengths) stopifnot(all(lengths == 1L | lengths == n)) to_recycle <- lengths == 1L args[to_recycle] <- map(args[to_recycle], function(x) rep.int(x, n)) args } keep <- function(.x, .f, ...) { .x[.rlang_purrr_probe(.x, .f, ...)] } discard <- function(.x, .p, ...) { sel <- .rlang_purrr_probe(.x, .p, ...) .x[is.na(sel) | !sel] } map_if <- function(.x, .p, .f, ...) { matches <- .rlang_purrr_probe(.x, .p) .x[matches] <- map(.x[matches], .f, ...) .x } .rlang_purrr_probe <- function(.x, .p, ...) { if (is_logical(.p)) { stopifnot(length(.p) == length(.x)) .p } else { .p <- as_function(.p, env = global_env()) map_lgl(.x, .p, ...) } } compact <- function(.x) { Filter(length, .x) } transpose <- function(.l) { if (!length(.l)) { return(.l) } inner_names <- names(.l[[1]]) if (is.null(inner_names)) { fields <- seq_along(.l[[1]]) } else { fields <- set_names(inner_names) } map(fields, function(i) { map(.l, .subset2, i) }) } every <- function(.x, .p, ...) { .p <- as_function(.p, env = global_env()) for (i in seq_along(.x)) { if (!rlang::is_true(.p(.x[[i]], ...))) return(FALSE) } TRUE } some <- function(.x, .p, ...) { .p <- as_function(.p, env = global_env()) for (i in seq_along(.x)) { if (rlang::is_true(.p(.x[[i]], ...))) return(TRUE) } FALSE } negate <- function(.p) { .p <- as_function(.p, env = global_env()) function(...) !.p(...) } reduce <- function(.x, .f, ..., .init) { f <- function(x, y) .f(x, y, ...) Reduce(f, .x, init = .init) } reduce_right <- function(.x, .f, ..., .init) { f <- function(x, y) .f(y, x, ...) Reduce(f, .x, init = .init, right = TRUE) } accumulate <- function(.x, .f, ..., .init) { f <- function(x, y) .f(x, y, ...) Reduce(f, .x, init = .init, accumulate = TRUE) } accumulate_right <- function(.x, .f, ..., .init) { f <- function(x, y) .f(y, x, ...) Reduce(f, .x, init = .init, right = TRUE, accumulate = TRUE) } detect <- function(.x, .f, ..., .right = FALSE, .p = is_true) { .p <- as_function(.p, env = global_env()) .f <- as_function(.f, env = global_env()) for (i in .rlang_purrr_index(.x, .right)) { if (.p(.f(.x[[i]], ...))) { return(.x[[i]]) } } NULL } detect_index <- function(.x, .f, ..., .right = FALSE, .p = is_true) { .p <- as_function(.p, env = global_env()) .f <- as_function(.f, env = global_env()) for (i in .rlang_purrr_index(.x, .right)) { if (.p(.f(.x[[i]], ...))) { return(i) } } 0L } .rlang_purrr_index <- function(x, right = FALSE) { idx <- seq_along(x) if (right) { idx <- rev(idx) } idx } # nocov end dtplyr/R/tidyeval.R0000644000176200001440000002277514375142710013756 0ustar liggesusersdt_eval <- function(x) { env <- as_environment(dt_sources(x), x$env) add_dt_wrappers(env) for (var in names(x$locals)) { env[[var]] <- eval(x$locals[[var]], env) } quo <- new_quosure(dt_call(x), env) eval_tidy(quo) } # Make sure data.table functions are available so dtplyr still works # even when data.table isn't attached dt_funs <- c( "between", "CJ", "copy", "data.table", "dcast", "melt", "nafill", "fcase", "fcoalesce", "fifelse", "fintersect", "frank", "frankv", "fsetdiff", "funion", "setcolorder", "setnames", "setorder", "shift", "tstrsplit", "uniqueN" ) dt_symbols <- c(".SD", ".BY", ".N", ".I", ".GRP", ".NGRP") add_dt_wrappers <- function(env) { env_bind(env, !!!env_get_list(ns_env("data.table"), dt_funs)) } globalVariables(dt_funs) # These functions attempt to simulate tidy eval as much as possible within # data.table. The goal is to get the majority of real-world code to work, # without aiming for 100% compliance. capture_dots <- function(.data, ..., .j = TRUE, .by = new_by()) { if (.by$uses_by) { .data$groups <- .by$names } dots <- enquos(..., .named = .j) dots <- map(dots, dt_squash, data = .data, j = .j, is_top = TRUE) # Remove names from any list elements is_list <- map_lgl(dots, is.list) names(dots)[is_list] <- "" # Auto-splice list results from dt_squash() dots[!is_list] <- lapply(dots[!is_list], list) unlist(dots, recursive = FALSE) } capture_new_vars <- function(.data, ..., .by = new_by()) { if (.by$uses_by) { .data$groups <- .by$names } dots <- as.list(enquos(..., .named = TRUE)) for (i in seq_along(dots)) { dot <- dots[[i]] dot <- dt_squash(dot, data = .data, is_top = TRUE) if (is.null(dot)) { dots[i] <- list(NULL) } else { dots[[i]] <- dot } .data$vars <- union(.data$vars, names(dot) %||% names(dots)[i]) } # Remove names from any list elements is_list <- map_lgl(dots, is.list) names(dots)[is_list] <- "" # Auto-splice list results from dt_squash() dots[!is_list] <- lapply(dots[!is_list], list) unlist(dots, recursive = FALSE) } capture_dot <- function(.data, x, j = TRUE) { dt_squash(enquo(x), data = .data, j = j) } # squash quosures dt_squash <- function(x, env, data, j = TRUE, is_top = FALSE) { if (is_atomic(x) || is_null(x)) { x } else if (is_symbol(x)) { if (identical(x, quote(.))) { quote(.SD) } else { var <- as.character(x) if (var %in% c("T", "F")) { as.logical(var) } else if (var %in% dt_symbols) { # data table pronouns are bound to NULL x } else if (!var %in% data$vars && env_has(env, var, inherit = TRUE)) { if (is_global(env)) { # Slightly dangerous because the variable might be modified # between creation and execution, but it seems like a reasonable # tradeoff in order to get a more natural translation. if (j) { # use .. to avoid data mask sym(paste0("..", var)) } else { # i doesn't provide a data mask x } } else { eval(x, env) } } else { x } } } else if (is_quosure(x)) { dt_squash(get_expr(x), get_env(x), data, j = j, is_top) } else if (is_call(x, "if_any")) { dt_squash_if(x, env, data, j = j, reduce = "|") } else if (is_call(x, "if_all")) { dt_squash_if(x, env, data, j = j, reduce = "&") } else if (is_call(x, "across")) { dt_squash_across(x, env, data, j = j, is_top) } else if (is_call(x, "pick")) { x[[1]] <- sym("c") call <- call2("across", x) dt_squash_across(call, env, data, j, is_top) } else if (is_call(x)) { dt_squash_call(x, env, data, j = j) } else { abort("Invalid input") } } dt_squash_call <- function(x, env, data, j = TRUE) { if (is_mask_pronoun(x)) { var <- x[[3]] if (is_call(x, "[[")) { var <- sym(eval(var, env)) } if (is_symbol(x[[2]], ".data")) { var } else if (is_symbol(x[[2]], ".env")) { sym(paste0("..", var)) } } else if (is_call(x, c("coalesce", "replace_na"))) { args <- lapply(x[-1], dt_squash, env, data, j) call2("fcoalesce", !!!args) } else if (is_call(x, "case_when")) { # case_when(x ~ y) -> fcase(x, y) args <- unlist(lapply(x[-1], function(x) { list( # Get as "default" case as close as possible # https://github.com/Rdatatable/data.table/issues/4258 if (isTRUE(x[[2]]) || is_symbol(x[[2]], "T")) quote(rep(TRUE, .N)) else x[[2]], x[[3]] ) })) args <- lapply(args, dt_squash, env = env, data = data, j = j) call2("fcase", !!!args) } else if (is_call(x, "cur_data")) { quote(.SD) } else if (is_call(x, "cur_data_all")) { abort("`cur_data_all()` is not available in dtplyr") } else if (is_call(x, "cur_group")) { quote(.BY) } else if (is_call(x, "cur_group_id")) { quote(.GRP) } else if (is_call(x, "cur_group_rows")) { quote(.I) } else if (is_call(x, "desc")) { check_one_arg(x) x[[1]] <- sym("-") x[[2]] <- dt_squash(x[[2]], env, data, j) x } else if (is_call(x, c("if_else", "ifelse"))) { if (is_call(x, "if_else")) { x <- unname(match.call(dplyr::if_else, x)) } else { x <- unname(match.call(ifelse, x)) } x[[1]] <- quote(fifelse) x[-1] <- lapply(x[-1], dt_squash, env, data, j = j) x } else if (is_call(x, c("lag", "lead"))) { if (is_call(x, "lag")) { type <- "lag" call <- match.call(dplyr::lag, x) } else { type <- "lead" call <- match.call(dplyr::lead, x) } call[-1] <- lapply(call[-1], dt_squash, env = env, data = data, j = j) shift_call <- call2("shift", x[[2]]) if (!is_null(call$n)) { shift_call$n <- call$n } if (!is_null(call$default)) { shift_call$fill <- call$default } if (!is_null(call$order_by)) { abort( glue::glue("The `order_by` argument of `{type}()` is not supported by dtplyr") ) } shift_call$type <- type shift_call } else if (is_call(x, "n", n = 0)) { quote(.N) } else if (is_call(x, "n_distinct")) { x <- match.call(dplyr::n_distinct, x, expand.dots = FALSE) dots <- x$... if (length(dots) == 1) { vec <- dots[[1]] } else { vec <- call2("data.table", !!!dots) } call <- call2("uniqueN", vec) if (!is_null(x$na.rm)) { call$na.rm <- x$na.rm } call } else if (is_call(x, "row_number", n = 0)) { quote(seq_len(.N)) } else if (is_call(x, "row_number", n = 1)) { arg <- dt_squash(x[[2]], env, data, j = j) expr(frank(!!arg, ties.method = "first", na.last = "keep")) } else if (is_call(x, "min_rank")) { check_one_arg(x) arg <- dt_squash(x[[2]], env, data, j = j) expr(frank(!!arg, ties.method = "min", na.last = "keep")) } else if (is_call(x, "dense_rank")) { check_one_arg(x) arg <- dt_squash(x[[2]], env, data, j = j) expr(frank(!!arg, ties.method = "dense", na.last = "keep")) } else if (is_call(x, "percent_rank")) { check_one_arg(x) arg <- dt_squash(x[[2]], env, data, j = j) frank_expr <- expr((frank(!!arg, ties.method = "min", na.last = "keep") - 1)) expr(!!frank_expr / (sum(!is.na(!!arg)) - 1)) } else if (is_call(x, "cume_dist")) { check_one_arg(x) arg <- dt_squash(x[[2]], env, data, j = j) frank_expr <- expr(frank(!!arg, ties.method = "max", na.last = "keep")) expr(!!frank_expr / sum(!is.na(!!arg))) } else if (is.function(x[[1]]) || is_call(x, "function")) { simplify_function_call(x, env, data, j = j) } else if (is_call(x, c("glue", "str_glue")) && j) { call <- call_match(x, glue::glue) if (is.null(call$.envir)) { call$.envir <- quote(.SD) } call } else { x[-1] <- lapply(x[-1], dt_squash, env, data, j = j) x } } is_mask_pronoun <- function(x) { is_call(x, c("$", "[["), n = 2) && is_symbol(x[[2]], c(".data", ".env")) } is_global <- function(env) { if (identical(env, globalenv())) { return(TRUE) } # Heuristic for inside pipe if (identical(env_names(env), ".") && identical(env_parent(env), globalenv())) { return(TRUE) } FALSE } simplify_function_call <- function(x, env, data, j = TRUE) { if (inherits(x[[1]], "inline_colwise_function")) { dot_var <- data$vars[[attr(x, "position")]] out <- replace_dot(attr(x[[1]], "formula")[[2]], sym(dot_var)) dt_squash(out, env, data, j = j) } else { name <- fun_name(x[[1]]) if (is_call(x, "function")) { x[[3]] <- dt_squash(x[[3]], env, data, j) return(x) } else if (is.null(name)) { return(x) } attr(x, "position") <- NULL x[[1]] <- name dt_squash(x, env, data, j = j) } } replace_dot <- function(call, sym) { if (is_symbol(call, ".") || is_symbol(call, ".x")) { sym } else if (is_call(call)) { call[] <- lapply(call, replace_dot, sym) call } else { call } } has_gforce <- c( "min", "max", "mean", "median", "var", "sd", "sum", "prod", "first", "last", "head", "tail" ) dplyr_trans <- c("n", "row_number") fun_name <- function(fun) { pkg_env <- baseenv() for (x in has_gforce) { if (!env_has(pkg_env, x, inherit = TRUE)) next fun_x <- env_get(pkg_env, x, inherit = TRUE) if (identical(fun, fun_x)) return(sym(x)) } dplyr_env <- pkg_env("dplyr") for (x in dplyr_trans) { fun_x <- env_get(dplyr_env, x, inherit = TRUE) if (identical(fun, fun_x)) return(sym(x)) } NULL } check_one_arg <- function(x) { fun <- as_name(x[[1]]) if (!has_length(x, 2L)) { abort(glue("`{fun}()` expects exactly one argument.")) } } dtplyr/NEWS.md0000644000176200001440000002715714406337674012720 0ustar liggesusers# dtplyr 1.3.1 * Fix for failing R CMD check. * `dtplyr` no longer directly depends on `crayon`. # dtplyr 1.3.0 ## Breaking changes * dplyr and tidyr verbs no longer dispatch to dtplyr translations when used directly on data.table objects. `lazy_dt()` must now explicitly be called by the user (#312). ## New features * `across()` output can now be used as a data frame (#341). * `.by`/`by` has been implemented for `mutate()`, `summarise()`, `filter()`, and the `slice()` family (#399). * New translations for `add_count()`, `pick()` (#341), and `unite()`. * `min_rank()`, `dense_rank()`, `percent_rank()`, & `cume_dist()` are now mapped to their `data.table` equivalents (#396). ## Performance improvements * `arrange()` now utilizes `setorder()` when possible for improved performance (#364). * `select()` now drops columns by reference when possible for improved performance (#367). * `slice()` uses an intermediate variable to reduce computation time of row selection (#377). ## Minor improvements and bug fixes * dtplyr no longer directly depends on `ellipsis`. * Chained operations properly prevent modify-by-reference (#210). * `across()`, `if_any()`, and `if_all()` evaluate the `.cols` argument in the environment from which the function was called. * `count()` properly handles grouping variables (#356). * `desc()` now supports use of `.data` pronoun inside in `arrange()` (#346). * `full_join()` now produces output with correctly named columns when a non-default value for `suffix` is supplied. Previously the `suffix` argument was ignored (#382). * `if_any()` and `if_all()` now work without specifying the `.fns` argument (@mgirlich, #325) and for a list of functions specified in the (@mgirlich, #335). * `pivot_wider()`'s `names_glue` now works even when `names_from` contains `NA`s (#394). * In `semi_join()` the `y` table is again coerced to a lazy table if `copy = TRUE` (@mgirlich, #322). * `mutate()` can now use `.keep`. * `mutate()`/`summarize()` correctly translates anonymous functions (#362). * `mutate()`/`transmute()` now supports `glue::glue()` and `stringr::str_glue()` without specifying `.envir`. * `where()` now clearly errors because dtplyr doesn't support selection by predicate (#271). # dtplyr 1.2.2 * Hot patch release to resolve R CMD check failures. # dtplyr 1.2.1 * Fix for upcoming rlang release. # dtplyr 1.2.0 ## New authors @markfairbanks, @mgirlich, and @eutwt are now dtplyr authors in recognition of their significant and sustained contributions. Along with @eutwt, they supplied the bulk of the improvements in this release! ## New features * dtplyr gains translations for many more tidyr verbs: * `drop_na()` (@markfairbanks, #194) * `complete()` (@markfairbanks, #225) * `expand()` (@markfairbanks, #225) * `fill()` (@markfairbanks, #197) * `pivot_longer()` (@markfairbanks, #204) * `replace_na()` (@markfairbanks, #202) * `nest()` (@mgirlich, #251) * `separate()` (@markfairbanks, #269) * `tally()` gains a translation (@mgirlich, #201). * `ifelse()` is mapped to `fifelse()` (@markfairbanks, #220). ## Minor improvements and bug fixes * `slice()` helpers (`slice_head()`, `slice_tail()`, `slice_min()`, `slice_max()` and `slice_sample()`) now accept negative values for `n` and `prop`. * `across()` defaults to `everything()` when `.cols` isn't provided (@markfairbanks, #231), and handles named selections (@eutwt #293). It ˜ow handles `.fns` arguments in more forms (@eutwt #288): * Anonymous functions, such as `function(x) x + 1` * Formulas which don't require a function call, such as `~ 1` * `arrange(dt, desc(col))` is translated to `dt[order(-col)]` in order to take advantage of data.table's fast order (@markfairbanks, #227). * `count()` applied to data.tables no longer breaks when dtplyr is loaded (@mgirlich, #201). * `case_when()` supports use of `T` to specify the default (#272). * `filter()` errors for named input, e.g. `filter(dt, x = 1)` (@mgirlich, #267) and works for negated logical columns (@mgirlich, @211). * `group_by()` ungroups when no grouping variables are specified (@mgirlich, #248), and supports inline mutation like `group_by(dt, y = x)` (@mgirlich, #246). * `if_else()` named arguments are translated to the correct arguments in `data.table::fifelse()` (@markfairbanks, #234). `if_else()` supports `.data` and `.env` pronouns (@markfairbanks, #220). * `if_any()` and `if_all()` default to `everything()` when `.cols` isn't provided (@eutwt, #294). * `intersect()`/`union()`/`union_all()`/`setdiff()` convert data.table inputs to `lazy_dt()` (#278). * `lag()`/`lead()` are translated to `shift()`. * `lazy_dt()` keeps groups (@mgirlich, #206). * `left_join()` produces the same column order as dplyr (@markfairbanks, #139). * `left_join()`, `right_join()`, `full_join()`, and `inner_join()` perform a cross join for `by = character()` (@mgirlich, #242). * `left_join()`, `right_join()`, and `inner_join()` are always translated to the `[.data.table` equivalent. For simple merges the translation gets a bit longer but thanks to the simpler code base it helps to better handle names in `by` and duplicated variables names produced in the data.table join (@mgirlich, #222). * `mutate()` and `transmute()` work when called without variables (@mgirlich, #248). * `mutate()` gains new experimental arguments `.before` and `.after` that allow you to control where the new columns are placed (to match dplyr 1.0.0) (@eutwt #291). * `mutate()` can modify grouping columns (instead of creating another column with the same name) (@mgirlich, #246). * `n_distinct()` is translated to `uniqueN()`. * `tally()` and `count()` follow the dplyr convention of creating a unique name if the default output `name` (n) already exists (@eutwt, #295). * `pivot_wider()` names the columns correctly when `names_from` is a numeric column (@mgirlich, #214). * `pull()` supports the `name` argument (@mgirlich, #263). * `slice()` no longer returns excess rows (#10). * `slice_*()` functions after `group_by()` are faster (@mgirlich, #216). * `slice_max()` works when ordering by a character column (@mgirlich, #218). * `summarise()` supports the `.groups` argument (@mgirlich, #245). * `summarise()`, `tally()`, and `count()` can change the value of a grouping variables (@eutwt, #295). * `transmute()` doesn't produce duplicate columns when assigning to the same variable (@mgirlich, #249). It correctly flags grouping variables so they selected (@mgirlich, #246). * `ungroup()` removes variables in `...` from grouping (@mgirlich, #253). # dtplyr 1.1.0 ## New features * All verbs now have (very basic) documentation pointing back to the dplyr generic, and providing a (very rough) description of the translation accompanied with a few examples. * Passing a data.table to a dplyr generic now converts it to a `lazy_dt()`, making it a little easier to move between data.table and dplyr syntax. * dtplyr has been bought up to compatibility with dplyr 1.0.0. This includes new translations for: * `across()`, `if_any()`, `if_all()` (#154). * `count()` (#159). * `relocate()` (@smingerson, #162). * `rename_with()` (#160) * `slice_min()`, `slice_max()`, `slice_head()`, `slice_tail()`, and `slice_sample()` (#174). And `rename()` and `select()` now support dplyr 1.0.0 tidyselect syntax (apart from predicate functions which can't easily work on lazily evaluated data tables). * We have begun the process of adding translations for tidyr verbs beginning with `pivot_wider()` (@markfairbanks, #189). ## Translation improvements * `compute()` now creates an intermediate assignment within the translation. This will generally have little impact on performance but it allows you to use intermediate variables to simplify complex translations. * `case_when()` is now translated to `fcase()` (#190). * `cur_data()` (`.SD`), `cur_group()` (`.BY`), `cur_group_id()` (`.GRP`), and `cur_group_rows() (`.I`) are now tranlsated to their data.table equivalents (#166). * `filter()` on grouped data nows use a much faster translation using on `.I` rather than `.SD` (and requiring an intermediate assignment) (#176). Thanks to suggestion from @myoung3 and @ColeMiller1. * Translation of individual expressions: * `x[[1]]` is now translated correctly. * Anonymous functions are now preserved (@smingerson, #155) * Environment variables used in the `i` argument of `[.data.table` are now correctly inlined when not in the global environment (#164). * `T` and `F` are correctly translated to `TRUE` and `FALSE` (#140). ## Minor improvements and bug fixes * Grouped filter, mutate, and slice no longer affect ordering of output (#178). * `as_tibble()` gains a `.name_repair` argument (@markfairbanks). * `as.data.table()` always calls `[]` so that the result will print (#146). * `print.lazy_dt()` shows total rows, and grouping, if present. * `group_map()` and `group_walk()` are now translated (#108). # dtplyr 1.0.1 * Better handling for `.data` and `.env` pronouns (#138). * dplyr verbs now work with `NULL` inputs (#129). * joins do better job at determining output variables in the presence of duplicated outputs (#128). When joining based on different variables in `x` and `y`, joins consistently preserve column from `x`, not `y` (#137). * `lazy_dt()` objects now have a useful `glimpse()` method (#132). * `group_by()` now has an `arrange` parameter which, if set to `FALSE`, sets the data.table translation to use `by` rather than `keyby` (#85). * `rename()` now works without `data.table` attached, as intended (@michaelchirico, #123). * dtplyr has been re-licensed as MIT (#165). # dtplyr 1.0.0 * Converted from eager approach to lazy approach. You now must use `lazy_dt()` to begin a translation pipeline, and must use `collect()`, `as.data.table()`, `as.data.frame()`, or `as_tibble()` to finish the translation and actually perform the computation (#38). This represents a complete overhaul of the package replacing the eager evaluation used in the previous releases. This unfortunately breaks all existing code that used dtplyr, but frankly the previous version was extremely inefficient so offered little of data.table's impressive speed, and was used by very few people. * dtplyr provides methods for data.tables that warning you that they use the data frame implementation and you should use `lazy_dt()` (#77) * Joins now pass `...` on to data.table's merge method (#41). * `ungroup()` now copies its input (@christophsax, #54). * `mutate()` preserves grouping (@christophsax, #17). * `if_else()` and `coalesce()` are mapped to data.table's `fifelse()` and `fcoalesce()` respectively (@michaelchirico, #112). # dtplyr 0.0.3 - Maintenance release for CRAN checks. - `inner_join()`, `left_join()`, `right_join()`, and `full_join()`: new `suffix` argument which allows you to control what suffix duplicated variable names receive, as introduced in dplyr 0.5 (#40, @christophsax). - Joins use extended `merge.data.table()` and the `on` argument, introduced in data.table 1.9.6. Avoids copy and allows joins by different keys (#20, #21, @christophsax). # dtplyr 0.0.2 - This is a compatibility release. It makes dtplyr compatible with dplyr 0.6.0 in addition to dplyr 0.5.0. # dtplyr 0.0.1 - `distinct()` gains `.keep_all` argument (#30, #31). - Slightly improve test coverage (#6). - Install `devtools` from GitHub on Travis (#32). - Joins return `data.table`. Right and full join are now implemented (#16, #19). - Remove warnings from tests (#4). - Extracted from `dplyr` at revision e5f2952923028803. dtplyr/MD50000644000176200001440000001722214406577055012121 0ustar liggesusers67267f1bb26d26cb800c1194e078f4e4 *DESCRIPTION 27a5563b3b2e3884613ea75a3305bb2a *LICENSE 88394f3992e1ca1014353cbfd2ab8d80 *NAMESPACE cd43d0676dd51a0a6dcd4417e9096b34 *NEWS.md fbfcbaef3e74f0303cbce1bb4acd6173 *R/by.R 0fa60b49bc17479ffacbdf7e1d738a31 *R/compat-purrr.R 3a4fe53420e9488303637248b4d2b8ba *R/complete.R bdd37939042b67d17f6ba36f5c9cb514 *R/count.R 9095cb89ccd8161667e3375d34fbb8aa *R/dtplyr-package.R 720e9d919f66491e3e9214571c568439 *R/fill.R 75c337831c04ca69b48dc4c3843a3b55 *R/replace_na.R 4edd1654335ff1256a67a002fe1466bc *R/step-assign.R a5a50b644ac55e5e61edeb7b3e0df700 *R/step-call-pivot_longer.R 63c108f2b7713af01ce642f8cf755369 *R/step-call-pivot_wider.R 2df7ffdbf11e95464e37884dcfd271a2 *R/step-call.R 4d1582801fa674c2c124a98f3b8a2b1b *R/step-colorder-relocate.R 4b0318c06dd07cffeb1e3d26cf0fc0ff *R/step-colorder.R 4d6a6227b737c07c30f7f6b87af7df0b *R/step-first.R 6fbfca85606ee6ab9ce872e9fef14764 *R/step-group.R 64d308a2b4e99f40ce812bbad439372c *R/step-join.R 0ea89387e990ef2199820b8548f56c82 *R/step-modify.R ece5932f8adb2f583b41e130c478d56d *R/step-mutate.R ab8f67e80be067f146b6d7c563c2287e *R/step-nest.R 2887ce0256a443459baa60599f42db3d *R/step-set.R edad1c1ed46d0f7f90dde400da56a4fb *R/step-setnames.R 740780a660f04d2078cbc250b88208d8 *R/step-subset-arrange.R cacd91b576528f9702578dd9650f9d5a *R/step-subset-do.R cb07ba632dea6e8a01dc6bc54c15886f *R/step-subset-expand.R fb8c49666f1878fcadcdc646dadc6d9b *R/step-subset-filter.R 32a320d72576330e52ac6c212b6e60c7 *R/step-subset-select.R 6acc75141df93c0fce1911d5609d79d8 *R/step-subset-separate.R f39925f484c7fb726a75b4cf38bf5a54 *R/step-subset-slice.R 4fbf871233e00df0f6a976de498ea975 *R/step-subset-summarise.R 377f9583ef91143ddd7e36303827f2e6 *R/step-subset-transmute.R 071ec44b44736a3c5dbabe1058812bf2 *R/step-subset.R d2c721ac309b533d49bda3b36fc6012d *R/step.R 1eb6de238d81dfa8bef51a6133d3ac75 *R/tidyeval-across.R c9e5387f6d167e9a9262d513deab1768 *R/tidyeval.R 577ff26aa8e09e1368d728c6aff47075 *R/unite.R 0dd6f8a296acc01969a2e5a9a4c70ce9 *R/utils.R 56e50602ff9cb4c527765ad81ac457bb *R/zzz.R 9d51297f98eb4399f891f1636fbb1f19 *README.md c228483ed6fd4b383ade09779551bc63 *build/vignette.rds 0e388eac7156e07062f46f93dbee85cb *inst/doc/translation.R a37504db4e779b0a7040a5d466761cab *inst/doc/translation.Rmd 4193d5f0be2f9a5b1630a52a7d84f8b1 *inst/doc/translation.html f717094e0c8d521ab067adb23ecded4d *man/arrange.dtplyr_step.Rd 0b3cd4884c115e1db0b07ec477b97863 *man/collect.dtplyr_step.Rd 87b60c0a4cceba528101591298c2c244 *man/complete.dtplyr_step.Rd 1248a670783b07f11fddd944394231af *man/count.dtplyr_step.Rd 0e3f5ade09e90033867361ba8d99373b *man/distinct.dtplyr_step.Rd 43fe3890dc9566476549cf1e5b391bf2 *man/dot-datatable.aware.Rd 5d5196c786eb8d076189634cdf04aaf3 *man/drop_na.dtplyr_step.Rd 9bc0104bbed576fdc289a91d29809e4f *man/dtplyr-package.Rd 30827e69472f1333214d67da4eacc302 *man/expand.dtplyr_step.Rd 6dfa31db423be20f202995c4a727a55f *man/figures/logo.png d582de02914acd306dd6bfa1d5300bc0 *man/fill.dtplyr_step.Rd 3a0e9d2e56ce2cb0aa1d3e9f33bb2bce *man/filter.dtplyr_step.Rd b88b4e1c9148291132de3f5d79f8644d *man/group_by.dtplyr_step.Rd 17b4dbc356cccb98d51bee1c35c24903 *man/group_modify.dtplyr_step.Rd c988f1b9dc3d12ee600530c2c928b989 *man/head.dtplyr_step.Rd cec46c1c485bce3a920ee51058065ae3 *man/intersect.dtplyr_step.Rd 2794d05f881b5b40b8f3649a4d1ce3e1 *man/lazy_dt.Rd 2ce20f77fb1c35181aa346303a15d5d6 *man/left_join.dtplyr_step.Rd 8b8386c6b0c016020ed9a1dcee63c137 *man/mutate.dtplyr_step.Rd 9031321b4eae9ac389dde060d979f6fb *man/nest.dtplyr_step.Rd 80117b19bc79ad99e1761a0a1df881d2 *man/pivot_longer.dtplyr_step.Rd af50a1d1fbec6eef6ff56647af6d6217 *man/pivot_wider.dtplyr_step.Rd 66baf1a806e0c7fcbf97ec20c72409d5 *man/relocate.dtplyr_step.Rd e734ce49d34bb506087b31420fa575cf *man/rename.dtplyr_step.Rd c1a7d764e20f61bf3c2d3f4f1103b7e8 *man/replace_na.dtplyr_step.Rd 0d851ef75e704f465836c5b024d932f8 *man/select.dtplyr_step.Rd 3f6bb8dd80dfa1a4bc78b6a75bfdb9e1 *man/separate.dtplyr_step.Rd 8a05dc2ed5998f8c5170a8bb372df465 *man/slice.dtplyr_step.Rd bee1c64ae067aefb3e80627d8e3f3813 *man/summarise.dtplyr_step.Rd 846fdfb25b0dcaa56ad0cf11648ae4ee *man/transmute.dtplyr_step.Rd 4c8d57dc9652075a1c197841fe116bdf *man/unite.dtplyr_step.Rd 6773cea4a567ea405c039094895d9d36 *tests/testthat.R 914a918abe0653446234f82d15da3340 *tests/testthat/_snaps/count.md 366d136be5b17b000e11260851fe93e3 *tests/testthat/_snaps/step-call-pivot_longer.md 22bbb855065a85c7e7fe66d46634e117 *tests/testthat/_snaps/step-call-pivot_wider.md 75dbd8446db327643337afcc104b349e *tests/testthat/_snaps/step-call.md b40003439384755abe8a3e3acd010550 *tests/testthat/_snaps/step-colorder-relocate.md 2b240a0cd062ae4d0ee47227c71c9724 *tests/testthat/_snaps/step-colorder.md 8ffac872d094056609ada66e5dc43178 *tests/testthat/_snaps/step-group.md 9cf7340c3a778076b6dc816f3e8c5fa2 *tests/testthat/_snaps/step-join.md ac979a68fa7c56d03b3e6e4e24855206 *tests/testthat/_snaps/step-mutate.md a12fe5ce00bd80e5999864826d7fa646 *tests/testthat/_snaps/step-subset-filter.md 4ec8b73ecc259f958be818be83aa8a46 *tests/testthat/_snaps/step-subset-select.md 64671767b1723506810b817b3534d8e1 *tests/testthat/_snaps/step-subset-separate.md cb2bd02e37a12f79e7276f9d21fffce0 *tests/testthat/_snaps/step-subset-slice.md 60089339a240c49b46d2dd75362bfb16 *tests/testthat/_snaps/step-subset-summarise.md 5927e53329bcc9edca12225a9fc510a6 *tests/testthat/_snaps/step.md b4bb468fa12f54313e6fe21e315abf0e *tests/testthat/_snaps/tidyeval-across.md 6c85808c54216d6971ca5d4e94740f5f *tests/testthat/_snaps/tidyeval.md 9754a66fbaeeabfc1b9099343f72bc47 *tests/testthat/_snaps/unite.md dd4a5d5d818d5b0f1d1181ec34a7335c *tests/testthat/helpers-library.R 9f70e4f31ccd2202ac448b04dee0603f *tests/testthat/test-complete.R b2026619c12c4c95264e7f363815aaea *tests/testthat/test-count.R 3d518026af991f3880a0df9868b1b5d2 *tests/testthat/test-fill.R d5bcb455cc02b8ce7e4e7eb3fec1291d *tests/testthat/test-replace_na.R d6892a468d9022325a82ded2c30a14e9 *tests/testthat/test-step-call-pivot_longer.R f8dc7f625fd75d03eab09af9c4a9fdf7 *tests/testthat/test-step-call-pivot_wider.R 8c7f93853bc5ccd3e029c879275a6e9e *tests/testthat/test-step-call.R 815588435cee777704dc642496a5860b *tests/testthat/test-step-colorder-relocate.R 5aae80c3810d2ace501925eb133e990b *tests/testthat/test-step-colorder.R 1c46a397ef02fe401f24cb9b2f2d6a20 *tests/testthat/test-step-first.R 599b362e6504ac191d8c824b64d4e0eb *tests/testthat/test-step-group.R e27f75d8e66f32127ec36bf7465a7d7e *tests/testthat/test-step-join.R f2f86385f70d3aadce0d720793136220 *tests/testthat/test-step-modify.R f96e24bd519c57de462fd4f7385ff42e *tests/testthat/test-step-mutate.R 40c5b5c47702627a4165ce1ba7b69a4a *tests/testthat/test-step-nest.R e2767a1967d22ff66f0c2beb56ebd4cd *tests/testthat/test-step-set.R f57f3bebef27b99e4760f205b89379cc *tests/testthat/test-step-subset-arrange.R d8f9c9ce077db1c71c7ca3b20346ffc0 *tests/testthat/test-step-subset-do.R 3e537d52c81de777013d7a7cd30c6a6d *tests/testthat/test-step-subset-expand.R d2adc52a47500cb857eb510f44dd0f9a *tests/testthat/test-step-subset-filter.R 528dfba4899a624507b58e042eb97c21 *tests/testthat/test-step-subset-select.R 68029ee866b0209462755dacc6392f09 *tests/testthat/test-step-subset-separate.R 6709273d70d1321ae0a49dc2b624bd3d *tests/testthat/test-step-subset-slice.R 62886dee75231b7807427c76c747e389 *tests/testthat/test-step-subset-summarise.R 609a5d61e8ea332ec15d6d3e0321f8b9 *tests/testthat/test-step-subset-transmute.R b2760ad16fc8df31e108dd39d9179127 *tests/testthat/test-step-subset.R 49ea920ef6bd0676648afd02b7347d65 *tests/testthat/test-step.R cfd7484d3fb6dd89e012ec4b2b845fba *tests/testthat/test-tidyeval-across.R 5f9c03bb9a6b0a1aa90aeda6a6974051 *tests/testthat/test-tidyeval.R a5a08f2f70eb17cb2fc47302848d1dc8 *tests/testthat/test-unite.R a37504db4e779b0a7040a5d466761cab *vignettes/translation.Rmd dtplyr/inst/0000755000176200001440000000000014406337714012556 5ustar liggesusersdtplyr/inst/doc/0000755000176200001440000000000014406337714013323 5ustar liggesusersdtplyr/inst/doc/translation.R0000644000176200001440000001207414406337714016010 0ustar liggesusers## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message = FALSE--------------------------------------------------- library(dtplyr) library(data.table) library(dplyr) ## ----------------------------------------------------------------------------- df <- data.frame(a = 1:5, b = 1:5, c = 1:5, d = 1:5) dt <- lazy_dt(df) ## ----------------------------------------------------------------------------- dt ## ----------------------------------------------------------------------------- dt %>% show_query() ## ----------------------------------------------------------------------------- dt %>% arrange(a, b, c) %>% show_query() dt %>% filter(b == c) %>% show_query() dt %>% filter(b == c, c == d) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% select(a:b) %>% show_query() dt %>% summarise(a = mean(a)) %>% show_query() dt %>% transmute(a2 = a * 2) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% mutate(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% transmute(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% rename(x = a, y = b) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% distinct() %>% show_query() dt %>% distinct(a, b) %>% show_query() dt %>% distinct(a, b, .keep_all = TRUE) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% distinct(c = a + b) %>% show_query() dt %>% distinct(c = a + b, .keep_all = TRUE) %>% show_query() ## ----------------------------------------------------------------------------- dt2 <- lazy_dt(data.frame(a = 1)) dt %>% inner_join(dt2, by = "a") %>% show_query() dt %>% right_join(dt2, by = "a") %>% show_query() dt %>% left_join(dt2, by = "a") %>% show_query() dt %>% anti_join(dt2, by = "a") %>% show_query() ## ----------------------------------------------------------------------------- dt %>% full_join(dt2, by = "a") %>% show_query() ## ----------------------------------------------------------------------------- dt3 <- lazy_dt(data.frame(b = 1, a = 1)) dt %>% left_join(dt3, by = "a") %>% show_query() dt %>% full_join(dt3, by = "b") %>% show_query() ## ----------------------------------------------------------------------------- dt %>% semi_join(dt2, by = "a") %>% show_query() ## ----------------------------------------------------------------------------- dt %>% intersect(dt2) %>% show_query() dt %>% setdiff(dt2) %>% show_query() dt %>% union(dt2) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% group_by(a, arrange = FALSE) %>% summarise(b = mean(b)) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% group_by(a) %>% filter(b < mean(b)) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% filter(a == 1) %>% select(-a) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% group_by(a) %>% filter(b < mean(b)) %>% summarise(c = max(c)) %>% show_query() ## ----------------------------------------------------------------------------- dt3 <- lazy_dt(data.frame(x = 1, y = 2)) dt4 <- lazy_dt(data.frame(x = 1, a = 2, b = 3, c = 4, d = 5, e = 7)) dt3 %>% left_join(dt4) %>% select(x, a:c) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% select(X = a, Y = b) %>% filter(X == 1) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% filter(a == 1) %>% mutate(b2 = b * 2) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ## ----------------------------------------------------------------------------- dt %>% filter(x == 1) %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ## ----------------------------------------------------------------------------- dt2 <- data.table(a = 1:10) dt_inplace <- lazy_dt(dt2, immutable = FALSE) dt_inplace %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ## ----------------------------------------------------------------------------- bench::mark( filter = dt %>% filter(a == b, c == d), mutate = dt %>% mutate(a = a * 2, a4 = a2 * 2, a8 = a4 * 2) %>% show_query(), summarise = dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query(), check = FALSE )[1:6] dtplyr/inst/doc/translation.Rmd0000644000176200001440000001764514406327117016336 0ustar liggesusers--- title: "Translation" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{translation} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ## Introduction This vignette shows the details of how dtplyr translates dplyr expressions into the equivalent [data.table](http://r-datatable.com/) code. If you see places where you think I could generate better data.table code, please [let me know](https://github.com/tidyverse/dtplyr/issues)! This document assumes that you're familiar with the basics of data.table; if you're not, I recommend starting at `vignette("datatable-intro.html")`. ```{r setup, message = FALSE} library(dtplyr) library(data.table) library(dplyr) ``` ## The basics To get started, I'll create a simple lazy table with `lazy_dt()`: ```{r} df <- data.frame(a = 1:5, b = 1:5, c = 1:5, d = 1:5) dt <- lazy_dt(df) ``` The actual data doesn't matter here since we're just looking at the translation. When you print a lazy frame, it tells you that it's a local data table with four rows. It also prints the call that dtplyr will evaluate when we execute the lazy table. In this case it's very simple: ```{r} dt ``` If we just want to see the generated code, you can use `show_query()`. I'll use that a lot in this vignette. ```{r} dt %>% show_query() ``` ## Simple verbs Many dplyr verbs have a straightforward translation to either the `i` or `j` component of `[.data.table`. ### `filter()` and `arrange()` `filter()` and `arrange()` become elements of `i`: ```{r} dt %>% arrange(a, b, c) %>% show_query() dt %>% filter(b == c) %>% show_query() dt %>% filter(b == c, c == d) %>% show_query() ``` ### `select()`, `summarise()`, `transmute()` `select()`, `summarise()` and `transmute()` all become elements of `j`: ```{r} dt %>% select(a:b) %>% show_query() dt %>% summarise(a = mean(a)) %>% show_query() dt %>% transmute(a2 = a * 2) %>% show_query() ``` `mutate()` also uses the `j` component with data.table's special `:=` operator: ```{r} dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` Note that dplyr will not copy the input data by default, see below for more details. `mutate()` allows to refer to variables that you just created using an "extended `j`" expression: ```{r} dt %>% mutate(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query() ``` `transmute()` works similarly: ```{r} dt %>% transmute(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query() ``` ## Other calls Other verbs require calls to other functions: ### `rename()` `rename()` uses `setnames()`: ```{r} dt %>% rename(x = a, y = b) %>% show_query() ``` ### `distinct()` `distinct()` uses `unique()`: ```{r} dt %>% distinct() %>% show_query() dt %>% distinct(a, b) %>% show_query() dt %>% distinct(a, b, .keep_all = TRUE) %>% show_query() ``` `distinct()` on a computed column uses an intermediate mutate: ```{r} dt %>% distinct(c = a + b) %>% show_query() dt %>% distinct(c = a + b, .keep_all = TRUE) %>% show_query() ``` ### Joins Most joins use the `[.data.table` equivalent: ```{r} dt2 <- lazy_dt(data.frame(a = 1)) dt %>% inner_join(dt2, by = "a") %>% show_query() dt %>% right_join(dt2, by = "a") %>% show_query() dt %>% left_join(dt2, by = "a") %>% show_query() dt %>% anti_join(dt2, by = "a") %>% show_query() ``` But `full_join()` uses `merge()` ```{r} dt %>% full_join(dt2, by = "a") %>% show_query() ``` In some case extra calls to `data.table::setcolorder()` and `data.table::setnames()` are required to ensure correct column order and names in: ```{r} dt3 <- lazy_dt(data.frame(b = 1, a = 1)) dt %>% left_join(dt3, by = "a") %>% show_query() dt %>% full_join(dt3, by = "b") %>% show_query() ``` Semi-joins are little more complex: ```{r} dt %>% semi_join(dt2, by = "a") %>% show_query() ``` ### Set operations Set operations use the fast data.table alternatives: ```{r} dt %>% intersect(dt2) %>% show_query() dt %>% setdiff(dt2) %>% show_query() dt %>% union(dt2) %>% show_query() ``` ## Grouping Just like in dplyr, `group_by()` doesn't do anything by itself, but instead modifies the operation of downstream verbs. This generally just involves using the `keyby` argument: ```{r} dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query() ``` You may use `by` instead of `keyby` if you set `arrange = FALSE`: ```{r} dt %>% group_by(a, arrange = FALSE) %>% summarise(b = mean(b)) %>% show_query() ``` Often, there won't be too much of a difference between these, but for larger grouped operations, the overhead of reordering data may become significant. In these situations, using `arrange = FALSE` becomes preferable. The primary exception is grouped `filter()`, which requires the use of `.SD`: ```{r} dt %>% group_by(a) %>% filter(b < mean(b)) %>% show_query() ``` ## Combinations dtplyr tries to generate generate data.table code as close as possible to what you'd write by hand, as this tends to unlock data.table's tremendous speed. For example, if you `filter()` and then `select()`, dtplyr generates a single `[`: ```{r} dt %>% filter(a == 1) %>% select(-a) %>% show_query() ``` And similarly when combining filtering and summarising: ```{r} dt %>% group_by(a) %>% filter(b < mean(b)) %>% summarise(c = max(c)) %>% show_query() ``` This is particularly nice when joining two tables together because you can select variables after you have joined and data.table will only carry those into the join: ```{r} dt3 <- lazy_dt(data.frame(x = 1, y = 2)) dt4 <- lazy_dt(data.frame(x = 1, a = 2, b = 3, c = 4, d = 5, e = 7)) dt3 %>% left_join(dt4) %>% select(x, a:c) %>% show_query() ``` Note, however, that `select()`ing and then `filter()`ing must generate two separate calls to `[`, because data.table evaluates `i` before `j`. ```{r} dt %>% select(X = a, Y = b) %>% filter(X == 1) %>% show_query() ``` Similarly, a `filter()` and `mutate()` can't be combined because `dt[a == 1, .(b2 := b * 2)]` would modify the selected rows in place: ```{r} dt %>% filter(a == 1) %>% mutate(b2 = b * 2) %>% show_query() ``` ## Copies By default dtplyr avoids mutating the input data, automatically creating a `copy()` if needed: ```{r} dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` Note that dtplyr does its best to avoid needless copies, so it won't explicitly copy if there's already an implicit copy produced by `[`, `head()`, `merge()` or similar: ```{r} dt %>% filter(x == 1) %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` You can choose to opt out of this copy, and take advantage of data.table's reference semantics (see `vignette("datatable-reference-semantics")` for more details). Do this by setting `immutable = FALSE` on construction: ```{r} dt2 <- data.table(a = 1:10) dt_inplace <- lazy_dt(dt2, immutable = FALSE) dt_inplace %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query() ``` ## Performance There are two components to the performance of dtplyr: how long it takes to generate the translation, and how well the translation performs. Given my explorations so far, I'm reasonably confident that we're generating high-quality data.table code, so most of the cost should be in the translation itself. The following code briefly explores the performance of a few different translations. A significant amount of work is done by the dplyr verbs, so we benchmark the whole process. ```{r} bench::mark( filter = dt %>% filter(a == b, c == d), mutate = dt %>% mutate(a = a * 2, a4 = a2 * 2, a8 = a4 * 2) %>% show_query(), summarise = dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query(), check = FALSE )[1:6] ``` These translations all take less than a millisecond, suggesting that the performance overhead of dtplyr should be negligible for realistic data sizes. Note that dtplyr run-time scales with the complexity of the pipeline, not the size of the data, so these timings should apply regardless of the size of the underlying data[^copy]. [^copy]: Unless a copy is performed. dtplyr/inst/doc/translation.html0000644000176200001440000013405714406337714016561 0ustar liggesusers Translation

Translation

Introduction

This vignette shows the details of how dtplyr translates dplyr expressions into the equivalent data.table code. If you see places where you think I could generate better data.table code, please let me know!

This document assumes that you’re familiar with the basics of data.table; if you’re not, I recommend starting at vignette("datatable-intro.html").

library(dtplyr)
library(data.table)
library(dplyr)

The basics

To get started, I’ll create a simple lazy table with lazy_dt():

df <- data.frame(a = 1:5, b = 1:5, c = 1:5, d = 1:5)
dt <- lazy_dt(df)

The actual data doesn’t matter here since we’re just looking at the translation.

When you print a lazy frame, it tells you that it’s a local data table with four rows. It also prints the call that dtplyr will evaluate when we execute the lazy table. In this case it’s very simple:

dt
#> Source: local data table [5 x 4]
#> Call:   `_DT1`
#> 
#>       a     b     c     d
#>   <int> <int> <int> <int>
#> 1     1     1     1     1
#> 2     2     2     2     2
#> 3     3     3     3     3
#> 4     4     4     4     4
#> 5     5     5     5     5
#> 
#> # Use as.data.table()/as.data.frame()/as_tibble() to access results

If we just want to see the generated code, you can use show_query(). I’ll use that a lot in this vignette.

dt %>% show_query()
#> `_DT1`

Simple verbs

Many dplyr verbs have a straightforward translation to either the i or j component of [.data.table.

filter() and arrange()

filter() and arrange() become elements of i:

dt %>% arrange(a, b, c) %>% show_query()
#> `_DT1`[order(a, b, c)]

dt %>% filter(b == c) %>% show_query()
#> `_DT1`[b == c]
dt %>% filter(b == c, c == d) %>% show_query()
#> `_DT1`[b == c & c == d]

select(), summarise(), transmute()

select(), summarise() and transmute() all become elements of j:

dt %>% select(a:b) %>% show_query()
#> `_DT1`[, .(a, b)]
dt %>% summarise(a = mean(a)) %>% show_query()
#> `_DT1`[, .(a = mean(a))]
dt %>% transmute(a2 = a * 2) %>% show_query()
#> `_DT1`[, .(a2 = a * 2)]

mutate() also uses the j component with data.table’s special := operator:

dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query()
#> copy(`_DT1`)[, `:=`(a2 = a * 2, b2 = b * 2)]

Note that dplyr will not copy the input data by default, see below for more details.

mutate() allows to refer to variables that you just created using an “extended j” expression:

dt %>% mutate(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query()
#> copy(`_DT1`)[, `:=`(c("a2", "b2", "a4"), {
#>     a2 <- a * 2
#>     b2 <- b * 2
#>     a4 <- a2 * 2
#>     .(a2, b2, a4)
#> })]

transmute() works similarly:

dt %>% transmute(a2 = a * 2, b2 = b * 2, a4 = a2 * 2) %>% show_query()
#> `_DT1`[, {
#>     a2 <- a * 2
#>     b2 <- b * 2
#>     a4 <- a2 * 2
#>     .(a2, b2, a4)
#> }]

Other calls

Other verbs require calls to other functions:

rename()

rename() uses setnames():

dt %>% rename(x = a, y = b) %>% show_query()
#> setnames(copy(`_DT1`), c("a", "b"), c("x", "y"))

distinct()

distinct() uses unique():

dt %>% distinct() %>% show_query()
#> unique(`_DT1`)
dt %>% distinct(a, b) %>% show_query()
#> unique(`_DT1`[, .(a, b)])
dt %>% distinct(a, b, .keep_all = TRUE) %>% show_query()
#> unique(`_DT1`, by = c("a", "b"))

distinct() on a computed column uses an intermediate mutate:

dt %>% distinct(c = a + b) %>% show_query()
#> unique(`_DT1`[, .(c = a + b)])
dt %>% distinct(c = a + b, .keep_all = TRUE) %>% show_query()
#> unique(copy(`_DT1`)[, `:=`(c = a + b)], by = "c")

Joins

Most joins use the [.data.table equivalent:

dt2 <- lazy_dt(data.frame(a = 1))

dt %>% inner_join(dt2, by = "a") %>% show_query()
#> `_DT1`[`_DT2`, on = .(a), nomatch = NULL, allow.cartesian = TRUE]
dt %>% right_join(dt2, by = "a") %>% show_query()
#> `_DT1`[`_DT2`, on = .(a), allow.cartesian = TRUE]
dt %>% left_join(dt2, by = "a") %>% show_query()
#> `_DT2`[`_DT1`, on = .(a), allow.cartesian = TRUE]
dt %>% anti_join(dt2, by = "a") %>% show_query()
#> `_DT1`[!`_DT2`, on = .(a)]

But full_join() uses merge()

dt %>% full_join(dt2, by = "a") %>% show_query()
#> merge(`_DT1`, `_DT2`, all = TRUE, by.x = "a", by.y = "a", allow.cartesian = TRUE)

In some case extra calls to data.table::setcolorder() and data.table::setnames() are required to ensure correct column order and names in:

dt3 <- lazy_dt(data.frame(b = 1, a = 1))

dt %>% left_join(dt3, by = "a") %>% show_query()
#> setnames(setcolorder(`_DT3`[`_DT1`, on = .(a), allow.cartesian = TRUE], 
#>     c(2L, 3L, 4L, 5L, 1L)), c("i.b", "b"), c("b.x", "b.y"))
dt %>% full_join(dt3, by = "b") %>% show_query()
#> setcolorder(merge(`_DT1`, `_DT3`, all = TRUE, by.x = "b", by.y = "b", 
#>     allow.cartesian = TRUE), c(2L, 1L, 3L, 4L, 5L))

Semi-joins are little more complex:

dt %>% semi_join(dt2, by = "a") %>% show_query()
#> `_DT1`[unique(`_DT1`[`_DT2`, which = TRUE, nomatch = NULL, on = .(a)])]

Set operations

Set operations use the fast data.table alternatives:

dt %>% intersect(dt2) %>% show_query()
#> fintersect(`_DT1`, `_DT2`)
dt %>% setdiff(dt2) %>% show_query()
#> fsetdiff(`_DT1`, `_DT2`)
dt %>% union(dt2) %>% show_query()
#> funion(`_DT1`, `_DT2`)

Grouping

Just like in dplyr, group_by() doesn’t do anything by itself, but instead modifies the operation of downstream verbs. This generally just involves using the keyby argument:

dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query()
#> `_DT1`[, .(b = mean(b)), keyby = .(a)]

You may use by instead of keyby if you set arrange = FALSE:

dt %>% group_by(a, arrange = FALSE) %>% summarise(b = mean(b)) %>% show_query()
#> `_DT1`[, .(b = mean(b)), by = .(a)]

Often, there won’t be too much of a difference between these, but for larger grouped operations, the overhead of reordering data may become significant. In these situations, using arrange = FALSE becomes preferable.

The primary exception is grouped filter(), which requires the use of .SD:

dt %>% group_by(a) %>% filter(b < mean(b)) %>% show_query()
#> `_DT1`[`_DT1`[, .I[b < mean(b)], by = .(a)]$V1]

Combinations

dtplyr tries to generate generate data.table code as close as possible to what you’d write by hand, as this tends to unlock data.table’s tremendous speed. For example, if you filter() and then select(), dtplyr generates a single [:

dt %>% 
  filter(a == 1) %>% 
  select(-a) %>% 
  show_query()
#> `_DT1`[a == 1, .(b, c, d)]

And similarly when combining filtering and summarising:

dt %>% 
  group_by(a) %>% 
  filter(b < mean(b)) %>% 
  summarise(c = max(c)) %>% 
  show_query()
#> `_DT1`[`_DT1`[, .I[b < mean(b)], by = .(a)]$V1, .(c = max(c)), 
#>     keyby = .(a)]

This is particularly nice when joining two tables together because you can select variables after you have joined and data.table will only carry those into the join:

dt3 <- lazy_dt(data.frame(x = 1, y = 2))
dt4 <- lazy_dt(data.frame(x = 1, a = 2, b = 3, c = 4, d = 5, e = 7))

dt3 %>% 
  left_join(dt4) %>% 
  select(x, a:c) %>% 
  show_query()
#> Joining, by = "x"
#> setcolorder(`_DT5`[`_DT4`, on = .(x), allow.cartesian = TRUE], 
#>     c(1L, 7L, 2L, 3L, 4L, 5L, 6L))[, `:=`(c("y", "d", "e"), NULL)]

Note, however, that select()ing and then filter()ing must generate two separate calls to [, because data.table evaluates i before j.

dt %>% 
  select(X = a, Y = b) %>% 
  filter(X == 1) %>% 
  show_query()
#> `_DT1`[, .(X = a, Y = b)][X == 1]

Similarly, a filter() and mutate() can’t be combined because dt[a == 1, .(b2 := b * 2)] would modify the selected rows in place:

dt %>% 
  filter(a == 1) %>% 
  mutate(b2 = b * 2) %>% 
  show_query()
#> `_DT1`[a == 1][, `:=`(b2 = b * 2)]

Copies

By default dtplyr avoids mutating the input data, automatically creating a copy() if needed:

dt %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query()
#> copy(`_DT1`)[, `:=`(a2 = a * 2, b2 = b * 2)]

Note that dtplyr does its best to avoid needless copies, so it won’t explicitly copy if there’s already an implicit copy produced by [, head(), merge() or similar:

dt %>% 
  filter(x == 1) %>% 
  mutate(a2 = a * 2, b2 = b * 2) %>% 
  show_query()
#> `_DT1`[x == 1][, `:=`(a2 = a * 2, b2 = b * 2)]

You can choose to opt out of this copy, and take advantage of data.table’s reference semantics (see vignette("datatable-reference-semantics") for more details). Do this by setting immutable = FALSE on construction:

dt2 <- data.table(a = 1:10)

dt_inplace <- lazy_dt(dt2, immutable = FALSE)
dt_inplace %>% mutate(a2 = a * 2, b2 = b * 2) %>% show_query()
#> `_DT6`[, `:=`(a2 = a * 2, b2 = b * 2)]

Performance

There are two components to the performance of dtplyr: how long it takes to generate the translation, and how well the translation performs. Given my explorations so far, I’m reasonably confident that we’re generating high-quality data.table code, so most of the cost should be in the translation itself.

The following code briefly explores the performance of a few different translations. A significant amount of work is done by the dplyr verbs, so we benchmark the whole process.

bench::mark(
  filter = dt %>% filter(a == b, c == d),
  mutate = dt %>% mutate(a = a * 2, a4 = a2 * 2, a8 = a4 * 2) %>% show_query(),
  summarise = dt %>% group_by(a) %>% summarise(b = mean(b)) %>% show_query(),
  check = FALSE
)[1:6]
#> # A tibble: 3 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 filter        415µs 432.88µs     2256.    4.75KB     46.8
#> 2 mutate        540µs 573.67µs     1651.   14.39KB     44.2
#> 3 summarise     967µs   1.01ms      969.   24.04KB     44.9

These translations all take less than a millisecond, suggesting that the performance overhead of dtplyr should be negligible for realistic data sizes. Note that dtplyr run-time scales with the complexity of the pipeline, not the size of the data, so these timings should apply regardless of the size of the underlying data1.


  1. Unless a copy is performed.↩︎