rsample/0000755000175000017500000000000014142307722012051 5ustar nileshnileshrsample/MD50000644000175000017500000001632314142307722012366 0ustar nileshnilesh20195ebb6c119048fa238393d44b4db2 *DESCRIPTION f1694fd62840d5b058c62f31a5037ab4 *LICENSE ec453a258212262026f0561350075957 *NAMESPACE c76727624b82d8e4fed544907d444ede *NEWS.md 774b922c7ee61dec82883f10944970e1 *R/0_imports.R f1a91c820f40db11bbb8c5b55808c5dd *R/apparent.R 8b279b8546b76e3dd0b7bc9343322350 *R/boot.R 198ba6b3076e000f5f3eaf135a932e46 *R/bootci.R 3767fa4287b999c094887fa3a5c8a278 *R/caret.R d6b1f4eec648fb1a63569f65eb967c39 *R/compat-dplyr-old.R 23d7a34bbd4a057c105b95ad4612ece8 *R/compat-dplyr.R 8b19e4e0662a1baadc89a883dede82cb *R/compat-vctrs-helpers.R fd4f97fa7e63535cdf10808b18ffa5e2 *R/compat-vctrs.R a45d61784606af3f739eb926553752a5 *R/complement.R bf71f028d43595abd1c1be063b7f164e *R/data.R 1b754219d9921df568d57f32d095f220 *R/form_pred.R 3c0a8b7f34adb45ffad6e10182caf04d *R/gather.R c5e47e4fed8c7000bf3e82402ca3478c *R/groups.R a87db2fe9344fc51eb9084f1ef136d89 *R/initial_split.R ee278221bb23a41f8a827bbb8356f058 *R/labels.R 3fade499c2b3c69d285e15eb7379c95d *R/loo.R a90dcd4a4bee6c04258ba7e02f8244fd *R/make_strata.R ac767f22338d183f321cba98cdab8cee *R/manual.R 62d63231744b0923a77fc2541a382336 *R/mc.R 487d27c9107f5bd64cb596376f0112e2 *R/misc.R c29888d25f81920b84e867e64d1f9ae1 *R/nest.R 23cd1afe6413ea44a1a5d9d4bc20e476 *R/permutations.R 798557da71f16c404380dbcf4c510eae *R/pkg.R 78e5670f483f212f47451975ddee94fe *R/reexports.R 83bfb1da52802a8cec86a9a084c5d594 *R/reg_intervals.R 77309a8b0e6e8318e54d0bcb444807cc *R/rolling_origin.R f07404abb0de02f13beedd7e3121ca4b *R/rset.R 0035183b7de976601c130cd924d0fa02 *R/rsplit.R b5c64dd182d5b9eff09c7a7e78b9e8ab *R/slide.R f1bb73876dd916ef07316160f6451f1f *R/tidy.R f957da90ddac91f1fdc559e67de6e3b0 *R/tidyselect.R d8724c200b66eac4ab854321e68f7c19 *R/validation_split.R 25eb2c91b5c1a821ce41582511376e3d *R/vfold.R f0eb5dff994f8a84fb76575c51e4564d *R/zzz.R 8a9ffab32835e11f1727a77233cd6aa2 *build/vignette.rds 6afd6910455dd0147aaf8da242a26fd8 *inst/doc/Working_with_rsets.R 6644f35e0e5bc07c99d422a75d9a56e8 *inst/doc/Working_with_rsets.Rmd b74822ee140d185e526f5e9ef90b44c7 *inst/doc/Working_with_rsets.html 1825ddce3bdf68c19b813013be1a3332 *inst/doc/rsample.R aa96774f0224c01314943ce43c5ee241 *inst/doc/rsample.Rmd 82cbc6e04fe78b42fddc4e281a1ce074 *inst/doc/rsample.html 42c70fe8ae2ea003c9dfe1a909df9138 *man/add_resample_id.Rd 629c370ef22fd9174425ee565474afd8 *man/apparent.Rd 9476c5fa15b4a2e22ddf19d24cd6d8a8 *man/as.data.frame.rsplit.Rd be40711b82e543653dc21d18008630cc *man/attrition.Rd f743ac370f9a21f841291d75fc5493e5 *man/bootstraps.Rd 6e343a8933073d14d9f3491da890d6c0 *man/complement.Rd 9f9744ee41170bcb966e8204c59ebbde *man/drinks.Rd cb1e46f469cfbbbde29c8b5113e1d789 *man/figures/lifecycle-archived.svg c0d2e5a54f1fa4ff02bf9533079dd1f7 *man/figures/lifecycle-defunct.svg a1b8c987c676c16af790f563f96cbb1f *man/figures/lifecycle-deprecated.svg c3978703d8f40f2679795335715e98f4 *man/figures/lifecycle-experimental.svg 952b59dc07b171b97d5d982924244f61 *man/figures/lifecycle-maturing.svg 27b879bf3677ea76e3991d56ab324081 *man/figures/lifecycle-questioning.svg 53b3f893324260b737b3c46ed2a0e643 *man/figures/lifecycle-stable.svg 1c1fe7a759b86dc6dbcbe7797ab8246c *man/figures/lifecycle-superseded.svg 8c05929a19a5dbe8ad9adf09216aea58 *man/figures/logo.png 7a9cfa41c920f8f6b8839dfe4ed24f1f *man/form_pred.Rd c2294048d1882d9b587a745274ff298c *man/gather.rset.Rd 1661ee4d77e48a11100e43b8f12181cc *man/get_fingerprint.Rd cb537904af2173b940ac485a3ee6b6bb *man/group_vfold_cv.Rd 124f3eb1ebb6f21e29350c436077bf15 *man/initial_split.Rd cba4fe25edf390b01832046fe000f256 *man/int_pctl.Rd 81f039993c4777417c007979dc1f1d4c *man/labels.rset.Rd f32445dce6cf616bac58685f961f3c79 *man/labels.rsplit.Rd a25a18c3e7d20ec24335bc3ee9997ad3 *man/loo_cv.Rd 45e143d41316b75bb7ea630d3150d2ce *man/make_splits.Rd ef1e0344dd3b89a697ef69aedc24e328 *man/make_strata.Rd 74913f9c98b800d745b8471f008afb82 *man/manual_rset.Rd 3e33e96a740d43fec24319f656467341 *man/mc_cv.Rd 0e718e90d01c825f4ed7a6a98865410f *man/nested_cv.Rd 727915b9cfa3c480f47f4bab040436c6 *man/new_rset.Rd 72b7a7213e2be46946a2774f2d8e6610 *man/permutations.Rd f1746e72c6c09078c8e1d3a2a4bc1314 *man/populate.Rd 72208acd292c0b102633036f6fc0587c *man/pretty.vfold_cv.Rd eb9bc1651a45f1c638ea6385e92a45f7 *man/reexports.Rd 263e92833183de128bf00d5a467cee0e *man/reg_intervals.Rd 8df3930a51d36266d61c7f2838d590ca *man/rolling_origin.Rd 92307c357590583d2bb1dbd786f1dc5b *man/rsample-dplyr.Rd 43da3abb6c83bd7cac86721c1dfd5308 *man/rsample.Rd d7f0332a6cc5b2c4648ffdc34541a407 *man/rsample2caret.Rd 4c875cb6b40b8b9c0d2bd9e8ea942a13 *man/rset_reconstruct.Rd 93b565585570464b31b4a94639ada80b *man/slide-resampling.Rd cdde013830f957f1ca360716b6d507ca *man/tidy.rsplit.Rd 665395a979a3c6da9e2a78409e980b64 *man/two_class_dat.Rd 0af497acf160916a8cc9b700a04b95bd *man/validation_split.Rd e7ac66344085032bdf57629c639fe2e2 *man/vfold_cv.Rd d3d32f0d08f42f4b2e9d8608824ede11 *tests/testthat.R 582c9befa749517dde8d7f70d2334aad *tests/testthat/helper-s3.R bd66ccd55e85581ab3ba18dac1788589 *tests/testthat/print_test_output/obj_sum 7965df9cf0e8cfde5c82bfe0b2ce6952 *tests/testthat/print_test_output/rsplit 40d7aed4ce58c792f0881132242ea89f *tests/testthat/print_test_output/val_split 59912f6407e1986af942edc29d1f227b *tests/testthat/test-compat-dplyr-old.R 41f0e286e1bf195238daaddc07fea0c4 *tests/testthat/test-compat-dplyr.R 0c7a78941b9add744ee80ea5b06193c8 *tests/testthat/test-compat-vctrs.R d0136038f6ffe96d4bde7f120deb786c *tests/testthat/test-make-splits.R 51796e5ec80e840b9e7ad5dafbeebcf0 *tests/testthat/test-manual.R 0f6235f11f549ef556f90a56230f6046 *tests/testthat/test-print-groups.txt ac7547dc70895f9dd07430e927071e99 *tests/testthat/test-rset.R c6227dcab1feea76d34eb9fa63dfac77 *tests/testthat/test-slide.R 6ce4bcfd572da87a3fb62180a80ea788 *tests/testthat/test_boot.R 91a8418dd38bd75dea5409c542ceaf11 *tests/testthat/test_bootci.R afbd0a2af303e40aec311d87ee5456b2 *tests/testthat/test_caret.R 4a5502a4755beed06adde55676f7931d *tests/testthat/test_fingerprint.R 04e93e38d7d98db3e288216fe893f6f1 *tests/testthat/test_for_pred.R 71fa9cb150d2b7764d124b57dc973fa3 *tests/testthat/test_group.R b44fa66938bcc2e1d95d26cd138dc221 *tests/testthat/test_initial.R b2e67f2b59b4a2c498a09070aea3c5a4 *tests/testthat/test_labels.R c4e2dfa172ba4f9a83dfde9c58a4c787 *tests/testthat/test_loo.R 35f6002bb7fd4abc03668b7f0b86b0fc *tests/testthat/test_mc.R 0dfd9da9127f2fd1a20ef3df34bd33da *tests/testthat/test_names.R 7197da1a9599da22bf8347be44676a88 *tests/testthat/test_nesting.R bd8348fd82d05d9de0339943d2596700 *tests/testthat/test_permutations.R c5fd01bc816f808d8c466808b4b03826 *tests/testthat/test_rolling.R cc75ad2b78545d6549c1e3159d84d3ec *tests/testthat/test_rset.R 5f73f01930546456fea9b1bfeadb9882 *tests/testthat/test_rsplit.R f33ac5cf0d46e532ed0d3801607f0fa7 *tests/testthat/test_strata.R 67faa03fef7f67328a6fe57940a29aed *tests/testthat/test_tidy.R 905d053ed6dcc37848930d7949655288 *tests/testthat/test_validation.R f2842719bc0a076014e55548f2f1c307 *tests/testthat/test_vfold.R aa5f0911c445846eb9f8fc5de671f3fe *vignettes/Applications/Intervals.Rmd e5d27cb86426c0fc36bbb712154db33a *vignettes/Applications/Recipes_and_rsample.Rmd 6558bad6c697b97c8adacae5965a0af8 *vignettes/Applications/Survival_Analysis.Rmd df1ca90e64750f622cc2fef6a66a0fa4 *vignettes/Applications/diagram.png 6644f35e0e5bc07c99d422a75d9a56e8 *vignettes/Working_with_rsets.Rmd aa96774f0224c01314943ce43c5ee241 *vignettes/rsample.Rmd rsample/NEWS.md0000644000175000017500000001422114142277652013157 0ustar nileshnilesh# rsample 0.1.1 * Updated documentation on stratified sampling (#245). * Changed `make_splits()` to an S3 generic, with the original functionality a method for `list` and a new method for dataframes that allows users to create a split from existing analysis & assessment sets (@LiamBlake, #246). * Added `validation_time_split()` for a single validation sample taking the first samples for training (@mine-cetinkaya-rundel, #256). * Escalated the deprecation of the `gather()` method for `rset` objects to a hard deprecation. Use `tidyr::pivot_longer()` instead (#257). * Changed resample "fingerprint" to hash the indices only rather than the entire resample result (including the data object). This is much faster and will still ensure the same resample for the same original data object (#259). # rsample 0.1.0 * Fixed how `mc_cv()`, `initial_split()`, and `validation_split()` use the `prop` argument to first compute the assessment indices, rather than the analysis indices. This is a minor but **breaking change** in some situations; the previous implementation could cause an inconsistency in the sizes of the generated analysis and assessment sets when compared to how `prop` is documented to function (#217, @issactoast). * Fixed problem with creation of `apparent()` (#223) and `caret2rsample()` (#232) resamples. * Re-licensed package from GPL-2 to MIT. See [consent from copyright holders here](https://github.com/tidymodels/rsample/issues/226). * Attempts to stratify on a `Surv` object now error more informatively (#230). * Exposed `pool` argument from `make_strata()` in user-facing resampling functions (#229). * Deprecated the `gather()` method for `rset` objects in favor of `tidyr::pivot_longer()` (#233). * Fixed bug in `make_strata()` for numeric variables with `NA` values (@brian-j-smith, #236). # rsample 0.0.9 * New `rset_reconstruct()`, a developer tool to ease creation of new rset subclasses (#210). * Added `permutations()`, a function for creating permutation resamples by performing column-wise shuffling (@mattwarkentin, #198). * Fixed an issue where empty assessment sets couldn't be created by `make_splits()` (#188). * `rset` objects now contain a "fingerprint" attribute that can be used to check to see if the same object uses the same resamples. * The `reg_intervals()` function is a convenience function for `lm()`, `glm()`, `survreg()`, and `coxph()` models (#206). * A few internal functions were exported so that `rsample`-adjacent packages can use the same underlying code. * The `obj_sum()` method for `rsplit` objects was updated (#215). * Changed the inheritance structure for `rsplit` objects from specific to general and simplified the methods for the `complement()` generic (#216). # rsample 0.0.8 * New `manual_rset()` for constructing rset objects manually from custom rsplits (tidymodels/tune#273). * Three new time based resampling functions have been added: `sliding_window()`, `sliding_index()`, and `sliding_period()`, which have more flexibility than the pre-existing `rolling_origin()`. * Correct `alpha` parameter handling for bootstrap CI functions (#179, #184). # rsample 0.0.7 * Lower threshold for pooling strata to 10% (from 15%) (#149). * The `print()` methods for `rsplit` and `val_split` objects were adjusted to show `""` and ``, respectively. * The `drinks`, `attrition`, and `two_class_dat` data sets were removed. They are in the `modeldata` package. * Compatability with `dplyr` 1.0.0. # `rsample` 0.0.6 * Added `validation_set()` for making a single resample. * Correct the tidy method for bootstraps (#115). * Changes for upcoming `tibble release. * Exported constructors for `rset` and `split` objects (#40) * `initial_time_split()` and `rolling_origin()` now have a `lag` parameter that ensures that previous data are available so that lagged variables can be calculated. (#135, #136) # `rsample` 0.0.5 * Added three functions to compute different bootstrap confidence intervals. * A new function (`add_resample_id()`) augments a data frame with columns for the resampling identifier. * Updated `initial_split()`, `mc_cv()`, `vfold_cv()`, `bootstraps()`, and `group_vfold_cv()` to use tidyselect on the stratification variable. * Updated `initial_split()`, `mc_cv()`, `vfold_cv()`, `bootstraps()` with new `breaks` parameter that specifies the number of bins to stratify by for a numeric stratification variable. # `rsample` 0.0.4 Small maintenence release. ## Minor improvements and fixes * `fill()` was removed per the deprecation warning. * Small changes were made for the new version of `tibble`. # `rsample` 0.0.3 ## New features * Added function `initial_time_split()` for ordered initial sampling appropriate for time series data. ## Minor improvements and fixes * `fill()` has been renamed `populate()` to avoid a conflict with `tidyr::fill()`. * Changed the R version requirement to be R >= 3.1 instead of 3.3.3. * The `recipes`-related `prepper` function was [moved to the `recipes` package](https://github.com/tidymodels/rsample/issues/48). This makes the `rsample` install footprint much smaller. * `rsplit` objects are shown differently inside of a tibble. * Moved from the `broom` package to the `generics` package. # `rsample` 0.0.2 * `initial_split`, `training`, and `testing` were added to do training/testing splits prior to resampling. * Another resampling method, `group_vfold_cv`, was added. * `caret2rsample` and `rsample2caret` can convert `rset` objects to those used by `caret::trainControl` and vice-versa. * A function called `form_pred` can be used to determine the original names of the predictors in a formula or `terms` object. * A vignette and a function (`prepper`) were included to facilitate using the `recipes` with `rsample`. * A `gather` method was added for `rset` objects. * A `labels` method was added for `rsplit` objects. This can help identify which resample is being used even when the whole `rset` object is not available. * A variety of `dplyr` methods were added (e.g. `filter`, `mutate`, etc) that work without dropping classes or attributes of the `rsample` objects. # `rsample` 0.0.1 (2017-07-08) Initial public version on CRAN rsample/DESCRIPTION0000644000175000017500000000305714142307722013564 0ustar nileshnileshPackage: rsample Title: General Resampling Infrastructure Version: 0.1.1 Authors@R: c( person("Julia", "Silge", , "julia.silge@rstudio.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-3671-836X")), person("Fanny", "Chow", , "fannybchow@gmail.com", role = "aut"), person("Max", "Kuhn", , "max@rstudio.com", role = "aut"), person("Hadley", "Wickham", , "hadley@rstudio.com", role = "aut"), person("RStudio", role = "cph") ) Maintainer: Julia Silge Description: Classes and functions to create and summarize different types of resampling objects (e.g. bootstrap, cross-validation). License: MIT + file LICENSE URL: https://rsample.tidymodels.org, https://github.com/tidymodels/rsample BugReports: https://github.com/tidymodels/rsample/issues Depends: R (>= 3.2) Imports: dplyr (>= 1.0.0), ellipsis, furrr, generics, lifecycle, methods, purrr, rlang (>= 0.4.10), slider (>= 0.1.5), tibble, tidyr, tidyselect, vctrs (>= 0.3.0) Suggests: broom, covr, ggplot2, knitr, modeldata, recipes (>= 0.1.4), rmarkdown, stats, testthat, utils, xml2 VignetteBuilder: knitr Config/Needs/website: GGally, nlstools, survival, tidymodels, tidyposterior, tidyverse/tidytemplate Encoding: UTF-8 RoxygenNote: 7.1.2 NeedsCompilation: no Packaged: 2021-11-08 20:30:17 UTC; juliasilge Author: Julia Silge [aut, cre] (), Fanny Chow [aut], Max Kuhn [aut], Hadley Wickham [aut], RStudio [cph] Repository: CRAN Date/Publication: 2021-11-08 21:00:02 UTC rsample/man/0000755000175000017500000000000014136023335012622 5ustar nileshnileshrsample/man/bootstraps.Rd0000644000175000017500000000651214066674573015337 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/boot.R \name{bootstraps} \alias{bootstraps} \title{Bootstrap Sampling} \usage{ bootstraps( data, times = 25, strata = NULL, breaks = 4, pool = 0.1, apparent = FALSE, ... ) } \arguments{ \item{data}{A data frame.} \item{times}{The number of bootstrap samples.} \item{strata}{A variable in \code{data} (single character or name) used to conduct stratified sampling. When not \code{NULL}, each resample is created within the stratification variable. Numeric \code{strata} are binned into quartiles.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{pool}{A proportion of data used to determine if a particular group is too small and should be pooled into another group. We do not recommend decreasing this argument below its default of 0.1 because of the dangers of stratifying groups that are too small.} \item{apparent}{A logical. Should an extra resample be added where the analysis and holdout subset are the entire data set. This is required for some estimators used by the \code{summary} function that require the apparent error rate.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{bootstraps}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ A bootstrap sample is a sample that is the same size as the original data set that is made using replacement. This results in analysis samples that have multiple replicates of some of the original rows of the data. The assessment set is defined as the rows of the original data that were not included in the bootstrap sample. This is often referred to as the "out-of-bag" (OOB) sample. } \details{ The argument \code{apparent} enables the option of an additional "resample" where the analysis and assessment data sets are the same as the original data set. This can be required for some types of analysis of the bootstrap results. With a \code{strata} argument, the random sampling is conducted \emph{within the stratification variable}. This can help ensure that the resamples have equivalent proportions as the original data set. For a categorical variable, sampling is conducted separately within each class. For a numeric stratification variable, \code{strata} is binned into quartiles, which are then used to stratify. Strata below 10\% of the total are pooled together; see \code{\link[=make_strata]{make_strata()}} for more details. } \examples{ bootstraps(mtcars, times = 2) bootstraps(mtcars, times = 2, apparent = TRUE) library(purrr) library(modeldata) data(wa_churn) set.seed(13) resample1 <- bootstraps(wa_churn, times = 3) map_dbl(resample1$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) set.seed(13) resample2 <- bootstraps(wa_churn, strata = churn, times = 3) map_dbl(resample2$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) set.seed(13) resample3 <- bootstraps(wa_churn, strata = tenure, breaks = 6, times = 3) map_dbl(resample3$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) } rsample/man/drinks.Rd0000644000175000017500000000071113673171774014422 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \name{drinks} \alias{drinks} \title{Sample Time Series Data} \description{ Sample Time Series Data } \details{ Drink sales. The exact name of the series from FRED is: "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales" These data are now in the \code{modeldata} package. } rsample/man/pretty.vfold_cv.Rd0000644000175000017500000000236613755042647016265 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/labels.R \name{pretty.vfold_cv} \alias{pretty.vfold_cv} \alias{pretty.loo_cv} \alias{pretty.apparent} \alias{pretty.rolling_origin} \alias{pretty.sliding_window} \alias{pretty.sliding_index} \alias{pretty.sliding_period} \alias{pretty.mc_cv} \alias{pretty.validation_split} \alias{pretty.nested_cv} \alias{pretty.bootstraps} \alias{pretty.permutations} \alias{pretty.group_vfold_cv} \alias{pretty.manual_rset} \title{Short Descriptions of rsets} \usage{ \method{pretty}{vfold_cv}(x, ...) \method{pretty}{loo_cv}(x, ...) \method{pretty}{apparent}(x, ...) \method{pretty}{rolling_origin}(x, ...) \method{pretty}{sliding_window}(x, ...) \method{pretty}{sliding_index}(x, ...) \method{pretty}{sliding_period}(x, ...) \method{pretty}{mc_cv}(x, ...) \method{pretty}{validation_split}(x, ...) \method{pretty}{nested_cv}(x, ...) \method{pretty}{bootstraps}(x, ...) \method{pretty}{permutations}(x, ...) \method{pretty}{group_vfold_cv}(x, ...) \method{pretty}{manual_rset}(x, ...) } \arguments{ \item{x}{An \code{rset} object} \item{...}{Not currently used.} } \value{ A character vector. } \description{ Produce a character vector describing the resampling method. } \keyword{internal} rsample/man/nested_cv.Rd0000644000175000017500000000423713727757057015115 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nest.R \name{nested_cv} \alias{nested_cv} \title{Nested or Double Resampling} \usage{ nested_cv(data, outside, inside) } \arguments{ \item{data}{A data frame.} \item{outside}{The initial resampling specification. This can be an already created object or an expression of a new object (see the examples below). If the latter is used, the \code{data} argument does not need to be specified and, if it is given, will be ignored.} \item{inside}{An expression for the type of resampling to be conducted within the initial procedure.} } \value{ An tibble with \code{nested_cv} class and any other classes that outer resampling process normally contains. The results include a column for the outer data split objects, one or more \code{id} columns, and a column of nested tibbles called \code{inner_resamples} with the additional resamples. } \description{ \code{nested_cv} can be used to take the results of one resampling procedure and conduct further resamples within each split. Any type of resampling used in \code{rsample} can be used. } \details{ It is a bad idea to use bootstrapping as the outer resampling procedure (see the example below) } \examples{ ## Using expressions for the resampling procedures: nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5)) ## Using an existing object: folds <- vfold_cv(mtcars) nested_cv(mtcars, folds, inside = bootstraps(times = 5)) ## The dangers of outer bootstraps: set.seed(2222) bad_idea <- nested_cv(mtcars, outside = bootstraps(times = 5), inside = vfold_cv(v = 3)) first_outer_split <- bad_idea$splits[[1]] outer_analysis <- as.data.frame(first_outer_split) sum(grepl("Volvo 142E", rownames(outer_analysis))) ## For the 3-fold CV used inside of each bootstrap, how are the replicated ## `Volvo 142E` data partitioned? first_inner_split <- bad_idea$inner_resamples[[1]]$splits[[1]] inner_analysis <- as.data.frame(first_inner_split) inner_assess <- as.data.frame(first_inner_split, data = "assessment") sum(grepl("Volvo 142E", rownames(inner_analysis))) sum(grepl("Volvo 142E", rownames(inner_assess))) } rsample/man/mc_cv.Rd0000644000175000017500000000535514066674573014232 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mc.R \name{mc_cv} \alias{mc_cv} \title{Monte Carlo Cross-Validation} \usage{ mc_cv(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, pool = 0.1, ...) } \arguments{ \item{data}{A data frame.} \item{prop}{The proportion of data to be retained for modeling/analysis.} \item{times}{The number of times to repeat the sampling.} \item{strata}{A variable in \code{data} (single character or name) used to conduct stratified sampling. When not \code{NULL}, each resample is created within the stratification variable. Numeric \code{strata} are binned into quartiles.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{pool}{A proportion of data used to determine if a particular group is too small and should be pooled into another group. We do not recommend decreasing this argument below its default of 0.1 because of the dangers of stratifying groups that are too small.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{mc_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ One resample of Monte Carlo cross-validation takes a random sample (without replacement) of the original data set to be used for analysis. All other data points are added to the assessment set. } \details{ With a \code{strata} argument, the random sampling is conducted \emph{within the stratification variable}. This can help ensure that the resamples have equivalent proportions as the original data set. For a categorical variable, sampling is conducted separately within each class. For a numeric stratification variable, \code{strata} is binned into quartiles, which are then used to stratify. Strata below 10\% of the total are pooled together; see \code{\link[=make_strata]{make_strata()}} for more details. } \examples{ mc_cv(mtcars, times = 2) mc_cv(mtcars, prop = .5, times = 2) library(purrr) data(wa_churn, package = "modeldata") set.seed(13) resample1 <- mc_cv(wa_churn, times = 3, prop = .5) map_dbl(resample1$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) set.seed(13) resample2 <- mc_cv(wa_churn, strata = churn, times = 3, prop = .5) map_dbl(resample2$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) set.seed(13) resample3 <- mc_cv(wa_churn, strata = tenure, breaks = 6, times = 3, prop = .5) map_dbl(resample3$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) } rsample/man/rolling_origin.Rd0000644000175000017500000000610713727757057016156 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rolling_origin.R \name{rolling_origin} \alias{rolling_origin} \title{Rolling Origin Forecast Resampling} \usage{ rolling_origin( data, initial = 5, assess = 1, cumulative = TRUE, skip = 0, lag = 0, ... ) } \arguments{ \item{data}{A data frame.} \item{initial}{The number of samples used for analysis/modeling in the initial resample.} \item{assess}{The number of samples used for each assessment resample.} \item{cumulative}{A logical. Should the analysis resample grow beyond the size specified by \code{initial} at each resample?.} \item{skip}{A integer indicating how many (if any) \emph{additional} resamples to skip to thin the total amount of data points in the analysis resample. See the example below.} \item{lag}{A value to include a lag between the assessment and analysis set. This is useful if lagged predictors will be used during training and testing.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{rolling_origin}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ This resampling method is useful when the data set has a strong time component. The resamples are not random and contain data points that are consecutive values. The function assumes that the original data set are sorted in time order. } \details{ The main options, \code{initial} and \code{assess}, control the number of data points from the original data that are in the analysis and assessment set, respectively. When \code{cumulative = TRUE}, the analysis set will grow as resampling continues while the assessment set size will always remain static. \code{skip} enables the function to not use every data point in the resamples. When \code{skip = 0}, the resampling data sets will increment by one position. Suppose that the rows of a data set are consecutive days. Using \code{skip = 6} will make the analysis data set to operate on \emph{weeks} instead of days. The assessment set size is not affected by this option. } \examples{ set.seed(1131) ex_data <- data.frame(row = 1:20, some_var = rnorm(20)) dim(rolling_origin(ex_data)) dim(rolling_origin(ex_data, skip = 2)) dim(rolling_origin(ex_data, skip = 2, cumulative = FALSE)) # You can also roll over calendar periods by first nesting by that period, # which is especially useful for irregular series where a fixed window # is not useful. This example slides over 5 years at a time. library(dplyr) library(tidyr) data(drinks, package = "modeldata") drinks_annual <- drinks \%>\% mutate(year = as.POSIXlt(date)$year + 1900) \%>\% nest(-year) multi_year_roll <- rolling_origin(drinks_annual, cumulative = FALSE) analysis(multi_year_roll$splits[[1]]) assessment(multi_year_roll$splits[[1]]) } \seealso{ \code{\link[=sliding_window]{sliding_window()}}, \code{\link[=sliding_index]{sliding_index()}}, and \code{\link[=sliding_period]{sliding_period()}} for additional time based resampling functions. } rsample/man/group_vfold_cv.Rd0000644000175000017500000000344714124441103016131 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/groups.R \name{group_vfold_cv} \alias{group_vfold_cv} \title{Group V-Fold Cross-Validation} \usage{ group_vfold_cv(data, group = NULL, v = NULL, ...) } \arguments{ \item{data}{A data frame.} \item{group}{A variable in \code{data} (single character or name) used for grouping observations with the same value to either the analysis or assessment set within a fold.} \item{v}{The number of partitions of the data set. If let \code{NULL}, \code{v} will be set to the number of unique values in the group.} \item{...}{Not currently used.} } \value{ A tibble with classes \code{group_vfold_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and an identification variable. } \description{ Group V-fold cross-validation creates splits of the data based on some grouping variable (which may have more than a single row associated with it). The function can create as many splits as there are unique values of the grouping variable or it can create a smaller set of splits where more than one value is left out at a time. A common use of this kind of resampling is when you have repeated measures of the same subject. } \examples{ set.seed(3527) test_data <- data.frame(id = sort(sample(1:20, size = 80, replace = TRUE))) test_data$dat <- runif(nrow(test_data)) set.seed(5144) split_by_id <- group_vfold_cv(test_data, group = "id") get_id_left_out <- function(x) unique(assessment(x)$id) library(purrr) table(map_int(split_by_id$splits, get_id_left_out)) set.seed(5144) split_by_some_id <- group_vfold_cv(test_data, group = "id", v = 7) held_out <- map(split_by_some_id$splits, get_id_left_out) table(unlist(held_out)) # number held out per resample: map_int(held_out, length) } rsample/man/reexports.Rd0000644000175000017500000000204713755042647015164 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reexports.R, R/tidyselect.R \docType{import} \name{reexports} \alias{reexports} \alias{tidy} \alias{gather} \alias{contains} \alias{select_helpers} \alias{ends_with} \alias{everything} \alias{matches} \alias{num_range} \alias{starts_with} \alias{last_col} \alias{any_of} \alias{all_of} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{generics}{\code{\link[generics]{tidy}}} \item{tidyr}{\code{\link[tidyr]{gather}}} \item{tidyselect}{\code{\link[tidyselect]{all_of}}, \code{\link[tidyselect:all_of]{any_of}}, \code{\link[tidyselect:starts_with]{contains}}, \code{\link[tidyselect:starts_with]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect:everything]{last_col}}, \code{\link[tidyselect:starts_with]{matches}}, \code{\link[tidyselect:starts_with]{num_range}}, \code{\link[tidyselect]{starts_with}}} }} rsample/man/gather.rset.Rd0000644000175000017500000000401714122424444015342 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/gather.R \name{gather.rset} \alias{gather.rset} \title{Gather an \code{rset} Object} \usage{ \method{gather}{rset}( data, key = NULL, value = NULL, ..., na.rm = TRUE, convert = FALSE, factor_key = TRUE ) } \arguments{ \item{data}{An \code{rset} object.} \item{key, value, ...}{Not specified in this method and will be ignored. Note that this means that selectors are ignored if they are passed to the function.} \item{na.rm}{If \code{TRUE}, will remove rows from output where the value column in \code{NA}.} \item{convert}{If \code{TRUE} will automatically run \code{type.convert()} on the key column. This is useful if the column names are actually numeric, integer, or logical.} \item{factor_key}{If FALSE, the default, the key values will be stored as a character vector. If \code{TRUE}, will be stored as a factor, which preserves the original ordering of the columns.} } \value{ A data frame with the ID columns, a column called \code{model} (with the previous column names), and a column called \code{statistic} (with the values). } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This method uses \code{gather} on an \code{rset} object to stack all of the non-ID or split columns in the data and is useful for stacking model evaluation statistics. The resulting data frame has a column based on the column names of \code{data} and another for the values. This method is now deprecated in favor of using \code{\link[tidyr:pivot_longer]{tidyr::pivot_longer()}} directly. } \examples{ library(rsample) cv_obj <- vfold_cv(mtcars, v = 10) cv_obj$lm_rmse <- rnorm(10, mean = 2) cv_obj$nnet_rmse <- rnorm(10, mean = 1) ## now deprecated for rset objects: ## gather(cv_obj) ## instead of gather, use tidyr::pivot_longer: library(tidyr) library(dplyr) cv_obj \%>\% select(-splits) \%>\% pivot_longer(-id) } \keyword{internal} rsample/man/int_pctl.Rd0000644000175000017500000000647114010267142014732 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bootci.R \name{int_pctl} \alias{int_pctl} \alias{int_t} \alias{int_bca} \title{Bootstrap confidence intervals} \usage{ int_pctl(.data, statistics, alpha = 0.05) int_t(.data, statistics, alpha = 0.05) int_bca(.data, statistics, alpha = 0.05, .fn, ...) } \arguments{ \item{.data}{A data frame containing the bootstrap resamples created using \code{bootstraps()}. For t- and BCa-intervals, the \code{apparent} argument should be set to \code{TRUE}. Even if the \code{apparent} argument is set to \code{TRUE} for the percentile method, the apparent data is never used in calculating the percentile confidence interval.} \item{statistics}{An unquoted column name or \code{dplyr} selector that identifies a single column in the data set that contains the individual bootstrap estimates. This can be a list column of tidy tibbles (that contains columns \code{term} and \code{estimate}) or a simple numeric column. For t-intervals, a standard tidy column (usually called \code{std.err}) is required. See the examples below.} \item{alpha}{Level of significance} \item{.fn}{A function to calculate statistic of interest. The function should take an \code{rsplit} as the first argument and the \code{...} are required.} \item{...}{Arguments to pass to \code{.fn}.} } \value{ Each function returns a tibble with columns \code{.lower}, \code{.estimate}, \code{.upper}, \code{.alpha}, \code{.method}, and \code{term}. \code{.method} is the type of interval (eg. "percentile", "student-t", or "BCa"). \code{term} is the name of the estimate. Note the \code{.estimate} returned from \code{int_pctl()} is the mean of the estimates from the bootstrap resamples and not the estimate from the apparent model. } \description{ Calculate bootstrap confidence intervals using various methods. } \details{ Percentile intervals are the standard method of obtaining confidence intervals but require thousands of resamples to be accurate. T-intervals may need fewer resamples but require a corresponding variance estimate. Bias-corrected and accelerated intervals require the original function that was used to create the statistics of interest and are computationally taxing. } \examples{ \donttest{ library(broom) library(dplyr) library(purrr) library(tibble) lm_est <- function(split, ...) { lm(mpg ~ disp + hp, data = analysis(split)) \%>\% tidy() } set.seed(52156) car_rs <- bootstraps(mtcars, 500, apparent = TRUE) \%>\% mutate(results = map(splits, lm_est)) int_pctl(car_rs, results) int_t(car_rs, results) int_bca(car_rs, results, .fn = lm_est) # putting results into a tidy format rank_corr <- function(split) { dat <- analysis(split) tibble( term = "corr", estimate = cor(dat$sqft, dat$price, method = "spearman"), # don't know the analytical std.err so no t-intervals std.err = NA_real_ ) } set.seed(69325) data(Sacramento, package = "modeldata") bootstraps(Sacramento, 1000, apparent = TRUE) \%>\% mutate(correlations = map(splits, rank_corr)) \%>\% int_pctl(correlations) } } \references{ Davison, A., & Hinkley, D. (1997). \emph{Bootstrap Methods and their Application}. Cambridge: Cambridge University Press. doi:10.1017/CBO9780511802843 \url{https://rsample.tidymodels.org/articles/Applications/Intervals.html} } \seealso{ \code{\link[=reg_intervals]{reg_intervals()}} } rsample/man/validation_split.Rd0000644000175000017500000000476214122413046016464 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/validation_split.R \name{validation_split} \alias{validation_split} \alias{validation_time_split} \title{Create a Validation Set} \usage{ validation_split(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) validation_time_split(data, prop = 3/4, lag = 0, ...) } \arguments{ \item{data}{A data frame.} \item{prop}{The proportion of data to be retained for modeling/analysis.} \item{strata}{A variable in \code{data} (single character or name) used to conduct stratified sampling. When not \code{NULL}, each resample is created within the stratification variable. Numeric \code{strata} are binned into quartiles.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{pool}{A proportion of data used to determine if a particular group is too small and should be pooled into another group. We do not recommend decreasing this argument below its default of 0.1 because of the dangers of stratifying groups that are too small.} \item{...}{Not currently used.} \item{lag}{A value to include a lag between the assessment and analysis set. This is useful if lagged predictors will be used during training and testing.} } \value{ An tibble with classes \code{validation_split}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ \code{validation_split()} takes a single random sample (without replacement) of the original data set to be used for analysis. All other data points are added to the assessment set (to be used as the validation set). \code{validation_time_split()} does the same, but takes the \emph{first} \code{prop} samples for training, instead of a random selection. } \details{ With a \code{strata} argument, the random sampling is conducted \emph{within the stratification variable}. This can help ensure that the resamples have equivalent proportions as the original data set. For a categorical variable, sampling is conducted separately within each class. For a numeric stratification variable, \code{strata} is binned into quartiles, which are then used to stratify. Strata below 10\% of the total are pooled together; see \code{\link[=make_strata]{make_strata()}} for more details. } \examples{ validation_split(mtcars, prop = .9) data(drinks, package = "modeldata") validation_time_split(drinks) } rsample/man/initial_split.Rd0000644000175000017500000000556714066674573016014 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/initial_split.R \name{initial_split} \alias{initial_split} \alias{initial_time_split} \alias{training} \alias{testing} \title{Simple Training/Test Set Splitting} \usage{ initial_split(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) initial_time_split(data, prop = 3/4, lag = 0, ...) training(x) testing(x) } \arguments{ \item{data}{A data frame.} \item{prop}{The proportion of data to be retained for modeling/analysis.} \item{strata}{A variable in \code{data} (single character or name) used to conduct stratified sampling. When not \code{NULL}, each resample is created within the stratification variable. Numeric \code{strata} are binned into quartiles.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{pool}{A proportion of data used to determine if a particular group is too small and should be pooled into another group. We do not recommend decreasing this argument below its default of 0.1 because of the dangers of stratifying groups that are too small.} \item{...}{Not currently used.} \item{lag}{A value to include a lag between the assessment and analysis set. This is useful if lagged predictors will be used during training and testing.} \item{x}{An \code{rsplit} object produced by \code{initial_split}} } \value{ An \code{rsplit} object that can be used with the \code{training} and \code{testing} functions to extract the data in each split. } \description{ \code{initial_split} creates a single binary split of the data into a training set and testing set. \code{initial_time_split} does the same, but takes the \emph{first} \code{prop} samples for training, instead of a random selection. \code{training} and \code{testing} are used to extract the resulting data. } \details{ With a \code{strata} argument, the random sampling is conducted \emph{within the stratification variable}. This can help ensure that the resamples have equivalent proportions as the original data set. For a categorical variable, sampling is conducted separately within each class. For a numeric stratification variable, \code{strata} is binned into quartiles, which are then used to stratify. Strata below 10\% of the total are pooled together; see \code{\link[=make_strata]{make_strata()}} for more details. } \examples{ set.seed(1353) car_split <- initial_split(mtcars) train_data <- training(car_split) test_data <- testing(car_split) data(drinks, package = "modeldata") drinks_split <- initial_time_split(drinks) train_data <- training(drinks_split) test_data <- testing(drinks_split) c(max(train_data$date), min(test_data$date)) # no lag # With 12 period lag drinks_lag_split <- initial_time_split(drinks, lag = 12) train_data <- training(drinks_lag_split) test_data <- testing(drinks_lag_split) c(max(train_data$date), min(test_data$date)) # 12 period lag } rsample/man/make_strata.Rd0000644000175000017500000000475614066674573015442 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/make_strata.R \name{make_strata} \alias{make_strata} \title{Create or Modify Stratification Variables} \usage{ make_strata(x, breaks = 4, nunique = 5, pool = 0.1, depth = 20) } \arguments{ \item{x}{An input vector.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{nunique}{An integer for the number of unique value threshold in the algorithm.} \item{pool}{A proportion of data used to determine if a particular group is too small and should be pooled into another group. We do not recommend decreasing this argument below its default of 0.1 because of the dangers of stratifying groups that are too small.} \item{depth}{An integer that is used to determine the best number of percentiles that should be used. The number of bins are based on \code{min(5, floor(n / depth))} where \code{n = length(x)}. If \code{x} is numeric, there must be at least 40 rows in the data set (when \code{depth = 20}) to conduct stratified sampling.} } \value{ A factor vector. } \description{ This function can create strata from numeric data and make non-numeric data more conducive for stratification. } \details{ For numeric data, if the number of unique levels is less than \code{nunique}, the data are treated as categorical data. For categorical inputs, the function will find levels of \code{x} than occur in the data with percentage less than \code{pool}. The values from these groups will be randomly assigned to the remaining strata (as will data points that have missing values in \code{x}). For numeric data with more unique values than \code{nunique}, the data will be converted to being categorical based on percentiles of the data. The percentile groups will have no more than 20 percent of the data in each group. Again, missing values in \code{x} are randomly assigned to groups. } \examples{ set.seed(61) x1 <- rpois(100, lambda = 5) table(x1) table(make_strata(x1)) set.seed(554) x2 <- rpois(100, lambda = 1) table(x2) table(make_strata(x2)) # small groups are randomly assigned x3 <- factor(x2) table(x3) table(make_strata(x3)) # `oilType` data from `caret` x4 <- rep(LETTERS[1:7], c(37, 26, 3, 7, 11, 10, 2)) table(x4) table(make_strata(x4)) table(make_strata(x4, pool = 0.1)) table(make_strata(x4, pool = 0.0)) # not enough data to stratify x5 <- rnorm(20) table(make_strata(x5)) set.seed(483) x6 <- rnorm(200) quantile(x6, probs = (0:10)/10) table(make_strata(x6, breaks = 10)) } rsample/man/reg_intervals.Rd0000644000175000017500000000410514010267142015752 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reg_intervals.R \name{reg_intervals} \alias{reg_intervals} \title{A convenience function for confidence intervals with linear-ish parametric models} \usage{ reg_intervals( formula, data, model_fn = "lm", type = "student-t", times = NULL, alpha = 0.05, filter = term != "(Intercept)", keep_reps = FALSE, ... ) } \arguments{ \item{formula}{An R model formula with one outcome and at least one predictor.} \item{data}{A data frame.} \item{model_fn}{The model to fit. Allowable values are "lm", "glm", "survreg", and "coxph". The latter two require that the \code{survival} package be installed.} \item{type}{The type of bootstrap confidence interval. Values of "student-t" and "percentile" are allowed.} \item{times}{A single integer for the number of bootstrap samples. If left NULL, 1,001 are used for t-intervals and 2,001 for percentile intervals.} \item{alpha}{Level of significance.} \item{filter}{A logical expression used to remove rows from the final result, or \code{NULL} to keep all rows.} \item{keep_reps}{Should the individual parameter estimates for each bootstrap sample be retained?} \item{...}{Options to pass to the model function (such as \code{family} for \code{glm()}).} } \value{ A tibble with columns "term", ".lower", ".estimate", ".upper", ".alpha", and ".method". If \code{keep_reps = TRUE}, an additional list column called ".replicates" is also returned. } \description{ A convenience function for confidence intervals with linear-ish parametric models } \examples{ \donttest{ set.seed(1) reg_intervals(mpg ~ I(1/sqrt(disp)), data = mtcars) set.seed(1) reg_intervals(mpg ~ I(1/sqrt(disp)), data = mtcars, keep_reps = TRUE) } } \references{ Davison, A., & Hinkley, D. (1997). \emph{Bootstrap Methods and their Application}. Cambridge: Cambridge University Press. doi:10.1017/CBO9780511802843 \emph{Bootstrap Confidence Intervals}, \url{https://rsample.tidymodels.org/articles/Applications/Intervals.html} } \seealso{ \code{\link[=int_pctl]{int_pctl()}}, \code{\link[=int_t]{int_t()}} } rsample/man/attrition.Rd0000644000175000017500000000121713673171774015147 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \name{attrition} \alias{attrition} \title{Job Attrition} \description{ Job Attrition } \details{ These data are from the IBM Watson Analytics Lab. The website describes the data with \dQuote{Uncover the factors that lead to employee attrition and explore important questions such as \sQuote{show me a breakdown of distance from home by job role and attrition} or \sQuote{compare average monthly income by education and attrition}. This is a fictional data set created by IBM data scientists.}. There are 1470 rows. These data are now in the \code{modeldata} package. } rsample/man/rset_reconstruct.Rd0000644000175000017500000000302714010267142016520 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compat-vctrs-helpers.R \name{rset_reconstruct} \alias{rset_reconstruct} \title{Extending rsample with new rset subclasses} \usage{ rset_reconstruct(x, to) } \arguments{ \item{x}{A data frame to restore to an rset subclass.} \item{to}{An rset subclass to restore to.} } \value{ \code{x} restored to the rset subclass of \code{to}. } \description{ \code{rset_reconstruct()} encapsulates the logic for allowing new rset subclasses to work properly with vctrs (through \code{vctrs::vec_restore()}) and dplyr (through \code{dplyr::dplyr_reconstruct()}). It is intended to be a developer tool, and is not required for normal usage of rsample. } \details{ rset objects are considered "reconstructable" after a vctrs/dplyr operation if: \itemize{ \item \code{x} and \code{to} both have an identical column named \code{"splits"} (column and row order do not matter). \item \code{x} and \code{to} both have identical columns prefixed with \code{"id"} (column and row order do not matter). } } \examples{ to <- bootstraps(mtcars, times = 25) # Imitate a vctrs/dplyr operation, # where the class might be lost along the way x <- tibble::as_tibble(to) # Say we added a new column to `x`. Here we mock a `mutate()`. x$foo <- "bar" # This is still reconstructable to `to` rset_reconstruct(x, to) # Say we lose the first row x <- x[-1,] # This is no longer reconstructable to `to`, as `x` is no longer an rset # bootstraps object with 25 bootstraps if one is lost! rset_reconstruct(x, to) } rsample/man/manual_rset.Rd0000644000175000017500000000227713727757057015457 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/manual.R \name{manual_rset} \alias{manual_rset} \title{Manual resampling} \usage{ manual_rset(splits, ids) } \arguments{ \item{splits}{A list of \code{"rsplit"} objects. It is easiest to create these using \code{\link[=make_splits]{make_splits()}}.} \item{ids}{A character vector of ids. The length of \code{ids} must be the same as the length of \code{splits}.} } \description{ \code{manual_rset()} is used for constructing the most minimal rset possible. It can be useful when you have custom rsplit objects built from \code{\link[=make_splits]{make_splits()}}, or when you want to create a new rset from splits contained within an existing rset. } \examples{ df <- data.frame(x = c(1, 2, 3, 4, 5, 6)) # Create an rset from custom indices indices <- list( list(analysis = c(1L, 2L), assessment = 3L), list(analysis = c(4L, 5L), assessment = 6L) ) splits <- lapply(indices, make_splits, data = df) manual_rset(splits, c("Split 1", "Split 2")) # You can also use this to create an rset from a subset of an # existing rset resamples <- vfold_cv(mtcars) best_split <- resamples[5,] manual_rset(best_split$splits, best_split$id) } rsample/man/vfold_cv.Rd0000644000175000017500000000640314136023335014716 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/vfold.R \name{vfold_cv} \alias{vfold_cv} \title{V-Fold Cross-Validation} \usage{ vfold_cv(data, v = 10, repeats = 1, strata = NULL, breaks = 4, pool = 0.1, ...) } \arguments{ \item{data}{A data frame.} \item{v}{The number of partitions of the data set.} \item{repeats}{The number of times to repeat the V-fold partitioning.} \item{strata}{A variable in \code{data} (single character or name) used to conduct stratified sampling. When not \code{NULL}, each resample is created within the stratification variable. Numeric \code{strata} are binned into quartiles.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{pool}{A proportion of data used to determine if a particular group is too small and should be pooled into another group. We do not recommend decreasing this argument below its default of 0.1 because of the dangers of stratifying groups that are too small.} \item{...}{Not currently used.} } \value{ A tibble with classes \code{vfold_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and one or more identification variables. For a single repeat, there will be one column called \code{id} that has a character string with the fold identifier. For repeats, \code{id} is the repeat number and an additional column called \code{id2} that contains the fold information (within repeat). } \description{ V-fold cross-validation (also known as k-fold cross-validation) randomly splits the data into V groups of roughly equal size (called "folds"). A resample of the analysis data consisted of V-1 of the folds while the assessment set contains the final fold. In basic V-fold cross-validation (i.e. no repeats), the number of resamples is equal to V. } \details{ With more than one repeat, the basic V-fold cross-validation is conducted each time. For example, if three repeats are used with \code{v = 10}, there are a total of 30 splits: three groups of 10 that are generated separately. With a \code{strata} argument, the random sampling is conducted \emph{within the stratification variable}. This can help ensure that the resamples have equivalent proportions as the original data set. For a categorical variable, sampling is conducted separately within each class. For a numeric stratification variable, \code{strata} is binned into quartiles, which are then used to stratify. Strata below 10\% of the total are pooled together; see \code{\link[=make_strata]{make_strata()}} for more details. } \examples{ vfold_cv(mtcars, v = 10) vfold_cv(mtcars, v = 10, repeats = 2) library(purrr) data(wa_churn, package = "modeldata") set.seed(13) folds1 <- vfold_cv(wa_churn, v = 5) map_dbl(folds1$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) set.seed(13) folds2 <- vfold_cv(wa_churn, strata = churn, v = 5) map_dbl(folds2$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) set.seed(13) folds3 <- vfold_cv(wa_churn, strata = tenure, breaks = 6, v = 5) map_dbl(folds3$splits, function(x) { dat <- as.data.frame(x)$churn mean(dat == "Yes") }) } rsample/man/two_class_dat.Rd0000644000175000017500000000055313727757057015766 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \name{two_class_dat} \alias{two_class_dat} \title{Two Class Data} \description{ Two Class Data } \details{ There are artificial data with two predictors (\code{A} and \code{B}) and a factor outcome variable (\code{Class}). These data are now in the \code{modeldata} package. } rsample/man/permutations.Rd0000644000175000017500000000437713755042647015673 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/permutations.R \name{permutations} \alias{permutations} \title{Permutation sampling} \usage{ permutations(data, permute = NULL, times = 25, apparent = FALSE, ...) } \arguments{ \item{data}{A data frame.} \item{permute}{One or more columns to shuffle. This argument supports \code{tidyselect} selectors. Multiple expressions can be combined with \code{c()}. Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can be used to select a range of variables. See \code{\link[tidyselect]{language}} for more details.} \item{times}{The number of permutation samples.} \item{apparent}{A logical. Should an extra resample be added where the analysis is the standard data set.} \item{...}{Not currently used.} } \value{ A \code{tibble} with classes \code{permutations}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ A permutation sample is the same size as the original data set and is made by permuting/shuffling one or more columns. This results in analysis samples where some columns are in their original order and some columns are permuted to a random order. Unlike other sampling functions in \code{rsample}, there is no assessment set and calling \code{assessment()} on a permutation split will throw an error. } \details{ The argument \code{apparent} enables the option of an additional "resample" where the analysis data set is the same as the original data set. Permutation-based resampling can be especially helpful for computing a statistic under the null hypothesis (e.g. t-statistic). This forms the basis of a permutation test, which computes a test statistic under all possible permutations of the data. } \examples{ permutations(mtcars, mpg, times = 2) permutations(mtcars, mpg, times = 2, apparent = TRUE) library(purrr) resample1 <- permutations(mtcars, starts_with("c"), times = 1) resample1$splits[[1]] \%>\% analysis() resample2 <- permutations(mtcars, hp, times = 10, apparent = TRUE) map_dbl(resample2$splits, function(x) { t.test(hp ~ vs, data = analysis(x))$statistic }) } rsample/man/rsample-dplyr.Rd0000644000175000017500000001040113673171774015720 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compat-dplyr.R \name{rsample-dplyr} \alias{rsample-dplyr} \title{Compatibility with dplyr} \description{ rsample should be fully compatible with dplyr 1.0.0. With older versions of dplyr, there is partial support for the following verbs: \code{mutate()}, \code{arrange()}, \code{filter()}, \code{rename()}, \code{select()}, and \code{slice()}. We strongly recommend updating to dplyr 1.0.0 if possible to get more complete integration with dplyr. } \section{Version Specific Behavior}{ rsample performs somewhat differently depending on whether you have dplyr >= 1.0.0 (new) or dplyr < 1.0.0 (old). Additionally, version 0.0.7 of rsample (new) introduced some changes to how rsample objects work with dplyr, even on old dplyr. Most of these changes influence the return value of a dplyr verb and determine whether it will be a tibble or an rsample rset subclass. The table below attempts to capture most of these changes. These examples are not exhaustive and may not capture some edge-cases. \subsection{Joins}{ The following affect all of the dplyr joins, such as \code{left_join()}, \code{right_join()}, \code{full_join()}, and \code{inner_join()}. Joins that alter the rows of the original rset object:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{join(rset, tbl)} \tab error \tab error \tab tibble \cr } The idea here is that, if there are less rows in the result, the result should not be an rset object. For example, you can't have a 10-fold CV object without 10 rows. Joins that keep the rows of the original rset object:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{join(rset, tbl)} \tab error \tab error \tab rset \cr } As with the logic above, if the original rset object (defined by the split column and the id column(s)) is left intact, the results should be an rset. } \subsection{Row Subsetting}{ As mentioned above, this should result in a tibble if any rows are removed or added. Simply reordering rows still results in a valid rset with new rsample. Cases where rows are removed or added:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{rset[ind,]} \tab tibble \tab tibble \tab tibble \cr \code{slice(rset)} \tab rset \tab tibble \tab tibble \cr \code{filter(rset)} \tab rset \tab tibble \tab tibble \cr } Cases where all rows are kept, but are possibly reordered:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{rset[ind,]} \tab tibble \tab rset \tab rset \cr \code{slice(rset)} \tab rset \tab rset \tab rset \cr \code{filter(rset)} \tab rset \tab rset \tab rset \cr \code{arrange(rset)} \tab rset \tab rset \tab rset \cr } } \subsection{Column Subsetting}{ When the \code{splits} column or any \code{id} columns are dropped or renamed, the result should no longer be considered a valid rset. Cases when the required columns are removed or renamed:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{rset[,ind]} \tab tibble \tab tibble \tab tibble \cr \code{select(rset)} \tab rset \tab tibble \tab tibble \cr \code{rename(rset)} \tab tibble \tab tibble \tab tibble \cr } Cases when no required columns are affected:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{rset[,ind]} \tab tibble \tab rset \tab rset \cr \code{select(rset)} \tab rset \tab rset \tab rset \cr \code{rename(rset)} \tab rset \tab rset \tab rset \cr } } \subsection{Other Column Operations}{ Cases when the required columns are altered:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{mutate(rset)} \tab rset \tab tibble \tab tibble \cr } Cases when no required columns are affected:\tabular{lccc}{ operation \tab old rsample + old dplyr \tab new rsample + old dplyr \tab new rsample + new dplyr \cr \code{mutate(rset)} \tab rset \tab rset \tab rset \cr } } } rsample/man/new_rset.Rd0000644000175000017500000000163114010267142014735 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rset.R \name{new_rset} \alias{new_rset} \title{Constructor for new rset objects} \usage{ new_rset(splits, ids, attrib = NULL, subclass = character()) } \arguments{ \item{splits}{A list column of \code{rsplits} or a tibble with a single column called "splits" with a list column of \code{rsplits}.} \item{ids}{A character vector or a tibble with one or more columns that begin with "id".} \item{attrib}{An optional named list of attributes to add to the object.} \item{subclass}{A character vector of subclasses to add.} } \value{ An \code{rset} object. } \description{ Constructor for new rset objects } \details{ Once the new \code{rset} is constructed, an additional attribute called "fingerprint" is added that is a hash of the \code{rset}. This can be used to make sure other objects have the exact same resamples. } \keyword{internal} rsample/man/make_splits.Rd0000644000175000017500000000152414066703504015433 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/misc.R \name{make_splits} \alias{make_splits} \alias{make_splits.default} \alias{make_splits.list} \alias{make_splits.data.frame} \title{Constructors for split objects} \usage{ make_splits(x, ...) \method{make_splits}{default}(x, ...) \method{make_splits}{list}(x, data, class = NULL, ...) \method{make_splits}{data.frame}(x, assessment, ...) } \arguments{ \item{x}{A list of integers with names "analysis" and "assessment", or a data frame of analysis or training data.} \item{...}{Further arguments passed to or from other methods (not currently used).} \item{data}{A data frame.} \item{class}{An optional class to give the object.} \item{assessment}{A data frame of assessment or testing data, which can be empty.} } \description{ Constructors for split objects } rsample/man/as.data.frame.rsplit.Rd0000644000175000017500000000225713653053433017044 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rsplit.R \name{as.data.frame.rsplit} \alias{as.data.frame.rsplit} \alias{analysis} \alias{assessment} \title{Convert an \code{rsplit} object to a data frame} \usage{ \method{as.data.frame}{rsplit}(x, row.names = NULL, optional = FALSE, data = "analysis", ...) analysis(x, ...) assessment(x, ...) } \arguments{ \item{x}{An \code{rsplit} object.} \item{row.names}{\code{NULL} or a character vector giving the row names for the data frame. Missing values are not allowed.} \item{optional}{A logical: should the column names of the data be checked for legality?} \item{data}{Either "analysis" or "assessment" to specify which data are returned.} \item{...}{Additional arguments to be passed to or from methods. Not currently used.} } \description{ The analysis or assessment code can be returned as a data frame (as dictated by the \code{data} argument) using \code{as.data.frame.rsplit}. \code{analysis} and \code{assessment} are shortcuts. } \examples{ library(dplyr) set.seed(104) folds <- vfold_cv(mtcars) model_data_1 <- folds$splits[[1]] \%>\% analysis() holdout_data_1 <- folds$splits[[1]] \%>\% assessment() } rsample/man/labels.rsplit.Rd0000644000175000017500000000104213755042647015701 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/labels.R \name{labels.rsplit} \alias{labels.rsplit} \title{Find Labels from rsplit Object} \usage{ \method{labels}{rsplit}(object, ...) } \arguments{ \item{object}{An \code{rsplit} object} \item{...}{Not currently used.} } \value{ A tibble. } \description{ Produce a tibble of identification variables so that single splits can be linked to a particular resample. } \examples{ cv_splits <- vfold_cv(mtcars) labels(cv_splits$splits[[1]]) } \seealso{ add_resample_id } rsample/man/tidy.rsplit.Rd0000644000175000017500000000456013653053433015410 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tidy.R \name{tidy.rsplit} \alias{tidy.rsplit} \alias{tidy.rset} \alias{tidy.vfold_cv} \alias{tidy.nested_cv} \title{Tidy Resampling Object} \usage{ \method{tidy}{rsplit}(x, unique_ind = TRUE, ...) \method{tidy}{rset}(x, ...) \method{tidy}{vfold_cv}(x, ...) \method{tidy}{nested_cv}(x, ...) } \arguments{ \item{x}{A \code{rset} or \code{rsplit} object} \item{unique_ind}{Should unique row identifiers be returned? For example, if \code{FALSE} then bootstrapping results will include multiple rows in the sample for the same row in the original data.} \item{...}{Not currently used.} } \value{ A tibble with columns \code{Row} and \code{Data}. The latter has possible values "Analysis" or "Assessment". For \code{rset} inputs, identification columns are also returned but their names and values depend on the type of resampling. \code{vfold_cv} contains a column "Fold" and, if repeats are used, another called "Repeats". \code{bootstraps} and \code{mc_cv} use the column "Resample". } \description{ The \code{tidy} function from the \pkg{broom} package can be used on \code{rset} and \code{rsplit} objects to generate tibbles with which rows are in the analysis and assessment sets. } \details{ Note that for nested resampling, the rows of the inner resample, named \code{inner_Row}, are \emph{relative} row indices and do not correspond to the rows in the original data set. } \examples{ library(ggplot2) theme_set(theme_bw()) set.seed(4121) cv <- tidy(vfold_cv(mtcars, v = 5)) ggplot(cv, aes(x = Fold, y = Row, fill = Data)) + geom_tile() + scale_fill_brewer() set.seed(4121) rcv <- tidy(vfold_cv(mtcars, v = 5, repeats = 2)) ggplot(rcv, aes(x = Fold, y = Row, fill = Data)) + geom_tile() + facet_wrap(~Repeat) + scale_fill_brewer() set.seed(4121) mccv <- tidy(mc_cv(mtcars, times = 5)) ggplot(mccv, aes(x = Resample, y = Row, fill = Data)) + geom_tile() + scale_fill_brewer() set.seed(4121) bt <- tidy(bootstraps(mtcars, time = 5)) ggplot(bt, aes(x = Resample, y = Row, fill = Data)) + geom_tile() + scale_fill_brewer() dat <- data.frame(day = 1:30) # Resample by week instead of day ts_cv <- rolling_origin(dat, initial = 7, assess = 7, skip = 6, cumulative = FALSE) ts_cv <- tidy(ts_cv) ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) + geom_tile() + scale_fill_brewer() } rsample/man/rsample2caret.Rd0000644000175000017500000000203113653053433015656 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/caret.R \name{rsample2caret} \alias{rsample2caret} \alias{caret2rsample} \title{Convert Resampling Objects to Other Formats} \usage{ rsample2caret(object, data = c("analysis", "assessment")) caret2rsample(ctrl, data = NULL) } \arguments{ \item{object}{An \code{rset} object. Currently, \code{nested_cv} is not supported.} \item{data}{The data that was originally used to produce the \code{ctrl} object.} \item{ctrl}{An object produced by \code{trainControl} that has had the \code{index} and \code{indexOut} elements populated by integers. One method of getting this is to extract the \code{control} objects from an object produced by \code{train}.} } \value{ \code{rsample2caret} returns a list that mimics the \code{index} and \code{indexOut} elements of a \code{trainControl} object. \code{caret2rsample} returns an \code{rset} object of the appropriate class. } \description{ These functions can convert resampling objects between \pkg{rsample} and \pkg{caret}. } rsample/man/get_fingerprint.Rd0000644000175000017500000000177514010267142016306 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/misc.R \name{.get_fingerprint} \alias{.get_fingerprint} \alias{.get_fingerprint.default} \alias{.get_fingerprint.rset} \title{Obtain a identifier for the resamples} \usage{ .get_fingerprint(x, ...) \method{.get_fingerprint}{default}(x, ...) \method{.get_fingerprint}{rset}(x, ...) } \arguments{ \item{x}{An \code{rset} or \code{tune_results} object.} \item{...}{Not currently used.} } \value{ A character value or \code{NA_character_} if the object was created prior to \code{rsample} version 0.1.0. } \description{ This function returns a hash (or NA) for an attribute that is created when the \code{rset} was initially constructed. This can be used to compare with other resampling objects to see if they are the same. } \examples{ set.seed(1) .get_fingerprint(vfold_cv(mtcars)) set.seed(1) .get_fingerprint(vfold_cv(mtcars)) set.seed(2) .get_fingerprint(vfold_cv(mtcars)) set.seed(1) .get_fingerprint(vfold_cv(mtcars, repeats = 2)) } rsample/man/labels.rset.Rd0000644000175000017500000000131413755042647015343 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/labels.R \name{labels.rset} \alias{labels.rset} \alias{labels.vfold_cv} \title{Find Labels from rset Object} \usage{ \method{labels}{rset}(object, make_factor = FALSE, ...) \method{labels}{vfold_cv}(object, make_factor = FALSE, ...) } \arguments{ \item{object}{An \code{rset} object} \item{make_factor}{A logical for whether the results should be a character or a factor.} \item{...}{Not currently used.} } \value{ A single character or factor vector. } \description{ Produce a vector of resampling labels (e.g. "Fold1") from an \code{rset} object. Currently, \code{nested_cv} is not supported. } \examples{ labels(vfold_cv(mtcars)) } rsample/man/rsample.Rd0000644000175000017500000000304313727757057014600 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/pkg.R \docType{package} \name{rsample} \alias{rsample} \title{rsample: General Resampling Infrastructure for R} \description{ \pkg{rsample} has functions to create variations of a data set that can be used to evaluate models or to estimate the sampling distribution of some statistic. } \section{Terminology}{ \itemize{ \item A \strong{resample} is the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. The data structure \code{rsplit} is used to store a single resample. \item When the data are split in two, the portion that is used to estimate the model or calculate the statistic is called the \strong{analysis} set here. In machine learning this is sometimes called the "training set" but this would be poorly named since it might conflict with any initial split of the original data. \item Conversely, the other data in the split are called the \strong{assessment} data. In bootstrapping, these data are often called the "out-of-bag" samples. \item A collection of resamples is contained in an \code{rset} object. } } \section{Basic Functions}{ The main resampling functions are: \code{\link[=vfold_cv]{vfold_cv()}}, \code{\link[=bootstraps]{bootstraps()}}, \code{\link[=mc_cv]{mc_cv()}}, \code{\link[=rolling_origin]{rolling_origin()}}, and \code{\link[=nested_cv]{nested_cv()}}. } rsample/man/loo_cv.Rd0000644000175000017500000000141313653053433014376 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/loo.R \name{loo_cv} \alias{loo_cv} \title{Leave-One-Out Cross-Validation} \usage{ loo_cv(data, ...) } \arguments{ \item{data}{A data frame.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{loo_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and one column called \code{id} that has a character string with the resample identifier. } \description{ Leave-one-out (LOO) cross-validation uses one data point in the original set as the assessment data and all other data points as the analysis set. A LOO resampling set has as many resamples as rows in the original data set. } \examples{ loo_cv(mtcars) } rsample/man/complement.Rd0000644000175000017500000000246014012611402015245 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/complement.R \name{complement} \alias{complement} \alias{complement.rsplit} \alias{complement.rof_split} \alias{complement.sliding_window_split} \alias{complement.sliding_index_split} \alias{complement.sliding_period_split} \alias{complement.apparent_split} \title{Determine the Assessment Samples} \usage{ complement(x, ...) \method{complement}{rsplit}(x, ...) \method{complement}{rof_split}(x, ...) \method{complement}{sliding_window_split}(x, ...) \method{complement}{sliding_index_split}(x, ...) \method{complement}{sliding_period_split}(x, ...) \method{complement}{apparent_split}(x, ...) } \arguments{ \item{x}{An \code{rsplit} object} \item{...}{Not currently used} } \value{ A integer vector. } \description{ This method and function help find which data belong in the analysis and assessment sets. } \details{ Given an \code{rsplit} object, \code{complement()} will determine which of the data rows are contained in the assessment set. To save space, many of the \code{rsplit} objects will not contain indices for the assessment split. } \examples{ set.seed(28432) fold_rs <- vfold_cv(mtcars) head(fold_rs$splits[[1]]$in_id) fold_rs$splits[[1]]$out_id complement(fold_rs$splits[[1]]) } \seealso{ \code{\link[=populate]{populate()}} } rsample/man/apparent.Rd0000644000175000017500000000176413727757057014757 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/apparent.R \name{apparent} \alias{apparent} \title{Sampling for the Apparent Error Rate} \usage{ apparent(data, ...) } \arguments{ \item{data}{A data frame.} \item{...}{Not currently used.} } \value{ A tibble with a single row and classes \code{apparent}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and one column called \code{id} that has a character string with the resample identifier. } \description{ When building a model on a data set and re-predicting the same data, the performance estimate from those predictions is often called the "apparent" performance of the model. This estimate can be wildly optimistic. "Apparent sampling" here means that the analysis and assessment samples are the same. These resamples are sometimes used in the analysis of bootstrap samples and should otherwise be avoided like old sushi. } \examples{ apparent(mtcars) } rsample/man/figures/0000755000175000017500000000000014036340014014261 5ustar nileshnileshrsample/man/figures/lifecycle-defunct.svg0000644000175000017500000000170414036340014020371 0ustar nileshnileshlifecyclelifecycledefunctdefunct rsample/man/figures/logo.png0000644000175000017500000007111113653053433015742 0ustar nileshnileshPNG  IHDRagAMA a cHRMz&u0`:pQ< pHYs  iTXtXML:com.adobe.xmp 1 1 2 ؀@IDATx}u{. JĂY5[Q1vF͗/!&N""EDإzw{{텥 }3Ϝ9šiWo ~j3:'tNyr=}gO:~x;.i֞f!غx2;3-kopJy[b>;@?jKMsz>Sk mI^ }q}ʹ5:xٗX8lX-))XVawk#["7cNÐ.phf8 ښ+l}+{bJXVzr/?7Wxnu"iW m @nê/K*-.ꝶgOIxzmoqO0R".,oxYf1N)pQ_Ri3r'Ju51XGohG1mY'e#/duȜnho+#pUaNoC_Xs2KuݽgO:NuIYԃ/|V Y, -BAtxp]: wvaVf..g6 4N6D~O]Z8E9:@CH-6Ul6H *ϢD Vpj/m֦sVY)֡mڴ8*D[inVE5`q:{XKF^LvDGFID ar,V|Q*Vn'.x-ti\ iY9wOJR$?9jgT).S:&ޢ8+A--%22R\.degIVn{^2e @u Et<Ώ1`QşS4lq \ *bZPn X$6&V%@@2$ 1F+JdBEfH'2ko᪼A^18^ mc$\Т[Bi9$1.^0) W!//O%E Vȼ!.X$2xF3=y;.},hhHr=@NjTFQ.su5Sܥn7C0hH,tcn>\55<"B.C2-El9_La׀CdY>MEd^"sجکZOZJO~jү!aS|MJiJg#"`V`"7 +**,p\tDNPpk^xX {<buBNh񔀩y١y6KсsN$Cná+/E9Od/fJ~Q!g:xg] ORjH>:]Z98oM(ir]+3Y<Ӭ8/E@^Rך5^ ' /InndBl(B2̰:y#1oΈy1es y3ͰhOFJ0^["@rV?J%{R5"r+/tRMfW"녏{j}(3O{`JfA%ZJ~JK#l.B |r+]9H\wws$'?Owu,(,n[8+pe!^7ӽy꿠8q%vc85݅gr fS+L%1@ɭ9U:֗(z"x ]7D HK~6ζ]eg@`"Ccu*`(.t"BTf'5D9 11 線rV UھcLOɻϗ)$vK iڨ󖹿Ej=>w =Uح-5L.i@4,ai<4j!g `E Pp$Lu asJjFauWޱ\qr\))~Vʞr>JPw!NόUvOtW ?{7FCLRn( %Mb` BPUs/塚])R˼5+tCw`ijgo>+A~8XˁA'TdL0T ,aZf ٬OM<^3&]kr0P+,dNlV)HOCmm|,%*[픲ԎG#Ɨo+%E7Ew9XKGcŜ"`  Qyo lPe;.A39OL@wx ͩK[-u( 2x[)RJNJcZS 4..] V8=,G!L}O.9F0ˆRkJdj27:5X+*)V]ʼ!W4rR5 {?zL@Rl\k l29o-^;\R(+=c%Z0"c>jV,tG|<]5Q~kn|?: @Ǧ1Ы]&MnwsUd˖l=&aw`nRhfVdڃLU0SCBʭ$GA! v[#|rB !"H sE؝ἷ>Z"-f"ilq|ąayM:uI,Du}ZxɁ nA˙=w+q$*2@B(.;n9{_4֡:+)rm_-'G}d U J&˩WrVCJ1r {?iݶG{;we]'x!bg"\j8M{w"{!#l)l`Wu?aqmP @T^,FdS`*2+UXJnEM,]e~2 \{ׁreޓI`5(yݳk>tX'u6 F/ U nP "Rw=hϖijlSڴXSQ"ԥR73&У^GfU@@Qr%w5~$B k>~Iy7Uq2mV4뇮:663#I.Ud2L@ Pu+nwݩ/\uja곹MlF.M5Rk+"w,41Ѩ79Qrpy[A |(Z|WpV.kb&+8C-E:/ߔSݬ,_rW.QF Fep_4ecFO[L]{brG[vLƹV^ "A@Le2}+uYp")U93;HGF~ȱV2`UV#`>6laRopեSS?*F4È ?V~7&؃P`C͘À&7]hv>Lyj"OmD{laZoCA жlBz b̀QK/ \xʬS _NEcECJs p̪qڃ:=gӕicْ}L.'ڷSE?qCzHjc$WW_>)D{a:Uxc^l@uan>&(W}0 0 QV&6?w'g\{J?5(h 8VJyjnW v7e~A :H.1q1JINfP5I]5^+{9)[X|4g=+χHʅRĆ1@ p͞BCҍ##\`2l{USDPzsu$4:݇w/M~r2{雋!2RI ZqWtC׻Q#ve2Ii<%(!r+Eq!@5`_ T+G Q@-.Q57``mFeDnJYpck!vʱO "z~B )g@x߃ţFʾx}a1G3>x['0)?6Q5;M Պy)_퐣DȼE[cz=;;޹hZgp Z! x0v{'=t"r0zh'@ Օ"LdNzD80VLlLrg{Ɩ [T@ʅ8)%HգXnlC}_띘ׯ70oN EZB"\\NzЫ AAN 5?ϬXଉe*LF֠eKY,=kTI,v{Cf=4`8 UXз+2*a4Ѥ5tH4PV+e$;X/]vh( 4*385~bО#RXP$a'9,+nwnw^UВR𡿳4z:{m3l(? d U6 gKm0pJTEXOId2S]Dw\Ң@@-2 VrC֘(f Qo+!g`_ t $6:Fq$G̔wȊͫe^'"Sd܀Q0]UәJ[q  c.K yVYtQTlX^zɼ Sml3˩ oM6!e+/bb.?Ҫ%xBH|80KNw!tAOn>*@ ]=hx4]򳗵۝:r\V*,ĂrEQ>0r"|cxˉ0ﻯ䱸?>dTv̒۶ŋȘArѰQPؔnqXQKZ\s*uko!?\)9Y.3DorhRTLٱX/W阚* q!cws>41 0>H=db pki|Ob+;=bY\gq{_9WbF +uQ,EXVVdJe^=T SG0  d<8h" Et9ع$Œ.L'Gw i4cэ|Cz7po^ywʵN-Ctrǫ yp)Tة[.q 7}/{{EM 1#RmJ̚49y"@V~@HW+3iV'hoP+{*l/bìԼ_%Aco]!Γ+RzOLzu6İ~ځr˝䳾eʙKal( n%Hjr$%9E1 P;[մS{\mZ.=˨l JȤ\RTqp3_%2X" ,[VV?}ě-*ἁ K܇\AVM:V`E F9h' 837c$t.{5Yx9sTL0y;~%22oavi$[^l۶U&}\6uco*\v43[VSu^+7\{ 2Xq| 2e3nЮ`\B.v:B/ʻ\9AkuPgȠܞoi&V:߹\QXWA_DTU.9(i9粵+2s1YɛzUA/rFƢV`1rM1rÔkߒsѼ5eK:%ͻW?7n{]&OUWo#W/F~usr(yn>t ?+7tހFM|K ڈtuNO]2fw_qg8 ~vxȭA{3S/|{|㐺j-{໫daۑ #;Y˾  i7HCztjI|l.=Z+5AL>r?`/l1gi<0];v}<$HD]0'=?H/ZDTD7EJN Hj9oj z;Z_5=E(jS7PYGӨ?dVB; VT Ab#1sl2 sȱu"6ȦI~Ҷ][ АDGNwtYo䊫m:W+}mi02lL9y2`hJ4Ջa7Yn2zph-8408x(<B-@{-!x}2"X,{wb qV9:uFJLيoLF3$w%XQV:93:L"ZDC7#6Z>kFK9;_YM;Kŧk.$ʆA ޒޚ*n;DbOiwHeksN'sd6)&C=? |Rt+Y!^aE~`Crrh0R:gP0&U쟟Bp}+] ܤ?Fޘ>AjŶ"+نw\]2W(Li&2֖"g'˹+幧zXDW5ϫ_z^p!' peG{;Tgf^J]RJr iZ"5@HAW,ƶ\cݔjQlA|TH/ p1(=/5a F4.C3{/3#*{~/nANs,WƗ[ۅ K(2vL8Bu\&B<' rRg.J9_' [r Cbc <9AX\+wT!:3F &(*𩾔R!K!&/Aq^!3#Vx|5n9xL6yŗg}Xr4pl8<&##Cv%+K&#oNZ< o@WsIt'˕K \(nq6t qi"MK +Hh(++fsϏQ!*x/2Eȅy,~)}?&2h8aY $þ fl?DF)GEd L*/K*r"RsmJCqE00xB!^F +VixB ,PA~V~~^>>= :lnO豣e{U[ʂ.{a8\('|Dc;PlDWWHaXXfX⬤uƕ绒@L6 kY%ظuffAWq>}9,E nϒ1үe;iݹUYNAx$q0u+{__MեU] ?C.60#{mT\]{* QLE"efE=E|75Wt)m!i XIH]FM8+YWߵ\K8dLDI  k8=иNMbdƀ"T{DȮDڟ9Y_圑En!!$*4z-RsZI78˸ǔߚ(!k5#8 蟠T#U 292' v%+ҥsu8JAsk]Z紌UΤ @r&"&J \),fc1imjޜl9I!L<$Ή-% h&"rgIQ$@[ 4~ʰ `27coU3~X&SP&@6QYpj1 UUdf 6KIo΋Qv!b d\itY&ܽS^,jD>W`, UaET85f yOv.Wl E0aPthR㫦f:L"hjwVr\И)˗ދO߮Ӷo٦Nܥ ֲ%w)8p˾W_k_-I," $(|fI(F`?8lp0kP& .wlQz&$)Y)rtPtw:CpE.xNL ̉k'"0ɀ)Oe<:!Goc9vvX^Kf}&={8}{Ɏ%==]vۍC{KKKa1^v+m[x, eylW8W`E9zr{Ph$G =Wq&`V#7k'\FF6Q,_ĶURK Z#㉕:uPc BuYbGjF4-@1~Grg匉.+=xOG?{N3CY"9*:cl61|ߦԜYD!ץWykiqӻ?א1 ?f(X{U4\a{'䡂Qgc 2p|Աl[M +WU6ZWd2(12K4 iqǟ&4*ҡC+`kO䕪3mOohHѰ%6k2VQZ"X9X6k7j@6jRB"ņhl#FNv?L U=v ~"io]VfWu6w6"ʮzISj#hUX]>ųrU7ZϬ \(Ǟ~\>bdT`kU#=i}<{ o: ,0hh[5mO=nd)1HDCd09z܈d'lp6ܶ|35col/XE)K*@+Lv|oA|^V"CAlξTcru eRtK%M~q5}}r`3x`> 4QUS<Z*)% cFb+teZ=6 "q00ӑq!9{A[[SLľNhK}&6:TˮfH/ߑ)c&[nvع^DU2_s^}# m-Isf/_b8 s0V nkKYkx L_-%Pu_F1BduaV lHUKWu=2 VB!wjE9ЁjAN)o,|WrY'1tY#Eʈm?L'7HOY*$dk[C {wp~$pBcQ)2E/"H#AɜďObr8SҪlaL2z"a8(qS$iU iFF]8Z.i?xH8ҥ@kȿLT NЖSwO حS_3ڹKܝה0Q^v"T<0fnFugVp,{ 9x0C`oKZ "s@Vp\ܦ3c|yG@gY} LΪu^.MƤKM-CTkىj"l;/Y"].AFjzUVKYf,9jxȻ1RPR \e&حpض ;܎4v/꘠N> BsY#灀)jO)aFN 46iАA2`9z!(pO[x!ٴi,YLY9+ u:hśKS߯sM',Nav ?8M 2red[\tFH8y+2P<8yYa$ 60FSs`:Չ*6i8seȈ2!r믐(mNHUs[xD5L!(⶛G1[U?ea;oQݠ6! O6iyIUvVn'y3lOʍ@yS2 Yva5VŠ 4&[zψXR__ ^6(sՕW#y87?\EОl%<\ϥ8- w:5٣g 6 wDʓJwgV-D>2"oȐEP'5{䴁m\=k *L:>1l8%EPW38j¬Ҫe+bz}%¨znNy*yh7锕i|/(nGLl?R U|g~x<\PfT^e\M㞄 a+;"S/qܨ/HtmS4=tVS:'9Y;UiAҦXejG3X&e)8Ѓq3=58322d-J ~xkS9^5"ƩXU X$lN)4>߶,+ O=tW?#kSeLYh2r\tDlB*qȴ;+n L;sPFP*6P+ `/f e v1|b)M2:S E }5a>u`z6iή@%R@83]>'Ʋ"q,|o$hU QNl̜Q0`>Uźp2|HP_ڜKgpsDryщT?AO(68-v9.X[\rJꦢ\勵5 PvU>haZ~i8P:;6J7|sT?}wP@CYʸq/rWwΜ}y*ʿ; (͛rfY8p!47;~E7|_#hiO !?g̃a|fVCeWe a'8{o3L_eI4q㣫,._显+"ty5;^;<3e#U>iU k.gV,wGѲ¹s!$|<`oShaF`3 ޱT,hdc@HXTȓPAϰ|FhLyM&sw u>YRU\z5'cYJarCݥbשy3ӧ2,Ӊ8U:mP3qT1]άJAZ>if+GAVEH$YqɲRLSzp:78[ 7?Dg %,BlfYb6 ,6q L&T;#!PH})_ɰۡۄTo+LkKYAp$(q+B˱ӡx07=AuI48؈7r% 鈼? ae,w(NyJǂ{\_1D#Kf^E3_#2'qOddRe 5ȜW BoMS9 3WS @\- d`BXݗɂ#ke+[ma*%K6ا5Z;\)]g C--/eҺ8j+m[`B8[Ȇ\LovP@~ȩte3-qRxUb/.9"h@7"b:H3FrJ 笭^ntGڡJeHRczK߸. qan+f"2BL##?N9bYWzT KTUߑ_(102̯0UN^/IDATw|:L!;J&hD;ȘV}tUE -Z}0~)o2mo$-xR9wȐuL+)ʓe7&+z](=Zu8e> %#L=.o/H<?<a$'V1-$/^Hdus?ըrT/u:ʤTʤ55 >mzKB$ecdV?7/"({]&#; AȳaD}f9Y^LPTeo\c䦁SEd<|[hNV]C1sI~&~\6YZǶ´2sϒo6ϗ׳Qh@jxV@6"` ,XAvI8սege!]Q AFuGâ$ ݐ/uYkiep=zМ@JT@{\t@6[Gݪ*ވlI3 F9KҽEgyh3wT2;DŽIT-嶤[sbs8 "*xʀut+AG68IUXngHvb]| P/|Iժ+7t @;U')pxu'N/mzJud g+qck=9'`)_SMS 76kRZ#\$ÒTe1?`oj+9T"=Y'y%EErK~k`<\eȹ-nkNv!Y9\xL"mo޲/|.-walWr0S 63D4b?A(GsVQ2gܬ ˬH.**W%ak:\Ⱦ+!:x;w:Wg㬔j|hu]]-_/g` &EeŰA(؈rvp_A~t~ш[b'x|˥ !*wd"v}WrXrTLcޟe_!e#DX&EuU$Dظm?is\EMIB.HN 1pԾdّ$T2 ^~ .*Sa Iqn5?>+ sע|xxr}#om)eRXfkjadYLH(հg; t4uUl~#bK<I G/eC:& $#|IUӠȞ6*ſO ˫iI Qm3@.*O̫vCfA<5ynrF0/NكAɝI2ug,w}&gʀܝ$apsߨ[w&{oD)DQƑ9]Ҡ[}_]FGtTp|OorQ/eѫ/tQZ}^)4 s'EǕraPSjH왽HN2팫82}_?P^ 0?`=|cF/AEW~.HvHdZk e{@L.! l I8LQd4_R<b `.}${}L9A5Uw,q#`aHXWvj@ ۖS$ceB#c8c,Ugb[K,zN<ư(75,20hCD4t^[i*Ao^rnT 6`d?FhX֥VWyeRl_,F @ - r3#⁡G.RcX!#Uqt>.RC}hyP'/GM*'|VJ$0[*r*3 e d\wx&hTfҴ,$!j5rDԩ`+*- [ڮDpɁx\R"8 ,1Ip2C2&db12O;(WϤ*n рݫvƲ=PV1aɏ 냰 3Naq=dA,F::n}=.{fDb0G;`|◢$EJٟstLht"/Bp%m/*"!hpwM]͕?Be_=^>ܱ@w(ڨ=ROPU==J/i91Y\EqntE$&'F|fg3*@ p,) *G+rvl a\)O5W> =zK@TeWl{UuЋs"IJ`\vfRM&ɇᘏn0RB|0Ι8\nV`_ 8<ɌہHoSӵ5fCj9B|ZK5|= iIװe悉aʰ8R1[r A{+ n)PY޷ОmVs]?kkr< h;ʨ-/k櫑Y~i"* jw l[qX_Buۊ xbp%sV.v _Z;> E¢  0O^MgT"WG=Z5a @6dA"@r50x51|slVxT>)\W2}?)6-rs@YSJGFwkz^$RQa5^Ø 3HH'1kA1ˀp4]'7s yP>-mj?;~G|)gGtݘg6AC`?k'2m_.YM ƾ`rY]# Z組'jUN F#C_2Sܐ;J rgi\Kv!%AYܛoW̓L{wsp,sSԠ94,`.bE *7OF3<~teL[e_̾yoGV3'#j6ڮ&F[Pir@[ɆŅWIW;VF0.4^\> Wd晷KO `DZ!ǘg8E+Y榼1n_+Vx3gnTHȶ=(gv%,־=v` NwQElھ-RTH~HbOpUDo%^Fj'-ҏl˞, . TGuiWQxxd;_`ǚ_\UXƂ4ةݴ@\ T.b!k}RS\/qs'P<#ichL_dޡfl0E3s?IVMDYrA3E{pH2>AriCOPt]% 6ح;QVemVs{ ,:0"h޾:)̎RhoK[)>8+ȅ%qc76g@J" n7weHAQuMK%>$[~Ş/eG-8kTEJ ] }y'pZp8T $chV0.hё}bu⼒`\"7B kŇa5/ZŊcXyn_aNT߶d7(o1Ų0k4T.qoDNDd}1; 0s`x̷u  rZ@֋UIvH)e( vɽC0Tnseaarx{X% Wv@czVX{P.d:Ҟ؝#oFST\PnV` @~0𦶦 kYj쐞Vy6cV_fƞ@u%4 LQB< pF]>hw 9"QJ%TAXvQ 㱜3``@3K.9,kGSn8=a:xBT,\G Jöbe\DW48uLG`|>Ɗ1 rQwþ꙽E(wpt:].dBTDz +O"SL:\uldeB}-uϚyޘh?c> pȮj6,F@%7j.>r4Fe^ 4 ΞApoPt$f;t|Q$X6V!Я(?r.:V/@}0N7@.pwN+.Rw#6\%[k_yȉIK}4{k4TҁDAR(W:}Nݛ@׌* {Nqe. =lwYVbU Z2^!VF8p[<f|U>HUrl=X/fSMp(qx x7]&[ 3Z =2ӮV(`ĉLLp"ر N(Z%p<<npH`QSk@`@- d{\bXs{罴oƜ"9M %55~'t[Ý^8ݐg MȢ"<wB̡Oߛ)&yWs;a`TIƔyh{U`lW.rk4>}mr)j]@/r@aT<. =r"ыe\(ӌlO!ixCVfDerH.Qͧ Ēda"|-dː h܈bd1}BK(HR#:Na*%yrgVdnB=uƘ/ȧOKz rEޭWI* ړ,b-Yvb>R±x%uVC(|1-?..n&2R臙ƻ܀p^ѨąJe>ƙ <]-dY[PsI;k+Y],[BPˋyA{+jLsE]7Ur@8Q)IV=J){ v)IXkfE"fl3/'J PNheZL?e{a22:Ej'?gѲ᳝_A|\/QVa;3a?GnP` @oֶ1{j_{`lPXRpN) 4GSH E؏Đl=%rS M}eJH)+1eXAFe4\>)yT׏CvLv䤅y ʥ{XPO)f+ B;\)foH`^zؔ.FNHh=9/!엇M{GLUF@#K'dwA9~E-(.b. @ߧ`?^ c}m;Hyr4)ʵ)FAsr33@MdЯ% m"#{5ofQ )$0Cw?diC' #4}ZTS3ƽѕ&1!D~/Zr  >XN"GYa2c6::#tYFJ'Y›_m"jxBK A%hXyp>27(GXGbDp9ru|#]Z`&ݩ>66VP ׭+Wx_ҫgEWw4Vz32fsB!yԺnTHgWRbvA[C&͐#>;2$We a&Z'P0+ IFbRF_1Z6fMU)edxp.sClLE> 1L<%DKXb*#D@Q %̐f=w湞mw=+j0c 4ó)g=b!DAy!ʆ DqIܤح90aceڬp+I-l~9_;Fvq{wAYY-]V|˔҉w!?n+w"j2|(UǶ" J(@aj fݸBhOBQ4gpoy踋`3Kෲf%ߪЇoqӇgN֩ N ~2!i$MJxi蜈z|Ļ, ^mZ%Nu!B] q!fks-al8 IPPe0Z=q[)Ԯ4 nP~K ނdZVMmlZ{Zh]<)pլ#aJoAx[|~cd@99V~wYT2>rd`=kՑmo}O@V~V],+>0_UcAMFöiێA_nkwUvlE|q35m7ꎼ kwwn{u).be9Owrl*d=ocDDH,P  gD~{0ꒆկv52qie8H[97>u9n қat2YfʺEo[D}j}uB:,]R iIP5=  ҪTzXS&ˏ>}~+ٱPAےڸOX 31U,Ľ[$lZ(~[l[v9ݺuNjBAdp,~|phJXauͰu[sh[SYYL'O7=7=h a@!)'šYuT4hiȉЌHIENDB`rsample/man/figures/lifecycle-archived.svg0000644000175000017500000000170714036340014020531 0ustar nileshnilesh lifecyclelifecyclearchivedarchived rsample/man/figures/lifecycle-stable.svg0000644000175000017500000000167414036340014020221 0ustar nileshnileshlifecyclelifecyclestablestable rsample/man/figures/lifecycle-questioning.svg0000644000175000017500000000171414036340014021307 0ustar nileshnileshlifecyclelifecyclequestioningquestioning rsample/man/figures/lifecycle-experimental.svg0000644000175000017500000000171614036340014021441 0ustar nileshnileshlifecyclelifecycleexperimentalexperimental rsample/man/figures/lifecycle-maturing.svg0000644000175000017500000000170614036340014020571 0ustar nileshnileshlifecyclelifecyclematuringmaturing rsample/man/figures/lifecycle-superseded.svg0000644000175000017500000000171314036340014021104 0ustar nileshnilesh lifecyclelifecyclesupersededsuperseded rsample/man/figures/lifecycle-deprecated.svg0000644000175000017500000000171214036340014021040 0ustar nileshnileshlifecyclelifecycledeprecateddeprecated rsample/man/slide-resampling.Rd0000644000175000017500000002214614045275374016370 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/slide.R \name{slide-resampling} \alias{slide-resampling} \alias{sliding_window} \alias{sliding_index} \alias{sliding_period} \title{Time-based Resampling} \usage{ sliding_window( data, ..., lookback = 0L, assess_start = 1L, assess_stop = 1L, complete = TRUE, step = 1L, skip = 0L ) sliding_index( data, index, ..., lookback = 0L, assess_start = 1L, assess_stop = 1L, complete = TRUE, step = 1L, skip = 0L ) sliding_period( data, index, period, ..., lookback = 0L, assess_start = 1L, assess_stop = 1L, complete = TRUE, step = 1L, skip = 0L, every = 1L, origin = NULL ) } \arguments{ \item{data}{A data frame.} \item{...}{These dots are for future extensions and must be empty.} \item{lookback}{The number of elements to look back from the current element when computing the resampling indices of the analysis set. The current row is always included in the analysis set. \itemize{ \item For \code{sliding_window()}, a single integer defining the number of rows to look back from the current row. \item For \code{sliding_index()}, a single object that will be subtracted from the \code{index} as \code{index - lookback} to define the boundary of where to start searching for rows to include in the current resample. This is often an integer value corresponding to the number of days to look back, or a lubridate Period object. \item For \code{sliding_period()}, a single integer defining the number of groups to look back from the current group, where the groups were defined from breaking up the \code{index} according to the \code{period}. } In all cases, \code{Inf} is also allowed to force an expanding window.} \item{assess_start, assess_stop}{This combination of arguments determines how far into the future to look when constructing the assessment set. Together they construct a range of \verb{[index + assess_start, index + assess_stop]} to search for rows to include in the assessment set. Generally, \code{assess_start} will always be \code{1} to indicate that the first value to potentially include in the assessment set should start one element after the current row, but it can be increased to a larger value to create "gaps" between the analysis and assessment set if you are worried about high levels of correlation in short term forecasting. \itemize{ \item For \code{sliding_window()}, these are both single integers defining the number of rows to look forward from the current row. \item For \code{sliding_index()}, these are single objects that will be added to the \code{index} to compute the range to search for rows to include in the assessment set. This is often an integer value corresponding to the number of days to look forward, or a lubridate Period object. \item For \code{sliding_period()}, these are both single integers defining the number of groups to look forward from the current group, where the groups were defined from breaking up the \code{index} according to the \code{period}. }} \item{complete}{A single logical. When using \code{lookback} to compute the analysis sets, should only complete windows be considered? If set to \code{FALSE}, partial windows will be used until it is possible to create a complete window (based on \code{lookback}). This is a way to use an expanding window up to a certain point, and then switch to a sliding window.} \item{step}{A single positive integer. After computing the resampling indices, \code{step} is used to thin out the results by selecting every \code{step}-th result by subsetting the indices with \code{seq(1L, n_indices, by = step)}. \code{step} is applied after \code{skip}. Note that \code{step} is independent of any time \code{index} used.} \item{skip}{A single positive integer, or zero. After computing the resampling indices, the first \code{skip} results will be dropped by subsetting the indices with \code{seq(skip + 1L, n_indices)}. This can be especially useful when combined with \code{lookback = Inf}, which creates an expanding window starting from the first row. By skipping forward, you can drop the first few windows that have very few data points. \code{skip} is applied before \code{step}. Note that \code{skip} is independent of any time \code{index} used.} \item{index}{The index to compute resampling indices relative to, specified as a bare column name. This must be an existing column in \code{data}. \itemize{ \item For \code{sliding_index()}, this is commonly a date vector, but is not required. \item For \code{sliding_period()}, it is required that this is a Date or POSIXct vector. } The \code{index} must be an \emph{increasing} vector, but duplicate values are allowed. Additionally, the index cannot contain any missing values.} \item{period}{The period to group the \code{index} by. This is specified as a single string, such as \code{"year"} or \code{"month"}. See the \code{.period} argument of \code{\link[slider:slide_index]{slider::slide_index()}} for the full list of options and further explanation.} \item{every}{A single positive integer. The number of periods to group together. For example, if the \code{period} was set to \code{"year"} with an \code{every} value of 2, then the years 1970 and 1971 would be placed in the same group.} \item{origin}{The reference date time value. The default when left as \code{NULL} is the epoch time of \verb{1970-01-01 00:00:00}, \emph{in the time zone of the index}. This is generally used to define the anchor time to count from, which is relevant when the \code{every} value is \verb{> 1}.} } \description{ These resampling functions are focused on various forms of \emph{time series} resampling. \itemize{ \item \code{sliding_window()} uses the row number when computing the resampling indices. It is independent of any time index, but is useful with completely regular series. \item \code{sliding_index()} computes resampling indices relative to the \code{index} column. This is often a Date or POSIXct column, but doesn't have to be. This is useful when resampling irregular series, or for using irregular lookback periods such as \code{lookback = lubridate::years(1)} with daily data (where the number of days in a year may vary). \item \code{sliding_period()} first breaks up the \code{index} into less granular groups based on \code{period}, and then uses that to construct the resampling indices. This is extremely useful for constructing rolling monthly or yearly windows from daily data. } } \examples{ library(vctrs) library(tibble) library(modeldata) data("Chicago") index <- new_date(c(1, 3, 4, 7, 8, 9, 13, 15, 16, 17)) df <- tibble(x = 1:10, index = index) df # Look back two rows beyond the current row, for a total of three rows # in each analysis set. Each assessment set is composed of the two rows after # the current row. sliding_window(df, lookback = 2, assess_stop = 2) # Same as before, but step forward by 3 rows between each resampling slice, # rather than just by 1. rset <- sliding_window(df, lookback = 2, assess_stop = 2, step = 3) rset analysis(rset$splits[[1]]) analysis(rset$splits[[2]]) # Now slide relative to the `index` column in `df`. This time we look back # 2 days from the current row's `index` value, and 2 days forward from # it to construct the assessment set. Note that this series is irregular, # so it produces different results than `sliding_window()`. Additionally, # note that it is entirely possible for the assessment set to contain no # data if you have a highly irregular series and "look forward" into a # date range where no data points actually exist! sliding_index(df, index, lookback = 2, assess_stop = 2) # With `sliding_period()`, we can break up our date index into less granular # chunks, and slide over them instead of the index directly. Here we'll use # the Chicago data, which contains daily data spanning 16 years, and we'll # break it up into rolling yearly chunks. Three years worth of data will # be used for the analysis set, and one years worth of data will be held out # for performance assessment. sliding_period( Chicago, date, "year", lookback = 2, assess_stop = 1 ) # Because `lookback = 2`, three years are required to form a "complete" # window of data. To allow partial windows, set `complete = FALSE`. # Here that first constructs two expanding windows until a complete three # year window can be formed, at which point we switch to a sliding window. sliding_period( Chicago, date, "year", lookback = 2, assess_stop = 1, complete = FALSE ) # Alternatively, you could break the resamples up by month. Here we'll # use an expanding monthly window by setting `lookback = Inf`, and each # assessment set will contain two months of data. To ensure that we have # enough data to fit our models, we'll `skip` the first 4 expanding windows. # Finally, to thin out the results, we'll `step` forward by 2 between # each resample. sliding_period( Chicago, date, "month", lookback = Inf, assess_stop = 2, skip = 4, step = 2 ) } \seealso{ \code{\link[=rolling_origin]{rolling_origin()}} \code{\link[slider:slide]{slider::slide()}}, \code{\link[slider:slide_index]{slider::slide_index()}}, and \code{\link[slider:slide_period]{slider::slide_period()}}, which power these resamplers. } rsample/man/add_resample_id.Rd0000644000175000017500000000203113755042647016216 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/labels.R \name{add_resample_id} \alias{add_resample_id} \title{Augment a data set with resampling identifiers} \usage{ add_resample_id(.data, split, dots = FALSE) } \arguments{ \item{.data}{A data frame} \item{split}{A single \code{rset} object.} \item{dots}{A single logical: should the id columns be prefixed with a "." to avoid name conflicts with \code{.data}?} } \value{ An updated data frame. } \description{ For a data set, \code{add_resample_id()} will add at least one new column that identifies which resample that the data came from. In most cases, a single column is added but for some resampling methods, two or more are added. } \examples{ library(dplyr) set.seed(363) car_folds <- vfold_cv(mtcars, repeats = 3) analysis(car_folds$splits[[1]]) \%>\% add_resample_id(car_folds$splits[[1]]) \%>\% head() car_bt <- bootstraps(mtcars) analysis(car_bt$splits[[1]]) \%>\% add_resample_id(car_bt$splits[[1]]) \%>\% head() } \seealso{ labels.rsplit } rsample/man/form_pred.Rd0000644000175000017500000000167213726435071015104 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/form_pred.R \name{form_pred} \alias{form_pred} \title{Extract Predictor Names from Formula or Terms} \usage{ form_pred(object, ...) } \arguments{ \item{object}{A model formula or \code{\link[stats:terms]{stats::terms()}} object.} \item{...}{Arguments to pass to \code{\link[=all.vars]{all.vars()}}} } \value{ A character vector of names } \description{ \code{all.vars} returns all variables used in a formula. This function only returns the variables explicitly used on the right-hand side (i.e., it will not resolve dots unless the object is terms with a data set specified). } \examples{ form_pred(y ~ x + z) form_pred(terms(y ~ x + z)) form_pred(y ~ x + log(z)) form_pred(log(y) ~ x + z) form_pred(y1 + y2 ~ x + z) form_pred(log(y1) + y2 ~ x + z) # will fail: # form_pred(y ~ .) form_pred(terms(mpg ~ (.)^2, data = mtcars)) form_pred(terms( ~ (.)^2, data = mtcars)) } rsample/man/populate.Rd0000644000175000017500000000135613727757057014773 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/complement.R \name{populate} \alias{populate} \title{Add Assessment Indices} \usage{ populate(x, ...) } \arguments{ \item{x}{A \code{rsplit} and \code{rset} object.} \item{...}{Not currently used} } \value{ An object of the same kind with the integer indices. } \description{ Many \code{rsplit} and \code{rset} objects do not contain indicators for the assessment samples. \code{populate()} can be used to fill the slot for the appropriate indices. } \examples{ set.seed(28432) fold_rs <- vfold_cv(mtcars) fold_rs$splits[[1]]$out_id complement(fold_rs$splits[[1]]) populate(fold_rs$splits[[1]])$out_id fold_rs_all <- populate(fold_rs) fold_rs_all$splits[[1]]$out_id } rsample/vignettes/0000755000175000017500000000000014142304331014052 5ustar nileshnileshrsample/vignettes/rsample.Rmd0000644000175000017500000000625513743327400016201 0ustar nileshnilesh--- title: "Introduction to rsample" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Introduction to rsample} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3) ``` ## Terminology We define a _resample_ as the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. Cross-validation is another type of resampling. ## `rset` Objects Contain Many Resamples The main class in the package (`rset`) is for a _set_ or _collection_ of resamples. In 10-fold cross-validation, the set would consist of the 10 different resamples of the original data. Like [`modelr`](https://cran.r-project.org/package=modelr), the resamples are stored in data-frame-like `tibble` object. As a simple example, here is a small set of bootstraps of the `mtcars` data: ```{r mtcars_bt, message=FALSE} library(rsample) set.seed(8584) bt_resamples <- bootstraps(mtcars, times = 3) bt_resamples ``` ## Individual Resamples are `rsplit` Objects The resamples are stored in the `splits` column in an object that has class `rsplit`. In this package we use the following terminology for the two partitions that comprise a resample: * The _analysis_ data are those that we selected in the resample. For a bootstrap, this is the sample with replacement. For 10-fold cross-validation, this is the 90% of the data. These data are often used to fit a model or calculate a statistic in traditional bootstrapping. * The _assessment_ data are usually the section of the original data not covered by the analysis set. Again, in 10-fold CV, this is the 10% held out. These data are often used to evaluate the performance of a model that was fit to the analysis data. (Aside: While some might use the term "training" and "testing" for these data sets, we avoid them since those labels often conflict with the data that result from an initial partition of the data that is typically done _before_ resampling. The training/test split can be conducted using the `initial_split` function in this package.) Let's look at one of the `rsplit` objects ```{r rsplit} first_resample <- bt_resamples$splits[[1]] first_resample ``` This indicates that there were `r dim(bt_resamples$splits[[1]])["analysis"]` data points in the analysis set, `r dim(bt_resamples$splits[[1]])["assessment"]` instances were in the assessment set, and that the original data contained `r dim(bt_resamples$splits[[1]])["n"]` data points. These results can also be determined using the `dim` function on an `rsplit` object. To obtain either of these data sets from an `rsplit`, the `as.data.frame` function can be used. By default, the analysis set is returned but the `data` option can be used to return the assessment data: ```{r rsplit_df} head(as.data.frame(first_resample)) as.data.frame(first_resample, data = "assessment") ``` Alternatively, you can use the shortcuts `analysis(first_resample)` and `assessment(first_resample)`. rsample/vignettes/Applications/0000755000175000017500000000000014122424444016506 5ustar nileshnileshrsample/vignettes/Applications/Intervals.Rmd0000644000175000017500000002233414020231061021111 0ustar nileshnilesh--- title: "Bootstrap Confidence Intervals" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Bootstrap Confidence Intervals} output: knitr:::html_vignette: toc: yes --- ```{r setup, include=FALSE} library(tidymodels) library(nlstools) library(GGally) theme_set(theme_bw()) ``` The bootstrap was originally intended for estimating confidence intervals for complex statistics whose variance properties are difficult to analytically derive. Davison and Hinkley's [_Bootstrap Methods and Their Application_](https://www.cambridge.org/core/books/bootstrap-methods-and-their-application/ED2FD043579F27952363566DC09CBD6A) is a great resource for these methods. `rsample` contains a few function to compute the most common types of intervals. ## A nonlinear regression example To demonstrate the computations for the different types of intervals, we'll use a nonlinear regression example from [Baty _et al_ (2015)](https://www.jstatsoft.org/article/view/v066i05). They showed data that monitored oxygen uptake in a patient with rest and exercise phases (in the data frame `O2K`). ```{r O2K-dat} library(tidymodels) library(nlstools) library(GGally) data(O2K) ggplot(O2K, aes(x = t, y = VO2)) + geom_point() ``` The authors fit a segmented regression model where the transition point was known (this is the time when exercise commenced). Their model was: ```{r O2K-fit} nonlin_form <- as.formula( VO2 ~ (t <= 5.883) * VO2rest + (t > 5.883) * (VO2rest + (VO2peak - VO2rest) * (1 - exp(-(t - 5.883) / mu))) ) # Starting values from visual inspection start_vals <- list(VO2rest = 400, VO2peak = 1600, mu = 1) res <- nls(nonlin_form, start = start_vals, data = O2K) tidy(res) ``` `broom::tidy()` returns our analysis object in a standardized way. The column names shown here are used for most types of objects and this allows us to use the results more easily. For `rsample`, we'll rely on the `tidy()` method to work with bootstrap estimates when we need confidence intervals. There's an example at the end of a univariate statistic that isn't automatically formatted with `tidy()`. To run our model over different bootstraps, we'll write a function that uses the `split` object as input and produces a tidy data frame: ```{r model-info} # Will be used to fit the models to different bootstrap data sets: fit_fun <- function(split, ...) { # We could check for convergence, make new parameters, etc. nls(nonlin_form, data = analysis(split), ...) %>% tidy() } ``` First, let's create a set of resamples and fit separate models to each. The options `apparent = TRUE` will be set. This creates a final resample that is a copy of the original (unsampled) data set. This is required for some of the interval methods. ```{r resample} set.seed(462) nlin_bt <- bootstraps(O2K, times = 2000, apparent = TRUE) %>% mutate(models = map(splits, ~ fit_fun(.x, start = start_vals))) nlin_bt nlin_bt$models[[1]] ``` Let's look at the data and see if there any outliers or aberrant results: ```{r extract} library(tidyr) nls_coef <- nlin_bt %>% dplyr::select(-splits) %>% # Turn it into a tibble by stacking the `models` col unnest() %>% # Get rid of unneeded columns dplyr::select(id, term, estimate) head(nls_coef) ``` Now let's create a scatterplot matrix: ```{r splom} nls_coef %>% # Put different parameters in columns tidyr::spread(term, estimate) %>% # Keep only numeric columns dplyr::select(-id) %>% ggscatmat(alpha = .25) ``` One potential outlier on the right for `VO2peak` but we'll leave it in. The univariate distributions are: ```{r hists} nls_coef %>% ggplot(aes(x = estimate)) + geom_histogram(bins = 20, col = "white") + facet_wrap(~ term, scales = "free_x") ``` ### Percentile intervals The most basic type of interval uses _percentiles_ of the resampling distribution. To get the percentile intervals, the `rset` object is passed as the first argument and the second argument is the list column of tidy results: ```{r pctl} p_ints <- int_pctl(nlin_bt, models) p_ints ``` When overlaid with the univariate distributions: ```{r pctl-plot} nls_coef %>% ggplot(aes(x = estimate)) + geom_histogram(bins = 20, col = "white") + facet_wrap(~ term, scales = "free_x") + geom_vline(data = p_ints, aes(xintercept = .lower), col = "red") + geom_vline(data = p_ints, aes(xintercept = .upper), col = "red") ``` How do these intervals compare to the parametric asymptotic values? ```{r int-compare} parametric <- tidy(res, conf.int = TRUE) %>% dplyr::select( term, .lower = conf.low, .estimate = estimate, .upper = conf.high ) %>% mutate( .alpha = 0.05, .method = "parametric" ) intervals <- bind_rows(parametric, p_ints) %>% arrange(term, .method) intervals %>% split(intervals$term) ``` The percentile intervals are wider than the parametric intervals (which assume asymptotic normality). Do the estimates appear to be normally distributed? We can look at quantile-quantile plots: ```{r qqplot} nls_coef %>% ggplot(aes(sample = estimate)) + stat_qq() + stat_qq_line(alpha = .25) + facet_wrap(~ term, scales = "free") ``` ### t-intervals Bootstrap _t_-intervals are estimated by computing intermediate statistics that are _t_-like in structure. To use these, we require the estimated variance _for each individual resampled estimate_. In our example, this comes along with the fitted model object. We can extract the standard errors of the parameters. Luckily, most `tidy()` provide this in a column named `std.error`. The arguments for these intervals are the same: ```{r t-ints} t_stats <- int_t(nlin_bt, models) intervals <- bind_rows(intervals, t_stats) %>% arrange(term, .method) intervals %>% split(intervals$term) ``` ### Bias-corrected and accelerated intervals For bias-corrected and accelerated (BCa) intervals, an additional argument is required. The `.fn` argument is a function that computes the statistic of interest. The first argument should be for the `rsplit` object and other arguments can be passed in using the ellipses. These intervals use an internal leave-one-out resample to compute the Jackknife statistic and will recompute the statistic for _every bootstrap resample_. If the statistic is expensive to compute, this may take some time. For those calculations, we use the `furrr` package so these can be computed in parallel if you have set up a parallel processing plan (see `?future::plan`). The user-facing function takes an argument for the function and the ellipses. ```{r bca-comp} bias_corr <- int_bca(nlin_bt, models, .fn = fit_fun, start = start_vals) intervals <- bind_rows(intervals, bias_corr) %>% arrange(term, .method) intervals %>% split(intervals$term) ``` ## No existing tidy method In this case, your function can emulate the minimum results: * a character column called `term`, * a numeric column called `estimate`, and, optionally, * a numeric column called `std.error`. The last column is only needed for `int_t`. Suppose we just want to estimate the fold-increase in the outcome between the 90th and 10th percentiles over the course of the experiment. Our function might look like: ```{r fold-foo} fold_incr <- function(split, ...) { dat <- analysis(split) quants <- quantile(dat$VO2, probs = c(.1, .9)) tibble( term = "fold increase", estimate = unname(quants[2]/quants[1]), # We don't know the analytical formula for this std.error = NA_real_ ) } ``` Everything else works the same as before: ```{r fold-ci} nlin_bt <- nlin_bt %>% mutate(folds = map(splits, fold_incr)) int_pctl(nlin_bt, folds) int_bca(nlin_bt, folds, .fn = fold_incr) ``` ## Intervals for linear(ish) parametric intervals `rsample` also contains the `reg_intervals()` function that can be used for linear regression (via `lm()`), generalized linear models (`glm()`), or log-linear survival models (`survival::survreg()` or `survival::coxph()`). This function makes it easier to get intervals for these models. A simple example is a logistic regression using the dementia data from the `modeldata` package: ```{r ad-data} data(ad_data, package = "modeldata") ``` Let's fit a model with a few predictors: ```{r ad-model} lr_mod <- glm(Class ~ male + age + Ab_42 + tau, data = ad_data, family = binomial) glance(lr_mod) tidy(lr_mod) ``` Let's use this model with student-t intervals: ```{r ad-t-int} set.seed(29832) lr_int <- reg_intervals(Class ~ male + age + Ab_42 + tau, data = ad_data, model_fn = "glm", family = binomial) lr_int ``` We can also save the resamples for plotting: ```{r ad-t-int-plot} set.seed(29832) lr_int <- reg_intervals(Class ~ male + age + Ab_42 + tau, data = ad_data, keep_reps = TRUE, model_fn = "glm", family = binomial) lr_int ``` Now we can unnest the data to use in a ggplot: ```{r ad-plot} lr_int %>% select(term, .replicates) %>% unnest(cols = .replicates) %>% ggplot(aes(x = estimate)) + geom_histogram(bins = 30) + facet_wrap(~ term, scales = "free_x") + geom_vline(data = lr_int, aes(xintercept = .lower), col = "red") + geom_vline(data = lr_int, aes(xintercept = .upper), col = "red") + geom_vline(xintercept = 0, col = "green") ``` rsample/vignettes/Applications/diagram.png0000755000175000017500000266010313653053433020637 0ustar nileshnileshPNG  IHDRoNse pHYs+ IDATx||D IDAT p IDATw IDATZP| IDAT||!X9 IDAT|I IDAT/ IDATS IDAT|| J IDATk IDAT39t IDATL IDAT||auq IDAT' IDAT IDAT%%%$$$???>>>6665 IDAT%%%>>>>>>RRR^^^ttt??? ÍMMMjjj||CCCKKK DDD;;;]]] چddd,,,fff ```FFFVVV IDATJJJ***]]]```^^^QQQDDDsssvvvbbb| IDATZZZxxxZZZJJJmmmlll 000!!!***\j IDATlllݛDcڙqڙqkkk111ݱrLLL...׬ ggj[111>>>Ѩ-Z6DV999((( ˦9X1@Y=== JA IDAT>>>Š:k-6 ===BBB(((羢I)))||㸣MN$ 1V:ȡ???CCC ? IDAT  " IDATv/7641/3D^|Ǘ\ɬ}:~Hqſ %Is=tGk!l oQ𾗙jO|x9ۇl oQ )))*** 8,$<|A˂VD:Cki\d!|A˂VD:+++555FFFү&: ү&:333q IDAT>>>xxx1 yuVq XtyuVq Xthhh===صq;rQ+' ï +' ï>>>===V<$ 4IXz;JcRQZ>%74\@fIG-bJcRQZ>%7???EEE   >>>2FH IDATwww___OOO___llllll||>>> NNNTTTNNNAb* IDATJJJSSSWWWcccsssmmmfffyyy!!!DDDCCCnnn!!!?h> IDATbbbWWWGGG UUU^^^ͬ 唔ooo+++הlll$$$xxx>>>>>>222_j IDATlllMMM>>>>>>"""RRR(((OOO 555```VVV000KKK^^^```ZZZ))) IDAT^^^```^^^"""OOO```||^^^)))  IDAT999```^^^000###```^^^777(((vgx IDAT___^^^???222___^^^FFF===___* IDAT^^^MMMHHH```^^^TTTSSS```riH IDAT ^^^>>>:::ccciii>>>Ӷ444||HHHʮGGG >>>A( IDATOOO***JJJ```:::444666UUU   000666UUU$  " "!Vϒ IDAT """...OOO333  $"$###   ...,,,v{v868 OOO &)-)868 ɕ-5-"&" 868 j IDAT 770 **+335!(.(*(*! !77/889))*HTHƽ202))#HHJ=F=EBEDD;RRT -4-USU33-DDF¿ $ ZXZN@ IDAT33*!!''+$$%"!%!a_a??5))- (-(QNQ<<2EE<(-( ' ,,#GG>6>6"'"||%~ * IDAT $$ RRHEPE18155.FF<=G=xzGqewif_-1@n8njbw<  3[  Ԁ8S#Pss:܌l hLnTfz-fh-/@n5knwifh-/@n5ǒbw< 3[  ϘxFI̍W7&Y^`o[j}+p шW[5L ՘^TZ?3 B<u>΂a@9pԈW[2MpԈW[2L ֘^TZ?3 ¿7.>TjX>8863#6F3,F^8K8  ?E.կ&9 2F3,2F3,B^4K8 08>?;AN  gI˭ijP 컁~Qj a|ˡpC IDAT >: $''%''#-ARa}7 4=س +% Ư0=ط  3lDcG)1=kmg()n& '[UpUFRR%a^zAA"+9FFJ J]RZc<%2#aXzKA!+9AFJ  CC;   s\Z5 QQTDMD s\Z~5   0/0 縂%׶d-   縊%׶d-  IDATEwx*IRp Epx*IRj   ' %     `b IDAT   !!! !  ++-   IDAT334)() ||668/./979668767668&%&CACbbb335%%' 868 IDAT+)+:::(((``` --0334545OOO$$$ZZZ OOQECEƼŻDCDQQQ XXX001QQRGFGSSSTTTBBD447'#'/./UUUPPP1 IDATSSU##( !*)*B?BWWWLLLNNQ !TQTYYYJJJXXZ++0))*pppvuv_]_ZZZDDDFFIMMQ;;;ZZZjjjFDF濳VTV\\\ BBB޹ք IDAT++#MMOӓ9A9IFI```ddd##$448ٺ؞ MMOeee535%%IFI```:::66/##$557|||+++𼿼!(!$)$%!%%$%dddaaa%% 557 ~~~/6/ 757ddd___  $$%~~~GGG/6/868ddd]]]Sg IDAT!!     !~~~  656#"#eee[[[||  666 %#%&&&dddYYYށ444mmm "      dddXXX%%%   eeeUUUq] IDAToooRRRXXX"""dddTTT@@@RRR   qqqJJJMMM溺cccrrrLLLZZZΠppptttMMMAi IDATЖiii SSS훛eeeLLLܗnnn,,,xxx뛛dddKKK閖iii999zzz霜dddIIIhhh<<<999{{{TTT݂^^^ EEE}}}WWW" IDATͥ[[[HHHSSS㜜dddDDD [[[888SSS✜dddAAA...[[['''SSSߜddd@@@PPP[[[SSSޜddd>>>1 IDAT夤[[["""ܜddd<<<ԥ\\\OOOSSSڜddd:::||[[[???SSS؜eee888ᰰAAASSS+++SSSכeee666PPPPPPSSSY IDAT՛ddd555䰰;;;PPP111SSSӜeee222PPPPPP ֊(((GGGVVVћddd111簰555QQQ333]]]lllϜeee///®PPPPPP !!!222Λddd---### J IDATָ444PPP666hhh,,,˜ eee+++666 縸 HHH ZZZXXX555555ʐ CCCHHHHHH###)))eeeFFFyyyJJJjjjeeeSSS)))SSSǗH IDATooo Ҹ666HHH???GGGnnn@@@RRR򉉉333PPPGGGqqqHHHGGG JJJ mmm (((HHHxxxMMMZZZKKKMMM~~~___AAABBBnnn ...```'''DDD DDD@@@䌌/// -p IDAT+++!!!|| [ IDATR'574.%+Ir꼂ΘmO3:ڑ:+574,%1S~ڙqj9%;XlkT-)Uv/772)%8]ڙqy>7QerteD$9~ȹe4O[J^KxTjxXC8893#-s??AN gIǭiyjIg,p>0<>??AN1Pisa`hTC'5=?)?DE.I+ryuVq Xt'5=?)?D߮vb9V$ ;XmuKoYYc:}0+8=???G  W<ʼ|BJ^muw׈I'5=?)?DmfTg.+= &'''''%0ARh}<"+ ϱܽ7(>= &'''''%Iv\ى*D($'''''&# (8I\r~j +' ïع̳/9*D($'''''&# yb0])  %V{B4?&'''''&!  ,%7%(ka+9RyoIL6&hOԑZQɈR< 0dlTLcboFQ5,zz)~W&-Vo XfUM^5!),%(ka+9RyoIL6&hO]S*'JOڐHbEEEHHHfCzzzIIIfC                   Tz 1v% 1v%He! IDAT '##(#(||P!o IDAT LLLQQQϵKKK111XXX---!!! 666www@@@444"""ѳKKK111LLL& IDAT˔!!!lll̢kkkZZZrrrVVV888픔lll666ܔ|||%%%|||XXXܔlll%%%픔lll666ǽ|||fff$$$˄ lll```888666؛,,,___uuu555<<>>GGG/g IDATyyy;;;BBBEEEBBBIIIYYYVVVIII&&&VVVFFFZZZШjjjJJJWWW gggXXXkkk{{{VVV fff쮮 xxxiii蔔lll111ggg}}}ttt~~~DDD IDATѐllllllBBBxxx[[[---ppp666///nnn777lll HHH BBB999>>>@@@$$$LLLCCCSSSOOO{{{QQQAAA||TTTܴddd```^^^bbbRRR IDATeeeuuu qqqGGGsssbbb╕ppp+++ΰKKKؔlll!!!kkk===ގYYY(((|||***kkkRRR***===mmmzzzggg999000ZZZ>>>NNNKKKt IDAT---]]]vvvKKK;;;rrr222(((JJJ>>>Ծ...bbb$$$̔lll999 :::///[[[!!!nnn\\\JJJ)))lll777ݔlll&&&nnn!!!KKKXXXlllbbb222000'''nnnݔlll&&&큁\\\sssoooBBBggg|||___~~~RI IDATKKKGGG QQQjjjeee???NNN:::vvv<<<HHH^^^ 555___򯯯///___EEEDDDכ???III+++OOOYYY]]]SSS<<@???++(../../...(+(')''*'!#!@>@???**'../../---||Ƕ141:?::>:HFHRRR..,..*..*.., ../778>>> ʴ.1.:?::>:IGIPPP ..,..*--*..+U5 IDAT//0556???ͳ*-*9>9;?; GFGQPQ **(..*..+..+../667>>>"%"6<6;?;УPOP^]^ռ@@;**(,,)ம++,DDEEEE;@;;?;ФKJK]\]ػBB<((&..+ᮮ(()CCDDDD =C=;>; ХEDE]\]ڼCC=((&--*ᮮ##%EEFCCC .3.X^X!646qpq﫫DD>DD@̼778JJK???*.*V]V"$"0.0pop򩩩 EE?DD@ͽ¼223KKL@@@$($U\U&(&*(*nmn DD>EEAл../KKLBBBqsq-3-IOIljlHGHZZSBB>hhj999rtr&,&HNH`^`LKL]]V@@<hhj===rsr & GMG¾§VSVPOP__X??;hhj@@@-0-hhaUUOݘhhj000y{y252hhaYYSᘘggi444npn696􊉊hha\\V䘘ggi777?y IDATϑ/&"-! Žőuus@@9__Y՘--1iijԋ/&"-! rro;;5bb\ؙ%%(mmo׈/&"-! ƪool550ee^ڙ rrtáxDܙyլGKGNWNjmjbb___YYM缐..+ ɗxDܙyլ@D@[f[egeXXUbb\YM缐++(}ЍxDܙyլ:>:jtj^a^LLIee^YM缐((&%%zzw鋍ȪW{$DZ4744;4興n}߼]]W00(ȪW{$DZ-/-?G?||yn}߼UUP<<3xxvstsȪW{$DZ&(<Lppmn}߼MMIFF>oolwwŤK{{1d-:fh-/@n5ϔq[_( atY`l{b|c/, 0~p$knwi﹈k"2tx8 SXSvkv%%%hhfᵮ_gb`u) ߱uc/,0~p#a%2t8 Ěo{ /9q<֝lu0%BSa%2t6iR AOzwW Ěo{ /9q<ۘNp@#]e& r(Xr;KKG❝ccdxxŤK{{1d-:fh-/@n5ϔq[_( atY`l{b|c/, 0~p$knwi﹈k"2tx8 MRM{q{+++aa`ᵮ_gb`u) ߱uc/,0~p#a%2t8 㽈m*Pc- ֝lu0%BVuc/,0~niR AOzwW Ěo{ /9q<ۘNp@#]e& r(Xr;EEA嗗iijwwŤK{{1d-:fh-/@n5ϔq[_( atY`l{b|c/, 0~p$knwi﹈k"2tx8 GKG111[[Yᵮ_gb`u) ߱uc/,0~p#a%2t8 㽈m*Pc- ֝lu0%BVuc/,0~p#f]*1bf(DĚo{ /9q<ۘNp@#]e& r(Xr;@@<撒nnp8:8KMuWʍQ]l7vڦ_FݑJR͑F]ڰXH֟YOȋDWi@vڑFc֨YOnFspGwq@>k% IDATKMuWʍQ]l7vڦ_FݑJR͑F]ڰXH֟YOȋDWi@vڑFc֨YO֩[F֬^GqWʍQ]l7vڦ_FݑJR͑F]ڰXH֟YO𒛒ȋDWi@vڑFc֨YO֩[F֬^Gqj262"" ֯!V<2X0æ98wt}.>jὶͪ>7ul.>j%@=  *19A2 #ὶͪ =C1 ( &)&ᴭWE *OA֧2F3,޻sbghg:a&ܬ_C98wt+>j252"" ֯!V<2X0æ98wt}.>jٹì>7ul98wt}%@=  *19A2 #ὶͪ =C1 (  ᴭWE *OA֧2F3,޻sbghg:a&ܬ_C98wt+>j252!!֯!V<2X0æ98wt}.>jٹì>7ul98wt}6E1 )4;B)(ὶͪ =C1 ( ݲU>.~F[·t5"dd4W9M)\e5qȞa 8 &gI˱ijQЦAz:B5qŞaz ;)gI˭ij"0.~F[·t5"dd4W9M)\e5qȞa 8 &gI˱ijQЦAz:B5qŞaz ;)gI˭ij>&5=;)?@A+I)n15qŞaz +"0.~F[·t5"dd4W9M)\e5qȞa 8 &gI˱ijQЦAz:B5qŞaz ;)gI˭ij>&5=;)?@A+I)n15qŞaz 7"0K !4E!0-\6-A,3C=A>S4(8D(2#&-A,3CO A>S4 ***((( ծ,ўHޠ\mU\$_ʹ%w 'QMpz?t!1E!.-\8 -=,/NQ434778&&$ͤQނ:ӅI>K !4E!0-\6-A,3CM(D(M/(8D(2#*!4E!0-\+A>S4 ((("""ծ,ўHޠ\mU\$_ʹ%w 'QMpz?t!1E!.-\8 -=,/NQ434667,,)ͤQނ:ӅI>K !4E!0-\6-A,3CM(D(M/(8D(2#*!4E!0-\7 A>S4 )))>>>.0.Ҧ%XhCPiSS8a5V_D2 NVi;) dk/$ lmg('h&YQYbbc__\BB>è?e  $QKz'~g2$ l kg()n& sUR0) MZӔUt08vlg()n& >sUR0) M[>]"o[G9lA==@=<=252Ҧ%XhCPiSS8a5V_D2 NVi;) dk/$ lmg('h&MDMbbcWWUGGCè?e  $QKz'~g2$ l kg()n&  RqoTL2&hBUt08vdc2$ l >sUR0) M[>]"o[G9lA225=;=8<8Ҧ%XhCPiSS8a5V_D2 NVi;) M IDATdk/$ lmg('h&BZBbbcPPNMMHè?e  $QKz'~g2$ l kg()n&  RqoTL2&hBUt08vdc2$ l >sUR0) M[>]"o[G9lA((,"""SVS ۟_z V  fefhhe         ssuGJG ۟_z V  mkm__\       ||<><ķ ۟_z V  sqsVVS       Ά̜Ӱ |v9½__]qqsҐӰ |v9ȴUUT vvy؄Ӱ |v9ЦKKJ{{~؂ " ۅppm11,gg`˙??B``a~~ " Όmmj,,'gg`Ι99>:))'ggi555V]V+.+NMNqpqEE?CC? EEFJJK222U\U.1.IHIonoDD?EE@!!¼DDEJJK555  U\U141BABnmn DD?DD?$$"ļDDEIIJ888š 7<7;?;ѫ^]^]\]⻻AA<))&.., ⫫DDEDDE>>>Ǣ9>9<@<ҧ^]^]\] 似DD?((%.., ⬬@@ADDE@@@ʢ8;! !HFHQQQ..+//+..,../556???&)&(*(&)&%#%@?@??? ...//0//0... Կ'*'&(&&)&@?@@@@ IDAT)))..///0/// %)%')'&)&@?@???"""//0..////!#!373  *)*)())))///%%"$$$ 474   *)**)****...""%%%585 *)**)*)))...!!&&&)+)((( )+) )))!!  *,* )))""       \ IDAT\ IDAT@ IDAT||- IDAT1 IDAT{-U IDAT, G IDAT|| | IDAT IDAT IDAT9k IDAT||@ IDATLio IDATr- IDATf{IDATc%G'oIENDB`rsample/vignettes/Applications/Recipes_and_rsample.Rmd0000644000175000017500000001366114045536131023121 0ustar nileshnilesh--- title: "Recipes with rsample" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Recipes with rsample} output: knitr:::html_vignette: toc: yes --- ```{r setup, include = FALSE} options(digits = 3) library(rsample) library(recipes) library(purrr) ``` The [`recipes`](https://topepo.github.io/recipes/) package contains a data preprocessor that can be used to avoid the potentially expensive formula methods as well as providing a richer set of data manipulation tools than base R can provide. This document uses version `r packageDescription("recipes")$Version` of `recipes`. In many cases, the preprocessing steps might contain quantities that require statistical estimation of parameters, such as * signal extraction using principal component analysis * imputation of missing values * transformations of individual variables (e.g. Box-Cox transformations) It is critical that any complex preprocessing steps be contained _inside_ of resampling so that the model performance estimates take into account the variability of these steps. Before discussing how `rsample` can use recipes, let's look at an example recipe for the Ames housing data. ## An Example Recipe For illustration, the Ames housing data will be used. There are sale prices of homes along with various other descriptors for the property: ```{r ames-data, message=FALSE} data(ames, package = "modeldata") ``` Suppose that we will again fit a simple regression model with the formula: ```{r form, eval = FALSE} log10(Sale_Price) ~ Neighborhood + House_Style + Year_Sold + Lot_Area ``` The distribution of the lot size is right-skewed: ```{r build} library(ggplot2) theme_set(theme_bw()) ggplot(ames, aes(x = Lot_Area)) + geom_histogram(binwidth = 5000, col = "red", fill ="red", alpha = .5) ``` It might benefit the model if we estimate a transformation of the data using the Box-Cox procedure. Also, note that the frequencies of the neighborhoods can vary: ```{r hood} ggplot(ames, aes(x = Neighborhood)) + geom_bar() + coord_flip() + xlab("") ``` When these are resampled, some neighborhoods will not be included in the test set and this will result in a column of dummy variables with zero entries. The same is true for the `House_Style` variable. We might want to collapse rarely occurring values into "other" categories. To define the design matrix, an initial recipe is created: ```{r rec_setup, message=FALSE, warning=FALSE} library(recipes) rec <- recipe(Sale_Price ~ Neighborhood + House_Style + Year_Sold + Lot_Area, data = ames) %>% # Log the outcome step_log(Sale_Price, base = 10) %>% # Collapse rarely occurring jobs into "other" step_other(Neighborhood, House_Style, threshold = 0.05) %>% # Dummy variables on the qualitative predictors step_dummy(all_nominal()) %>% # Unskew a predictor step_BoxCox(Lot_Area) %>% # Normalize step_center(all_predictors()) %>% step_scale(all_predictors()) rec ``` This recreates the work that the formula method traditionally uses with the additional steps. While the original data object `ames` is used in the call, it is only used to define the variables and their characteristics so a single recipe is valid across all resampled versions of the data. The recipe can be estimated on the analysis component of the resample. If we execute the recipe on the entire data set: ```{r recipe-all} rec_training_set <- prep(rec, training = ames) rec_training_set ``` To get the values of the data, the `bake` function can be used: ```{r baked} # By default, the selector `everything()` is used to # return all the variables. Other selectors can be used too. bake(rec_training_set, new_data = head(ames)) ``` Note that there are fewer dummy variables for `Neighborhood` and `House_Style` than in the data. Also, the above code using `prep` benefits from the default argument of `retain = TRUE`, which keeps the processed version of the data set so that we don't have to reapply the steps to extract the processed values. For the data used to train the recipe, we would have used: ```{r juiced} juice(rec_training_set) %>% head ``` The next section will explore recipes and bootstrap resampling for modeling: ```{r boot} library(rsample) set.seed(7712) bt_samples <- bootstraps(ames) bt_samples bt_samples$splits[[1]] ``` ## Working with Resamples We can add a recipe column to the tibble. `recipes` has a convenience function called `prepper` that can be used to call `prep` but has the split object as the first argument (for easier purrring): ```{r col-pred} library(purrr) bt_samples$recipes <- map(bt_samples$splits, prepper, recipe = rec) bt_samples bt_samples$recipes[[1]] ``` Now, to fit the model, the fit function only needs the recipe as input. This is because the above code implicitly used the `retain = TRUE` option in `prep`. Otherwise, the split objects would also be needed to `bake` the recipe (as it will in the prediction function below). ```{r cols-fit} fit_lm <- function(rec_obj, ...) lm(..., data = juice(rec_obj, everything())) bt_samples$lm_mod <- map( bt_samples$recipes, fit_lm, Sale_Price ~ . ) bt_samples ``` To get predictions, the function needs three arguments: the splits (to get the assessment data), the recipe (to process them), and the model. To iterate over these, the function `purrr::pmap` is used: ```{r cols-pred} pred_lm <- function(split_obj, rec_obj, model_obj, ...) { mod_data <- bake( rec_obj, new_data = assessment(split_obj), all_predictors(), all_outcomes() ) out <- mod_data %>% select(Sale_Price) out$predicted <- predict(model_obj, newdata = mod_data %>% select(-Sale_Price)) out } bt_samples$pred <- pmap( lst( split_obj = bt_samples$splits, rec_obj = bt_samples$recipes, model_obj = bt_samples$lm_mod ), pred_lm ) bt_samples ``` Calculating the RMSE: ```{r cols-rmse} library(yardstick) results <- map_dfr(bt_samples$pred, rmse, Sale_Price, predicted) results mean(results$.estimate) ``` rsample/vignettes/Applications/Survival_Analysis.Rmd0000644000175000017500000001257114122424444022636 0ustar nileshnilesh--- title: "Survival Analysis Example" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Survival Analysis Example} output: knitr:::html_vignette: toc: yes --- ```{r setup, include = FALSE} options(digits = 3) library(survival) library(purrr) library(rsample) library(dplyr) library(tidyposterior) library(ggplot2) library(tidyr) ``` In this article, a parametric analysis of censored data is conducted and `rsample` is used to measure the importance of predictors in the model. The data that will be used is the NCCTG lung cancer data contained in the `survival` package: ```{r lung} library(survival) str(lung) ``` `status` is an indicator for which patients are censored (`status = 1`) or an actual event (`status = 2`). The help file `?survreg` has the following model fit: ```{r example-model} lung_mod <- survreg(Surv(time, status) ~ ph.ecog + age + strata(sex), data = lung) summary(lung_mod) ``` Note that the stratification on gender only affects the scale parameter; the estimates above are from a log-linear model for the scale parameter even though they are listed with the regression variables for the other parameter. `coef` gives results that are more clear: ```{r coef} coef(lung_mod) ``` To resample these data, it would be a good idea to try to maintain the same censoring rate across the splits. To do this, stratified resampling can be used where each analysis/assessment split is conducted within each value of the status indicator. To demonstrate, Monte Carlo resampling is used where 75% of the data are in the analysis set. A total of 100 splits are created. ```{r splits} library(rsample) set.seed(9666) mc_samp <- mc_cv(lung, strata = "status", times = 100) library(purrr) cens_rate <- function(x) mean(analysis(x)$status == 1) summary(map_dbl(mc_samp$splits, cens_rate)) ``` To demonstrate the use of resampling with censored data, the parametric model shown above will be fit with different variable sets to characterize how important each predictor is to the outcome. To do this, a set of formulas are created for the different variable sets: ```{r forms} three_fact <- as.formula(Surv(time, status) ~ ph.ecog + age + strata(sex)) rm_ph.ecog <- as.formula(Surv(time, status) ~ age + strata(sex)) rm_age <- as.formula(Surv(time, status) ~ ph.ecog + strata(sex)) rm_sex <- as.formula(Surv(time, status) ~ ph.ecog + age ) ``` The model fitting function will take the formula as an argument: ```{r fit-func} mod_fit <- function(x, form, ...) survreg(form, data = analysis(x), ...) ``` To calculate the efficacy of the model, the concordance statistic is used (see `?survConcordance`): ```{r concord} get_concord <- function(split, mod, ...) { pred_dat <- assessment(split) pred_dat$pred <- predict(mod, newdata = pred_dat) concordance(Surv(time, status) ~ pred, pred_dat, ...)$concordance } ``` With these functions, a series of models are created for each variable set. ```{r models} mc_samp$mod_full <- map(mc_samp$splits, mod_fit, form = three_fact) mc_samp$mod_ph.ecog <- map(mc_samp$splits, mod_fit, form = rm_ph.ecog) mc_samp$mod_age <- map(mc_samp$splits, mod_fit, form = rm_age) mc_samp$mod_sex <- map(mc_samp$splits, mod_fit, form = rm_sex) ``` Similarly, the concordance values are computed for each model: ```{r concord-est} mc_samp$full <- map2_dbl(mc_samp$splits, mc_samp$mod_full, get_concord) mc_samp$ph.ecog <- map2_dbl(mc_samp$splits, mc_samp$mod_ph.ecog, get_concord) mc_samp$age <- map2_dbl(mc_samp$splits, mc_samp$mod_age, get_concord) mc_samp$sex <- map2_dbl(mc_samp$splits, mc_samp$mod_sex, get_concord) ``` The distributions of the resampling estimates ```{r concord-df} library(dplyr) concord_est <- mc_samp %>% dplyr::select(-matches("^mod")) library(tidyr) library(ggplot2) concord_est %>% select(-splits) %>% pivot_longer(-id) %>% ggplot(aes(x = value, color = name)) + geom_line(stat = "density") + theme_bw() + theme(legend.position = "top") ``` It looks as though the model missing `ph.ecog` has larger concordance values than the other models. As one might expect, the full model and the model absent `sex` are very similar; the difference in these models should only be the scale parameters estimates. To more formally test this, the `tidyposterior` package is used to create a Bayesian model for the concordance statistics. ```{r perf-mod, warning = FALSE, message = FALSE} library(tidyposterior) concord_est <- perf_mod(concord_est, seed = 6507, iter = 5000) concord_est$stan ``` To summarize the posteriors for each model: ```{r post} ggplot(tidy(concord_est)) + theme_bw() ``` While this seems clear-cut, let's assume that a difference in the concordance statistic of 0.1 is a real effect. To compute the posteriors for the difference in models, the full model will be contrasted with the others: ```{r diffs} comparisons <- contrast_models( concord_est, list_1 = rep("full", 3), list_2 = c("ph.ecog", "age", "sex"), seed = 4654 ) ``` The posterior distributions show that, statistically, `ph.ecog` has real importance ot the model. However, since these distributions are mostly with +/- 0.05, they are unlikely to be real differences. ```{r diff-post} ggplot(comparisons, size = 0.05) + theme_bw() ``` The ROPE statistics quantify the practical effects: ```{r diff-sum} summary(comparisons, size = 0.05) %>% dplyr::select(contrast, starts_with("pract")) ``` rsample/vignettes/Working_with_rsets.Rmd0000644000175000017500000001673613673171774020453 0ustar nileshnilesh--- title: "Working with rsets" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Working with rsets} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3, width = 90) library(ggplot2) theme_set(theme_bw()) ``` ## Introduction `rsample` can be used to create objects containing resamples of the original data. This page contains examples of how those objects can be used for data analysis. For illustration, the `attrition` data are used. From the help file: > These data are from the IBM Watson Analytics Lab. The website describes the data with "Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or 'compare average monthly income by education and attrition'. This is a fictional data set created by IBM data scientists." There are 1470 rows. The data can be accessed using ```{r attrition, message=FALSE} library(rsample) data("attrition", package = "modeldata") names(attrition) table(attrition$Attrition) ``` ## Model Assessment Let's fit a logistic regression model to the data with model terms for the job satisfaction, gender, and monthly income. If we were fitting the model to the entire data set, we might model attrition using ```r glm(Attrition ~ JobSatisfaction + Gender + MonthlyIncome, data = attrition, family = binomial) ``` For convenience, we'll create a formula object that will be used later: ```{r form, message=FALSE} mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome) ``` To evaluate this model, we will use 10 repeats of 10-fold cross-validation and use the 100 holdout samples to evaluate the overall accuracy of the model. First, let's make the splits of the data: ```{r model_vfold, message=FALSE} library(rsample) set.seed(4622) rs_obj <- vfold_cv(attrition, v = 10, repeats = 10) rs_obj ``` Now let's write a function that will, for each resample: 1. obtain the analysis data set (i.e. the 90% used for modeling) 1. fit a logistic regression model 1. predict the assessment data (the other 10% not used for the model) using the `broom` package 1. determine if each sample was predicted correctly. Here is our function: ```{r lm_func} ## splits will be the `rsplit` object with the 90/10 partition holdout_results <- function(splits, ...) { # Fit the model to the 90% mod <- glm(..., data = analysis(splits), family = binomial) # Save the 10% holdout <- assessment(splits) # `augment` will save the predictions with the holdout data set res <- broom::augment(mod, newdata = holdout) # Class predictions on the assessment set from class probs lvls <- levels(holdout$Attrition) predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]), levels = lvls) # Calculate whether the prediction was correct res$correct <- predictions == holdout$Attrition # Return the assessment data set with the additional columns res } ``` For example: ```{r onefold, warning = FALSE} example <- holdout_results(rs_obj$splits[[1]], mod_form) dim(example) dim(assessment(rs_obj$splits[[1]])) ## newly added columns: example[1:10, setdiff(names(example), names(attrition))] ``` For this model, the `.fitted` value is the linear predictor in log-odds units. To compute this data set for each of the 100 resamples, we'll use the `map` function from the `purrr` package: ```{r model_purrr, warning=FALSE} library(purrr) rs_obj$results <- map(rs_obj$splits, holdout_results, mod_form) rs_obj ``` Now we can compute the accuracy values for all of the assessment data sets: ```{r model_acc} rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct)) summary(rs_obj$accuracy) ``` Keep in mind that the baseline accuracy to beat is the rate of non-attrition, which is `r round(mean(attrition$Attrition == "No"), 3)`. Not a great model so far. ## Using the Bootstrap to Make Comparisons Traditionally, the bootstrap has been primarily used to empirically determine the sampling distribution of a test statistic. Given a set of samples with replacement, a statistic can be calculated on each analysis set and the results can be used to make inferences (such as confidence intervals). For example, are there differences in the median monthly income between genders? ```{r type_plot} ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + geom_boxplot() + scale_y_log10() ``` If we wanted to compare the genders, we could conduct a _t_-test or rank-based test. Instead, let's use the bootstrap to see if there is a difference in the median incomes for the two groups. We need a simple function to compute this statistic on the resample: ```{r mean_diff} median_diff <- function(splits) { x <- analysis(splits) median(x$MonthlyIncome[x$Gender == "Female"]) - median(x$MonthlyIncome[x$Gender == "Male"]) } ``` Now we would create a large number of bootstrap samples (say 2000+). For illustration, we'll only do 500 in this document. ```{r boot_mean_diff} set.seed(353) bt_resamples <- bootstraps(attrition, times = 500) ``` This function is then computed across each resample: ```{r stats} bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff) ``` The bootstrap distribution of this statistic has a slightly bimodal and skewed distribution: ```{r stats_plot} ggplot(bt_resamples, aes(x = wage_diff)) + geom_line(stat = "density", adjust = 1.25) + xlab("Difference in Median Monthly Income (Female - Male)") ``` The variation is considerable in this statistic. One method of computing a confidence interval is to take the percentiles of the bootstrap distribution. A 95% confidence interval for the difference in the means would be: ```{r ci} quantile(bt_resamples$wage_diff, probs = c(0.025, 0.975)) ``` The calculated 95% confidence interval contains zero, so we don't have evidence for a difference in median income between these genders at a confidence level of 95%. ## Bootstrap Estimates of Model Coefficients Unless there is already a column in the resample object that contains the fitted model, a function can be used to fit the model and save all of the model coefficients. The [`broom` package](https://cran.r-project.org/package=broom) package has a `tidy` function that will save the coefficients in a data frame. Instead of returning a data frame with a row for each model term, we will save a data frame with a single row and columns for each model term. As before, `purrr::map` can be used to estimate and save these values for each split. ```{r coefs} glm_coefs <- function(splits, ...) { ## use `analysis` or `as.data.frame` to get the analysis data mod <- glm(..., data = analysis(splits), family = binomial) as.data.frame(t(coef(mod))) } bt_resamples$betas <- map(.x = bt_resamples$splits, .f = glm_coefs, mod_form) bt_resamples bt_resamples$betas[[1]] ``` ## Keeping Tidy As previously mentioned, the [`broom` package](https://cran.r-project.org/package=broom) contains a class called `tidy` that created representations of objects that can be easily used for analysis, plotting, etc. `rsample` contains `tidy` methods for `rset` and `rsplit` objects. For example: ```{r tidy_rsplit} first_resample <- bt_resamples$splits[[1]] class(first_resample) tidy(first_resample) ``` and ```{r tidy_rset} class(bt_resamples) tidy(bt_resamples) ``` rsample/build/0000755000175000017500000000000014142304331013141 5ustar nileshnileshrsample/build/vignette.rds0000644000175000017500000000036414142304331015503 0ustar nileshnileshuP 0Lm} "Aol`$Rm݈=d3Iv]BHxC.B t+Յ>Wٞ4VAua) +!2B%p)B#C5 l]]auiehƕ%&L#BЛsx^pnlcg,c"Y[.Eyd JgwEQ<# vcjhtTDlrsample/tests/0000755000175000017500000000000014122413046013206 5ustar nileshnileshrsample/tests/testthat/0000755000175000017500000000000014142307722015053 5ustar nileshnileshrsample/tests/testthat/test-print-groups.txt0000644000175000017500000000036014123136426021241 0ustar nileshnilesh> print(group_vfold_cv(warpbreaks, "tension"), n = 2) # Group 3-fold cross-validation # A tibble: 3 x 2 splits id 1 Resample1 2 Resample2 # ... with 1 more row rsample/tests/testthat/test_initial.R0000644000175000017500000000375214122413046017670 0ustar nileshnileshcontext("Initial splitting") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) data(drinks, package = "modeldata") test_that('default param', { set.seed(11) rs1 <- initial_split(dat1) expect_equal(class(rs1), c("mc_split", "rsplit")) tr1 <- training(rs1) ts1 <- testing(rs1) expect_equal(nrow(tr1), nrow(dat1)*3/4) expect_equal(nrow(ts1), nrow(dat1)/4) }) test_that('default time param', { rs1 <- initial_time_split(dat1) expect_equal(class(rs1), "rsplit") tr1 <- training(rs1) ts1 <- testing(rs1) expect_equal(nrow(tr1), floor(nrow(dat1) * 3/4)) expect_equal(nrow(ts1), ceiling(nrow(dat1) / 4)) expect_equal(tr1, dplyr::slice(dat1, 1:floor(nrow(dat1) * 3/4))) }) test_that('default time param with lag', { rs1 <- initial_time_split(dat1, lag = 5) expect_equal(class(rs1), "rsplit") tr1 <- training(rs1) ts1 <- testing(rs1) expect_equal(nrow(tr1), floor(nrow(dat1) * 3/4)) expect_equal(nrow(ts1), ceiling(nrow(dat1) / 4) + 5) expect_equal(tr1, dplyr::slice(dat1, 1:floor(nrow(dat1) * 3/4)) ) expect_equal(ts1, dat1[(floor(nrow(dat1) * 3/4) + 1 - 5):nrow(dat1),] ) # Whole numbers only expect_error(initial_time_split(drinks, lag = 12.5), "must be a whole number") # Lag must be less than number of training observations expect_error(initial_time_split(drinks, lag = 500), "must be less than or equal to the number") }) test_that("`prop` computes the proportion for analysis (#217)", { set.seed(11) props <- c(.1, .9) for (prop in props) { # Not stratified split <- initial_split(airquality, prop = prop) actual <- nrow(analysis(split)) expect <- as.integer(floor(nrow(airquality) * prop)) expect_identical(actual, expect) # Stratified split <- initial_split(airquality, prop = prop, strata = Month) actual <- nrow(analysis(split)) expect <- as.integer(sum(floor(table(airquality$Month) * prop)) ) expect_identical(actual, expect) } }) rsample/tests/testthat/test-compat-dplyr.R0000644000175000017500000002222213673171774020604 0ustar nileshnilesh# Skip entire file if dplyr < 1.0.0 skip_if(dplyr_pre_1.0.0()) library(dplyr) # ------------------------------------------------------------------------------ # dplyr_reconstruct() test_that("dplyr_reconstruct() returns an rset subclass if `x` retains rset structure", { for (x in rset_subclasses) { expect_identical(dplyr_reconstruct(x, x), x) expect_s3_class_rset(dplyr_reconstruct(x, x)) } }) test_that("dplyr_reconstruct() returns bare tibble if `x` loses rset structure", { for (x in rset_subclasses) { col <- x[1] row <- x[0,] expect_s3_class_bare_tibble(dplyr_reconstruct(col, x)) expect_s3_class_bare_tibble(dplyr_reconstruct(row, x)) } }) test_that("dplyr_reconstruct() retains extra attributes of `to` when not falling back", { for (x in rset_subclasses) { to <- x attr(to, "foo") <- "bar" x_tbl <- x[1] expect_identical(attr(dplyr_reconstruct(x, to), "foo"), "bar") expect_identical(attr(dplyr_reconstruct(x_tbl, to), "foo"), NULL) expect_s3_class_rset(dplyr_reconstruct(x, to)) expect_s3_class_bare_tibble(dplyr_reconstruct(x_tbl, to)) } }) # ------------------------------------------------------------------------------ # dplyr_col_modify() test_that("can add columns and retain rset class", { for (x in rset_subclasses) { cols <- list(x = rep(1, vec_size(x))) result <- dplyr_col_modify(x, cols) expect_s3_class_rset(result) expect_identical(result$x, cols$x) } }) test_that("modifying rset columns removes rset class", { for (x in rset_subclasses) { cols <- list(splits = rep(1, vec_size(x))) result <- dplyr_col_modify(x, cols) expect_s3_class_bare_tibble(result) expect_identical(result$splits, cols$splits) } for (x in rset_subclasses) { cols <- list(id = rep(1, vec_size(x))) result <- dplyr_col_modify(x, cols) expect_s3_class_bare_tibble(result) expect_identical(result$id, cols$id) } }) test_that("replacing rset columns with the exact same column retains rset class", { for (x in rset_subclasses) { cols <- list(splits = x$splits) result <- dplyr_col_modify(x, cols) expect_s3_class_rset(result) expect_identical(result, x) } }) test_that("for nested_cv, `inner_resamples` is also a protected column", { x <- rset_subclasses$nested_cv cols <- list(inner_resamples = rep(1, vec_size(x))) expect_s3_class_bare_tibble(dplyr_col_modify(x, cols)) }) # ------------------------------------------------------------------------------ # dplyr_row_slice() test_that("row slicing generally removes the rset subclass", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(dplyr_row_slice(x, 0)) } }) test_that("row slicing and duplicating any rows removes the rset subclass", { # Remove rsets with only 1 row subclasses <- rset_subclasses subclasses$apparent <- NULL subclasses$validation_split <- NULL for (x in subclasses) { loc <- seq_len(nrow(x)) loc[length(loc)] <- 1L expect_s3_class_bare_tibble(dplyr_row_slice(x, loc)) } }) test_that("row slicing and selecting everything keeps the rset subclass", { for (x in rset_subclasses) { loc <- seq_len(nrow(x)) expect_s3_class_rset(dplyr_row_slice(x, loc)) } }) test_that("rset subclass is kept if row order is changed but all rows are present", { for (x in rset_subclasses) { loc <- rev(seq_len(nrow(x))) expect_s3_class_rset(dplyr_row_slice(x, loc)) } }) # ------------------------------------------------------------------------------ # summarise() test_that("summarise() always drops the rset class", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(summarise(x, y = 1)) expect_s3_class_bare_tibble(summarise(x, splits = splits[1], id = id[1])) } }) # ------------------------------------------------------------------------------ # group_by() test_that("group_by() always returns a bare grouped-df or bare tibble", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(group_by(x)) expect_s3_class(group_by(x, splits), c("grouped_df", "tbl_df", "tbl", "data.frame"), exact = TRUE) } }) # ------------------------------------------------------------------------------ # ungroup() test_that("ungroup() returns a rset", { for (x in rset_subclasses) { expect_s3_class_rset(ungroup(x)) } }) # ------------------------------------------------------------------------------ # relocate() test_that("can relocate() and keep the class", { for (x in rset_subclasses) { x <- relocate(x, id) expect_s3_class_rset(x) } }) # ------------------------------------------------------------------------------ # distinct() test_that("distinct() keeps the class if everything is intact", { for (x in rset_subclasses) { expect_s3_class_rset(distinct(x)) } }) test_that("distinct() drops the class if any rset columns are lost", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(distinct(x, splits)) } }) # ------------------------------------------------------------------------------ # left_join() test_that("left_join() can keep rset class if rset structure is intact", { for (x in rset_subclasses) { expect_s3_class_rset(left_join(x, x, by = names(x))) y <- tibble(id = x$id[[1]], x = 1) expect_s3_class_rset(left_join(x, y, by = "id")) } }) test_that("left_join() can lose rset class if rows are added", { for (x in rset_subclasses) { y <- tibble(id = x$id[[1]], x = 1:2) expect_s3_class_bare_tibble(left_join(x, y, by = "id")) } }) # ------------------------------------------------------------------------------ # right_join() test_that("right_join() can keep rset class if rset structure is intact", { for (x in rset_subclasses) { expect_s3_class_rset(right_join(x, x, by = names(x))) x_names <- names(x) id_names <- x_names[col_starts_with_id(x_names)] y <- mutate(select(x, all_of(id_names)), x = 1) expect_s3_class_rset(right_join(x, y, by = id_names)) } }) test_that("right_join() can lose rset class if rows are added", { for (x in rset_subclasses) { y <- tibble(id = x$id[[1]], x = 1:2) expect_s3_class_bare_tibble(right_join(x, y, by = "id")) } }) test_that("right_join() restores to the type of first input", { for (x in rset_subclasses) { y <- tibble(id = x$id[[1]], x = 1) # technically rset structure is intact, but `y` is a bare tibble! expect_s3_class_bare_tibble(right_join(y, x, by = "id")) } }) # ------------------------------------------------------------------------------ # full_join() test_that("full_join() can keep rset class if rset structure is intact", { for (x in rset_subclasses) { expect_s3_class_rset(full_join(x, x, by = names(x))) } }) test_that("full_join() can lose rset class if rows are added", { for (x in rset_subclasses) { y <- tibble(id = "foo", x = 1) expect_s3_class_bare_tibble(full_join(x, y, by = "id")) } }) # ------------------------------------------------------------------------------ # anti_join() test_that("anti_join() can keep rset class if rset structure is intact", { for (x in rset_subclasses) { y <- tibble(id = "foo") expect_s3_class_rset(anti_join(x, y, by = "id")) } }) test_that("anti_join() can lose rset class if rows are removed", { for (x in rset_subclasses) { y <- tibble(id = x$id[[1]], x = 1) expect_s3_class_bare_tibble(anti_join(x, y, by = "id")) } }) # ------------------------------------------------------------------------------ # semi_join() test_that("semi_join() can keep rset class if rset structure is intact", { for (x in rset_subclasses) { expect_s3_class_rset(semi_join(x, x, by = names(x))) } }) test_that("semi_join() can lose rset class if rows are removed", { for (x in rset_subclasses) { y <- tibble(id = "foo", x = 1) expect_s3_class_bare_tibble(semi_join(x, y, by = "id")) } }) # ------------------------------------------------------------------------------ # nest_join() test_that("nest_join() can keep rset class if rset structure is intact", { for (x in rset_subclasses) { y <- mutate(x, foo = "bar") expect_s3_class_rset(nest_join(x, y, by = names(x))) } }) # ------------------------------------------------------------------------------ # bind_rows() test_that("bind_rows() keeps the class if there are no new rows/cols and the first object is an rset subclass", { for (x in rset_subclasses) { expect_s3_class_rset(bind_rows(x)) expect_s3_class_rset(bind_rows(x, tibble())) expect_s3_class_bare_tibble(bind_rows(tibble(), x)) } }) test_that("bind_rows() drops the class with new rows", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(bind_rows(x, x)) } }) # ------------------------------------------------------------------------------ # bind_cols() test_that("bind_cols() keeps the class if there are no new rows and the first object is an rset subclass", { for (x in rset_subclasses) { expect_s3_class_rset(bind_cols(x)) expect_s3_class_rset(bind_cols(x, tibble(x = 1))) expect_s3_class_bare_tibble(bind_cols(tibble(x = 1), x)) } }) test_that("bind_cols() drops the class with new rows", { # Use rset subclass with 1 row, these get recycled x <- rset_subclasses$apparent expect_s3_class_bare_tibble(bind_cols(x, tibble(x = 1:2))) }) rsample/tests/testthat/test_validation.R0000644000175000017500000000715414122413046020371 0ustar nileshnileshcontext("Validation sets") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) data(drinks, package = "modeldata") test_that('default param', { set.seed(11) rs1 <- validation_split(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 15)) expect_true(all(sizes1$assessment == 5)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('default time param', { set.seed(11) rs1 <- validation_time_split(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 15)) expect_true(all(sizes1$assessment == 5)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) tr1 <- training(rs1$splits[[1]]) expect_equal(nrow(tr1), floor(nrow(dat1) * 3/4)) expect_equal(tr1, dplyr::slice(dat1, 1:floor(nrow(dat1) * 3/4))) }) test_that('default time param with lag', { rs1 <- validation_time_split(dat1, lag = 5) expect_s3_class(rs1, "validation_split") tr1 <- training(rs1$splits[[1]]) expect_equal(nrow(tr1), floor(nrow(dat1) * 3/4)) expect_equal(tr1, dplyr::slice(dat1, 1:floor(nrow(dat1) * 3/4)) ) expect_error(validation_time_split(drinks, lag = 12.5)) # Whole numbers only expect_error(validation_time_split(drinks, lag = 500)) # Lag must be less than number of training observations }) test_that('different percent', { set.seed(11) rs2 <- validation_split(dat1, prop = .5) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == 10)) expect_true(all(sizes2$assessment == 10)) same_data <- map_lgl(rs2$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs2$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('strata', { set.seed(11) rs3 <- validation_split(warpbreaks, strata = "tension") sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis == 39)) expect_true(all(sizes3$assessment == 15)) rate <- map_dbl(rs3$splits, function(x) { dat <- as.data.frame(x)$tension mean(dat == "M") }) expect_true(length(unique(rate)) == 1) good_holdout <- map_lgl(rs3$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('bad args', { expect_error(validation_split(warpbreaks, strata = warpbreaks$tension)) expect_error(validation_split(warpbreaks, strata = c("tension", "wool"))) }) test_that('printing', { expect_output(print(validation_split(warpbreaks)), "Validation Set Split") expect_output(print(validation_time_split(drinks)), "Validation Set Split") }) test_that('printing', { expect_output(print(validation_split(warpbreaks)$splits[[1]]), "Training/Validation/Total") }) test_that('rsplit labels', { rs <- validation_split(mtcars) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test-compat-dplyr-old.R0000644000175000017500000000773613673171774021375 0ustar nileshnilesh# These tests should pass on all supported versions of dplyr. Both pre and # post dplyr 1.0.0 should work. # When `compat-dplyr-old.R` is removed and support for dplyr < 1.0.0 is # deprecated, these tests should move to `test-compat-dplyr.R` instead. # Do not just delete them, as they are important tests and are not repeated in # `compat-dplyr.R`. library(dplyr) # ------------------------------------------------------------------------------ # mutate() test_that("mutate() can keep rset class", { for (x in rset_subclasses) { expect_s3_class_rset(mutate(x, x = 1)) expect_identical(mutate(x, x = 1)$x, rep(1, vec_size(x))) } }) test_that("mutate() drops rset class if any rset columns are touched", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(mutate(x, splits = 1)) expect_s3_class_bare_tibble(mutate(x, id = 1)) expect_identical(mutate(x, splits = 1)$splits, rep(1, vec_size(x))) expect_identical(mutate(x, id = 1)$id, rep(1, vec_size(x))) } }) test_that("mutate() keeps rset class if replacement rset column is same as original", { for (x in rset_subclasses) { expect_s3_class_rset(mutate(x, splits = splits)) expect_s3_class_rset(mutate(x, id = id)) } }) test_that("adding a column that looks like an `id` drops the class", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(mutate(x, id9 = 1)) } }) # ------------------------------------------------------------------------------ # arrange() test_that("arrange() keeps rset class when row order is modified", { for (x in rset_subclasses) { x <- mutate(x, rn = row_number()) expect_s3_class_rset(arrange(x, desc(rn))) } }) test_that("arrange() keeps rset class when row order is untouched", { for (x in rset_subclasses) { expect_s3_class_rset(arrange(x)) x <- mutate(x, rn = row_number()) expect_s3_class_rset(arrange(x, rn)) } }) # ------------------------------------------------------------------------------ # filter() test_that("filter() drops rset class when rows are modified", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(filter(x, 0 == 1)) expect_s3_class_bare_tibble(filter(x, is.numeric(id))) } }) test_that("filter() keeps rset class if row order is untouched", { for (x in rset_subclasses) { expect_s3_class_rset(filter(x)) expect_s3_class_rset(filter(x, is.character(id))) } }) # ------------------------------------------------------------------------------ # rename() test_that("renaming can keep the rset class", { for (x in rset_subclasses) { x <- mutate(x, a = 1) x <- rename(x, b = a) expect_s3_class_rset(x) } }) test_that("renaming `id` at all drops the rset class", { for (x in rset_subclasses) { x <- rename(x, id9 = id) expect_s3_class_bare_tibble(x) } }) test_that("renaming `id` to a non-id name drops the rset class", { for (x in rset_subclasses) { x <- rename(x, stuff = id) expect_s3_class_bare_tibble(x) } }) test_that("for nested_cv, renaming `inner_resamples` drops the rset class", { x <- rset_subclasses$nested_cv x <- rename(x, inner_stuff = inner_resamples) expect_s3_class_bare_tibble(x) }) # ------------------------------------------------------------------------------ # select() test_that("select() can keep rset class", { for (x in rset_subclasses) { expect_s3_class_rset(select(x, everything())) } }) test_that("select() drops rset class if any rset columns aren't selected", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(select(x, id)) expect_s3_class_bare_tibble(select(x, splits)) } }) # ------------------------------------------------------------------------------ # slice() test_that("slice() drops rset class when rows are modified", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(slice(x, 0)) } }) test_that("slice() keeps rset class when rows are untouched", { for (x in rset_subclasses) { expect_s3_class_rset(slice(x)) expect_s3_class_rset(slice(x, seq_len(nrow(x)))) } }) rsample/tests/testthat/test_for_pred.R0000644000175000017500000000243113726435071020043 0ustar nileshnileshcontext("Predictor extraction") library(testthat) test_that('no dots', { expect_equal(form_pred(y ~ x + z), c("x", "z")) expect_equal(form_pred(terms(y ~ x + z)), c("x", "z")) expect_equal(form_pred(y ~ x + log(z)), c("x", "z")) expect_equal(form_pred(terms(y ~ x + log(z))), c("x", "z")) expect_equal(form_pred(log(y) ~ x + z), c("x", "z")) expect_equal(form_pred(terms(log(y) ~ x + z)), c("x", "z")) expect_equal(form_pred(y1 + y2 ~ x + z), c("x", "z")) expect_equal(form_pred(terms(y1 + y2 ~ x + z)), c("x", "z")) expect_equal(form_pred(log(y1) + y2 ~ x + z), c("x", "z")) expect_equal(form_pred(terms(log(y1) + y2 ~ x + z)), c("x", "z")) expect_equal(form_pred(~ x + z), c("x", "z")) expect_equal(form_pred(terms(~ x + z)), c("x", "z")) expect_equal(form_pred(~ x), "x") expect_equal(form_pred(terms(~ x)), "x") expect_equal(form_pred(y ~ x), "x") expect_equal(form_pred(terms(y ~ x)), "x") }) test_that('dots', { expect_error(form_pred(y ~ .)) expect_error(form_pred(terms(y ~ .))) expect_error(form_pred(y ~ (.)^2)) expect_error(form_pred(terms(y ~ (.)^2))) expect_equal(form_pred(terms(mpg ~ (.)^2, data = mtcars)), names(mtcars)[2:11]) expect_equal(form_pred(terms(~ (.)^2, data = mtcars)), names(mtcars)) }) rsample/tests/testthat/test_boot.R0000644000175000017500000000460713726435071017215 0ustar nileshnileshcontext("Bootstrapping") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { set.seed(11) rs1 <- bootstraps(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == nrow(dat1))) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('apparent', { rs2 <- bootstraps(dat1, apparent = TRUE) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == nrow(dat1))) expect_true(all(sizes2$assessment[nrow(sizes2)] == nrow(dat1))) expect_equal(sizes2$assessment[sizes2$id == "Apparent"], nrow(dat1)) res2 <- as.data.frame(rs2$splits[[nrow(sizes2)]], data = "assessment") expect_equal(res2, dat1) }) test_that('strata', { set.seed(11) rs4 <- bootstraps(warpbreaks, strata = "tension") sizes4 <- dim_rset(rs4) expect_true(all(sizes4$analysis == nrow(warpbreaks))) rate <- map_dbl(rs4$splits, function(x) { dat <- as.data.frame(x)$tension mean(dat == "M") }) expect_true(length(unique(rate)) == 1) good_holdout <- map_lgl(rs4$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) rs5 <- bootstraps(warpbreaks, apparent = TRUE, strata = "tension") sizes5 <- dim_rset(rs5) expect_true(all(sizes5$analysis == nrow(warpbreaks))) expect_true(all(sizes5$assessment[nrow(sizes5)] == nrow(warpbreaks))) expect_equal(sizes5$assessment[sizes5$id == "Apparent"], nrow(warpbreaks)) res5 <- as.data.frame(rs5$splits[[nrow(sizes5)]], data = "assessment") expect_equal(res5, warpbreaks) }) test_that('bad args', { expect_error(bootstraps(warpbreaks, strata = warpbreaks$tension)) expect_error(bootstraps(warpbreaks, strata = c("tension", "wool"))) }) test_that('printing', { expect_output(print(bootstraps(warpbreaks))) }) test_that('rsplit labels', { rs <- bootstraps(warpbreaks) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/helper-s3.R0000644000175000017500000000027213673171774017017 0ustar nileshnileshexpect_s3_class_rset <- function(x) { expect_s3_class(x, "rset") } expect_s3_class_bare_tibble <- function(x) { expect_s3_class(x, c("tbl_df", "tbl", "data.frame"), exact = TRUE) } rsample/tests/testthat/test-manual.R0000644000175000017500000000203213727757057017447 0ustar nileshnileshtest_that("can create a manual rset", { df <- data.frame(x = c(1, 2, 3, 4)) indices <- list( list(analysis = 1L, assessment = 2L), list(analysis = 3L, assessment = 4L) ) splits <- lapply(indices, make_splits, data = df) rset <- manual_rset(splits, c("Split 1", "Split 2")) expect_s3_class(rset, c("manual_rset", "rset")) expect_identical(rset$id, c("Split 1", "Split 2")) }) test_that("can use analysis/assessment functions", { df <- data.frame(x = c(1, 2, 3)) indices <- list(analysis = c(1L, 2L), assessment = 3L) splits <- list(make_splits(indices, df)) rset <- manual_rset(splits, "Split 1") expect_identical(analysis(rset$splits[[1]]), df[1:2, 1, drop = FALSE]) expect_identical(assessment(rset$splits[[1]]), df[3, 1, drop = FALSE]) }) test_that("`pretty()` works", { df <- data.frame(x = c(1, 2, 3)) indices <- list(analysis = c(1L, 2L), assessment = 3L) splits <- list(make_splits(indices, df)) rset <- manual_rset(splits, "Split 1") expect_identical(pretty(rset), "Manual resampling") }) rsample/tests/testthat/test-compat-vctrs.R0000644000175000017500000000767613673171774020633 0ustar nileshnilesh# ------------------------------------------------------------------------------ # vec_restore() test_that("vec_restore() returns an rset subclass if `x` retains rset structure", { for (x in rset_subclasses) { expect_identical(vec_restore(x, x), x) expect_s3_class_rset(vec_restore(x, x)) } }) test_that("vec_restore() returns bare tibble if `x` loses rset structure", { for (x in rset_subclasses) { col <- x[1] row <- x[0,] expect_s3_class_bare_tibble(vec_restore(col, x)) expect_s3_class_bare_tibble(vec_restore(row, x)) } }) test_that("vec_restore() retains extra attributes of `to` when not falling back", { for (x in rset_subclasses) { to <- x attr(to, "foo") <- "bar" x_tbl <- x[1] expect_identical(attr(vec_restore(x, to), "foo"), "bar") expect_identical(attr(vec_restore(x_tbl, to), "foo"), NULL) expect_s3_class_rset(vec_restore(x, to)) expect_s3_class_bare_tibble(vec_restore(x_tbl, to)) } }) # ------------------------------------------------------------------------------ # vec_ptype2() test_that("vec_ptype2() is working", { for (x in rset_subclasses) { tbl <- tibble::tibble(x = 1) df <- data.frame(x = 1) # rset-rset expect_identical(vec_ptype2(x, x), vec_ptype2(tib_upcast(x), tib_upcast(x))) # rset-tbl_df expect_identical(vec_ptype2(x, tbl), vec_ptype2(tib_upcast(x), tbl)) expect_identical(vec_ptype2(tbl, x), vec_ptype2(tbl, tib_upcast(x))) # rset-df expect_identical(vec_ptype2(x, df), vec_ptype2(tib_upcast(x), df)) expect_identical(vec_ptype2(df, x), vec_ptype2(df, tib_upcast(x))) } }) # ------------------------------------------------------------------------------ # vec_cast() test_that("vec_cast() is working", { for (x in rset_subclasses) { tbl <- tib_upcast(x) df <- as.data.frame(tbl) # rset-rset expect_error(vec_cast(x, x), class = "vctrs_error_incompatible_type") # rset-tbl_df expect_identical(vec_cast(x, tbl), tbl) expect_error(vec_cast(tbl, x), class = "vctrs_error_incompatible_type") # rset-df expect_identical(vec_cast(x, df), df) expect_error(vec_cast(df, x), class = "vctrs_error_incompatible_type") } }) # ------------------------------------------------------------------------------ # vctrs methods test_that("vec_ptype() returns a bare tibble", { for (x in rset_subclasses) { expect_identical(vec_ptype(x), vec_ptype(tib_upcast(x))) expect_s3_class_bare_tibble(vec_ptype(x)) } }) test_that("vec_slice() generally returns a bare tibble", { for (x in rset_subclasses) { expect_identical(vec_slice(x, 0), vec_slice(tib_upcast(x), 0)) expect_s3_class_bare_tibble(vec_slice(x, 0)) } }) test_that("vec_slice() can return an rset if all rows are selected", { for (x in rset_subclasses) { expect_identical(vec_slice(x, TRUE), x) expect_s3_class_rset(vec_slice(x, TRUE)) } }) test_that("vec_c() returns a bare tibble", { for (x in rset_subclasses) { tbl <- tib_upcast(x) expect_identical(vec_c(x), vec_c(tbl)) expect_identical(vec_c(x, x), vec_c(tbl, tbl)) expect_identical(vec_c(x, tbl), vec_c(tbl, tbl)) expect_s3_class_bare_tibble(vec_c(x)) expect_s3_class_bare_tibble(vec_c(x, x)) } }) test_that("vec_rbind() returns a bare tibble", { for (x in rset_subclasses) { tbl <- tib_upcast(x) expect_identical(vec_rbind(x), vec_rbind(tbl)) expect_identical(vec_rbind(x, x), vec_rbind(tbl, tbl)) expect_identical(vec_rbind(x, tbl), vec_rbind(tbl, tbl)) expect_s3_class_bare_tibble(vec_rbind(x)) expect_s3_class_bare_tibble(vec_cbind(x, x)) } }) test_that("vec_cbind() returns a bare tibble", { for (x in rset_subclasses) { tbl <- tib_upcast(x) expect_identical(vec_cbind(x), vec_cbind(tbl)) expect_identical(vec_cbind(x, x), vec_cbind(tbl, tbl)) expect_identical(vec_cbind(x, tbl), vec_cbind(tbl, tbl)) expect_s3_class_bare_tibble(vec_cbind(x)) expect_s3_class_bare_tibble(vec_cbind(x, x)) } }) rsample/tests/testthat/test_permutations.R0000644000175000017500000000420213755042647021000 0ustar nileshnileshcontext("Permutations") library(testthat) library(rsample) library(purrr) library(dplyr) test_that('default param', { set.seed(11) rs1 <- permutations(mtcars, 1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == nrow(mtcars))) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, mtcars)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('apparent', { rs2 <- permutations(mtcars, 1, apparent = TRUE) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == nrow(mtcars))) expect_true(all(sizes2$assessment[nrow(sizes2)] == nrow(mtcars))) expect_equal(sizes2$assessment[sizes2$id == "Apparent"], nrow(mtcars)) }) test_that('no assessment set', { xx <- permutations(mtcars, 1) expect_error(assessment(xx$splits[[1]])) }) test_that('bad args', { expect_error(permutations(mtcars)) # no columns specified expect_error(permutations(mtcars, foo)) # column doesn't exist expect_error(permutations(mtcars, start_with("z"))) # column doesn't exist expect_error(permutations(mtcars, everything())) # all columns }) test_that('printing', { expect_output(print(permutations(mtcars, 1))) }) test_that('rsplit labels', { rs <- permutations(mtcars, 1) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) test_that('filtering/slicing rows', { x <- permutations(mtcars, 1:3) xf <- filter(x, id=="Permutations01") xs <- slice(x, 1) expect_identical(class(xf), c("tbl_df", "tbl", "data.frame")) expect_identical(class(xs), c("tbl_df", "tbl", "data.frame")) }) test_that('column binding', { x <- permutations(mtcars, 1:3) xcb1 <- bind_cols(x, y = LETTERS[1:nrow(x)]) xcb2 <- bind_cols(x, mtcars = tidyr::nest(mtcars, data = everything())) xcb3 <- bind_cols(y = LETTERS[1:nrow(x)], x) expect_identical(class(xcb1), class(x)) expect_identical(class(xcb2), class(x)) expect_false(identical(class(xcb3), class(x))) }) rsample/tests/testthat/test_group.R0000644000175000017500000000552513726435071017406 0ustar nileshnileshcontext("Group resampling") library(testthat) library(rsample) library(purrr) library(tibble) warpbreaks2 <- as_tibble(warpbreaks) get_id_left_out <- function(x) unique(as.character(assessment(x)$tension)) test_that('bad args', { expect_error(group_vfold_cv(warpbreaks, group = warpbreaks$tension)) expect_error(group_vfold_cv(warpbreaks, group = c("tension", "wool"))) expect_error(group_vfold_cv(warpbreaks, group = "tensio")) expect_error(group_vfold_cv(warpbreaks)) expect_error(group_vfold_cv(warpbreaks, group = "tension", v = 10)) }) test_that('default param', { set.seed(11) rs1 <- group_vfold_cv(warpbreaks, "tension") sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 36)) expect_true(all(sizes1$assessment == 18)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, warpbreaks)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) sp_out <- map_chr(rs1$splits, get_id_left_out) expect_true(all(table(sp_out) == 1)) }) test_that('v < max v', { set.seed(11) rs2 <- group_vfold_cv(warpbreaks, "tension", v = 2) sizes2 <- dim_rset(rs2) expect_true(!all(sizes2$analysis == 36)) expect_true(!all(sizes2$assessment == 18)) same_data <- map_lgl(rs2$splits, function(x) all.equal(x$data, warpbreaks)) expect_true(all(same_data)) good_holdout <- map_lgl(rs2$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) sp_out <- map(rs2$splits, get_id_left_out) expect_true(all(table(unlist(sp_out)) == 1)) }) test_that('tibble input', { set.seed(11) rs3 <- group_vfold_cv(warpbreaks2, "tension") sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis == 36)) expect_true(all(sizes3$assessment == 18)) same_data <- map_lgl(rs3$splits, function(x) all.equal(x$data, warpbreaks2)) expect_true(all(same_data)) good_holdout <- map_lgl(rs3$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) sp_out <- map_chr(rs3$splits, get_id_left_out) expect_true(all(table(sp_out) == 1)) }) test_that('printing', { expect_output(print(group_vfold_cv(warpbreaks, "tension"))) }) test_that('printing with ...', { verify_output(test_path("test-print-groups.txt"), { print(group_vfold_cv(warpbreaks, "tension"), n = 2) }) }) test_that('rsplit labels', { rs <- group_vfold_cv(warpbreaks, "tension") all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_loo.R0000644000175000017500000000145413653053433017035 0ustar nileshnileshcontext("Leave-one-out CV") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:10, b = letters[1:10]) test_that('Loooooo', { loo1 <- loo_cv(dat1) expect_equal(nrow(loo1), nrow(dat1)) same_data <- map_lgl(loo1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) holdouts <- map_lgl(loo1$splits, function(x) length(x$out_id) == 1) expect_true(all(holdouts)) retained <- map_lgl(loo1$splits, function(x) length(x$in_id) == (nrow(dat1) - 1)) expect_true(all(retained)) }) test_that('printing', { expect_output(print(loo_cv(dat1))) }) test_that('rsplit labels', { rs <- loo_cv(mtcars) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_caret.R0000644000175000017500000002237713653053433017351 0ustar nileshnileshcontext("Conversions for caret") library(testthat) library(rsample) ################################################################### ## Test cases for caret -> rsample that mimic `trainControl` dat <- data.frame(y = 1:15, x = 15:1) lgo1 <- structure( list( method = "LGOCV", index = structure( list( Resample1 = c(1L, 4L, 5L, 6L, 7L, 9L, 10L, 14L), Resample2 = c(2L, 4L, 5L, 6L, 9L, 10L, 14L, 15L), Resample3 = c(1L, 2L, 3L, 5L, 6L, 7L, 8L, 9L) ), .Names = c("Resample1", "Resample2", "Resample3") ), indexOut = structure( list( Resample1 = c(2L, 3L, 8L, 11L, 12L, 13L, 15L), Resample2 = c(1L, 3L, 7L, 8L, 11L, 12L, 13L), Resample3 = c(4L, 10L, 11L, 12L, 13L, 14L, 15L) ), .Names = c("Resample1", "Resample2", "Resample3") ), number = 3, p = 0.5 ), .Names = c("method", "index", "indexOut", "number", "p") ) cv_1 <- structure( list( method = "cv", index = structure( list( Fold1 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 13L), Fold2 = c(1L, 4L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold3 = c(1L, 2L, 3L, 5L, 7L, 9L, 12L, 14L, 15L) ), .Names = c("Fold1", "Fold2", "Fold3") ), indexOut = structure( list( Resample1 = c(1L, 9L, 12L, 14L, 15L), Resample2 = c(2L, 3L, 5L, 7L), Resample3 = c(4L, 6L, 8L, 10L, 11L, 13L) ), .Names = c("Resample1", "Resample2", "Resample3") ), number = 3, repeats = NA ), .Names = c("method", "index", "indexOut", "number", "repeats") ) cv_2 <- structure( list( method = "repeatedcv", index = structure( list( Fold1.Rep1 = c(1L, 3L, 4L, 6L, 9L, 10L, 12L, 13L, 14L, 15L), Fold2.Rep1 = c(2L, 5L, 7L, 8L, 10L, 11L, 13L, 14L, 15L), Fold3.Rep1 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L), Fold1.Rep2 = c(1L, 2L, 3L, 5L, 6L, 7L, 10L, 11L, 12L, 14L), Fold2.Rep2 = c(2L, 4L, 6L, 8L, 9L, 11L, 13L, 14L, 15L), Fold3.Rep2 = c(1L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 12L, 13L, 15L) ), .Names = c( "Fold1.Rep1", "Fold2.Rep1", "Fold3.Rep1", "Fold1.Rep2", "Fold2.Rep2", "Fold3.Rep2" ) ), indexOut = structure( list( Resample1 = c(2L, 5L, 7L, 8L, 11L), Resample2 = c(1L, 3L, 4L, 6L, 9L, 12L), Resample3 = c(10L, 13L, 14L, 15L), Resample4 = c(4L, 8L, 9L, 13L, 15L), Resample5 = c(1L, 3L, 5L, 7L, 10L, 12L), Resample6 = c(2L, 6L, 11L, 14L) ), .Names = c( "Resample1", "Resample2", "Resample3", "Resample4", "Resample5", "Resample6" ) ), number = 3, repeats = 2 ), .Names = c("method", "index", "indexOut", "number", "repeats") ) cv_3 <- cv_2 cv_3$method <- "adaptive_cv" bt_1 <- structure( list( method = "boot", index = structure( list( Resample1 = c(1L, 1L, 4L, 4L, 5L, 7L, 8L, 10L, 11L, 11L, 12L, 13L, 15L, 15L, 15L), Resample2 = c(1L, 2L, 3L, 5L, 5L, 5L, 6L, 7L, 8L, 9L, 9L, 9L, 10L, 10L, 12L) ), .Names = c("Resample1", "Resample2") ), indexOut = structure( list( Resample1 = c(2L, 3L, 6L, 9L, 14L), Resample2 = c(4L, 11L, 13L, 14L, 15L) ), .Names = c("Resample1", "Resample2") ), number = 2 ), .Names = c("method", "index", "indexOut", "number") ) bt_2 <- bt_1 bt_2$method <- "boot632" bt_3 <- bt_1 bt_3$method <- "optimism_boot" bt_4 <- bt_1 bt_4$method <- "boot_all" bt_5 <- bt_1 bt_5$method <- "adaptive_boot" loo_1 <- structure( list( method = "LOOCV", index = structure( list( Fold01 = 2:15, Fold02 = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold03 = c(1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold04 = c(1L, 2L, 3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold05 = c(1L, 2L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold06 = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold07 = c(1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold08 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold09 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L, 13L, 14L, 15L), Fold10 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L), Fold11 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L, 13L, 14L, 15L), Fold12 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L), Fold13 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 14L, 15L), Fold14 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L), Fold15 = 1:14 ), .Names = c( "Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fold06", "Fold07", "Fold08", "Fold09", "Fold10", "Fold11", "Fold12", "Fold13", "Fold14", "Fold15" ) ), indexOut = structure( list( Resample01 = 1L, Resample02 = 2L, Resample03 = 3L, Resample04 = 4L, Resample05 = 5L, Resample06 = 6L, Resample07 = 7L, Resample08 = 8L, Resample09 = 9L, Resample10 = 10L, Resample11 = 11L, Resample12 = 12L, Resample13 = 13L, Resample14 = 14L, Resample15 = 15L ), .Names = c( "Resample01", "Resample02", "Resample03", "Resample04", "Resample05", "Resample06", "Resample07", "Resample08", "Resample09", "Resample10", "Resample11", "Resample12", "Resample13", "Resample14", "Resample15" ) ) ), .Names = c("method", "index", "indexOut") ) rof_1 <- structure( list( method = "timeSlice", index = structure( list( Training04 = 1:4, Training05 = 2:5, Training06 = 3:6, Training07 = 4:7, Training08 = 5:8, Training09 = 6:9, Training10 = 7:10 ), .Names = c( "Training04", "Training05", "Training06", "Training07", "Training08", "Training09", "Training10" ) ), indexOut = structure( list( Testing04 = 5:9, Testing05 = 6:10, Testing06 = 7:11, Testing07 = 8:12, Testing08 = 9:13, Testing09 = 10:14, Testing10 = 11:15 ), .Names = c( "Testing04", "Testing05", "Testing06", "Testing07", "Testing08", "Testing09", "Testing10" ) ), initialWindow = 4, horizon = 5, fixedWindow = TRUE, skip = 0 ), .Names = c( "method", "index", "indexOut", "initialWindow", "horizon", "fixedWindow", "skip" ) ) ################################################################### ## check_indices <- function(newer, orig) { for (i in seq_along(newer$splits)) { expect_equal(as.integer(newer$splits[[i]]), orig$index[[i]]) expect_equal(as.integer(newer$splits[[i]], "assessment"), orig$indexOut[[i]]) } invisible(NULL) } ################################################################### ## Tests test_that('basic v-fold', { vfold_obj_1 <- caret2rsample(cv_1, data = dat) check_indices(vfold_obj_1, cv_1) for (i in seq_along(vfold_obj_1$splits)) expect_equal(vfold_obj_1$id[[i]], names(cv_1$index)[i]) }) test_that('repeated v-fold', { vfold_obj_2 <- caret2rsample(cv_2, data = dat) check_indices(vfold_obj_2, cv_2) for (i in seq_along(vfold_obj_2$splits)) expect_equal(paste(vfold_obj_2$id2[[i]], vfold_obj_2$id[[i]], sep = "."), names(cv_2$index)[i]) }) test_that('basic boot', { bt_obj_1 <- caret2rsample(bt_1, data = dat) check_indices(bt_obj_1, bt_1) for (i in seq_along(bt_obj_1$splits)) expect_equal(bt_obj_1$id[[i]], names(bt_1$index)[i]) }) test_that('boot 632', { bt_obj_2 <- caret2rsample(bt_2, data = dat) check_indices(bt_obj_2, bt_2) for (i in seq_along(bt_obj_2$splits)) expect_equal(bt_obj_2$id[[i]], names(bt_2$index)[i]) }) test_that('boot optim', { bt_obj_3 <- caret2rsample(bt_3, data = dat) check_indices(bt_obj_3, bt_3) for (i in seq_along(bt_obj_3$splits)) expect_equal(bt_obj_3$id[[i]], names(bt_3$index)[i]) }) test_that('boot all', { bt_obj_4 <- caret2rsample(bt_4, data = dat) check_indices(bt_obj_4, bt_4) for (i in seq_along(bt_obj_4$splits)) expect_equal(bt_obj_4$id[[i]], names(bt_4$index)[i]) }) test_that('adaptive boot', { bt_obj_5 <- caret2rsample(bt_5, data = dat) check_indices(bt_obj_5, bt_5) for (i in seq_along(bt_obj_5$splits)) expect_equal(bt_obj_5$id[[i]], names(bt_5$index)[i]) }) test_that('loo', { loo_obj <- caret2rsample(loo_1, data = dat) check_indices(loo_obj, loo_1) for (i in seq_along(loo_obj$splits)) expect_equal(loo_obj$id[[i]], names(loo_1$index)[i]) }) test_that('mcv', { mcv_obj <- caret2rsample(lgo1, data = dat) check_indices(mcv_obj, lgo1) for (i in seq_along(mcv_obj$splits)) expect_equal(mcv_obj$id[[i]], names(lgo1$index)[i]) }) test_that('rolling origin', { rof_obj <- caret2rsample(rof_1, data = dat) check_indices(rof_obj, rof_1) for (i in seq_along(rof_obj$splits)) expect_equal(rof_obj$id[[i]], names(rof_1$index)[i]) }) rsample/tests/testthat/test_names.R0000644000175000017500000000041613653053433017344 0ustar nileshnileshcontext("Naming functions") library(testthat) library(rsample) test_that('basic naming sequences', { expect_equal(names0(2), c("x1", "x2")) expect_equal(names0(2, "y"), c("y1", "y2")) expect_equal(names0(10), c(paste0("x0", 1:9), "x10")) }) rsample/tests/testthat/test-rset.R0000644000175000017500000001574513673171774017162 0ustar nileshnilesh# ------------------------------------------------------------------------------ # `[]` test_that("subsetting with nothing returns rset", { for (x in rset_subclasses) { expect_s3_class_rset(x[]) } }) # ------------------------------------------------------------------------------ # `[i]` test_that("can subset with just `i` and return rset", { for (x in rset_subclasses) { loc <- seq_len(ncol(x)) expect_s3_class_rset(x[loc]) } }) test_that("removing any rset specific columns falls back", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(x[1]) } }) test_that("duplicating an rset column falls back", { for (x in rset_subclasses) { loc <- c(1, seq_len(ncol(x))) expect_s3_class_bare_tibble(x[loc]) } }) test_that("can reorder columns and keep rset class", { for (x in rset_subclasses) { loc <- rev(seq_len(ncol(x))) expect_s3_class_rset(x[loc]) } }) # ------------------------------------------------------------------------------ # `[i,]` test_that("can row subset and drop to a tibble", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(x[0,]) } }) test_that("can row subset and keep rset class", { for (x in rset_subclasses) { loc <- seq_len(nrow(x)) expect_s3_class_rset(x[loc,]) } }) # ------------------------------------------------------------------------------ # `[,j]` # Most of these tests should be the same as `[i]`. test_that("can subset with just `j` and keep rset class", { for (x in rset_subclasses) { loc <- seq_len(ncol(x)) expect_s3_class_rset(x[,loc]) } }) test_that("removing an rset specific class drops the rset class", { for (x in rset_subclasses) { expect_s3_class_bare_tibble(x[,1]) } }) # ------------------------------------------------------------------------------ # `[i, j]` test_that("row subsetting mixed with col subsetting can drop to tibble", { for (x in rset_subclasses) { loc <- seq_len(ncol(x)) expect_s3_class_bare_tibble(x[0, loc]) } }) test_that("row subsetting mixed with col subsetting can keep rset subclass", { for (x in rset_subclasses) { row_loc <- seq_len(nrow(x)) col_loc <- seq_len(ncol(x)) expect_s3_class_rset(x[row_loc, col_loc]) } }) # ------------------------------------------------------------------------------ # Misc `[` tests test_that("additional attributes are kept when subsetting and rset class is kept", { for (x in rset_subclasses) { attr(x, "foo") <- "bar" loc <- seq_len(ncol(x)) result <- x[loc] expect_s3_class_rset(result) expect_identical(attr(result, "foo"), "bar") } }) test_that("additional attributes are dropped when subsetting and rset class is dropped", { for (x in rset_subclasses) { attr(x, "foo") <- "bar" result <- x[1] expect_s3_class_bare_tibble(result) expect_identical(attr(result, "foo"), NULL) } }) # ------------------------------------------------------------------------------ # `names<-` test_that("can modify non-rset names and keep the rset class", { for (x in rset_subclasses) { x <- mutate(x, a = 1) names <- names(x) names[names == "a"] <- "b" names(x) <- names expect_s3_class_rset(x) } }) test_that("touching the `splits` name removes the class", { for (x in rset_subclasses) { names <- names(x) names[names == "splits"] <- "splits2" names(x) <- names expect_s3_class_bare_tibble(x) } }) test_that("renaming an `id` column is not allowed", { for (x in rset_subclasses) { names <- names(x) names[names == "id"] <- "id9" names(x) <- names expect_s3_class_bare_tibble(x) } }) test_that("renaming an `id` column to something that doesn't start with `id` drops the class", { for (x in rset_subclasses) { names <- names(x) names[names == "id"] <- "foo" names(x) <- names expect_s3_class_bare_tibble(x) } }) test_that("`splits` and `id` columns can't just be switched", { for (x in rset_subclasses) { names <- names(x) new_names <- names new_names[names == "splits"] <- "id" new_names[names == "id"] <- "splits" names(x) <- new_names expect_s3_class_bare_tibble(x) } }) test_that("`id` column can't just be moved", { for (x in rset_subclasses) { x <- mutate(x, a = 1) names <- names(x) new_names <- names new_names[names == "a"] <- "id" new_names[names == "id"] <- "a" names(x) <- new_names expect_s3_class_bare_tibble(x) } }) test_that("`splits` column can't just be moved", { for (x in rset_subclasses) { x <- mutate(x, a = 1) names <- names(x) new_names <- names new_names[names == "a"] <- "splits" new_names[names == "splits"] <- "a" names(x) <- new_names expect_s3_class_bare_tibble(x) } }) # ------------------------------------------------------------------------------ # rset_reconstructable() test_that("two rset subclasses can be considered identical", { for (x in rset_subclasses) { expect_true(rset_reconstructable(x, x)) } }) test_that("order doesn't matter", { for (x in rset_subclasses) { y <- x[rev(names(x))] expect_true(rset_reconstructable(x, y)) } }) test_that("no longer identical if `splits` is lost", { for (to in rset_subclasses) { locs <- col_equals_splits(names(to)) x <- to[!locs] expect_false(rset_reconstructable(x, to)) } }) test_that("no longer identical if any `id` columns are lost", { for (to in rset_subclasses) { locs <- col_starts_with_id(names(to)) first_id <- which(locs)[[1]] x <- to[-first_id] expect_false(rset_reconstructable(x, to)) } }) test_that("no longer identical if rows are lost", { # Apparent/Validation only have 1 row subclasses <- rset_subclasses subclasses$apparent <- NULL subclasses$validation_split <- NULL for (to in subclasses) { x <- to[1,] expect_false(rset_reconstructable(x, to)) } }) test_that("still considered identical if rows are simply reordered", { for (to in rset_subclasses) { loc <- rev(seq_len(nrow(to))) x <- to[loc,] expect_true(rset_reconstructable(x, to)) } for (to in rset_subclasses) { loc <- sample(nrow(to)) x <- to[loc,] expect_true(rset_reconstructable(x, to)) } }) test_that("the `inner_resamples` column of `nested_cv` is handled specially", { to <- rset_subclasses$nested_cv x <- to[c("splits", "id")] expect_false(rset_reconstructable(x, to)) }) # ------------------------------------------------------------------------------ # col_starts_with_id() test_that("column can be just `id`", { expect_true(col_starts_with_id("id")) }) test_that("column can be `id1:9`", { ids <- paste0("id", 1:9) expect_true(all(col_starts_with_id(ids))) }) test_that("column cannot be outside the range of 1-9", { ids <- c("id0", "id10") expect_false(any(col_starts_with_id(ids))) }) test_that("column must start with `id`", { ids <- c("xid", "xid1") expect_false(any(col_starts_with_id(ids))) }) test_that("column can't have anything after the id part", { ids <- c("idx", "id1x") expect_false(any(col_starts_with_id(ids))) }) rsample/tests/testthat/test_rsplit.R0000644000175000017500000000336714012611402017550 0ustar nileshnileshcontext("Rsplit constructor") library(testthat) library(rsample) dat1 <- data.frame(a = 1:100, b = 101:200) size1 <- object.size(dat1) dat2 <- as.matrix(dat1) test_that('simple rsplit', { rs1 <- rsplit(dat1, 1:2, 4:5) expect_equal(rs1$data, dat1) expect_equal(rs1$in_id, 1:2) expect_equal(rs1$out_id, 4:5) }) test_that('simple rsplit with matrices', { rs2 <- rsplit(dat2, 1:2, 4:5) expect_equal(rs2$data, dat2) expect_equal(rs2$in_id, 1:2) expect_equal(rs2$out_id, 4:5) }) test_that('bad inputs', { expect_error(rsplit(as.list(dat1), 1:2, 4:5)) expect_error(rsplit(dat1, letters[1:2], 4:5)) expect_error(rsplit(as.list(dat1), 1:2, letters[4:5])) expect_error(rsplit(as.list(dat1), -1:2, 4:5)) expect_error(rsplit(as.list(dat1), 1:2, -4:5)) expect_error(rsplit(as.list(dat1), integer(0), 4:5)) }) test_that('as.data.frame', { rs3 <- rsplit(dat1, 1:2, 4:5) expect_equal(as.data.frame(rs3), dat1[1:2,]) expect_equal(as.data.frame(rs3, data = "assessment"), dat1[4:5,]) rs4 <- rsplit(dat1, rep(1:2, each = 3), rep(4:5, c(2, 1))) expect_equal(as.data.frame(rs4), dat1[c(1, 1, 1, 2, 2, 2),]) expect_equal(as.data.frame(rs4, data = "assessment"), dat1[c(4, 4, 5),]) }) test_that('print methods', { verify_output(test_path("print_test_output", "rsplit"), { set.seed(233) print(vfold_cv(mtcars)$splits[[1]]) }) verify_output(test_path("print_test_output", "val_split"), { set.seed(233) print(validation_split(mtcars)$splits[[1]]) }) verify_output(test_path("print_test_output", "obj_sum"), { set.seed(233) print(validation_split(mtcars)) }) }) test_that("default complement method errors", { expect_error( complement("a string"), "No `complement[(][)]` method for this class[(]es[)]" ) }) rsample/tests/testthat/print_test_output/0000755000175000017500000000000014010267142020661 5ustar nileshnileshrsample/tests/testthat/print_test_output/rsplit0000644000175000017500000000013114123136427022122 0ustar nileshnilesh> set.seed(233) > print(vfold_cv(mtcars)$splits[[1]]) <28/4/32> rsample/tests/testthat/print_test_output/obj_sum0000644000175000017500000000027614123136427022255 0ustar nileshnilesh> set.seed(233) > print(validation_split(mtcars)) # Validation Set Split (0.75/0.25) # A tibble: 1 x 2 splits id 1 validation rsample/tests/testthat/print_test_output/val_split0000644000175000017500000000014514123136427022607 0ustar nileshnilesh> set.seed(233) > print(validation_split(mtcars)$splits[[1]]) <24/8/32> rsample/tests/testthat/test_strata.R0000644000175000017500000000337214051253375017543 0ustar nileshnileshcontext("Strata constructor") library(testthat) library(rsample) library(purrr) test_that('simple numerics', { set.seed(13333) x1 <- rnorm(1000) str1a <- make_strata(x1) tab1a <- table(str1a) expect_equal(as.vector(tab1a), rep(250, 4)) str1b <- expect_warning(make_strata(x1, depth = 500), "2 breaks instead") tab1b <- table(str1b) expect_equal(as.vector(tab1b), rep(500, 2)) str1c <- make_strata(c(NA, x1[1:999])) tab1c <- table(str1c) expect_true(all(as.vector(tab1c) %in% 249:251)) }) test_that('simple character', { x2 <- factor(rep(LETTERS[1:12], each = 20)) expect_warning( str2a <- make_strata(x2, pool = 0.05), "Stratifying groups that make up 5%" ) expect_equal(table(str2a, dnn = ""), table(x2, dnn = "")) x2[5] <- NA expect_warning( str2b <- make_strata(x2, pool = 0.05), "Stratifying groups that make up 5%" ) expect_true(all(as.vector(table(str2b, dnn = "")) %in% 19:21)) }) test_that('bad data', { x3 <- factor(rep(LETTERS[1:15], each = 50)) expect_warning(make_strata(x3), "Too little data") expect_warning(make_strata(x3, pool = 0.06), "Stratifying groups that make up 6%") expect_warning(make_strata(mtcars$mpg)) expect_warning(make_strata(seq_len(50), breaks = -1)) }) # strata_check() ---------------------------------------------------------- test_that("don't stratify on Surv objects", { df <- data.frame( time = c(85, 79, 70, 6, 32, 8, 17, 93, 81, 76), event = c(0, 0, 1, 0, 0, 0, 1, 1, 1, 1) ) df$surv <- structure( c(85, 79, 70, 6, 32, 8, 17, 93, 81, 76, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1), .Dim = c(10L, 2L), .Dimnames = list(NULL, c("time", "status")), type = "right", class = "Surv") expect_error(strata_check("surv", df)) }) rsample/tests/testthat/test_tidy.R0000644000175000017500000000236513653053433017217 0ustar nileshnileshcontext("Tidy methods") library(testthat) library(rsample) library(purrr) check_ind <- function(x, tdat) { in_dat <- subset(tdat, Data == "Analysis") in_check <- all(sort(in_dat$Row) == x$in_ind) out_dat <- subset(tdat, Data == "Analysis") out_check <- all(sort(out_dat$Row) == x$out_ind) in_check & out_check } dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('simple boot', { set.seed(11) rs1 <- bootstraps(dat1) td1 <- tidy(rs1, unique_ind = FALSE) expect_gt(nrow(td1), nrow(dat1)) name_vals <- names0(nrow(rs1), "Bootstrap") for(i in 1:nrow(rs1)) { expect_true( check_ind(rs1$splits[[i]], subset(td1, Resample == name_vals[i]) ) ) } }) test_that('vfold', { set.seed(11) rs2 <- vfold_cv(dat1) td2 <- tidy(rs2, unique_ind = FALSE) for(i in 1:nrow(rs2)) { expect_true( check_ind(rs2$splits[[i]], subset(td2, Fold == rs2$id[i]) ) ) } }) test_that('vfold with repeats', { set.seed(11) rs3 <- vfold_cv(dat1, repeats = 2) td3 <- tidy(rs3, unique_ind = FALSE) for(i in 1:nrow(rs3)) { expect_true( check_ind(rs3$splits[[i]], subset(td3, Fold == rs3$id2[i] & Repeat == rs3$id[i]) ) ) } }) rsample/tests/testthat/test-slide.R0000644000175000017500000003401113710065601017247 0ustar nileshnilesh# ------------------------------------------------------------------------------ # `sliding_window()` test_that("defaults work", { df <- data.frame(x = 1:3) x <- sliding_window(df) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 1L) expect_identical(split1[["out_id"]], 2L) expect_identical(split2[["in_id"]], 2L) expect_identical(split2[["out_id"]], 3L) }) test_that("lookback always uses complete windows", { df <- data.frame(x = 1:4) x <- sliding_window(df, lookback = 1) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 1:2) expect_identical(split1[["out_id"]], 3L) expect_identical(split2[["in_id"]], 2:3) expect_identical(split2[["out_id"]], 4L) }) test_that("can step forward between slices", { df <- data.frame(x = 1:6) x <- sliding_window(df, lookback = 1, step = 2) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 1:2) expect_identical(split1[["out_id"]], 3L) expect_identical(split2[["in_id"]], 3:4) expect_identical(split2[["out_id"]], 5L) }) test_that("can generate assessment slices", { df <- data.frame(x = 1:4) x <- sliding_window(df, assess_stop = 2) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 1L) expect_identical(split1[["out_id"]], 2:3) expect_identical(split2[["in_id"]], 2L) expect_identical(split2[["out_id"]], 3:4) expect_identical(nrow(x), 2L) }) test_that("can add analysis / assessment gaps", { df <- data.frame(x = 1:7) x <- sliding_window(df, lookback = 1, assess_start = 3, assess_stop = 4) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 1:2) expect_identical(split1[["out_id"]], 5:6) expect_identical(split2[["in_id"]], 2:3) expect_identical(split2[["out_id"]], 6:7) expect_identical(nrow(x), 2L) }) test_that("can create an expanding window", { df <- data.frame(x = 1:4) x <- sliding_window(df, lookback = Inf) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] split3 <- x[["splits"]][[3]] expect_identical(split1[["in_id"]], 1L) expect_identical(split1[["out_id"]], 2L) expect_identical(split2[["in_id"]], 1:2) expect_identical(split2[["out_id"]], 3L) expect_identical(split3[["in_id"]], 1:3) expect_identical(split3[["out_id"]], 4L) expect_identical(nrow(x), 3L) }) test_that("can skip first few resampling slices", { df <- data.frame(x = 1:8) x <- sliding_window(df, lookback = 1, skip = 3) split1 <- x[["splits"]][[1]] expect_identical(split1[["in_id"]], 4:5) expect_identical(split1[["out_id"]], 6L) expect_identical(nrow(x), 3L) }) test_that("`skip` is applied before `step`", { df <- data.frame(x = 1:8) x <- sliding_window(df, lookback = 1, skip = 3, step = 2) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 4:5) expect_identical(split1[["out_id"]], 6L) expect_identical(split2[["in_id"]], 6:7) expect_identical(split2[["out_id"]], 8L) expect_identical(nrow(x), 2L) }) test_that("can use incomplete windows at the beginning", { df <- data.frame(x = 1:5) x <- sliding_window(df, lookback = 2, complete = FALSE) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] split3 <- x[["splits"]][[3]] split4 <- x[["splits"]][[4]] expect_identical(split1[["in_id"]], 1L) expect_identical(split1[["out_id"]], 2L) expect_identical(split2[["in_id"]], 1:2) expect_identical(split2[["out_id"]], 3L) expect_identical(split3[["in_id"]], 1:3) expect_identical(split3[["out_id"]], 4L) expect_identical(split4[["in_id"]], 2:4) expect_identical(split4[["out_id"]], 5L) expect_identical(nrow(x), 4L) }) test_that("`data` is validated", { expect_error(sliding_window(1), "`data` must be a data frame") }) test_that("`lookback` is validated", { expect_error(sliding_window(data.frame(), lookback = -1), "`lookback` must be positive, or zero") expect_error(sliding_window(data.frame(), lookback = "a"), "`lookback` must be an integer") expect_error(sliding_window(data.frame(), lookback = c(1, 2)), "`lookback` must have size 1") expect_error(sliding_window(data.frame(), lookback = NA), "`lookback` must be an integer") }) test_that("`assess_start` is validated", { expect_error(sliding_window(data.frame(), assess_start = -1), "`assess_start` must be positive") expect_error(sliding_window(data.frame(), assess_start = "a"), "`assess_start` must be an integer") expect_error(sliding_window(data.frame(), assess_start = c(1, 2)), "`assess_start` must have size 1") expect_error(sliding_window(data.frame(), assess_start = NA), "`assess_start` must be an integer") }) test_that("`assess_stop` is validated", { expect_error(sliding_window(data.frame(), assess_stop = -1), "`assess_stop` must be positive") expect_error(sliding_window(data.frame(), assess_stop = "a"), "`assess_stop` must be an integer") expect_error(sliding_window(data.frame(), assess_stop = c(1, 2)), "`assess_stop` must have size 1") expect_error(sliding_window(data.frame(), assess_stop = NA), "`assess_stop` must be an integer") }) test_that("`assess_start` must be before or equal to `assess_stop`", { expect_error(sliding_window(data.frame(), assess_start = 2, assess_stop = 1), "less than or equal to") }) # ------------------------------------------------------------------------------ # `sliding_index()` test_that("defaults works", { df <- data.frame(x = 1:3) x <- sliding_index(df, x) split1 <- x$splits[[1]] split2 <- x$splits[[2]] expect_identical(split1$in_id, 1L) expect_identical(split1$out_id, 2L) expect_identical(split2$in_id, 2L) expect_identical(split2$out_id, 3L) expect_identical(nrow(x), 2L) }) test_that("can lookback over irregular index", { df <- data.frame(x = c(1, 3, 4, 5)) x <- sliding_index(df, x, lookback = 1) split1 <- x$splits[[1]] split2 <- x$splits[[2]] expect_identical(split1$in_id, 2L) expect_identical(split1$out_id, 3L) expect_identical(split2$in_id, 2:3) expect_identical(split2$out_id, 4L) expect_identical(nrow(x), 2L) }) test_that("can compute assessment indices relative to irregular index", { df <- data.frame(x = c(1, 3, 4, 5, 7, 8)) x <- sliding_index(df, x, lookback = 1, assess_stop = 2) split1 <- x$splits[[1]] split2 <- x$splits[[2]] split3 <- x$splits[[3]] expect_identical(split1$in_id, 2L) expect_identical(split1$out_id, 3:4) expect_identical(split2$in_id, 2:3) expect_identical(split2$out_id, 4L) expect_identical(split3$in_id, 3:4) expect_identical(split3$out_id, 5L) expect_identical(nrow(x), 3L) }) test_that("it is possible to create empty assessment sets", { # Look forward 1->2 values from `5`, so creates a window with range of [6, 7]. # But no `x` values fall in this range. However, in theory it is "possible" # to make a complete window starting at `5`, which is why `complete = TRUE` # didn't remove it. df <- data.frame(x = c(1, 3, 4, 5, 8, 9)) x <- sliding_index(df, x, lookback = 1, assess_stop = 2) split3 <- x$splits[[3]] expect_identical(split3$in_id, 3:4) expect_identical(split3$out_id, integer()) expect_identical(nrow(x), 3L) }) test_that("can add a gap between the analysis and assessment set", { df <- data.frame(x = c(1, 3, 4, 5, 7, 8)) x <- sliding_index(df, x, lookback = 2, assess_start = 2, assess_stop = 3) split1 <- x$splits[[1]] split2 <- x$splits[[2]] split3 <- x$splits[[3]] expect_identical(split1$in_id, 1:2) expect_identical(split1$out_id, 4L) expect_identical(split2$in_id, 2:3) expect_identical(split2$out_id, 5L) expect_identical(split3$in_id, 2:4) expect_identical(split3$out_id, 5:6) expect_identical(nrow(x), 3L) }) test_that("can use `step` to thin results after calling `slide_index()`", { df <- data.frame(x = c(1, 3, 4, 6, 7, 10)) x <- sliding_index(df, x, lookback = 2, assess_stop = 2, step = 2) split1 <- x$splits[[1]] split2 <- x$splits[[2]] expect_identical(split1$in_id, 1:2) expect_identical(split1$out_id, 3L) expect_identical(split2$in_id, 3:4) expect_identical(split2$out_id, 5L) expect_identical(nrow(x), 2L) }) test_that("can skip first few resampling slices", { df <- data.frame(x = c(1, 3, 4, 6, 7, 10)) x <- sliding_index(df, x, lookback = 1, skip = 2) split1 <- x[["splits"]][[1]] expect_identical(split1[["in_id"]], 4L) expect_identical(split1[["out_id"]], 5L) expect_identical(nrow(x), 2L) }) test_that("`skip` is applied before `step`", { df <- data.frame(x = c(1, 3, 4, 6, 7, 9, 11, 13, 14)) x <- sliding_index(df, x, lookback = 1, skip = 3, step = 2, assess_stop = 2) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 4:5) expect_identical(split1[["out_id"]], 6L) expect_identical(split2[["in_id"]], 7L) expect_identical(split2[["out_id"]], 8L) expect_identical(nrow(x), 2L) }) test_that("can use incomplete windows at the beginning", { df <- data.frame(x = c(1, 3, 4, 5, 7)) x <- sliding_index(df, x, lookback = 2, complete = FALSE, assess_stop = 2) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] split3 <- x[["splits"]][[3]] split4 <- x[["splits"]][[4]] expect_identical(split1[["in_id"]], 1L) expect_identical(split1[["out_id"]], 2L) expect_identical(split2[["in_id"]], 1:2) expect_identical(split2[["out_id"]], 3:4) expect_identical(split3[["in_id"]], 2:3) expect_identical(split3[["out_id"]], 4L) expect_identical(split4[["in_id"]], 2:4) expect_identical(split4[["out_id"]], 5L) expect_identical(nrow(x), 4L) }) test_that("`data` is validated", { expect_error(sliding_index(1), "`data` must be a data frame") }) test_that("`index` is validated", { df <- data.frame(x = 1:2) expect_error(sliding_index(df, y)) }) # ------------------------------------------------------------------------------ # `sliding_period()` test_that("can group by month", { index <- vctrs::new_date(c(-1, 0, 1, 31)) df <- data.frame(index = index) x <- sliding_period(df, index, period = "month") split1 <- x$splits[[1]] split2 <- x$splits[[2]] expect_identical(split1$in_id, 1L) expect_identical(split1$out_id, 2:3) expect_identical(split2$in_id, 2:3) expect_identical(split2$out_id, 4L) expect_identical(nrow(x), 2L) }) test_that("can group by year", { index <- vctrs::new_date(c(-1, 0, 1, 31)) df <- data.frame(index = index) x <- sliding_period(df, index, period = "year") split1 <- x$splits[[1]] expect_identical(split1$in_id, 1L) expect_identical(split1$out_id, 2:4) expect_identical(nrow(x), 1L) }) test_that("when looking back over multiple periods, only complete ones are used", { index <- vctrs::new_date(c(-32, -1, 0, 1, 31)) df <- data.frame(index = index) x <- sliding_period(df, index, period = "month", lookback = 1) split1 <- x$splits[[1]] expect_identical(split1$in_id, 1:2) expect_identical(split1$out_id, 3:4) expect_identical(nrow(x), 2L) }) test_that("can look forward to assess over multiple periods", { index <- vctrs::new_date(c(-32, -1, 0, 1, 31)) df <- data.frame(index = index) x <- sliding_period(df, index, period = "month", assess_stop = 2) split1 <- x$splits[[1]] split2 <- x$splits[[2]] expect_identical(split1$in_id, 1L) expect_identical(split1$out_id, 2:4) expect_identical(split2$in_id, 2L) expect_identical(split2$out_id, 3:5) expect_identical(nrow(x), 2L) }) test_that("can use `step` to thin results after calling `slide_period()`", { df <- data.frame(x = vctrs::new_date(c(1, 3, 4, 6, 7, 10))) x <- sliding_period(df, x, "day", lookback = 2, assess_stop = 2, step = 2) split1 <- x$splits[[1]] split2 <- x$splits[[2]] expect_identical(split1$in_id, 1:2) expect_identical(split1$out_id, 3L) expect_identical(split2$in_id, 3:4) expect_identical(split2$out_id, 5L) expect_identical(nrow(x), 2L) }) test_that("can skip first few resampling slices", { index <- vctrs::new_date(c(-32, -1, 0, 1, 31, 59)) df <- data.frame(index = index) x <- sliding_period(df, index, "month", lookback = 1, skip = 2) split1 <- x[["splits"]][[1]] expect_identical(split1[["in_id"]], 3:5) expect_identical(split1[["out_id"]], 6L) expect_identical(nrow(x), 1L) }) test_that("can skip with expanding window", { index <- vctrs::new_date(c(-32, -1, 0, 1, 31, 59)) df <- data.frame(index = index) x <- sliding_period(df, index, "month", lookback = Inf, skip = 2) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 1:4) expect_identical(split1[["out_id"]], 5L) expect_identical(split2[["in_id"]], 1:5) expect_identical(split2[["out_id"]], 6L) expect_identical(nrow(x), 2L) }) test_that("`skip` is applied before `step`", { index <- vctrs::new_date(c(-32, -1, 0, 1, 31, 59, 90)) df <- data.frame(index = index) x <- sliding_period(df, index, "month", lookback = Inf, skip = 2, step = 2) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] expect_identical(split1[["in_id"]], 1:4) expect_identical(split1[["out_id"]], 5L) expect_identical(split2[["in_id"]], 1:6) expect_identical(split2[["out_id"]], 7L) expect_identical(nrow(x), 2L) }) test_that("can use incomplete windows at the beginning", { index <- vctrs::new_date(c(-32, -1, 0, 1, 59, 90)) df <- data.frame(index = index) x <- sliding_period(df, index, "month", lookback = 2, complete = FALSE) split1 <- x[["splits"]][[1]] split2 <- x[["splits"]][[2]] split3 <- x[["splits"]][[3]] split4 <- x[["splits"]][[4]] expect_identical(split1[["in_id"]], 1L) expect_identical(split1[["out_id"]], 2L) expect_identical(split2[["in_id"]], 1:2) expect_identical(split2[["out_id"]], 3:4) expect_identical(split3[["in_id"]], 1:4) expect_identical(split3[["out_id"]], integer()) expect_identical(split4[["in_id"]], 3:5) expect_identical(split4[["out_id"]], 6L) expect_identical(nrow(x), 4L) }) test_that("`data` is validated", { expect_error(sliding_period(1), "`data` must be a data frame") }) test_that("`index` is validated", { df <- data.frame(x = 1:2) expect_error(sliding_period(df, y)) }) rsample/tests/testthat/test_vfold.R0000644000175000017500000000504314033426424017351 0ustar nileshnileshcontext("V-fold CV") library(testthat) library(rsample) library(purrr) library(modeldata) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { set.seed(11) rs1 <- vfold_cv(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 18)) expect_true(all(sizes1$assessment == 2)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('repeated', { set.seed(11) rs2 <- vfold_cv(dat1, repeats = 4) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == 18)) expect_true(all(sizes2$assessment == 2)) same_data <- map_lgl(rs2$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs2$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('strata', { set.seed(11) data("mlc_churn", package = "modeldata") rs3 <- vfold_cv(mlc_churn, repeats = 2, strata = "voice_mail_plan") sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis %in% 4499:4501)) expect_true(all(sizes3$assessment %in% 499:501)) rate <- map_dbl(rs3$splits, function(x) { dat <- as.data.frame(x)$voice_mail_plan mean(dat == "yes") }) expect_equal(mean(unique(rate)), 0.2645925848) good_holdout <- map_lgl(rs3$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) expect_warning( rs4 <- vfold_cv(mlc_churn, strata = state, pool = 0.01), "Stratifying groups that make up 1%" ) }) test_that('bad args', { expect_error(vfold_cv(iris, strata = iris$Species)) expect_error(vfold_cv(iris, strata = c("Species", "Sepal.Width"))) }) test_that('printing', { expect_output(print(vfold_cv(mtcars))) }) test_that('rsplit labels', { rs <- vfold_cv(mtcars) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) rs2 <- vfold_cv(mtcars, repeats = 4) all_labs2 <- map_df(rs2$splits, labels) original_id2 <- rs2[, grepl("^id", names(rs2))] expect_equal(all_labs2, original_id2) }) rsample/tests/testthat/test_fingerprint.R0000644000175000017500000000065014010267142020560 0ustar nileshnilesh test_that("fingerprinting", { set.seed(1) rs_1 <- vfold_cv(mtcars) fp_1 <- .get_fingerprint(rs_1) set.seed(1) fp_2 <- .get_fingerprint(vfold_cv(mtcars)) set.seed(1) fp_3 <- .get_fingerprint(vfold_cv(mtcars, repeats = 2)) expect_true(class(fp_1) == "character") expect_true(class(fp_2) == "character") expect_true(class(fp_3) == "character") expect_equal(fp_1, fp_2) expect_false(fp_1 == fp_3) }) rsample/tests/testthat/test_nesting.R0000644000175000017500000000713413673171774017727 0ustar nileshnileshcontext("Nested CV") library(testthat) library(rsample) library(purrr) test_that('default param', { set.seed(11) rs1 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = vfold_cv(v = 3)) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 27)) expect_true(all(sizes1$assessment == 3)) subsizes1 <- map(rs1$inner_resamples, dim_rset) subsizes1 <- do.call("rbind", subsizes1) expect_true(all(subsizes1$analysis == 18)) expect_true(all(subsizes1$assessment == 9)) set.seed(11) rs2 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = bootstraps(times = 3)) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == 27)) expect_true(all(sizes2$assessment == 3)) subsizes2 <- map(rs2$inner_resamples, dim_rset) subsizes2 <- do.call("rbind", subsizes2) expect_true(all(subsizes2$analysis == 27)) set.seed(11) rs3 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = mc_cv(prop = 2/3, times = 3)) sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis == 27)) expect_true(all(sizes3$assessment == 3)) subsizes3 <- map(rs3$inner_resamples, dim_rset) subsizes3 <- do.call("rbind", subsizes3) expect_true(all(subsizes3$analysis == 18)) expect_true(all(subsizes3$assessment == 9)) }) test_that('bad args', { expect_warning( nested_cv(mtcars, outside = bootstraps(times = 5), inside = vfold_cv(V = 3)) ) folds <- vfold_cv(mtcars) expect_error( nested_cv(mtcars, outside = vfold_cv(), inside = folds) ) }) test_that('printing', { rs1 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = vfold_cv(v = 3)) expect_output(print(rs1)) }) test_that('rsplit labels', { rs <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = vfold_cv(v = 3)) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) # ------------------------------------------------------------------------------ # `[` test_that("can keep the rset class", { x <- rset_subclasses$nested_cv loc <- seq_len(ncol(x)) expect_s3_class_rset(x[loc]) }) test_that("drops the rset class if missing `inner_resamples`", { x <- rset_subclasses$nested_cv names <- names(x) names <- names[names != "inner_resamples"] expect_s3_class_bare_tibble(x[names]) }) test_that("drops the rset class if duplicating `inner_resamples`", { x <- rset_subclasses$nested_cv names <- names(x) names <- c(names, "inner_resamples") expect_s3_class_bare_tibble(x[names]) }) # ------------------------------------------------------------------------------ # `names<-` test_that("can keep the rset subclass when renaming doesn't touch rset columns", { x <- rset_subclasses$nested_cv x <- mutate(x, a = 1) names <- names(x) names[names == "a"] <- "b" names(x) <- names expect_s3_class_rset(x) }) test_that("drops the rset class if `inner_resamples` is renamed", { x <- rset_subclasses$nested_cv names <- names(x) names[names == "inner_resamples"] <- "inner_things" names(x) <- names expect_s3_class_bare_tibble(x) }) test_that("drops the rset class if `inner_resamples` is moved", { x <- rset_subclasses$nested_cv x <- mutate(x, a = 1) names <- names(x) new_names <- names new_names[names == "inner_resamples"] <- "a" new_names[names == "a"] <- "inner_resamples" names(x) <- new_names expect_s3_class_bare_tibble(x) }) rsample/tests/testthat/test_mc.R0000644000175000017500000000430414022034401016621 0ustar nileshnileshcontext("Monte Carlo CV") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { set.seed(11) rs1 <- mc_cv(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 15)) expect_true(all(sizes1$assessment == 5)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('different percent', { set.seed(11) rs2 <- mc_cv(dat1, prop = .5) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == 10)) expect_true(all(sizes2$assessment == 10)) same_data <- map_lgl(rs2$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs2$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('strata', { set.seed(11) rs3 <- mc_cv(warpbreaks, strata = "tension") sizes3 <- dim_rset(rs3) # sum(floor(table(warpbreaks$tension) * prop)) = 39 expect_true(all(sizes3$analysis == 39)) expect_true(all(sizes3$assessment == 15)) rate <- map_dbl(rs3$splits, function(x) { dat <- as.data.frame(x)$tension mean(dat == "M") }) expect_true(length(unique(rate)) == 1) good_holdout <- map_lgl(rs3$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('bad args', { expect_error(mc_cv(warpbreaks, strata = warpbreaks$tension)) expect_error(mc_cv(warpbreaks, strata = c("tension", "wool"))) }) test_that('printing', { expect_output(print(mc_cv(warpbreaks))) }) test_that('rsplit labels', { rs <- mc_cv(mtcars) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_rolling.R0000644000175000017500000000563213653053433017714 0ustar nileshnileshcontext("Rolling window resampling") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { rs1 <- rolling_origin(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$assessment == 1)) expect_true(all(sizes1$analysis == 5:19)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) for (i in 1:nrow(rs1)) { expect_equal(rs1$splits[[i]]$in_id, 1:(i + attr(rs1, "initial") - 1)) expect_equal(rs1$splits[[i]]$out_id, i + attr(rs1, "initial")) } }) test_that('larger holdout', { rs2 <- rolling_origin(dat1, assess = 3) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$assessment == 3)) expect_true(all(sizes2$analysis == 5:17)) for (i in 1:nrow(rs2)) { expect_equal(rs2$splits[[i]]$in_id, 1:(i + attr(rs2, "initial") - 1)) expect_equal(rs2$splits[[i]]$out_id, (i + attr(rs2, "initial")): (i + attr(rs2, "initial") + attr(rs2, "assess") - 1)) } }) test_that('fixed analysis size', { rs3 <- rolling_origin(dat1, cumulative = FALSE) sizes3 <- dim_rset(rs3) expect_true(all(sizes3$assessment == 1)) expect_true(all(sizes3$analysis == 5)) for (i in 1:nrow(rs3)) { expect_equal(rs3$splits[[i]]$in_id, i:(i + attr(rs3, "initial") - 1)) expect_equal(rs3$splits[[i]]$out_id, i + attr(rs3, "initial")) } }) test_that('skipping', { rs4 <- rolling_origin(dat1, cumulative = FALSE, skip = 2) sizes4 <- dim_rset(rs4) expect_true(all(sizes4$assessment == 1)) expect_true(all(sizes4$analysis == 5)) for (i in 1:nrow(rs4)) { expect_equal(rs4$splits[[i]]$in_id, (i + attr(rs4, "skip")*(i-1)): (i + attr(rs4, "skip")*(i-1) + attr(rs4, "initial") -1)) expect_equal(rs4$splits[[i]]$out_id, i + attr(rs4, "skip")*(i-1) + attr(rs4, "initial")) } }) test_that('lag', { rs5 <- rolling_origin(dat1, initial = 5, assess = 1, cumulative = FALSE, skip = 0, lag = 3) sizes5 <- dim_rset(rs5) expect_true(all(sizes5$assessment == attr(rs5, "assess") + attr(rs5, "lag"))) expect_true(all(sizes5$analysis == attr(rs5, "initial"))) for (i in 1:nrow(rs5)) { expect_equal(rs5$splits[[i]]$in_id, i:(i + attr(rs5, "initial") - 1)) expect_equal(rs5$splits[[i]]$out_id, (i + attr(rs5, "initial") - attr(rs5, "lag")):(i + attr(rs5, "initial") + attr(rs5, "assess") - 1)) } expect_error(rolling_origin(drinks, initial = 5, lag = 6)) # lag must be less than training observations expect_error(olling_origin(drinks, lag = 2.1)) # lag must be whole number }) test_that('rsplit labels', { rs <- rolling_origin(dat1) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_labels.R0000644000175000017500000000330613653053433017504 0ustar nileshnileshlibrary(testthat) library(rsample) context("Labels") test_that('basic cv', { cv_obj <- vfold_cv(mtcars) expect_equal(cv_obj$id, labels(cv_obj)) expect_is(labels(cv_obj), "character") expect_s3_class(labels(cv_obj, TRUE), "factor") }) test_that('repeated cv', { rcv_obj <- vfold_cv(mtcars, repeats = 3) expect_equal(paste(rcv_obj$id, rcv_obj$id2, sep = "."), labels(rcv_obj)) expect_is(labels(rcv_obj), "character") expect_s3_class(labels(rcv_obj, TRUE), "factor") }) test_that('nested cv', { expect_error( labels( nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5) ) ) ) }) test_that('adding labels', { set.seed(363) car_folds <- vfold_cv(mtcars, repeats = 3) res <- analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits[[1]]) expect_equal(colnames(res), c(colnames(mtcars), "id", "id2")) car_bt <- bootstraps(mtcars) res <- analysis(car_bt$splits[[1]]) %>% add_resample_id(car_bt$splits[[1]]) expect_equal(colnames(res), c(colnames(mtcars), "id")) res <- analysis(car_bt$splits[[1]]) %>% add_resample_id(car_bt$splits[[1]], TRUE) expect_equal(colnames(res), c(colnames(mtcars), ".id")) expect_error( analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits[[1]], 7) ) expect_error( analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits[[1]], c(TRUE, TRUE)) ) expect_error( analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits) ) expect_error( analysis(car_folds$splits[[1]]) %>% as.matrix() %>% add_resample_id(car_folds$splits[[1]]) ) }) rsample/tests/testthat/test_rset.R0000644000175000017500000000236614010267142017214 0ustar nileshnileshcontext("rset constructor") library(testthat) library(rsample) cars_10fold <- vfold_cv(mtcars) test_that('bad args', { expect_error( new_rset(cars_10fold$splits[1:2], cars_10fold$id) ) expect_error( new_rset(cars_10fold$splits, cars_10fold[ "splits"]) ) expect_error( new_rset(cars_10fold$splits, cars_10fold$splits) ) expect_error( new_rset(list(1), "x"), "must be an `rsplit` object" ) args <- list(a = 1, b = 2, 3) expect_error( new_rset( cars_10fold$splits, cars_10fold$id, attrib = args ) ) }) test_that('rset with attributes', { args <- list(value = "potato") res3 <- new_rset( cars_10fold$splits, cars_10fold$id, attrib = args ) expect_equal(sort(names(attributes(res3))), c("class", "fingerprint", "names", "row.names", "value")) expect_equal(attr(res3, "value"), "potato") }) test_that('rset with additional classes', { res4 <- new_rset( cars_10fold$splits, cars_10fold$id, subclass = "potato" ) expect_equal(class(res4), c("potato", "tbl_df", "tbl", "data.frame")) }) test_that('not an rsplit', { folds <- vfold_cv(mtcars) expect_error(analysis(folds$splits[1])) expect_error(assessment(folds$splits[1])) }) rsample/tests/testthat/test-make-splits.R0000644000175000017500000000323114066675766020430 0ustar nileshnileshtest_that("can create a split with an empty assessment set (#188)", { df <- data.frame(x = c(1, 2, 3, 4)) indices <- list(analysis = 1:4, assessment = integer()) split <- make_splits(indices, df) expect_identical(split$out_id, integer()) expect_identical(assessment(split), df[0, , drop = FALSE]) }) test_that("cannot create a split with an empty analysis set", { df <- data.frame(x = c(1, 2, 3, 4)) indices <- list(analysis = integer(), assessment = 1:4) expect_error(make_splits(indices, df), "At least one row") }) test_that("create a split from training and testing dataframes", { training <- tibble(x = c(1, 2, 3, 4)) testing <- tibble(x = c(5, 6)) split <- make_splits(training, testing) expect_identical(analysis(split), training) expect_identical(assessment(split), testing) }) test_that("can create a split from empty testing dataframe", { training <- tibble(x = c(1, 2, 3, 4)) testing <- tibble() split <- make_splits(training, testing) expect_identical(split$out_id, integer()) expect_identical(analysis(split), training) }) test_that("cannot create a split from empty training dataframe", { training <- tibble() testing <- tibble(x = c(5, 6)) expect_error( make_splits(training, testing), "The analysis set must contain at least one row." ) }) test_that("cannot create a split from dataframes with different columns", { training <- tibble(x = c(1, 2, 3, 4)) testing <- tibble(y = c(5, 6)) expect_error( make_splits(training, testing), "The analysis and assessment sets must have" ) }) test_that("improper argument", { expect_error(make_splits("potato"), "There is no method available to") }) rsample/tests/testthat/test_bootci.R0000644000175000017500000002017614010267142017515 0ustar nileshnileshlibrary(rsample) library(testthat) library(purrr) library(tibble) library(dplyr) library(broom) data("attrition", package = "modeldata") context("Bootstrap intervals") # ------------------------------------------------------------------------------ get_stats <- function(split, ...) { dat <- analysis(split) x <- dat[[1]] tibble( term = "mean", estimate = mean(x, na.rm = TRUE), std.error = sqrt(var(x, na.rm = TRUE)/sum(!is.na(x))) ) } # ------------------------------------------------------------------------------ n <- 1000 mu <- 10 sigma <- 1 set.seed(888) rand_nums <- rnorm(n, mu, sigma) ttest <- tidy(t.test(rand_nums)) ttest_lower_conf <- tidy(t.test(rand_nums, conf.level = 0.8)) dat <- data.frame(x = rand_nums) set.seed(456765) bt_norm <- bootstraps(dat, times = 1000, apparent = TRUE) %>% dplyr::mutate( stats = map(splits, ~ get_stats(.x)) ) test_that('Bootstrap estimate of mean is close to estimate of mean from normal distribution',{ skip_on_cran() single_pct_res <- int_pctl(bt_norm, stats) single_t_res <- int_t(bt_norm, stats) single_bca_res <- int_bca(bt_norm, stats, .fn = get_stats) single_bca_res_lower_conf <- int_bca(bt_norm, stats, .fn = get_stats, alpha = 0.2) expect_equal(ttest$conf.low, single_pct_res$.lower, tolerance = 0.001) expect_equal(unname(ttest$estimate), single_pct_res$.estimate, tolerance = 0.001) expect_equal(ttest$conf.high, single_pct_res$.upper, tolerance = 0.001) expect_equal(ttest$conf.low, single_t_res$.lower, tolerance = 0.001) expect_equal(unname(ttest$estimate), single_t_res$.estimate, tolerance = 0.001) expect_equal(ttest$conf.high, single_pct_res$.upper, tolerance = 0.001) expect_equal(ttest$conf.low, single_bca_res$.lower, tolerance = 0.001) expect_equal(unname(ttest$estimate), single_bca_res$.estimate, tolerance = 0.001) expect_equal(ttest$conf.high, single_bca_res$.upper, tolerance = 0.001) expect_equal(ttest_lower_conf$conf.low, single_bca_res_lower_conf$.lower, tolerance = 0.001) expect_equal(unname(ttest_lower_conf$estimate), single_bca_res_lower_conf$.estimate, tolerance = 0.001) expect_equal(ttest_lower_conf$conf.high, single_bca_res_lower_conf$.upper, tolerance = 0.001) }) # ------------------------------------------------------------------------------ context("Wrapper Functions") test_that("Wrappers -- selection of multiple variables works", { func <- function(split, ...) { lm(Age ~ HourlyRate + DistanceFromHome, data = analysis(split)) %>% tidy() } # generate boostrap resamples set.seed(888) bt_resamples <- bootstraps(attrition, times = 1000, apparent = TRUE) %>% mutate(res = map(splits, func)) attrit_tidy <- lm(Age ~ HourlyRate + DistanceFromHome, data = attrition) %>% tidy(conf.int = TRUE) %>% dplyr::arrange(term) pct_res <- int_pctl(bt_resamples, res) %>% inner_join(attrit_tidy, by = "term") expect_equal(pct_res$conf.low, pct_res$.lower, tolerance = .01) expect_equal(pct_res$conf.high, pct_res$.upper, tolerance = .01) t_res <- int_t(bt_resamples, res) %>% inner_join(attrit_tidy, by = "term") expect_equal(t_res$conf.low, t_res$.lower, tolerance = .01) expect_equal(t_res$conf.high, t_res$.upper, tolerance = .01) bca_res <- int_bca(bt_resamples, res, .fn = func) %>% inner_join(attrit_tidy, by = "term") expect_equal(bca_res$conf.low, bca_res$.lower, tolerance = .01) expect_equal(bca_res$conf.high, bca_res$.upper, tolerance = .01) }) # ------------------------------------------------------------------------------ context("boot_ci() Prompt Errors: Too Many NAs") test_that('Upper & lower confidence interval does not contain NA', { bad_stats <- function(split, ...) { tibble( term = "mean", estimate = NA_real_, std.error = runif(1) ) } set.seed(888) bt_resamples <- bootstraps(data.frame(x = 1:100), times = 1000, apparent = TRUE) %>% mutate(res = map(splits, bad_stats)) expect_error( expect_warning( int_pctl(bt_resamples, res), "at least 1000 non-missing" ), "missing values" ) expect_error( expect_warning( int_t(bt_resamples, res), "at least 1000 non-missing" ), "missing values" ) expect_error( expect_warning( int_bca(bt_resamples, res, .fn = bad_stats), "at least 1000 non-missing" ), "missing values" ) }) # ------------------------------------------------------------------------------ context("boot_ci() Insufficient Number of Bootstrap Resamples") set.seed(456765) bt_small <- bootstraps(dat, times = 10, apparent = TRUE) %>% dplyr::mutate( stats = map(splits, ~ get_stats(.x)), junk = 1:11 ) test_that( "Sufficient replications needed to sufficiently reduce Monte Carlo sampling Error for BCa method", { expect_warning(int_pctl(bt_small, stats)) expect_warning(int_t(bt_small, stats)) expect_warning(int_bca(bt_small, stats, .fn = get_stats)) } ) context("boot_ci() Input Validation") test_that("bad input", { expect_error(int_pctl(bt_small, id)) expect_error(int_pctl(bt_small, junk)) expect_error(int_pctl(bt_small, stats, alpha = c(0.05, 0.2))) expect_error(int_t(bt_small, stats, alpha = "potato")) expect_error(int_bca(bt_small, stats, alpha = 1:2, .fn = get_stats)) bad_bt_norm <- bt_norm %>% mutate(stats = map(stats, ~ .x[, 1:2])) expect_error(int_t(bad_bt_norm, stats)) expect_error(int_bca(bad_bt_norm, stats)) no_dots <- function(split) { dat <- analysis(split) x <- dat[[1]] tibble( term = "mean", estimate = mean(x, na.rm = TRUE), std.error = sqrt(var(x, na.rm = TRUE)/sum(!is.na(x))) ) } expect_error( int_bca(bt_norm, stats, .fn = no_dots), "must have an argument" ) expect_error(int_pctl(as.data.frame(bt_norm), stats)) expect_error(int_t(as.data.frame(bt_norm), stats)) expect_error(int_bca(as.data.frame(bt_norm), stats, .fn = get_stats)) expect_error( int_t(bt_norm %>% dplyr::filter(id != "Apparent"), stats) ) expect_error( int_bca(bt_norm %>% dplyr::filter(id != "Apparent"), stats, .fn = get_stats) ) poo <- function(x) { x$estimate <- "a" x } badder_bt_norm <- bt_norm %>% mutate( bad_term = map(stats, ~ .x %>% setNames(c("a", "estimate", "std.err"))), bad_est = map(stats, ~ .x %>% setNames(c("term", "b", "std.err"))), bad_err = map(stats, ~ .x %>% setNames(c("term", "estimate", "c"))), bad_num = map(stats, ~ poo(.x)) ) expect_error(int_pctl(badder_bt_norm, bad_term)) expect_error(int_t(badder_bt_norm, bad_err)) expect_error(int_bca(badder_bt_norm, bad_est, .fn = get_stats)) expect_error(int_pctl(badder_bt_norm, bad_num)) }) # ------------------------------------------------------------------------------ context("regression intervals") test_that("regression intervals", { skip_on_cran() expect_error({ set.seed(1) int_1 <- reg_intervals(mpg ~ disp + wt, data = mtcars) }, regex = NA) expect_equal( names(int_1), c("term", ".lower", ".estimate", ".upper", ".alpha", ".method") ) expect_error({ set.seed(1) int_2 <- reg_intervals(mpg ~ disp + wt, data = mtcars, filter = term == "wt", model_fn = "glm", keep_reps = TRUE) }, regex = NA) expect_equal( names(int_2), c("term", ".lower", ".estimate", ".upper", ".alpha", ".method", ".replicates") ) expect_true(nrow(int_2) == 1) expect_true(all(int_2$term == "wt")) expect_error( reg_intervals(mpg ~ disp + wt, data = mtcars, model_fn = "potato"), "`model_fn` must be one of" ) expect_error( reg_intervals(mpg ~ disp + wt, data = mtcars, type = "random"), "`type` must be one of" ) expect_error( reg_intervals(mpg ~ disp + wt, data = mtcars, alpha = "a"), "must be a single numeric value" ) }) rsample/tests/testthat.R0000644000175000017500000000007214122413046015170 0ustar nileshnileshlibrary(testthat) library(rsample) test_check("rsample") rsample/R/0000755000175000017500000000000014136023335012250 5ustar nileshnileshrsample/R/reexports.R0000644000175000017500000000015613653053433014435 0ustar nileshnilesh#' @importFrom generics tidy #' @export generics::tidy #' @importFrom tidyr gather #' @export tidyr::gather rsample/R/manual.R0000644000175000017500000000255113727757057013677 0ustar nileshnilesh#' Manual resampling #' #' `manual_rset()` is used for constructing the most minimal rset possible. It #' can be useful when you have custom rsplit objects built from #' [make_splits()], or when you want to create a new rset from splits #' contained within an existing rset. #' #' @param splits A list of `"rsplit"` objects. It is easiest to create these #' using [make_splits()]. #' #' @param ids A character vector of ids. The length of `ids` must be the same #' as the length of `splits`. #' #' @export #' @examples #' df <- data.frame(x = c(1, 2, 3, 4, 5, 6)) #' #' # Create an rset from custom indices #' indices <- list( #' list(analysis = c(1L, 2L), assessment = 3L), #' list(analysis = c(4L, 5L), assessment = 6L) #' ) #' #' splits <- lapply(indices, make_splits, data = df) #' #' manual_rset(splits, c("Split 1", "Split 2")) #' #' # You can also use this to create an rset from a subset of an #' # existing rset #' resamples <- vfold_cv(mtcars) #' best_split <- resamples[5,] #' manual_rset(best_split$splits, best_split$id) manual_rset <- function(splits, ids) { new_manual_rset(splits, ids) } new_manual_rset <- function(splits, ids) { new_rset(splits, ids, subclass = c("manual_rset", "rset")) } #' @export print.manual_rset <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("manual_rset", "rset"))] print(x, ...) } rsample/R/validation_split.R0000644000175000017500000000635514122413046015746 0ustar nileshnilesh#' Create a Validation Set #' #' `validation_split()` takes a single random sample (without replacement) of #' the original data set to be used for analysis. All other data points are #' added to the assessment set (to be used as the validation set). #' `validation_time_split()` does the same, but takes the _first_ `prop` samples #' for training, instead of a random selection. #' @template strata_details #' @inheritParams vfold_cv #' @inheritParams make_strata #' @param prop The proportion of data to be retained for modeling/analysis. #' @export #' @return An tibble with classes `validation_split`, `rset`, `tbl_df`, `tbl`, #' and `data.frame`. The results include a column for the data split objects #' and a column called `id` that has a character string with the resample #' identifier. #' @examples #' validation_split(mtcars, prop = .9) #' #' data(drinks, package = "modeldata") #' validation_time_split(drinks) #' @export validation_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) { if (!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if (length(strata) == 0) { strata <- NULL } } strata_check(strata, data) split_objs <- mc_splits(data = data, prop = prop, times = 1, strata = strata, breaks = breaks, pool = pool) ## We remove the holdout indices since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) class(split_objs$splits[[1]]) <- c("val_split", "rsplit") val_att <- list(prop = prop, strata = !is.null(strata)) new_rset(splits = split_objs$splits, ids = "validation", attrib = val_att, subclass = c("validation_split", "rset")) } #' @rdname validation_split #' @inheritParams vfold_cv #' @inheritParams initial_time_split #' @export validation_time_split <- function(data, prop = 3/4, lag = 0, ...) { if (!is.numeric(prop) | prop >= 1 | prop <= 0) { rlang::abort("`prop` must be a number on (0, 1).") } if (!is.numeric(lag) | !(lag%%1 == 0)) { stop("`lag` must be a whole number.", call. = FALSE) } n_train <- floor(nrow(data) * prop) if (lag > n_train) { stop("`lag` must be less than or equal to the number of training observations.", call. = FALSE) } split <- rsplit(data, 1:n_train, (n_train + 1 - lag):nrow(data)) split <- rm_out(split) class(split) <- c("val_split", "rsplit") splits <- list(split) val_att <- list(prop = prop, strata = FALSE) new_rset(splits = splits, ids = "validation", attrib = val_att, subclass = c("validation_split", "rset")) } #' @export print.validation_split <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("validation_split", "rset"))] print(x, ...) } #' @export print.val_split<- function(x, ...) { if (is_missing_out_id(x)) { out_char <- paste(length(complement(x))) } else { out_char <- paste(length(x$out_id)) } cat("\n") cat("<", length(x$in_id), "/", out_char, "/", nrow(x$data), ">\n", sep = "") } rsample/R/bootci.R0000644000175000017500000003121314010267142013647 0ustar nileshnilesh# Bootstrap confidence interval code # ------------------------------------------------------------------------------ # helpers check_rset <- function(x, app = TRUE) { if (!inherits(x, "bootstraps")) stop("`.data` should be an `rset` object generated from `bootstraps()`", call. = FALSE) if (app) { if(x %>% dplyr::filter(id == "Apparent") %>% nrow() != 1) stop("Please set `apparent = TRUE` in `bootstraps()` function", call. = FALSE) } invisible(NULL) } stat_fmt_err <- paste("`statistics` should select a list column of tidy results.") stat_nm_err <- paste("The tibble in `statistics` should have columns for", "'estimate' and 'term`") std_exp <- c("std.error", "robust.se") check_tidy_names <- function(x, std_col) { # check for proper columns if (sum(colnames(x) == "estimate") != 1) { stop(stat_nm_err, call. = FALSE) } if (sum(colnames(x) == "term") != 1) { stop(stat_nm_err, call. = FALSE) } if (std_col) { std_candidates <- colnames(x) %in% std_exp if (sum(std_candidates) != 1) { stop("`statistics` should select a single column for the standard ", "error.", call. = FALSE) } } invisible(TRUE) } check_tidy <- function(x, std_col = FALSE) { if (!is.list(x)) { stop(stat_fmt_err, call. = FALSE) } # convert to data frame from list has_id <- any(names(x) == "id") if (has_id) { list_cols <- names(x)[map_lgl(x, is_list)] x <- try(tidyr::unnest(x, cols = list_cols), silent = TRUE) } else { x <- try(map_dfr(x, ~ .x), silent = TRUE) } if (inherits(x, "try-error")) { stop(stat_fmt_err, call. = FALSE) } check_tidy_names(x, std_col) if (std_col) { std_candidates <- colnames(x) %in% std_exp std_candidates <- colnames(x)[std_candidates] if (has_id) { x <- dplyr::select(x, term, estimate, id, tidyselect::one_of(std_candidates)) %>% mutate(id = (id == "Apparent")) %>% setNames(c("term", "estimate", "orig", "std_err")) } else { x <- dplyr::select(x, term, estimate, tidyselect::one_of(std_candidates)) %>% setNames(c("term", "estimate", "std_err")) } } else { if (has_id) { x <- dplyr::select(x, term, estimate, id) %>% mutate(orig = (id == "Apparent")) %>% dplyr::select(-id) } else { x <- dplyr::select(x, term, estimate) } } x } get_p0 <- function(x, alpha = 0.05) { orig <- x %>% group_by(term) %>% dplyr::filter(orig) %>% dplyr::select(term, theta_0 = estimate) %>% ungroup() x %>% dplyr::filter(!orig) %>% inner_join(orig, by = "term") %>% group_by(term) %>% summarize(p0 = mean(estimate <= theta_0, na.rm = TRUE)) %>% mutate(Z0 = stats::qnorm(p0), Za = stats::qnorm(1 - alpha / 2)) } new_stats <- function(x, lo, hi) { res <- as.numeric(quantile(x, probs = c(lo, hi), na.rm = TRUE)) tibble(.lower = min(res), .estimate = mean(x, na.rm = TRUE), .upper = max(res)) } has_dots <- function(x) { nms <- names(formals(x)) if (!any(nms == "...")) { stop("`.fn` must have an argument `...`.", call. = FALSE) } invisible(NULL) } check_num_resamples <- function(x, B = 1000) { x <- x %>% dplyr::group_by(term) %>% dplyr::summarize(n = sum(!is.na(estimate))) %>% dplyr::filter(n < B) if (nrow(x) > 0) { terms <- paste0("`", x$term, "`") msg <- paste0( "Recommend at least ", B, " non-missing bootstrap resamples for ", ifelse(length(terms) > 1, "terms: ", "term "), paste0(terms, collapse = ", "), "." ) warning(msg, call. = FALSE) } invisible(NULL) } # ------------------------------------------------------------------------------ # percentile code pctl_single <- function(stats, alpha = 0.05) { if (all(is.na(stats))) stop("All statistics have missing values..", call. = FALSE) if (!is.numeric(stats)) stop("`stats` must be a numeric vector.", call. = FALSE) # stats is a numeric vector of values ci <- stats %>% quantile(probs = c(alpha / 2, 1 - alpha / 2), na.rm = TRUE) # return a tibble with .lower, .estimate, .upper res <- tibble( .lower = min(ci), .estimate = mean(stats, na.rm = TRUE), .upper = max(ci), .alpha = alpha, .method = "percentile" ) res } #' Bootstrap confidence intervals #' @description #' Calculate bootstrap confidence intervals using various methods. #' @param .data A data frame containing the bootstrap resamples created using #' `bootstraps()`. For t- and BCa-intervals, the `apparent` argument #' should be set to `TRUE`. Even if the `apparent` argument is set to #' `TRUE` for the percentile method, the apparent data is never used in calculating #' the percentile confidence interval. #' @param statistics An unquoted column name or `dplyr` selector that identifies #' a single column in the data set that contains the individual bootstrap #' estimates. This can be a list column of tidy tibbles (that contains columns #' `term` and `estimate`) or a simple numeric column. For t-intervals, a #' standard tidy column (usually called `std.err`) is required. #' See the examples below. #' @param alpha Level of significance #' @return Each function returns a tibble with columns `.lower`, #' `.estimate`, `.upper`, `.alpha`, `.method`, and `term`. #' `.method` is the type of interval (eg. "percentile", #' "student-t", or "BCa"). `term` is the name of the estimate. Note #' the `.estimate` returned from `int_pctl()` #' is the mean of the estimates from the bootstrap resamples #' and not the estimate from the apparent model. #' @details Percentile intervals are the standard method of #' obtaining confidence intervals but require thousands of #' resamples to be accurate. T-intervals may need fewer #' resamples but require a corresponding variance estimate. #' Bias-corrected and accelerated intervals require the original function #' that was used to create the statistics of interest and are #' computationally taxing. #' @seealso [reg_intervals()] #' @references Davison, A., & Hinkley, D. (1997). _Bootstrap Methods and their #' Application_. Cambridge: Cambridge University Press. #' doi:10.1017/CBO9780511802843 #' #' @examples #' \donttest{ #' library(broom) #' library(dplyr) #' library(purrr) #' library(tibble) #' #' lm_est <- function(split, ...) { #' lm(mpg ~ disp + hp, data = analysis(split)) %>% #' tidy() #' } #' #' set.seed(52156) #' car_rs <- #' bootstraps(mtcars, 500, apparent = TRUE) %>% #' mutate(results = map(splits, lm_est)) #' #' int_pctl(car_rs, results) #' int_t(car_rs, results) #' int_bca(car_rs, results, .fn = lm_est) #' #' # putting results into a tidy format #' rank_corr <- function(split) { #' dat <- analysis(split) #' tibble( #' term = "corr", #' estimate = cor(dat$sqft, dat$price, method = "spearman"), #' # don't know the analytical std.err so no t-intervals #' std.err = NA_real_ #' ) #' } #' #' set.seed(69325) #' data(Sacramento, package = "modeldata") #' bootstraps(Sacramento, 1000, apparent = TRUE) %>% #' mutate(correlations = map(splits, rank_corr)) %>% #' int_pctl(correlations) #' } #' @export int_pctl <- function(.data, statistics, alpha = 0.05) { check_rset(.data, app = FALSE) if (length(alpha) != 1 || !is.numeric(alpha)) { abort("`alpha` must be a single numeric value.") } .data <- .data %>% dplyr::filter(id != "Apparent") column_name <- tidyselect::vars_select(names(.data), !!rlang::enquo(statistics)) if (length(column_name) != 1) { stop(stat_fmt_err, call. = FALSE) } stats <- .data[[column_name]] stats <- check_tidy(stats, std_col = FALSE) check_num_resamples(stats, B = 1000) vals <- stats %>% dplyr::group_by(term) %>% dplyr::do(pctl_single(.$estimate, alpha = alpha)) %>% dplyr::ungroup() vals } # ------------------------------------------------------------------------------ # t interval code t_single <- function(stats, std_err, is_orig, alpha = 0.05) { # stats is a numeric vector of values # vars is a numeric vector of variances # return a tibble with .lower, .estimate, .upper # which_orig is the index of stats and std_err that has the original result if (all(is.na(stats))) stop("All statistics have missing values.", call. = FALSE) if (!is.logical(is_orig) || any(is.na(is_orig))) { stop("`is_orig` should be a logical column the same length as `stats` ", "with no missing values.", call. = FALSE) } if (length(stats) != length(std_err) && length(stats) != length(is_orig)) { stop("`stats`, `std_err`, and `is_orig` should have the same length.", call. = FALSE) } if (sum(is_orig) != 1) { stop("The original statistic must be in a single row.", call. = FALSE) } theta_obs <- stats[is_orig] std_err_obs <- std_err[is_orig] stats <- stats[!is_orig] std_err <- std_err[!is_orig] z_dist <- (stats - theta_obs) / std_err z_pntl <- quantile(z_dist, probs = c(alpha / 2, 1 - (alpha) / 2), na.rm = TRUE) ci <- theta_obs - z_pntl * std_err_obs tibble( .lower = min(ci), .estimate = mean(stats, na.rm = TRUE), .upper = max(ci), .alpha = alpha, .method = "student-t" ) } #' @rdname int_pctl #' @export int_t <- function(.data, statistics, alpha = 0.05) { check_rset(.data) if (length(alpha) != 1 || !is.numeric(alpha)) { abort("`alpha` must be a single numeric value.") } column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics)) if (length(column_name) != 1) { stop(stat_fmt_err, call. = FALSE) } stats <- .data %>% dplyr::select(!!column_name, id) stats <- check_tidy(stats, std_col = TRUE) check_num_resamples(stats, B = 500) vals <- stats %>% dplyr::group_by(term) %>% dplyr::do(t_single(.$estimate, .$std_err, .$orig, alpha = alpha)) %>% dplyr::ungroup() vals } # ---------------------------------------------------------------- bca_calc <- function(stats, orig_data, alpha = 0.05, .fn, ...) { # TODO check per term if (all(is.na(stats$estimate))) { stop("All statistics have missing values.", call. = FALSE) } ### Estimating Z0 bias-correction bias_corr_stats <- get_p0(stats, alpha = alpha) # need the original data frame here loo_rs <- loo_cv(orig_data) # We can't be sure what we will get back from the analysis function. # To test, we run on the first LOO data set and see if it is a vector or df loo_test <- try(rlang::exec(.fn, loo_rs$splits[[1]], ...), silent = TRUE) if (inherits(loo_test, "try-error")) { cat("Running `.fn` on the LOO resamples produced an error:\n") print(loo_test) stop("`.fn` failed.", call. = FALSE) } loo_res <- furrr::future_map_dfr(loo_rs$splits, .fn, ...) loo_estimate <- loo_res %>% dplyr::group_by(term) %>% dplyr::summarize(loo = mean(estimate, na.rm = TRUE)) %>% dplyr::inner_join(loo_res, by = "term") %>% dplyr::group_by(term) %>% dplyr::summarize( cubed = sum((loo - estimate)^3), squared = sum((loo - estimate)^2) ) %>% dplyr::ungroup() %>% dplyr::inner_join(bias_corr_stats, by = "term") %>% dplyr::mutate( a = cubed/(6 * (squared^(3 / 2))), Zu = (Z0 + Za) / ( 1 - a * (Z0 + Za)) + Z0, Zl = (Z0 - Za) / (1 - a * (Z0 - Za)) + Z0, lo = stats::pnorm(Zl, lower.tail = TRUE), hi = stats::pnorm(Zu, lower.tail = TRUE) ) terms <- loo_estimate$term stats <- stats %>% dplyr::filter(!orig) for (i in seq_along(terms)) { tmp <- new_stats(stats$estimate[ stats$term == terms[i] ], lo = loo_estimate$lo[i], hi = loo_estimate$hi[i]) tmp$term <- terms[i] if (i == 1) { ci_bca <- tmp } else { ci_bca <- bind_rows(ci_bca, tmp) } } ci_bca <- ci_bca %>% dplyr::select(term, .lower, .estimate, .upper) %>% dplyr::mutate( .alpha = alpha, .method = "BCa" ) } #' @rdname int_pctl #' @param .fn A function to calculate statistic of interest. The #' function should take an `rsplit` as the first argument and the `...` are #' required. #' @param ... Arguments to pass to `.fn`. #' @references \url{https://rsample.tidymodels.org/articles/Applications/Intervals.html} #' @export int_bca <- function(.data, statistics, alpha = 0.05, .fn, ...) { check_rset(.data) if (length(alpha) != 1 || !is.numeric(alpha)) { abort("`alpha` must be a single numeric value.") } has_dots(.fn) column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics)) if (length(column_name) != 1) { stop(stat_fmt_err, call. = FALSE) } stats <- .data %>% dplyr::select(!!column_name, id) stats <- check_tidy(stats) check_num_resamples(stats, B = 1000) vals <- bca_calc(stats, .data$splits[[1]]$data, alpha = alpha, .fn = .fn, ...) vals } rsample/R/compat-dplyr.R0000644000175000017500000001447313673171773015036 0ustar nileshnilesh#' Compatibility with dplyr #' #' @description #' rsample should be fully compatible with dplyr 1.0.0. #' #' With older versions of dplyr, there is partial support for the following #' verbs: `mutate()`, `arrange()`, `filter()`, `rename()`, `select()`, and #' `slice()`. We strongly recommend updating to dplyr 1.0.0 if possible to #' get more complete integration with dplyr. #' #' @section Version Specific Behavior: #' #' rsample performs somewhat differently depending on whether you have #' dplyr >= 1.0.0 (new) or dplyr < 1.0.0 (old). Additionally, version #' 0.0.7 of rsample (new) introduced some changes to how rsample objects #' work with dplyr, even on old dplyr. Most of these changes influence the #' return value of a dplyr verb and determine whether it will be a tibble #' or an rsample rset subclass. #' #' The table below attempts to capture most of these changes. These examples #' are not exhaustive and may not capture some edge-cases. #' #' ## Joins #' #' The following affect all of the dplyr joins, such as `left_join()`, #' `right_join()`, `full_join()`, and `inner_join()`. #' #' Joins that alter the rows of the original rset object: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :------------------------- | :---------------------: | :---------------------: | :---------------------: #' | `join(rset, tbl)` | error | error | tibble #' #' The idea here is that, if there are less rows in the result, the result should #' not be an rset object. For example, you can't have a 10-fold CV object #' without 10 rows. #' #' Joins that keep the rows of the original rset object: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :------------------------- | :---------------------: | :---------------------: | :---------------------: #' | `join(rset, tbl)` | error | error | rset #' #' As with the logic above, if the original rset object (defined by the split #' column and the id column(s)) is left intact, the results should be an rset. #' #' ## Row Subsetting #' #' As mentioned above, this should result in a tibble if any rows are removed #' or added. Simply reordering rows still results in a valid rset with new #' rsample. #' #' Cases where rows are removed or added: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :-------------- | :---------------------: | :---------------------: | :---------------------: #' | `rset[ind,]` | tibble | tibble | tibble #' | `slice(rset)` | rset | tibble | tibble #' | `filter(rset)` | rset | tibble | tibble #' #' Cases where all rows are kept, but are possibly reordered: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :-------------- | :---------------------: | :---------------------: | :---------------------: #' | `rset[ind,]` | tibble | rset | rset #' | `slice(rset)` | rset | rset | rset #' | `filter(rset)` | rset | rset | rset #' | `arrange(rset)` | rset | rset | rset #' #' ## Column Subsetting #' #' When the `splits` column or any `id` columns are dropped or renamed, #' the result should no longer be considered a valid rset. #' #' Cases when the required columns are removed or renamed: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :-------------- | :---------------------: | :---------------------: | :---------------------: #' | `rset[,ind]` | tibble | tibble | tibble #' | `select(rset)` | rset | tibble | tibble #' | `rename(rset)` | tibble | tibble | tibble #' #' Cases when no required columns are affected: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :-------------- | :---------------------: | :---------------------: | :---------------------: #' | `rset[,ind]` | tibble | rset | rset #' | `select(rset)` | rset | rset | rset #' | `rename(rset)` | rset | rset | rset #' #' ## Other Column Operations #' #' Cases when the required columns are altered: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :-------------- | :---------------------: | :---------------------: | :---------------------: #' | `mutate(rset)` | rset | tibble | tibble #' #' Cases when no required columns are affected: #' #' | operation | old rsample + old dplyr | new rsample + old dplyr | new rsample + new dplyr #' | :-------------- | :---------------------: | :---------------------: | :---------------------: #' | `mutate(rset)` | rset | rset | rset #' #' @name rsample-dplyr NULL # `dplyr_reconstruct()` # # `dplyr_reconstruct()` is called: # - After a complex dplyr operation, like a `left_join()`, to restore to the # type of the first input, `x`. # - At the end of a call to `dplyr_col_modify()` # - At the end of a call to `dplyr_row_slice()` # - See `?dplyr_reconstruct` for the full list. # # Because `dplyr_reconstruct()` is called at the end of `dplyr_col_modify()` # and `dplyr_row_slice()`, we don't need methods for them. The default methods # in dplyr do the right thing automatically, and then our reconstruction # method decides whether or not the result should still be an rset. # # The implementation for rsample is the same as `vec_restore()`. Generally # it will fall back to reconstructing a bare tibble, unless the rset structure # is still completely intact. This happens when rset specific rows and columns # (splits, id cols) are still exactly identical to how they were before the # dplyr operation (with the exception of column reordering). # Registered in `.onLoad()` dplyr_reconstruct_rset <- function(data, template) { rset_reconstruct(data, template) } rsample/R/pkg.R0000644000175000017500000000262013727757057013200 0ustar nileshnilesh#' rsample: General Resampling Infrastructure for R #' #'\pkg{rsample} has functions to create variations of a data set #' that can be used to evaluate models or to estimate the #' sampling distribution of some statistic. #' #' @section Terminology: #'\itemize{ #' \item A **resample** is the result of a two-way split of a #' data set. For example, when bootstrapping, one part of the #' resample is a sample with replacement of the original data. #' The other part of the split contains the instances that were #' not contained in the bootstrap sample. The data structure #' `rsplit` is used to store a single resample. #' \item When the data are split in two, the portion that is #' used to estimate the model or calculate the statistic is #' called the **analysis** set here. In machine learning this #' is sometimes called the "training set" but this would be #' poorly named since it might conflict with any initial split #' of the original data. #' \item Conversely, the other data in the split are called the #' **assessment** data. In bootstrapping, these data are #' often called the "out-of-bag" samples. #' \item A collection of resamples is contained in an #' `rset` object. #'} #' #' @section Basic Functions: #' The main resampling functions are: [vfold_cv()], #' [bootstraps()], [mc_cv()], #' [rolling_origin()], and [nested_cv()]. #' @docType package #' @name rsample NULL rsample/R/tidyselect.R0000644000175000017500000000126113755042647014561 0ustar nileshnilesh# Alias required for help links in downstream packages #' @aliases select_helpers #' @importFrom tidyselect contains #' @export tidyselect::contains #' @importFrom tidyselect ends_with #' @export tidyselect::ends_with #' @importFrom tidyselect everything #' @export tidyselect::everything #' @importFrom tidyselect matches #' @export tidyselect::matches #' @importFrom tidyselect num_range #' @export tidyselect::num_range #' @importFrom tidyselect starts_with #' @export tidyselect::starts_with #' @importFrom tidyselect last_col #' @export tidyselect::last_col #' @importFrom tidyselect any_of #' @export tidyselect::any_of #' @importFrom tidyselect all_of #' @export tidyselect::all_of rsample/R/nest.R0000644000175000017500000000661714012611402013345 0ustar nileshnilesh#' Nested or Double Resampling #' #' `nested_cv` can be used to take the results of one resampling procedure #' and conduct further resamples within each split. Any type of resampling #' used in `rsample` can be used. #' #' @details #' It is a bad idea to use bootstrapping as the outer resampling procedure (see #' the example below) #' @param data A data frame. #' @param outside The initial resampling specification. This can be an already #' created object or an expression of a new object (see the examples below). #' If the latter is used, the `data` argument does not need to be #' specified and, if it is given, will be ignored. #' @param inside An expression for the type of resampling to be conducted #' within the initial procedure. #' @return An tibble with `nested_cv` class and any other classes that #' outer resampling process normally contains. The results include a #' column for the outer data split objects, one or more `id` columns, #' and a column of nested tibbles called `inner_resamples` with the #' additional resamples. #' @examples #' ## Using expressions for the resampling procedures: #' nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5)) #' #' ## Using an existing object: #' folds <- vfold_cv(mtcars) #' nested_cv(mtcars, folds, inside = bootstraps(times = 5)) #' #' ## The dangers of outer bootstraps: #' set.seed(2222) #' bad_idea <- nested_cv(mtcars, #' outside = bootstraps(times = 5), #' inside = vfold_cv(v = 3)) #' #' first_outer_split <- bad_idea$splits[[1]] #' outer_analysis <- as.data.frame(first_outer_split) #' sum(grepl("Volvo 142E", rownames(outer_analysis))) #' #' ## For the 3-fold CV used inside of each bootstrap, how are the replicated #' ## `Volvo 142E` data partitioned? #' first_inner_split <- bad_idea$inner_resamples[[1]]$splits[[1]] #' inner_analysis <- as.data.frame(first_inner_split) #' inner_assess <- as.data.frame(first_inner_split, data = "assessment") #' #' sum(grepl("Volvo 142E", rownames(inner_analysis))) #' sum(grepl("Volvo 142E", rownames(inner_assess))) #' @export nested_cv <- function(data, outside, inside) { nest_args <- formalArgs(nested_cv) cl <- match.call() boot_msg <- paste0( "Using bootstrapping as the outer resample is dangerous ", "since the inner resample might have the same data ", "point in both the analysis and assessment set." ) outer_cl <- cl[["outside"]] if (is_call(outer_cl)) { if (grepl("^bootstraps", deparse(outer_cl))) warning(boot_msg, call. = FALSE) outer_cl$data <- quote(data) outside <- eval(outer_cl) } else { if (inherits(outside, "bootstraps")) warning(boot_msg, call. = FALSE) } inner_cl <- cl[["inside"]] if (!is_call(inner_cl)) stop( "`inside` should be a expression such as `vfold()` or ", "bootstraps(times = 10)` instead of a existing object.", call. = FALSE ) inside <- map(outside$splits, inside_resample, cl = inner_cl) out <- dplyr::mutate(outside, inner_resamples = inside) out <- add_class(out, cls = "nested_cv") attr(out, "outside") <- cl$outside attr(out, "inside") <- cl$inside out } inside_resample <- function(src, cl) { cl$data <- quote(as.data.frame(src)) eval(cl) } #' @export print.nested_cv <- function(x, ...) { char_x <- paste("#", pretty(x)) cat(char_x, sep = "\n") class(x) <- class(tibble()) print(x, ...) } rsample/R/zzz.R0000644000175000017500000000116013673171774013246 0ustar nileshnilesh# nocov start .onLoad <- function(libname, pkgname) { if (dplyr_pre_1.0.0()) { vctrs::s3_register("dplyr::mutate", "rset", method = mutate_rset) vctrs::s3_register("dplyr::arrange", "rset", method = arrange_rset) vctrs::s3_register("dplyr::filter", "rset", method = filter_rset) vctrs::s3_register("dplyr::rename", "rset", method = rename_rset) vctrs::s3_register("dplyr::select", "rset", method = select_rset) vctrs::s3_register("dplyr::slice", "rset", method = slice_rset) } else { vctrs::s3_register("dplyr::dplyr_reconstruct", "rset", method = dplyr_reconstruct_rset) } } # nocov end rsample/R/permutations.R0000644000175000017500000000764613755042647015157 0ustar nileshnilesh#' Permutation sampling #' #' @description #' A permutation sample is the same size as the original data set and is made #' by permuting/shuffling one or more columns. This results in analysis #' samples where some columns are in their original order and some columns #' are permuted to a random order. Unlike other sampling functions in #' `rsample`, there is no assessment set and calling `assessment()` on a #' permutation split will throw an error. #' #' @param data A data frame. #' @param permute One or more columns to shuffle. This argument supports #' `tidyselect` selectors. Multiple expressions can be combined with `c()`. #' Variable names can be used as if they were positions in the data frame, so #' expressions like `x:y` can be used to select a range of variables. #' See \code{\link[tidyselect]{language}} for more details. #' @param times The number of permutation samples. #' @param apparent A logical. Should an extra resample be added where the #' analysis is the standard data set. #' @param ... Not currently used. #' #' @details The argument `apparent` enables the option of an additional #' "resample" where the analysis data set is the same as the original data #' set. Permutation-based resampling can be especially helpful for computing #' a statistic under the null hypothesis (e.g. t-statistic). This forms the #' basis of a permutation test, which computes a test statistic under all #' possible permutations of the data. #' #' @return A `tibble` with classes `permutations`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and a #' column called `id` that has a character string with the resample #' identifier. #' #' @examples #' permutations(mtcars, mpg, times = 2) #' permutations(mtcars, mpg, times = 2, apparent = TRUE) #' #' library(purrr) #' resample1 <- permutations(mtcars, starts_with("c"), times = 1) #' resample1$splits[[1]] %>% analysis() #' #' resample2 <- permutations(mtcars, hp, times = 10, apparent = TRUE) #' map_dbl(resample2$splits, function(x) { #' t.test(hp ~ vs, data = analysis(x))$statistic #' }) #' #' @export permutations <- function( data, permute = NULL, times = 25, apparent = FALSE, ... ) { permute <- rlang::enquo(permute) if (is.null(permute)) { rlang::abort("You must specify at least one column to permute!") } col_id <- tidyselect::eval_select(permute, data) if (identical(length(col_id), 0L)) { rlang::abort("You must specify at least one column to permute!") } else if (identical(length(col_id), ncol(data))) { rlang::abort("You have selected all columns to permute. This effectively reorders the rows in the original data without changing the data structure. Please select fewer columns to permute.") } split_objs <- perm_splits(data, times) if (apparent) split_objs <- dplyr::bind_rows(split_objs, apparent(data)) split_objs$splits <- purrr::map(split_objs$splits, function(x) { x$col_id <- col_id; x }) perm_att <- list( times = times, apparent = apparent, col_id = col_id ) new_rset( splits = split_objs$splits, ids = split_objs$id, attrib = perm_att, subclass = c("permutations", "rset") ) } perm_complement <- function(ind, n) { list(analysis = ind, assessment = NA) } perm_splits <- function(data, times = 25) { n <- nrow(data) indices <- purrr::map(rep(n, times), sample, replace = FALSE) indices <- lapply(indices, perm_complement, n = n) split_objs <- purrr::map(indices, make_splits, data = data, class = "perm_split") list( splits = split_objs, id = names0(length(split_objs), "Permutations") ) } #' @export print.permutations <- function(x, ...) { shuffled_cols <- paste(names(attr(x, "col_id")), collapse = ", ") cat("#", pretty(x), "\n") cat("# Permuted columns: [", shuffled_cols, "] \n", sep = "") class(x) <- class(x)[!(class(x) %in% c("permutations", "rset"))] print(x, ...) } rsample/R/rsplit.R0000644000175000017500000001064714010267142013715 0ustar nileshnileshrsplit <- function(data, in_id, out_id) { if (!is.data.frame(data) & !is.matrix(data)) stop("`data` must be a data frame.", call. = FALSE) if (!is.integer(in_id) | any(in_id < 1)) stop("`in_id` must be a positive integer vector.", call. = FALSE) if(!all(is.na(out_id))) { if (!is.integer(out_id) | any(out_id < 1)) stop("`out_id` must be a positive integer vector.", call. = FALSE) } if (length(in_id) == 0) stop("At least one row should be selected for the analysis set.", call. = FALSE) structure( list( data = data, in_id = in_id, out_id = out_id ), class = "rsplit" ) } #' @export print.rsplit <- function(x, ...) { out_char <- if (is_missing_out_id(x)) paste(length(complement(x))) else paste(length(x$out_id)) cat("\n") cat("<", length(x$in_id), "/", out_char, "/", nrow(x$data), ">\n", sep = "") } #' @export as.integer.rsplit <- function(x, data = c("analysis", "assessment"), ...) { data <- match.arg(data) if (data == "analysis") out <- x$in_id else { out <- if (is_missing_out_id(x)) complement(x) else x$out_id } out } #' Convert an `rsplit` object to a data frame #' #' The analysis or assessment code can be returned as a data #' frame (as dictated by the `data` argument) using #' `as.data.frame.rsplit`. `analysis` and #' `assessment` are shortcuts. #' @param x An `rsplit` object. #' @param row.names `NULL` or a character vector giving the row names for the data frame. Missing values are not allowed. #' @param optional A logical: should the column names of the data be checked for legality? #' @param data Either "analysis" or "assessment" to specify which data are returned. #' @param ... Additional arguments to be passed to or from methods. Not currently used. #' @examples #' library(dplyr) #' set.seed(104) #' folds <- vfold_cv(mtcars) #' #' model_data_1 <- folds$splits[[1]] %>% analysis() #' holdout_data_1 <- folds$splits[[1]] %>% assessment() #' @export as.data.frame.rsplit <- function(x, row.names = NULL, optional = FALSE, data = "analysis", ...) { if (!is.null(row.names)) warning( "`row.names` is kept for consistency with the ", "underlying class but non-NULL values will be ", "ignored.", call. = FALSE) if (optional) warning( "`optional` is kept for consistency with the ", "underlying class but TRUE values will be ", "ignored.", call. = FALSE) if (!is.null(x$col_id)) { if (identical(data, "assessment")) { rsplit_class <- class(x)[[2]] msg <- paste0("There is no assessment data set for an `rsplit` object", " with class `", rsplit_class, "`.") rlang::abort(msg) } permuted_col <- x$data[as.integer(x, data = data, ...), x$col_id, drop = FALSE] x$data[, x$col_id] <- permuted_col return(x$data) } x$data[as.integer(x, data = data, ...), , drop = FALSE] } #' @rdname as.data.frame.rsplit #' @export analysis <- function(x, ...) { if (!inherits(x, "rsplit")) stop("`x` should be an `rsplit` object", call. = FALSE) as.data.frame(x, data = "analysis", ...) } #' @rdname as.data.frame.rsplit #' @export assessment <- function(x, ...){ if (!inherits(x, "rsplit")) stop("`x` should be an `rsplit` object", call. = FALSE) as.data.frame(x, data = "assessment", ...) } #' @export dim.rsplit <- function(x, ...) { c( analysis = length(x$in_id), assessment = length(complement(x)), n = nrow(x$data), p = ncol(x$data) ) } #' @method obj_sum rsplit #' @export obj_sum.rsplit <- function(x, ...) { out_char <- if (is_missing_out_id(x)) paste(length(complement(x))) else paste(length(x$out_id)) paste0("split [", length(x$in_id), "/", out_char, "]") } #' @method type_sum rsplit #' @export type_sum.rsplit <- function(x, ...) { out_char <- if (is_missing_out_id(x)) format_n(length(complement(x))) else format_n(length(x$out_id)) paste0( "split [", format_n(length(x$in_id)), "/", out_char, "]" ) } format_n <- function(x, digits = 1) { case_when( log10(x) < 3 ~ paste(x), log10(x) >= 3 & log10(x) < 6 ~ paste0(round(x/1000, digits = digits), "K"), TRUE ~ paste0(round(x/1000000, digits = digits), "M"), ) } is_rsplit <- function(x) { inherits(x, "rsplit") } rsample/R/slide.R0000644000175000017500000004354214045275374013516 0ustar nileshnilesh#' Time-based Resampling #' #' @description #' These resampling functions are focused on various forms of _time series_ #' resampling. #' #' - `sliding_window()` uses the row number when computing the resampling #' indices. It is independent of any time index, but is useful with #' completely regular series. #' #' - `sliding_index()` computes resampling indices relative to the `index` #' column. This is often a Date or POSIXct column, but doesn't have to be. #' This is useful when resampling irregular series, or for using irregular #' lookback periods such as `lookback = lubridate::years(1)` with daily #' data (where the number of days in a year may vary). #' #' - `sliding_period()` first breaks up the `index` into less granular groups #' based on `period`, and then uses that to construct the resampling indices. #' This is extremely useful for constructing rolling monthly or yearly #' windows from daily data. #' #' @inheritParams ellipsis::dots_empty #' #' @param data A data frame. #' #' @param index The index to compute resampling indices relative to, specified #' as a bare column name. This must be an existing column in `data`. #' #' - For `sliding_index()`, this is commonly a date vector, but is not #' required. #' #' - For `sliding_period()`, it is required that this is a Date or POSIXct #' vector. #' #' The `index` must be an _increasing_ vector, but duplicate values are #' allowed. Additionally, the index cannot contain any missing values. #' #' @param period The period to group the `index` by. This is specified as a #' single string, such as `"year"` or `"month"`. See the `.period` argument #' of [slider::slide_index()] for the full list of options and further #' explanation. #' #' @param lookback The number of elements to look back from the current element #' when computing the resampling indices of the analysis set. The current #' row is always included in the analysis set. #' #' - For `sliding_window()`, a single integer defining the number of rows to #' look back from the current row. #' #' - For `sliding_index()`, a single object that will be subtracted from the #' `index` as `index - lookback` to define the boundary of where to start #' searching for rows to include in the current resample. This is often #' an integer value corresponding to the number of days to look back, #' or a lubridate Period object. #' #' - For `sliding_period()`, a single integer defining the number of groups #' to look back from the current group, where the groups were defined from #' breaking up the `index` according to the `period`. #' #' In all cases, `Inf` is also allowed to force an expanding window. #' #' @param assess_start,assess_stop This combination of arguments determines #' how far into the future to look when constructing the assessment set. #' Together they construct a range of #' `[index + assess_start, index + assess_stop]` to search for rows to include #' in the assessment set. #' #' Generally, `assess_start` will always be `1` to indicate that the first #' value to potentially include in the assessment set should start one element #' after the current row, but it can be increased to a larger value to #' create "gaps" between the analysis and assessment set if you are worried #' about high levels of correlation in short term forecasting. #' #' - For `sliding_window()`, these are both single integers defining the #' number of rows to look forward from the current row. #' #' - For `sliding_index()`, these are single objects that will be added #' to the `index` to compute the range to search for rows to include #' in the assessment set. This is often an integer value corresponding to #' the number of days to look forward, or a lubridate Period object. #' #' - For `sliding_period()`, these are both single integers defining the #' number of groups to look forward from the current group, where the groups #' were defined from breaking up the `index` according to the `period`. #' #' @param complete A single logical. When using `lookback` to compute the #' analysis sets, should only complete windows be considered? If set to #' `FALSE`, partial windows will be used until it is possible to create #' a complete window (based on `lookback`). This is a way to use an #' expanding window up to a certain point, and then switch to a sliding #' window. #' #' @param step A single positive integer. After computing the resampling #' indices, `step` is used to thin out the results by selecting every #' `step`-th result by subsetting the indices with #' `seq(1L, n_indices, by = step)`. `step` is applied after `skip`. #' Note that `step` is independent of any time `index` used. #' #' @param skip A single positive integer, or zero. After computing the #' resampling indices, the first `skip` results will be dropped by subsetting #' the indices with `seq(skip + 1L, n_indices)`. This can be especially #' useful when combined with `lookback = Inf`, which creates an expanding #' window starting from the first row. By skipping forward, you can drop #' the first few windows that have very few data points. `skip` is #' applied before `step`. Note that `skip` is independent of any time #' `index` used. #' #' @param every A single positive integer. The number of periods to group #' together. #' #' For example, if the `period` was set to `"year"` with an `every` #' value of 2, then the years 1970 and 1971 would be placed in the same #' group. #' #' @param origin The reference date time value. The default when left #' as `NULL` is the epoch time of `1970-01-01 00:00:00`, #' _in the time zone of the index_. #' #' This is generally used to define the anchor time to count from, #' which is relevant when the `every` value is `> 1`. #' #' @seealso #' [rolling_origin()] #' #' [slider::slide()], [slider::slide_index()], and [slider::slide_period()], #' which power these resamplers. #' #' @name slide-resampling #' #' @examples #' library(vctrs) #' library(tibble) #' library(modeldata) #' data("Chicago") #' #' index <- new_date(c(1, 3, 4, 7, 8, 9, 13, 15, 16, 17)) #' df <- tibble(x = 1:10, index = index) #' df #' #' # Look back two rows beyond the current row, for a total of three rows #' # in each analysis set. Each assessment set is composed of the two rows after #' # the current row. #' sliding_window(df, lookback = 2, assess_stop = 2) #' #' # Same as before, but step forward by 3 rows between each resampling slice, #' # rather than just by 1. #' rset <- sliding_window(df, lookback = 2, assess_stop = 2, step = 3) #' rset #' #' analysis(rset$splits[[1]]) #' analysis(rset$splits[[2]]) #' #' # Now slide relative to the `index` column in `df`. This time we look back #' # 2 days from the current row's `index` value, and 2 days forward from #' # it to construct the assessment set. Note that this series is irregular, #' # so it produces different results than `sliding_window()`. Additionally, #' # note that it is entirely possible for the assessment set to contain no #' # data if you have a highly irregular series and "look forward" into a #' # date range where no data points actually exist! #' sliding_index(df, index, lookback = 2, assess_stop = 2) #' #' # With `sliding_period()`, we can break up our date index into less granular #' # chunks, and slide over them instead of the index directly. Here we'll use #' # the Chicago data, which contains daily data spanning 16 years, and we'll #' # break it up into rolling yearly chunks. Three years worth of data will #' # be used for the analysis set, and one years worth of data will be held out #' # for performance assessment. #' sliding_period( #' Chicago, #' date, #' "year", #' lookback = 2, #' assess_stop = 1 #' ) #' #' # Because `lookback = 2`, three years are required to form a "complete" #' # window of data. To allow partial windows, set `complete = FALSE`. #' # Here that first constructs two expanding windows until a complete three #' # year window can be formed, at which point we switch to a sliding window. #' sliding_period( #' Chicago, #' date, #' "year", #' lookback = 2, #' assess_stop = 1, #' complete = FALSE #' ) #' #' # Alternatively, you could break the resamples up by month. Here we'll #' # use an expanding monthly window by setting `lookback = Inf`, and each #' # assessment set will contain two months of data. To ensure that we have #' # enough data to fit our models, we'll `skip` the first 4 expanding windows. #' # Finally, to thin out the results, we'll `step` forward by 2 between #' # each resample. #' sliding_period( #' Chicago, #' date, #' "month", #' lookback = Inf, #' assess_stop = 2, #' skip = 4, #' step = 2 #' ) NULL #' @export #' @rdname slide-resampling sliding_window <- function(data, ..., lookback = 0L, assess_start = 1L, assess_stop = 1L, complete = TRUE, step = 1L, skip = 0L) { ellipsis::check_dots_empty() if (!is.data.frame(data)) { rlang::abort("`data` must be a data frame.") } lookback <- check_lookback(lookback) assess_start <- check_assess(assess_start, "assess_start") assess_stop <- check_assess(assess_stop, "assess_stop") step <- check_step(step) skip <- check_skip(skip) if (assess_start > assess_stop) { rlang::abort("`assess_start` must be less than or equal to `assess_stop`.") } seq <- vctrs::vec_seq_along(data) id_in <- slider::slide( .x = seq, .f = identity, .before = lookback, .after = 0L, .step = 1L, .complete = complete ) id_out <- slider::slide( .x = seq, .f = identity, .before = -assess_start, .after = assess_stop, .step = 1L, .complete = TRUE ) indices <- compute_complete_indices(id_in, id_out) if (!identical(skip, 0L)) { indices <- slice_skip(indices, skip) } if (!identical(step, 1L)) { indices <- slice_step(indices, step) } splits <- purrr::map( indices, ~ make_splits(.x, data = data, class = "sliding_window_split") ) ids <- names0(length(indices), prefix = "Slice") attrib <- list( lookback = lookback, assess_start = assess_start, assess_stop = assess_stop, complete = complete, step = step, skip = skip ) new_rset( splits = splits, ids = ids, attrib = attrib, subclass = c("sliding_window", "rset") ) } #' @export print.sliding_window <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("sliding_window", "rset"))] print(x, ...) } # ------------------------------------------------------------------------------ #' @export #' @rdname slide-resampling sliding_index <- function(data, index, ..., lookback = 0L, assess_start = 1L, assess_stop = 1L, complete = TRUE, step = 1L, skip = 0L) { ellipsis::check_dots_empty() if (!is.data.frame(data)) { rlang::abort("`data` must be a data frame.") } step <- check_step(step) skip <- check_skip(skip) index <- rlang::enexpr(index) loc <- tidyselect::eval_select(index, data) if (length(loc) != 1L) { rlang::abort("`index` must specify exactly one column in `data`.") } index <- data[[loc]] seq <- vctrs::vec_seq_along(data) id_in <- slider::slide_index( .x = seq, .i = index, .f = identity, .before = lookback, .after = 0L, .complete = complete ) id_out <- slider::slide_index( .x = seq, .i = index, .f = identity, .before = -assess_start, .after = assess_stop, .complete = TRUE ) indices <- compute_complete_indices(id_in, id_out) if (!identical(skip, 0L)) { indices <- slice_skip(indices, skip) } if (!identical(step, 1L)) { indices <- slice_step(indices, step) } splits <- purrr::map( indices, ~ make_splits(.x, data = data, class = "sliding_index_split") ) ids <- names0(length(indices), prefix = "Slice") attrib <- list( lookback = lookback, assess_start = assess_start, assess_stop = assess_stop, complete = complete, step = step, skip = skip ) new_rset( splits = splits, ids = ids, attrib = attrib, subclass = c("sliding_index", "rset") ) } #' @export print.sliding_index <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("sliding_index", "rset"))] print(x, ...) } # ------------------------------------------------------------------------------ #' @export #' @rdname slide-resampling sliding_period <- function(data, index, period, ..., lookback = 0L, assess_start = 1L, assess_stop = 1L, complete = TRUE, step = 1L, skip = 0L, every = 1L, origin = NULL) { ellipsis::check_dots_empty() if (!is.data.frame(data)) { rlang::abort("`data` must be a data frame.") } lookback <- check_lookback(lookback) assess_start <- check_assess(assess_start, "assess_start") assess_stop <- check_assess(assess_stop, "assess_stop") step <- check_step(step) if (assess_start > assess_stop) { rlang::abort("`assess_start` must be less than or equal to `assess_stop`.") } index <- rlang::enexpr(index) loc <- tidyselect::eval_select(index, data) if (length(loc) != 1L) { rlang::abort("`index` must specify exactly one column in `data`.") } index <- data[[loc]] seq <- vctrs::vec_seq_along(data) id_in <- slider::slide_period( .x = seq, .i = index, .period = period, .f = identity, .every = every, .origin = origin, .before = lookback, .after = 0L, .complete = complete ) id_out <- slider::slide_period( .x = seq, .i = index, .period = period, .f = identity, .every = every, .origin = origin, .before = -assess_start, .after = assess_stop, .complete = TRUE ) indices <- compute_complete_indices(id_in, id_out) if (!identical(skip, 0L)) { indices <- slice_skip(indices, skip) } if (!identical(step, 1L)) { indices <- slice_step(indices, step) } splits <- purrr::map( indices, ~ make_splits(.x, data = data, class = "sliding_period_split") ) ids <- names0(length(indices), prefix = "Slice") attrib <- list( period = period, lookback = lookback, assess_start = assess_start, assess_stop = assess_stop, complete = complete, step = step, skip = skip, every = every, origin = origin ) new_rset( splits = splits, ids = ids, attrib = attrib, subclass = c("sliding_period", "rset") ) } #' @export print.sliding_period <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("sliding_period", "rset"))] print(x, ...) } # ------------------------------------------------------------------------------ check_lookback <- function(x) { if (vctrs::vec_size(x) != 1L) { rlang::abort(paste0("`lookback` must have size 1.")) } if (identical(x, Inf)) { return(x) } if (!rlang::is_integerish(x, finite = TRUE)) { rlang::abort(paste0("`lookback` must be an integer of size 1, or `Inf`.")) } if (x < 0L) { rlang::abort(paste0("`lookback` must be positive, or zero.")) } vctrs::vec_cast(x, integer(), x_arg = "lookback") } check_assess <- function(x, arg) { if (vctrs::vec_size(x) != 1L) { rlang::abort(paste0("`", arg, "` must have size 1.")) } if (identical(x, Inf)) { return(x) } if (!rlang::is_integerish(x, finite = TRUE)) { rlang::abort(paste0("`", arg, "` must be an integer of size 1, or `Inf`.")) } if (x <= 0L) { rlang::abort(paste0("`", arg, "` must be positive.")) } vctrs::vec_cast(x, integer(), x_arg = arg) } check_step <- function(x) { if (vctrs::vec_size(x) != 1L) { rlang::abort(paste0("`step` must have size 1.")) } if (!rlang::is_integerish(x, finite = TRUE)) { rlang::abort(paste0("`step` must be an integer of size 1.")) } if (x <= 0L) { rlang::abort(paste0("`step` must be positive.")) } vctrs::vec_cast(x, integer(), x_arg = "step") } check_skip <- function(x) { if (vctrs::vec_size(x) != 1L) { rlang::abort(paste0("`skip` must have size 1.")) } if (!rlang::is_integerish(x, finite = TRUE)) { rlang::abort(paste0("`skip` must be an integer of size 1.")) } if (x < 0L) { rlang::abort(paste0("`skip` must be positive, or zero.")) } vctrs::vec_cast(x, integer(), x_arg = "skip") } compute_complete_indices <- function(id_in, id_out) { # Remove where either list has a `NULL` element. # These are incomplete windows. id_in_na <- vctrs::vec_equal_na(id_in) id_out_na <- vctrs::vec_equal_na(id_out) id_either_na <- id_in_na | id_out_na id_in <- vctrs::vec_slice(id_in, !id_either_na) id_out <- vctrs::vec_slice(id_out, !id_either_na) purrr::map2(id_in, id_out, merge_lists) } slice_skip <- function(indices, skip) { n_indices <- length(indices) slicer <- rlang::seq2(skip + 1L, n_indices) vctrs::vec_slice(indices, slicer) } slice_step <- function(indices, step) { n_indices <- length(indices) slicer <- seq2_by(1L, n_indices, by = step) indices <- vctrs::vec_slice(indices, slicer) } seq2_by <- function(from, to, by) { if (length(from) != 1) { rlang::abort("`from` must be length one") } if (length(to) != 1) { rlang::abort("`to` must be length one") } by <- as.integer(by) if (length(by) != 1) { rlang::abort("`by` must be length one") } if (by <= 0L) { rlang::abort("`by` must be positive") } if (from > to) { integer() } else { seq.int(from, to, by = by) } } rsample/R/complement.R0000644000175000017500000000606114012611402014530 0ustar nileshnilesh#' Determine the Assessment Samples #' #' This method and function help find which data belong in the analysis and #' assessment sets. #' #' Given an `rsplit` object, `complement()` will determine which #' of the data rows are contained in the assessment set. To save space, #' many of the `rsplit` objects will not contain indices for the #' assessment split. #' #' @param x An `rsplit` object #' @param ... Not currently used #' @return A integer vector. #' @seealso [populate()] #' @examples #' set.seed(28432) #' fold_rs <- vfold_cv(mtcars) #' head(fold_rs$splits[[1]]$in_id) #' fold_rs$splits[[1]]$out_id #' complement(fold_rs$splits[[1]]) #' @export complement <- function(x, ...) UseMethod("complement") #' @export #' @rdname complement complement.rsplit <- function(x, ...) { if (!is_missing_out_id(x)) { return(x$out_id) } else { (1:nrow(x$data))[-unique(x$in_id)] } } #' @export #' @rdname complement complement.rof_split <- function(x, ...) { get_stored_out_id(x) } #' @export #' @rdname complement complement.sliding_window_split <- function(x, ...) { get_stored_out_id(x) } #' @export #' @rdname complement complement.sliding_index_split <- function(x, ...) { get_stored_out_id(x) } #' @export #' @rdname complement complement.sliding_period_split <- function(x, ...) { get_stored_out_id(x) } get_stored_out_id <- function(x) { out_id <- x$out_id if (length(out_id) == 0L) { return(out_id) } if (all(is.na(out_id))) { rlang::abort("Cannot derive the assessment set for this type of resampling.") } out_id } #' @export #' @rdname complement complement.apparent_split <- function(x, ...) { if (!is_missing_out_id(x)) { return(x$out_id) } else { 1:nrow(x$data) } } #' @export complement.default <- function(x, ...) { cls <- paste0("'", class(x), "'", collapse = ", ") rlang::abort( paste("No `complement()` method for this class(es)", cls) ) } # Get the indices of the analysis set from the assessment set default_complement <- function(ind, n) { list(analysis = setdiff(1:n, ind), assessment = unique(ind)) } #' Add Assessment Indices #' #' Many `rsplit` and `rset` objects do not contain indicators for #' the assessment samples. `populate()` can be used to fill the slot #' for the appropriate indices. #' @param x A `rsplit` and `rset` object. #' @param ... Not currently used #' @return An object of the same kind with the integer indices. #' @examples #' set.seed(28432) #' fold_rs <- vfold_cv(mtcars) #' #' fold_rs$splits[[1]]$out_id #' complement(fold_rs$splits[[1]]) #' #' populate(fold_rs$splits[[1]])$out_id #' #' fold_rs_all <- populate(fold_rs) #' fold_rs_all$splits[[1]]$out_id #' @export populate <- function (x, ...) UseMethod("populate") #' @export populate.rsplit <- function(x, ...) { x$out_id <- complement(x, ...) x } #' @export populate.rset <- function(x, ...) { x$splits <- map(x$splits, populate) x } ## This will remove the assessment indices from an rsplit object rm_out <- function(x) { x$out_id <- NA x } is_missing_out_id <- function(x) { identical(x$out_id, NA) } rsample/R/misc.R0000644000175000017500000000757414066703504013350 0ustar nileshnilesh#' Constructors for split objects #' @export make_splits <- function(x, ...) UseMethod("make_splits") #' @rdname make_splits #' @param x A list of integers with names "analysis" and "assessment", or a #' data frame of analysis or training data. #' @export make_splits.default <- function(x, ...) { rlang::abort("There is no method available to make an rsplit from `x`.") } #' @rdname make_splits #' @param data A data frame. #' @param class An optional class to give the object. #' @param ... Further arguments passed to or from other methods (not currently #' used). #' @export make_splits.list <- function(x, data, class = NULL, ...) { ellipsis::check_dots_empty() res <- rsplit(data, x$analysis, x$assessment) if (!is.null(class)) { res <- add_class(res, class) } res } #' @rdname make_splits #' @param assessment A data frame of assessment or testing data, which can be empty. #' @export make_splits.data.frame <- function(x, assessment, ...) { ellipsis::check_dots_empty() if (nrow(x) == 0) { rlang::abort("The analysis set must contain at least one row.") } ind_analysis <- 1:nrow(x) if (nrow(assessment) == 0) { ind_assessment <- integer() } else { if (!identical(colnames(x), colnames(assessment))) { rlang::abort("The analysis and assessment sets must have the same columns.") } ind_assessment <- nrow(x) + 1:nrow(assessment) } data <- bind_rows(x, assessment) ind <- list( analysis = ind_analysis, assessment = ind_assessment ) make_splits(ind, data) } merge_lists <- function(a, b) list(analysis = a, assessment = b) dim_rset <- function(x, ...) { dims <- purrr::map(x$splits, dim) dims <- do.call("rbind", dims) dims <- tibble::as_tibble(dims) id_cols <- grep("(^id$)|(^id[1-9]$)", colnames(x), value = TRUE) for (i in seq_along(id_cols)) { dims[id_cols[i]] <- getElement(x, id_cols[i]) } dims } names0 <- function(num, prefix = "x") { if (num == 0L) { return(character()) } ind <- format(1:num) ind <- gsub(" ", "0", ind) paste0(prefix, ind) } add_class <- function(x, cls) { class(x) <- c(cls, class(x)) x } strata_check <- function(strata, data) { if (!is.null(strata)) { if (!is.character(strata) | length(strata) != 1) { rlang::abort("`strata` should be a single character value.") } if (inherits(data[, strata], "Surv")) { rlang::abort("`strata` cannot be a `Surv` object. Use the time or event variable directly.") } if (!(strata %in% names(data))) { rlang::abort(strata, " is not in `data`.") } } invisible(NULL) } split_unnamed <- function(x, f) { out <- split(x, f) unname(out) } #' Obtain a identifier for the resamples #' #' This function returns a hash (or NA) for an attribute that is created when #' the `rset` was initially constructed. This can be used to compare with other #' resampling objects to see if they are the same. #' @param x An `rset` or `tune_results` object. #' @param ... Not currently used. #' @return A character value or `NA_character_` if the object was created prior #' to `rsample` version 0.1.0. #' @rdname get_fingerprint #' @aliases .get_fingerprint #' @examples #' set.seed(1) #' .get_fingerprint(vfold_cv(mtcars)) #' #' set.seed(1) #' .get_fingerprint(vfold_cv(mtcars)) #' #' set.seed(2) #' .get_fingerprint(vfold_cv(mtcars)) #' #' set.seed(1) #' .get_fingerprint(vfold_cv(mtcars, repeats = 2)) #' @export .get_fingerprint <- function(x, ...) { UseMethod(".get_fingerprint") } #' @export #' @rdname get_fingerprint .get_fingerprint.default <- function(x, ...) { cls <- paste0("'", class(x), "'", collapse = ", ") rlang::abort( paste("No `.get_fingerprint()` method for this class(es)", cls) ) } #' @export #' @rdname get_fingerprint .get_fingerprint.rset <- function(x, ...) { att <- attributes(x) if (any(names(att) == "fingerprint")) { res <- att$fingerprint } else { res <- NA_character_ } res } rsample/R/rset.R0000644000175000017500000000574514124432157013366 0ustar nileshnilesh#' Constructor for new rset objects #' @param splits A list column of `rsplits` or a tibble with a single column #' called "splits" with a list column of `rsplits`. #' @param ids A character vector or a tibble with one or more columns that #' begin with "id". #' @param attrib An optional named list of attributes to add to the object. #' @param subclass A character vector of subclasses to add. #' @return An `rset` object. #' @details Once the new `rset` is constructed, an additional attribute called #' "fingerprint" is added that is a hash of the `rset`. This can be used to #' make sure other objects have the exact same resamples. #' @keywords internal #' @export new_rset <- function(splits, ids, attrib = NULL, subclass = character()) { stopifnot(is.list(splits)) if (!is_tibble(ids)) { ids <- tibble(id = ids) } else { if (!all(grepl("^id", names(ids)))) { rlang::abort("The `ids` tibble column names should start with 'id'.") } } either_type <- function(x) is.character(x) | is.factor(x) ch_check <- vapply(ids, either_type, c(logical = TRUE)) if (!all(ch_check)) { rlang::abort("All ID columns should be character or factor vectors.") } if (!is_tibble(splits)) { splits <- tibble(splits = splits) } else { if (ncol(splits) > 1 | names(splits)[1] != "splits") { rlang::abort( "The `splits` tibble should have a single column named `splits`." ) } } where_rsplits <- vapply(splits[["splits"]], is_rsplit, logical(1)) if (!all(where_rsplits)) { rlang::abort("Each element of `splits` must be an `rsplit` object.") } if (nrow(ids) != nrow(splits)) { rlang::abort("Split and ID vectors have different lengths.") } # Create another element to the splits that is a tibble containing # an identifier for each id column so that, in isolation, the resample # id can be known just based on the `rsplit` object. This can then be # accessed using the `labels` method for `rsplits` splits$splits <- map2( splits$splits, split_unnamed(ids, rlang::seq2(1L, nrow(ids))), add_id ) res <- bind_cols(splits, ids) if (!is.null(attrib)) { if (any(names(attrib) == "")) { rlang::abort("`attrib` should be a fully named list.") } for (i in names(attrib)) { attr(res, i) <- attrib[[i]] } } if (length(subclass) > 0) { res <- add_class(res, cls = subclass) } fingerprint <- map(res$splits, function(x) list(x$in_id, x$out_id)) fingerprint <- rlang::hash(fingerprint) attr(res, "fingerprint") <- fingerprint res } add_id <- function(split, id) { split$id <- id split } # ------------------------------------------------------------------------------ #' @export `[.rset` <- function(x, i, j, drop = FALSE, ...) { out <- NextMethod() rset_reconstruct(out, x) } # ------------------------------------------------------------------------------ #' @export `names<-.rset` <- function(x, value) { out <- NextMethod() rset_reconstruct(out, x) } rsample/R/boot.R0000644000175000017500000001032214066674573013356 0ustar nileshnilesh#' Bootstrap Sampling #' #' A bootstrap sample is a sample that is the same size as the original data #' set that is made using replacement. This results in analysis samples that #' have multiple replicates of some of the original rows of the data. The #' assessment set is defined as the rows of the original data that were not #' included in the bootstrap sample. This is often referred to as the #' "out-of-bag" (OOB) sample. #' @details The argument `apparent` enables the option of an additional #' "resample" where the analysis and assessment data sets are the same as the #' original data set. This can be required for some types of analysis of the #' bootstrap results. #' #' @template strata_details #' @inheritParams vfold_cv #' @inheritParams make_strata #' @param times The number of bootstrap samples. #' @param apparent A logical. Should an extra resample be added where the #' analysis and holdout subset are the entire data set. This is required for #' some estimators used by the `summary` function that require the apparent #' error rate. #' @export #' @return An tibble with classes `bootstraps`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and a #' column called `id` that has a character string with the resample identifier. #' @examples #' bootstraps(mtcars, times = 2) #' bootstraps(mtcars, times = 2, apparent = TRUE) #' #' library(purrr) #' library(modeldata) #' data(wa_churn) #' #' set.seed(13) #' resample1 <- bootstraps(wa_churn, times = 3) #' map_dbl(resample1$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' #' set.seed(13) #' resample2 <- bootstraps(wa_churn, strata = churn, times = 3) #' map_dbl(resample2$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' #' set.seed(13) #' resample3 <- bootstraps(wa_churn, strata = tenure, breaks = 6, times = 3) #' map_dbl(resample3$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' @export bootstraps <- function(data, times = 25, strata = NULL, breaks = 4, pool = 0.1, apparent = FALSE, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if(length(strata) == 0) strata <- NULL } strata_check(strata, data) split_objs <- boot_splits( data = data, times = times, strata = strata, breaks = breaks, pool = pool ) if(apparent) split_objs <- bind_rows(split_objs, apparent(data)) boot_att <- list(times = times, apparent = apparent, strata = !is.null(strata)) new_rset(splits = split_objs$splits, ids = split_objs$id, attrib = boot_att, subclass = c("bootstraps", "rset")) } # Get the indices of the analysis set from the analysis set (= bootstrap sample) boot_complement <- function(ind, n) { list(analysis = ind, assessment = NA) } boot_splits <- function(data, times = 25, strata = NULL, breaks = 4, pool = 0.1) { n <- nrow(data) if (is.null(strata)) { indices <- purrr::map(rep(n, times), sample, replace = TRUE) } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), breaks = breaks, pool = pool)) stratas <- split_unnamed(stratas, stratas$strata) stratas <- purrr::map_df( stratas, strat_sample, prop = 1, times = times, replace = TRUE ) indices <- split_unnamed(stratas$idx, stratas$rs_id) } indices <- lapply(indices, boot_complement, n = n) split_objs <- purrr::map(indices, make_splits, data = data, class = "boot_split") list(splits = split_objs, id = names0(length(split_objs), "Bootstrap")) } #' @export print.bootstraps <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("bootstraps", "rset"))] print(x, ...) } rsample/R/loo.R0000644000175000017500000000245313727757057013214 0ustar nileshnilesh#' Leave-One-Out Cross-Validation #' #' Leave-one-out (LOO) cross-validation uses one data point in the original #' set as the assessment data and all other data points as the analysis set. A #' LOO resampling set has as many resamples as rows in the original data set. #' @inheritParams vfold_cv #' @return An tibble with classes `loo_cv`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and #' one column called `id` that has a character string with the resample #' identifier. #' @examples #' loo_cv(mtcars) #' @export loo_cv <- function(data, ...) { split_objs <- vfold_splits(data = data, v = nrow(data)) split_objs <- list(splits = map(split_objs$splits, change_class), id = paste0("Resample", seq_along(split_objs$id))) ## We remove the holdout indices since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) new_rset(splits = split_objs$splits, ids = split_objs$id, subclass = c("loo_cv", "rset")) } #' @export print.loo_cv <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("loo_cv", "rset"))] print(x, ...) } change_class <- function(x) { class(x) <- c("rsplit", "loo_split") x } rsample/R/data.R0000644000175000017500000000212713727757057013332 0ustar nileshnilesh#' Job Attrition #' #' @details These data are from the IBM Watson Analytics Lab. #' The website describes the data with \dQuote{Uncover the #' factors that lead to employee attrition and explore important #' questions such as \sQuote{show me a breakdown of distance #' from home by job role and attrition} or \sQuote{compare #' average monthly income by education and attrition}. This is a #' fictional data set created by IBM data scientists.}. There #' are 1470 rows. #' #' These data are now in the `modeldata` package. #' #' @name attrition NULL #' Two Class Data #' #' @details There are artificial data with two predictors (`A` and `B`) and #' a factor outcome variable (`Class`). #' #' These data are now in the `modeldata` package. #' #' @name two_class_dat NULL #' Sample Time Series Data #' #' @details Drink sales. The exact name of the series from FRED is: #' "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices #' Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales" #' #' These data are now in the `modeldata` package. #' @name drinks NULL rsample/R/groups.R0000644000175000017500000000753114124441103013712 0ustar nileshnilesh#' Group V-Fold Cross-Validation #' #' Group V-fold cross-validation creates splits of the data based #' on some grouping variable (which may have more than a single row #' associated with it). The function can create as many splits as #' there are unique values of the grouping variable or it can #' create a smaller set of splits where more than one value is left #' out at a time. A common use of this kind of resampling is when you have #' repeated measures of the same subject. #' #' @param data A data frame. #' @param group A variable in `data` (single character or name) used for #' grouping observations with the same value to either the analysis or #' assessment set within a fold. #' @param v The number of partitions of the data set. If let #' `NULL`, `v` will be set to the number of unique values #' in the group. #' @param ... Not currently used. #' @export #' @return A tibble with classes `group_vfold_cv`, #' `rset`, `tbl_df`, `tbl`, and `data.frame`. #' The results include a column for the data split objects and an #' identification variable. #' @examples #' set.seed(3527) #' test_data <- data.frame(id = sort(sample(1:20, size = 80, replace = TRUE))) #' test_data$dat <- runif(nrow(test_data)) #' #' set.seed(5144) #' split_by_id <- group_vfold_cv(test_data, group = "id") #' #' get_id_left_out <- function(x) #' unique(assessment(x)$id) #' #' library(purrr) #' table(map_int(split_by_id$splits, get_id_left_out)) #' #' set.seed(5144) #' split_by_some_id <- group_vfold_cv(test_data, group = "id", v = 7) #' held_out <- map(split_by_some_id$splits, get_id_left_out) #' table(unlist(held_out)) #' # number held out per resample: #' map_int(held_out, length) #' @export group_vfold_cv <- function(data, group = NULL, v = NULL, ...) { if(!missing(group)) { group <- tidyselect::vars_select(names(data), !!enquo(group)) if(length(group) == 0) { group <- NULL } } if (is.null(group) || !is.character(group) || length(group) != 1) stop( "`group` should be a single character value for the column ", "that will be used for splitting.", call. = FALSE ) if (!any(names(data) == group)) stop("`group` should be a column in `data`.", call. = FALSE) split_objs <- group_vfold_splits(data = data, group = group, v = v) ## We remove the holdout indices since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) # Update `v` if not supplied directly if (is.null(v)) { v <- length(split_objs$splits) } ## Save some overall information cv_att <- list(v = v, group = group) new_rset(splits = split_objs$splits, ids = split_objs[, grepl("^id", names(split_objs))], attrib = cv_att, subclass = c("group_vfold_cv", "rset")) } group_vfold_splits <- function(data, group, v = NULL) { uni_groups <- unique(getElement(data, group)) max_v <- length(uni_groups) if (is.null(v)) { v <- max_v } else { if (v > max_v) stop("`v` should be less than ", max_v, call. = FALSE) } data_ind <- data.frame(..index = 1:nrow(data), ..group = getElement(data, group)) keys <- data.frame(..group = uni_groups) n <- nrow(keys) keys$..folds <- sample(rep(1:v, length.out = n)) data_ind <- data_ind %>% full_join(keys, by = "..group") %>% arrange(..index) indices <- split_unnamed(data_ind$..index, data_ind$..folds) indices <- lapply(indices, default_complement, n = nrow(data)) split_objs <- purrr::map(indices, make_splits, data = data, class = "group_vfold_split") tibble::tibble(splits = split_objs, id = names0(length(split_objs), "Resample")) } #' @export print.group_vfold_cv <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("group_vfold_cv", "rset"))] print(x, ...) } rsample/R/reg_intervals.R0000644000175000017500000000724314010267142015242 0ustar nileshnilesh#' A convenience function for confidence intervals with linear-ish parametric models #' #' @param formula An R model formula with one outcome and at least one predictor. #' @param data A data frame. #' @param model_fn The model to fit. Allowable values are "lm", "glm", #' "survreg", and "coxph". The latter two require that the `survival` package #' be installed. #' @param type The type of bootstrap confidence interval. Values of "student-t" and #' "percentile" are allowed. #' @param times A single integer for the number of bootstrap samples. If left #' NULL, 1,001 are used for t-intervals and 2,001 for percentile intervals. #' @param alpha Level of significance. #' @param filter A logical expression used to remove rows from the final result, or `NULL` to keep all rows. #' @param keep_reps Should the individual parameter estimates for each bootstrap #' sample be retained? #' @param ... Options to pass to the model function (such as `family` for `glm()`). #' @return A tibble with columns "term", ".lower", ".estimate", ".upper", #' ".alpha", and ".method". If `keep_reps = TRUE`, an additional list column #' called ".replicates" is also returned. #' @export #' @seealso [int_pctl()], [int_t()] #' @references #' Davison, A., & Hinkley, D. (1997). _Bootstrap Methods and their #' Application_. Cambridge: Cambridge University Press. #' doi:10.1017/CBO9780511802843 #' #' _Bootstrap Confidence Intervals_, #' \url{https://rsample.tidymodels.org/articles/Applications/Intervals.html} #' @examples #' \donttest{ #' set.seed(1) #' reg_intervals(mpg ~ I(1/sqrt(disp)), data = mtcars) #' #' set.seed(1) #' reg_intervals(mpg ~ I(1/sqrt(disp)), data = mtcars, keep_reps = TRUE) #' } reg_intervals <- function(formula, data, model_fn = "lm", type = "student-t", times = NULL, alpha = 0.05, filter = term != "(Intercept)", keep_reps = FALSE, ...) { model_fn <- rlang::arg_match(model_fn, c("lm", "glm", "survreg", "coxph")) type <- rlang::arg_match(type, c("student-t", "percentile")) filter <- rlang::enexpr(filter) if (is.null(times)) { if (type == "student-t") { times <- 1001 } else { times <- 2001 } } else { times <- times[1] if (!is.numeric(times)) { rlang::abort("'times' should be a single integer.") } } if (length(alpha) != 1 || !is.numeric(alpha)) { abort("`alpha` must be a single numeric value.") } if (model_fn %in% c("survreg", "coxph")) { pkg <- "survival" rlang::check_installed("survival") } else { pkg <- NULL } fn_call <- rlang::call2(model_fn, formula = formula, data = rlang::expr(data), .ns = pkg, ...) bt <- rsample::bootstraps(data, times = times, apparent = type %in% c("student-t")) bt <- dplyr::mutate(bt, models = purrr::map(splits, ~ model_results(rsample::analysis(.x), fn_call, filter) ) ) if (type == "student-t") { res <- int_t(bt, models, alpha = alpha) } else { res <- int_pctl(bt, models, alpha = alpha) } if (keep_reps) { bt <- bt[bt$id != "Apparent",] reps <- purrr::map_dfr(bt$models, I) reps <- dplyr::group_nest(reps, term, .key = ".replicates") res <- dplyr::full_join(res, reps, by = "term") } res } # TODO add handler for survival models to catch warnings? That seems to be the # only way to know about convergence. model_results <- function(data, cl, flt) { mod <- broom::tidy(rlang::eval_tidy(cl, data)) mod <- mod[, c("term", "estimate", "std.error")] if (is.language(flt)) { mod <- dplyr::filter(mod, !!flt) } mod } rsample/R/mc.R0000644000175000017500000000772614066674573013030 0ustar nileshnilesh#' Monte Carlo Cross-Validation #' #' One resample of Monte Carlo cross-validation takes a random sample (without #' replacement) of the original data set to be used for analysis. All other #' data points are added to the assessment set. #' @template strata_details #' @inheritParams vfold_cv #' @inheritParams make_strata #' @param prop The proportion of data to be retained for modeling/analysis. #' @param times The number of times to repeat the sampling. #' @export #' @return An tibble with classes `mc_cv`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and a #' column called `id` that has a character string with the resample identifier. #' @examples #' mc_cv(mtcars, times = 2) #' mc_cv(mtcars, prop = .5, times = 2) #' #' library(purrr) #' data(wa_churn, package = "modeldata") #' #' set.seed(13) #' resample1 <- mc_cv(wa_churn, times = 3, prop = .5) #' map_dbl(resample1$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' #' set.seed(13) #' resample2 <- mc_cv(wa_churn, strata = churn, times = 3, prop = .5) #' map_dbl(resample2$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' #' set.seed(13) #' resample3 <- mc_cv(wa_churn, strata = tenure, breaks = 6, times = 3, prop = .5) #' map_dbl(resample3$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' @export mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, pool = 0.1, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if(length(strata) == 0) strata <- NULL } strata_check(strata, data) split_objs <- mc_splits(data = data, prop = prop, times = times, strata = strata, breaks = breaks, pool = pool) ## We remove the holdout indices since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) mc_att <- list(prop = prop, times = times, strata = !is.null(strata)) new_rset(splits = split_objs$splits, ids = split_objs$id, attrib = mc_att, subclass = c("mc_cv", "rset")) } # Get the indices of the assessment set from the analysis set mc_complement <- function(ind, n) { list(analysis = ind, assessment = setdiff(1:n, ind)) } mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, pool = 0.1) { if (!is.numeric(prop) | prop >= 1 | prop <= 0) stop("`prop` must be a number on (0, 1).", call. = FALSE) n <- nrow(data) if (is.null(strata)) { indices <- purrr::map(rep(n, times), sample, size = floor(n * prop)) } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), breaks = breaks, pool = pool)) stratas <- split_unnamed(stratas, stratas$strata) stratas <- purrr::map_df(stratas, strat_sample, prop = prop, times = times) indices <- split_unnamed(stratas$idx, stratas$rs_id) } indices <- lapply(indices, mc_complement, n = n) split_objs <- purrr::map(indices, make_splits, data = data, class = "mc_split") list(splits = split_objs, id = names0(length(split_objs), "Resample")) } strat_sample <- function(x, prop, times, ...) { n <- nrow(x) idx <- purrr::map(rep(n, times), sample, size = floor(n*prop), ...) out <- purrr::map_df(idx, function(ind, x) x[sort(ind), "idx"], x = x) out$rs_id <- rep(1:times, each = floor(n*prop)) out } #' @export print.mc_cv <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("mc_cv", "rset"))] print(x, ...) } rsample/R/tidy.R0000644000175000017500000000763513653053433013364 0ustar nileshnilesh#' Tidy Resampling Object #' #' The `tidy` function from the \pkg{broom} package can be used on `rset` and #' `rsplit` objects to generate tibbles with which rows are in the analysis and #' assessment sets. #' @param x A `rset` or `rsplit` object #' @param unique_ind Should unique row identifiers be returned? For example, #' if `FALSE` then bootstrapping results will include multiple rows in the #' sample for the same row in the original data. #' @param ... Not currently used. #' @return A tibble with columns `Row` and `Data`. The latter has possible #' values "Analysis" or "Assessment". For `rset` inputs, identification columns #' are also returned but their names and values depend on the type of #' resampling. `vfold_cv` contains a column "Fold" and, if repeats are used, #' another called "Repeats". `bootstraps` and `mc_cv` use the column #' "Resample". #' @details Note that for nested resampling, the rows of the inner resample, #' named `inner_Row`, are *relative* row indices and do not correspond to the #' rows in the original data set. #' @examples #' library(ggplot2) #' theme_set(theme_bw()) #' #' set.seed(4121) #' cv <- tidy(vfold_cv(mtcars, v = 5)) #' ggplot(cv, aes(x = Fold, y = Row, fill = Data)) + #' geom_tile() + scale_fill_brewer() #' #' set.seed(4121) #' rcv <- tidy(vfold_cv(mtcars, v = 5, repeats = 2)) #' ggplot(rcv, aes(x = Fold, y = Row, fill = Data)) + #' geom_tile() + facet_wrap(~Repeat) + scale_fill_brewer() #' #' set.seed(4121) #' mccv <- tidy(mc_cv(mtcars, times = 5)) #' ggplot(mccv, aes(x = Resample, y = Row, fill = Data)) + #' geom_tile() + scale_fill_brewer() #' #' set.seed(4121) #' bt <- tidy(bootstraps(mtcars, time = 5)) #' ggplot(bt, aes(x = Resample, y = Row, fill = Data)) + #' geom_tile() + scale_fill_brewer() #' #' dat <- data.frame(day = 1:30) #' # Resample by week instead of day #' ts_cv <- rolling_origin(dat, initial = 7, assess = 7, #' skip = 6, cumulative = FALSE) #' ts_cv <- tidy(ts_cv) #' ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) + #' geom_tile() + scale_fill_brewer() #' @export tidy.rsplit <- function(x, unique_ind = TRUE, ...) { if (unique_ind) x$in_id <- unique(x$in_id) out <- tibble(Row = c(x$in_id, complement(x)), Data = rep(c("Analysis", "Assessment"), c(length(x$in_id), length(complement(x))))) out <- dplyr::arrange(.data = out, Data, Row) out } #' @rdname tidy.rsplit #' @export tidy.rset <- function(x, ...) { dots <- list(...) unique_ind <- dplyr::if_else(is.null(dots$unique_ind), TRUE, dots$unique_ind) stacked <- purrr::map(x$splits, tidy, unique_ind = unique_ind) for (i in seq(along = stacked)) stacked[[i]]$Resample <- x$id[i] stacked <- dplyr::bind_rows(stacked) stacked <- dplyr::arrange(.data = stacked, Data, Row) stacked } #' @rdname tidy.rsplit #' @export tidy.vfold_cv <- function(x, ...) { stacked <- purrr::map(x$splits, tidy) for (i in seq(along = stacked)) { if (attr(x, "repeats") > 1) { stacked[[i]]$Repeat <- x$id[i] stacked[[i]]$Fold <- x$id2[i] } else stacked[[i]]$Fold <- x$id[i] } stacked <- dplyr::bind_rows(stacked) stacked <- dplyr::arrange(.data = stacked, Data, Row) stacked } #' @rdname tidy.rsplit #' @export tidy.nested_cv <- function(x, ...) { x$inner_tidy <- purrr::map(x$inner_resamples, tidy_wrap) inner_tidy <- tidyr::unnest(x, inner_tidy) class(x) <- class(x)[class(x) != "nested_cv"] outer_tidy <- tidy(x) id_cols <- names(outer_tidy) id_cols <- id_cols[!(id_cols %in% c("Row", "Data"))] inner_id <- grep("^id", names(inner_tidy)) if (length(inner_id) != length(id_cols)) stop("Cannot merge tidt data sets", call. = FALSE) names(inner_tidy)[inner_id] <- id_cols full_join(outer_tidy, inner_tidy, by = id_cols) } tidy_wrap <- function(x) { x <- tidy(x) names(x) <- paste0("inner_", names(x)) x } rsample/R/form_pred.R0000644000175000017500000000216413726435071014363 0ustar nileshnilesh#' Extract Predictor Names from Formula or Terms #' #' `all.vars` returns all variables used in a formula. This #' function only returns the variables explicitly used on the #' right-hand side (i.e., it will not resolve dots unless the #' object is terms with a data set specified). #' @param object A model formula or [stats::terms()] #' object. #' @param ... Arguments to pass to [all.vars()] #' @return A character vector of names #' @export #' @examples #' form_pred(y ~ x + z) #' form_pred(terms(y ~ x + z)) #' #' form_pred(y ~ x + log(z)) #' form_pred(log(y) ~ x + z) #' #' form_pred(y1 + y2 ~ x + z) #' form_pred(log(y1) + y2 ~ x + z) #' #' # will fail: #' # form_pred(y ~ .) #' #' form_pred(terms(mpg ~ (.)^2, data = mtcars)) #' form_pred(terms( ~ (.)^2, data = mtcars)) #' @importFrom stats terms form_pred <- function(object, ...) { if(inherits(object, "formula")) { object <- terms(object) } y_index <- attr(object, "response") ## If there is something on the lhs of the formula, ## remove it and get vars if(y_index != 0) { object[[2]] <- NULL object <- terms(object) } all.vars(object, ...) } rsample/R/labels.R0000644000175000017500000001625113755042647013657 0ustar nileshnilesh#' Find Labels from rset Object #' #' Produce a vector of resampling labels (e.g. "Fold1") from #' an `rset` object. Currently, `nested_cv` #' is not supported. #' #' @param object An `rset` object #' @param make_factor A logical for whether the results should be #' a character or a factor. #' @param ... Not currently used. #' @return A single character or factor vector. #' @export #' @examples #' labels(vfold_cv(mtcars)) labels.rset <- function(object, make_factor = FALSE, ...) { if (inherits(object, "nested_cv")) stop("`labels` not implemented for nested resampling", call. = FALSE) if (make_factor) as.factor(object$id) else as.character(object$id) } #' @rdname labels.rset #' @export labels.vfold_cv <- function(object, make_factor = FALSE, ...) { if (inherits(object, "nested_cv")) stop("`labels` not implemented for nested resampling", call. = FALSE) is_repeated <- attr(object, "repeats") > 1 if (is_repeated) { out <- as.character(paste(object$id, object$id2, sep = ".")) } else out <- as.character(object$id) if (make_factor) out <- as.factor(out) out } #' Find Labels from rsplit Object #' #' Produce a tibble of identification variables so that single #' splits can be linked to a particular resample. #' #' @param object An `rsplit` object #' @param ... Not currently used. #' @return A tibble. #' @seealso add_resample_id #' @export #' @examples #' cv_splits <- vfold_cv(mtcars) #' labels(cv_splits$splits[[1]]) labels.rsplit <- function(object, ...) { out <- if ("id" %in% names(object)) object$id else tibble() out } ## The `pretty` methods below are good for when you need to ## textually describe the resampling procedure. Note that they ## can have more than one element (in the case of nesting) #' Short Descriptions of rsets #' #' Produce a character vector describing the resampling method. #' #' @param x An `rset` object #' @param ... Not currently used. #' @return A character vector. #' @export pretty.vfold_cv #' @export #' @method pretty vfold_cv #' @keywords internal pretty.vfold_cv <- function(x, ...) { details <- attributes(x) res <- paste0(details$v, "-fold cross-validation") if (details$repeats > 1) res <- paste(res, "repeated", details$repeats, "times") if (details$strata) res <- paste(res, "using stratification") res } #' @export pretty.loo_cv #' @export #' @method pretty loo_cv #' @rdname pretty.vfold_cv pretty.loo_cv <- function(x, ...) "Leave-one-out cross-validation" #' @export pretty.apparent #' @export #' @method pretty apparent #' @rdname pretty.vfold_cv pretty.apparent <- function(x, ...) "Apparent sampling" #' @export pretty.rolling_origin #' @export #' @method pretty rolling_origin #' @rdname pretty.vfold_cv pretty.rolling_origin <- function(x, ...) "Rolling origin forecast resampling" #' @export pretty.sliding_window #' @export #' @method pretty sliding_window #' @rdname pretty.vfold_cv pretty.sliding_window <- function(x, ...) "Sliding window resampling" #' @export pretty.sliding_index #' @export #' @method pretty sliding_index #' @rdname pretty.vfold_cv pretty.sliding_index <- function(x, ...) "Sliding index resampling" #' @export pretty.sliding_period #' @export #' @method pretty sliding_period #' @rdname pretty.vfold_cv pretty.sliding_period <- function(x, ...) "Sliding period resampling" #' @export pretty.mc_cv #' @export #' @method pretty mc_cv #' @rdname pretty.vfold_cv pretty.mc_cv <- function(x, ...) { details <- attributes(x) res <- paste0( "Monte Carlo cross-validation (", signif(details$prop, 2), "/", signif(1 - details$prop, 2), ") with ", details$times, " resamples " ) if (details$strata) res <- paste(res, "using stratification") res } #' @export pretty.validation_split #' @export #' @method pretty validation_split #' @rdname pretty.vfold_cv pretty.validation_split <- function(x, ...) { details <- attributes(x) res <- paste0( "Validation Set Split (", signif(details$prop, 2), "/", signif(1 - details$prop, 2), ") " ) if (details$strata) res <- paste(res, "using stratification") res } #' @export pretty.nested_cv #' @export #' @method pretty nested_cv #' @rdname pretty.vfold_cv pretty.nested_cv <- function(x, ...) { details <- attributes(x) if (is_call(details$outside)) { class(x) <- class(x)[!(class(x) == "nested_cv")] outer_label <- pretty(x) } else { outer_label <- paste0("`", deparse(details$outside), "`") } inner_label <- if (is_call(details$inside)) pretty(x$inner_resamples[[1]]) else paste0("`", deparse(details$inside), "`") res <- c("Nested resampling:", paste(" outer:", outer_label), paste(" inner:", inner_label)) res } #' @export pretty.bootstraps #' @export #' @method pretty bootstraps #' @rdname pretty.vfold_cv pretty.bootstraps <- function(x, ...) { details <- attributes(x) res <- "Bootstrap sampling" if (details$strata) res <- paste(res, "using stratification") if (details$apparent) res <- paste(res, "with apparent sample") res } #' @export pretty.permutations #' @export #' @method pretty permutations #' @rdname pretty.vfold_cv pretty.permutations <- function(x, ...) { details <- attributes(x) res <- "Permutation sampling" if (details$apparent) res <- paste(res, "with apparent sample") res } #' @export pretty.group_vfold_cv #' @export #' @method pretty group_vfold_cv #' @rdname pretty.vfold_cv pretty.group_vfold_cv <- function(x, ...) { details <- attributes(x) paste0("Group ", details$v, "-fold cross-validation") } #' @export pretty.manual_rset #' @export #' @method pretty manual_rset #' @rdname pretty.vfold_cv pretty.manual_rset <- function(x, ...) { "Manual resampling" } #' Augment a data set with resampling identifiers #' #' For a data set, `add_resample_id()` will add at least one new column that #' identifies which resample that the data came from. In most cases, a single #' column is added but for some resampling methods, two or more are added. #' @param .data A data frame #' @param split A single `rset` object. #' @param dots A single logical: should the id columns be prefixed with a "." #' to avoid name conflicts with `.data`? #' @return An updated data frame. #' @examples #' library(dplyr) #' #' set.seed(363) #' car_folds <- vfold_cv(mtcars, repeats = 3) #' #' analysis(car_folds$splits[[1]]) %>% #' add_resample_id(car_folds$splits[[1]]) %>% #' head() #' #' car_bt <- bootstraps(mtcars) #' #' analysis(car_bt$splits[[1]]) %>% #' add_resample_id(car_bt$splits[[1]]) %>% #' head() #' @seealso labels.rsplit #' @export add_resample_id <- function(.data, split, dots = FALSE) { if (!inherits(dots, "logical") || length(dots) > 1) { stop("`dots` should be a single logical.", call. = FALSE) } if (!inherits(.data, "data.frame")) { stop("`.data` should be a data frame.", call. = FALSE) } if (!inherits(split, "rsplit")) { stop("`split` should be a single 'rset' object.", call. = FALSE) } labs <- labels(split) if (!tibble::is_tibble(labs) && nrow(labs) == 1) { stop("`split` should be a single 'rset' object.", call. = FALSE) } if (dots) { colnames(labs) <- paste0(".", colnames(labs)) } cbind(.data, labs) } rsample/R/vfold.R0000644000175000017500000001135214136023335013507 0ustar nileshnilesh#' V-Fold Cross-Validation #' #' V-fold cross-validation (also known as k-fold cross-validation) randomly #' splits the data into V groups of roughly equal size (called "folds"). A #' resample of the analysis data consisted of V-1 of the folds while the #' assessment set contains the final fold. In basic V-fold cross-validation #' (i.e. no repeats), the number of resamples is equal to V. #' @details With more than one repeat, the basic V-fold cross-validation is #' conducted each time. For example, if three repeats are used with `v = 10`, #' there are a total of 30 splits: three groups of 10 that are generated #' separately. #' @template strata_details #' @inheritParams make_strata #' @param data A data frame. #' @param v The number of partitions of the data set. #' @param repeats The number of times to repeat the V-fold partitioning. #' @param strata A variable in `data` (single character or name) used to conduct #' stratified sampling. When not `NULL`, each resample is created within the #' stratification variable. Numeric `strata` are binned into quartiles. #' @param ... Not currently used. #' @export #' @return A tibble with classes `vfold_cv`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and #' one or more identification variables. For a single repeat, there will be #' one column called `id` that has a character string with the fold identifier. #' For repeats, `id` is the repeat number and an additional column called `id2` #' that contains the fold information (within repeat). #' @examples #' vfold_cv(mtcars, v = 10) #' vfold_cv(mtcars, v = 10, repeats = 2) #' #' library(purrr) #' data(wa_churn, package = "modeldata") #' #' set.seed(13) #' folds1 <- vfold_cv(wa_churn, v = 5) #' map_dbl(folds1$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' #' set.seed(13) #' folds2 <- vfold_cv(wa_churn, strata = churn, v = 5) #' map_dbl(folds2$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' #' set.seed(13) #' folds3 <- vfold_cv(wa_churn, strata = tenure, breaks = 6, v = 5) #' map_dbl(folds3$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' @export vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, pool = 0.1, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if(length(strata) == 0) strata <- NULL } strata_check(strata, data) if (repeats == 1) { split_objs <- vfold_splits(data = data, v = v, strata = strata, breaks = breaks, pool = pool) } else { for (i in 1:repeats) { tmp <- vfold_splits(data = data, v = v, strata = strata, pool = pool) tmp$id2 <- tmp$id tmp$id <- names0(repeats, "Repeat")[i] split_objs <- if (i == 1) tmp else rbind(split_objs, tmp) } } ## We remove the holdout indices since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) ## Save some overall information cv_att <- list(v = v, repeats = repeats, strata = !is.null(strata)) new_rset(splits = split_objs$splits, ids = split_objs[, grepl("^id", names(split_objs))], attrib = cv_att, subclass = c("vfold_cv", "rset")) } vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4, pool = 0.1) { if (!is.numeric(v) || length(v) != 1) stop("`v` must be a single integer.", call. = FALSE) n <- nrow(data) if (is.null(strata)) { folds <- sample(rep(1:v, length.out = n)) idx <- seq_len(n) indices <- split_unnamed(idx, folds) } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), breaks = breaks, pool = pool)) stratas <- split_unnamed(stratas, stratas$strata) stratas <- purrr::map(stratas, add_vfolds, v = v) stratas <- dplyr::bind_rows(stratas) indices <- split_unnamed(stratas$idx, stratas$folds) } indices <- lapply(indices, default_complement, n = n) split_objs <- purrr::map(indices, make_splits, data = data, class = "vfold_split") tibble::tibble(splits = split_objs, id = names0(length(split_objs), "Fold")) } add_vfolds <- function(x, v) { x$folds <- sample(rep(1:v, length.out = nrow(x))) x } #' @export print.vfold_cv <- function(x, ...) { cat("# ", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("vfold_cv", "rset"))] print(x, ...) } rsample/R/0_imports.R0000644000175000017500000000173314010267142014310 0ustar nileshnilesh#' @importFrom utils globalVariables #' @importFrom purrr map map_df map2 map_dfr map_dbl pluck map_lgl #' @importFrom tibble tibble is_tibble as_tibble obj_sum type_sum #' @importFrom tidyr gather unnest #' @importFrom dplyr select %>% bind_cols bind_rows arrange_ arrange full_join #' @importFrom dplyr mutate last ungroup group_by inner_join summarize do case_when #' @importFrom rlang !! is_call is_string enquo quos exec is_list abort warn #' @importFrom methods formalArgs #' @importFrom stats quantile setNames qnorm pnorm #' @importFrom tidyselect vars_select one_of #' @importFrom furrr future_map_dfr #' @importFrom tidyr gather # ------------------------------------------------------------------------------ utils::globalVariables( c("model", "splits", "statistic", "Data", "Row", "id", ".", ".estimate", ".lower", ".upper", "Z0", "Za", "Zl", "Zu", "a", "cubed", "estimate", "orig", "p0", "squared", "term", "theta_0", "loo", "n", "..index", "models" ) ) rsample/R/rolling_origin.R0000644000175000017500000001051413727757057015435 0ustar nileshnilesh#' Rolling Origin Forecast Resampling #' #' This resampling method is useful when the data set has a strong time #' component. The resamples are not random and contain data points that are #' consecutive values. The function assumes that the original data set are #' sorted in time order. #' @details The main options, `initial` and `assess`, control the number of #' data points from the original data that are in the analysis and assessment #' set, respectively. When `cumulative = TRUE`, the analysis set will grow as #' resampling continues while the assessment set size will always remain #' static. #' `skip` enables the function to not use every data point in the resamples. #' When `skip = 0`, the resampling data sets will increment by one position. #' Suppose that the rows of a data set are consecutive days. Using `skip = 6` #' will make the analysis data set to operate on *weeks* instead of days. The #' assessment set size is not affected by this option. #' @seealso #' [sliding_window()], [sliding_index()], and [sliding_period()] for additional #' time based resampling functions. #' @inheritParams vfold_cv #' @param initial The number of samples used for analysis/modeling in the #' initial resample. #' @param assess The number of samples used for each assessment resample. #' @param cumulative A logical. Should the analysis resample grow beyond the #' size specified by `initial` at each resample?. #' @param skip A integer indicating how many (if any) _additional_ resamples #' to skip to thin the total amount of data points in the analysis resample. #' See the example below. #' @param lag A value to include a lag between the assessment #' and analysis set. This is useful if lagged predictors will be used #' during training and testing. #' @export #' @return An tibble with classes `rolling_origin`, `rset`, `tbl_df`, `tbl`, #' and `data.frame`. The results include a column for the data split objects #' and a column called `id` that has a character string with the resample #' identifier. #' @examples #' set.seed(1131) #' ex_data <- data.frame(row = 1:20, some_var = rnorm(20)) #' dim(rolling_origin(ex_data)) #' dim(rolling_origin(ex_data, skip = 2)) #' dim(rolling_origin(ex_data, skip = 2, cumulative = FALSE)) #' #' # You can also roll over calendar periods by first nesting by that period, #' # which is especially useful for irregular series where a fixed window #' # is not useful. This example slides over 5 years at a time. #' library(dplyr) #' library(tidyr) #' data(drinks, package = "modeldata") #' #' drinks_annual <- drinks %>% #' mutate(year = as.POSIXlt(date)$year + 1900) %>% #' nest(-year) #' #' multi_year_roll <- rolling_origin(drinks_annual, cumulative = FALSE) #' #' analysis(multi_year_roll$splits[[1]]) #' assessment(multi_year_roll$splits[[1]]) #' #' @export rolling_origin <- function(data, initial = 5, assess = 1, cumulative = TRUE, skip = 0, lag = 0, ...) { n <- nrow(data) if (n < initial + assess) stop("There should be at least ", initial + assess, " nrows in `data`", call. = FALSE) if (!is.numeric(lag) | !(lag%%1==0)) { stop("`lag` must be a whole number.", call. = FALSE) } if (lag > initial) { stop("`lag` must be less than or equal to the number of training observations.", call. = FALSE) } stops <- seq(initial, (n - assess), by = skip + 1) starts <- if (!cumulative) { stops - initial + 1 } else { starts <- rep(1, length(stops)) } in_ind <- mapply(seq, starts, stops, SIMPLIFY = FALSE) out_ind <- mapply(seq, stops + 1 - lag, stops + assess, SIMPLIFY = FALSE) indices <- mapply(merge_lists, in_ind, out_ind, SIMPLIFY = FALSE) split_objs <- purrr::map(indices, make_splits, data = data, class = "rof_split") split_objs <- list(splits = split_objs, id = names0(length(split_objs), "Slice")) roll_att <- list(initial = initial, assess = assess, cumulative = cumulative, skip = skip, lag = lag) new_rset(splits = split_objs$splits, ids = split_objs$id, attrib = roll_att, subclass = c("rolling_origin", "rset")) } #' @export print.rolling_origin <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("rolling_origin", "rset"))] print(x, ...) } rsample/R/apparent.R0000644000175000017500000000255614020222440014203 0ustar nileshnilesh#' Sampling for the Apparent Error Rate #' #' When building a model on a data set and re-predicting the same data, the #' performance estimate from those predictions is often called the #' "apparent" performance of the model. This estimate can be wildly #' optimistic. "Apparent sampling" here means that the analysis and #' assessment samples are the same. These resamples are sometimes used in #' the analysis of bootstrap samples and should otherwise be #' avoided like old sushi. #' #' @inheritParams vfold_cv #' @return A tibble with a single row and classes `apparent`, #' `rset`, `tbl_df`, `tbl`, and `data.frame`. The #' results include a column for the data split objects and one column #' called `id` that has a character string with the resample identifier. #' @examples #' apparent(mtcars) #' @export apparent <- function(data, ...) { splits <- rsplit(data, in_id = 1:nrow(data), out_id = 1:nrow(data)) # splits <- rm_out(splits) class(splits) <- c("rsplit", "apparent_split") split_objs <- tibble::tibble(splits = list(splits), id = "Apparent") new_rset(splits = split_objs$splits, ids = split_objs$id, attrib = NULL, subclass = c("apparent", "rset")) } #' @export print.apparent <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("apparent", "rset"))] print(x, ...) } rsample/R/initial_split.R0000644000175000017500000000526014122413046015237 0ustar nileshnilesh#' Simple Training/Test Set Splitting #' #' `initial_split` creates a single binary split of the data into a training #' set and testing set. `initial_time_split` does the same, but takes the #' _first_ `prop` samples for training, instead of a random selection. #' `training` and `testing` are used to extract the resulting data. #' @template strata_details #' @inheritParams vfold_cv #' @inheritParams make_strata #' @param prop The proportion of data to be retained for modeling/analysis. #' @export #' @return An `rsplit` object that can be used with the `training` and `testing` #' functions to extract the data in each split. #' @examples #' set.seed(1353) #' car_split <- initial_split(mtcars) #' train_data <- training(car_split) #' test_data <- testing(car_split) #' #' data(drinks, package = "modeldata") #' drinks_split <- initial_time_split(drinks) #' train_data <- training(drinks_split) #' test_data <- testing(drinks_split) #' c(max(train_data$date), min(test_data$date)) # no lag #' #' # With 12 period lag #' drinks_lag_split <- initial_time_split(drinks, lag = 12) #' train_data <- training(drinks_lag_split) #' test_data <- testing(drinks_lag_split) #' c(max(train_data$date), min(test_data$date)) # 12 period lag #' #' @export #' initial_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) { if (!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if (length(strata) == 0) { strata <- NULL } } res <- mc_cv( data = data, prop = prop, strata = strata, breaks = breaks, pool = pool, times = 1, ... ) res$splits[[1]] } #' @rdname initial_split #' @param lag A value to include a lag between the assessment #' and analysis set. This is useful if lagged predictors will be used #' during training and testing. #' @export initial_time_split <- function(data, prop = 3/4, lag = 0, ...) { if (!is.numeric(prop) | prop >= 1 | prop <= 0) { rlang::abort("`prop` must be a number on (0, 1).") } if (!is.numeric(lag) | !(lag%%1 == 0)) { stop("`lag` must be a whole number.", call. = FALSE) } n_train <- floor(nrow(data) * prop) if (lag > n_train) { stop("`lag` must be less than or equal to the number of training observations.", call. = FALSE) } split <- rsplit(data, 1:n_train, (n_train + 1 - lag):nrow(data)) splits <- list(split) ids <- "Resample1" rset <- new_rset(splits, ids) rset$splits[[1]] } #' @rdname initial_split #' @export #' @param x An `rsplit` object produced by `initial_split` training <- function(x) analysis(x) #' @rdname initial_split #' @export testing <- function(x) assessment(x) rsample/R/gather.R0000644000175000017500000000500214122424444013643 0ustar nileshnilesh#' Gather an `rset` Object #' #' @description #' `r lifecycle::badge("deprecated")` #' #' This method uses `gather` on an `rset` object to stack all of #' the non-ID or split columns in the data and is useful for #' stacking model evaluation statistics. The resulting data frame #' has a column based on the column names of `data` and another for #' the values. This method is now deprecated in favor of using #' [tidyr::pivot_longer()] directly. #' #' @param data An `rset` object. #' @param key,value,... Not specified in this method and will be #' ignored. Note that this means that selectors are ignored if #' they are passed to the function. #' @param na.rm If `TRUE`, will remove rows from output where the #' value column in `NA`. #' @param convert If `TRUE` will automatically run #' `type.convert()` on the key column. This is useful if the column #' names are actually numeric, integer, or logical. #' @param factor_key If FALSE, the default, the key values will be #' stored as a character vector. If `TRUE`, will be stored as a #' factor, which preserves the original ordering of the columns. #' @return A data frame with the ID columns, a column called #' `model` (with the previous column names), and a column called #' `statistic` (with the values). #' @examples #' library(rsample) #' cv_obj <- vfold_cv(mtcars, v = 10) #' cv_obj$lm_rmse <- rnorm(10, mean = 2) #' cv_obj$nnet_rmse <- rnorm(10, mean = 1) #' #' ## now deprecated for rset objects: #' ## gather(cv_obj) #' #' ## instead of gather, use tidyr::pivot_longer: #' library(tidyr) #' library(dplyr) #' cv_obj %>% #' select(-splits) %>% #' pivot_longer(-id) #' #' @export gather.rset #' @export #' @keywords internal #' @method gather rset gather.rset <- function(data, key = NULL, value = NULL, ..., na.rm = TRUE, convert = FALSE, factor_key = TRUE) { lifecycle::deprecate_stop("0.1.0", "gather.rset()", "tidyr::pivot_longer()") if (any(names(data) == "splits")) { data <- data %>% dplyr::select(-splits) } data <- as.data.frame(data) id_vars <- grep("^id", names(data), value = TRUE) other_vars <- names(data)[!(names(data) %in% id_vars)] if (length(other_vars) < 2) { rlang::abort( paste0( "There should be at least two other columns ", "(besides `id` variables) in the data set to ", "use `gather.rset()`." ) ) } tidyr::gather( data, key = model, value = statistic, - !!id_vars, na.rm = na.rm, convert = convert, factor_key = factor_key ) } rsample/R/caret.R0000644000175000017500000001122714033610403013466 0ustar nileshnilesh#' Convert Resampling Objects to Other Formats #' #' These functions can convert resampling objects between #' \pkg{rsample} and \pkg{caret}. #' #' @param object An `rset` object. Currently, #' `nested_cv` is not supported. #' @return `rsample2caret` returns a list that mimics the #' `index` and `indexOut` elements of a #' `trainControl` object. `caret2rsample` returns an #' `rset` object of the appropriate class. #' @export rsample2caret <- function(object, data = c("analysis", "assessment")) { if(!inherits(object, "rset")) stop("`object` must be an `rset`", call. = FALSE) data <- rlang::arg_match(data) in_ind <- purrr::map(object$splits, as.integer, data = "analysis") names(in_ind) <- labels(object) out_ind <- purrr::map(object$splits, as.integer, data = "assessment") names(out_ind) <- names(in_ind) list(index = in_ind, indexOut = out_ind) } #' @rdname rsample2caret #' @param ctrl An object produced by `trainControl` that has #' had the `index` and `indexOut` elements populated by #' integers. One method of getting this is to extract the #' `control` objects from an object produced by `train`. #' @param data The data that was originally used to produce the #' `ctrl` object. #' @export caret2rsample <- function(ctrl, data = NULL) { if (is.null(data)) stop("Must supply original data", call. = FALSE) if (!any(names(ctrl) == "index")) stop("`ctrl` should have an element `index`", call. = FALSE) if (!any(names(ctrl) == "indexOut")) stop("`ctrl` should have an element `indexOut`", call. = FALSE) if (is.null(ctrl$index)) stop("`ctrl$index` should be populated with integers", call. = FALSE) if (is.null(ctrl$indexOut)) stop("`ctrl$indexOut` should be populated with integers", call. = FALSE) indices <- purrr::map2(ctrl$index, ctrl$indexOut, extract_int) id_data <- names(indices) indices <- unname(indices) indices <- purrr::map(indices, add_data, y = data) indices <- map(indices, add_rsplit_class, cl = map_rsplit_method(ctrl$method)) indices <- tibble::tibble(splits = indices) if (ctrl$method %in% c("repeatedcv", "adaptive_cv")) { id_data <- strsplit(id_data, split = ".", fixed = TRUE) id_data <- tibble::tibble( id = vapply(id_data, function(x) x[2], character(1)), id2 = vapply(id_data, function(x) x[1], character(1)) ) } else { id_data <- tibble::tibble(id = id_data) } new_rset(splits = indices$splits, ids = id_data[, grepl("^id", names(id_data))], attrib = map_attr(ctrl), subclass = c(map_rset_method(ctrl$method), "rset")) } extract_int <- function(x, y) list(in_id = x, out_id = y) add_data <- function(x, y) c(list(data = y), x) add_rsplit_class <- function(x, cl) { class(x) <- c("rsplit", cl) x } add_rset_class <- function(x, cl) { class(x) <- c(cl, "rset", "tbl_df", "tbl", "data.frame") x } map_rsplit_method <- function(method) { out <- switch( method, cv = , repeatedcv = , adaptive_cv = "vfold_split", boot = , boot_all =, boot632 = , optimism_boot = , adaptive_boot = "boot_split", LOOCV = "loo_split", LGOCV = , adaptive_LGOCV = "mc_split", timeSlice = "rof_split", "error" ) if (out == "error") stop("Resampling method `", method, "` cannot be converted into an `rsplit` object", call. = FALSE) out } map_rset_method <- function(method) { out <- switch( method, cv = , repeatedcv = , adaptive_cv = "vfold_cv", boot = , boot_all =, boot632 = , optimism_boot = , adaptive_boot = "bootstraps", LOOCV = "loo_cv", LGOCV = , adaptive_LGOCV = "mc_cv", timeSlice = "rolling_origin", "error" ) if (out == "error") stop("Resampling method `", method, "` cannot be converted into an `rset` object", call. = FALSE) out } map_attr <- function(object) { if (grepl("cv$", object$method)) { out <- list(v = object$number, repeats = ifelse(!is.na(object$repeats), object$repeats, 1), strata = TRUE) } else if (grepl("boot", object$method)) { out <- list(times = object$number, apparent = FALSE, strata = FALSE) } else if (grepl("LGOCV$", object$method)) { out <- list(times = object$number, prop = object$p, strata = FALSE) } else if (object$method == "LOOCV") { out <- list() } else if (object$method == "timeSlice") { out <- list( initial = object$initialWindow, assess = object$horizon, cumulative = !object$fixedWindow, skip = object$skip ) } else { stop("Method", object$method, "cannot be converted") } out } rsample/R/make_strata.R0000644000175000017500000001103114066674573014704 0ustar nileshnilesh#' Create or Modify Stratification Variables #' #' This function can create strata from numeric data and make non-numeric data #' more conducive for stratification. #' #' @details #' For numeric data, if the number of unique levels is less than #' `nunique`, the data are treated as categorical data. #' #' For categorical inputs, the function will find levels of `x` than #' occur in the data with percentage less than `pool`. The values from #' these groups will be randomly assigned to the remaining strata (as will #' data points that have missing values in `x`). #' #' For numeric data with more unique values than `nunique`, the data #' will be converted to being categorical based on percentiles of the data. #' The percentile groups will have no more than 20 percent of the data in #' each group. Again, missing values in `x` are randomly assigned #' to groups. #' #' @param x An input vector. #' @param breaks A single number giving the number of bins desired to stratify a #' numeric stratification variable. #' @param nunique An integer for the number of unique value threshold in the #' algorithm. #' @param pool A proportion of data used to determine if a particular group is #' too small and should be pooled into another group. We do not recommend #' decreasing this argument below its default of 0.1 because of the dangers #' of stratifying groups that are too small. #' @param depth An integer that is used to determine the best number of #' percentiles that should be used. The number of bins are based on #' `min(5, floor(n / depth))` where `n = length(x)`. #' If `x` is numeric, there must be at least 40 rows in the data set #' (when `depth = 20`) to conduct stratified sampling. #' #' @export #' @return A factor vector. #' @examples #' set.seed(61) #' x1 <- rpois(100, lambda = 5) #' table(x1) #' table(make_strata(x1)) #' #' set.seed(554) #' x2 <- rpois(100, lambda = 1) #' table(x2) #' table(make_strata(x2)) #' #' # small groups are randomly assigned #' x3 <- factor(x2) #' table(x3) #' table(make_strata(x3)) #' #' # `oilType` data from `caret` #' x4 <- rep(LETTERS[1:7], c(37, 26, 3, 7, 11, 10, 2)) #' table(x4) #' table(make_strata(x4)) #' table(make_strata(x4, pool = 0.1)) #' table(make_strata(x4, pool = 0.0)) #' #' # not enough data to stratify #' x5 <- rnorm(20) #' table(make_strata(x5)) #' #' set.seed(483) #' x6 <- rnorm(200) #' quantile(x6, probs = (0:10)/10) #' table(make_strata(x6, breaks = 10)) #' @export make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) { default_pool <- 0.1 num_vals <- unique(stats::na.omit(x)) n <- length(x) if (length(num_vals) <= nunique | is.character(x) | is.factor(x)) { x <- factor(x) xtab <- sort(table(x)) pcts <- xtab / n ## This should really be based on some combo of rate and number. if (all(pcts < pool)) { rlang::warn(c("Too little data to stratify.", "Resampling will be unstratified.")) return(factor(rep("strata1", n))) } if (pool < default_pool & any(pcts < default_pool)) rlang::warn(c( paste0("Stratifying groups that make up ", round(100 * pool), "% of the data may be ", "statistically risky."), "Consider increasing `pool` to at least 0.1" )) ## Small groups will be randomly allocated to stratas at end ## These should probably go into adjacent groups but this works for now if (any(pcts < pool)) x[x %in% names(pcts)[pcts < pool]] <- NA ## The next line will also relevel the data if `x` was a factor out <- factor(as.character(x)) } else { if (breaks < 2) { rlang::warn(c("The bins specified by `breaks` must be >=2.", "Resampling will be unstratified.")) return(factor(rep("strata1", n))) } else if (floor(n / breaks) < depth) { rlang::warn(c( paste0("The number of observations in each quantile is ", "below the recommended threshold of ", depth, "."), paste0("Stratification will use ", floor(n/depth), " breaks instead.") )) } breaks <- min(breaks, floor(n/depth)) if (breaks < 2) { rlang::warn(c("Too little data to stratify.", "Resampling will be unstratified.")) return(factor(rep("strata1", n))) } pctls <- quantile(x, probs = (0:breaks) / breaks, na.rm = TRUE) pctls <- unique(pctls) out <- cut(x, breaks = pctls, include.lowest = TRUE) } num_miss <- sum(is.na(x)) if (num_miss > 0) out[is.na(x)] <- sample(levels(out), size = num_miss, replace = TRUE) out } rsample/R/compat-dplyr-old.R0000644000175000017500000000353113673171773015603 0ustar nileshnilesh# These compat methods are for dplyr < 1.0.0. # They are conditionally registered in `zzz.R` if an old version of dplyr is # being used. Otherwise the methods in `compat-dplyr.R` are registered. # Eventually these should be deprecated and dplyr 1.0.0 should be required. # Only verbs that were previously supported in rsample are continually # supported now. This means that there is more complete support when a user # upgrades to dplyr >= 1.0.0. When support for dplyr < 1.0.0 is deprecated, # move the tests in `compact-dplyr-old.R` to `compat-dplyr.R`. The methods # are named with underscores between the generic and the class, not with dots, # because when working interactively with `load_all()`, just having # a function named `mutate.rset` will register a dplyr method internally in # the package, even if you are on dplyr 1.0.0, which is highly confusing and # undesirable. So we work around that by naming them in such a way that they # are not picked up by S3 registration, and then we manually register them # with the `method` argument of `s3_register()`. dplyr_pre_1.0.0 <- function() { utils::packageVersion("dplyr") <= "0.8.5" } # Registered in `.onLoad()` mutate_rset <- function(.data, ...) { out <- NextMethod() rset_reconstruct(out, .data) } # Registered in `.onLoad()` arrange_rset <- function(.data, ...) { out <- NextMethod() rset_reconstruct(out, .data) } # Registered in `.onLoad()` filter_rset <- function(.data, ...) { out <- NextMethod() rset_reconstruct(out, .data) } # Registered in `.onLoad()` rename_rset <- function(.data, ...) { out <- NextMethod() rset_reconstruct(out, .data) } # Registered in `.onLoad()` select_rset <- function(.data, ...) { out <- NextMethod() rset_reconstruct(out, .data) } # Registered in `.onLoad()` slice_rset <- function(.data, ...) { out <- NextMethod() rset_reconstruct(out, .data) } rsample/R/compat-vctrs.R0000644000175000017500000006321013727757057015043 0ustar nileshnilesh#' @import vctrs NULL # `vec_restore()` # # Called at the end of `vec_slice()` and `vec_ptype()` after all slicing has # been done on the proxy object. # # If no changes have been made to the row/column structure of rset specific # columns, then we can return an rset subclass. Otherwise, the resulting # object is no longer guaranteed to return a valid rset, and we have to # fallback to a bare tibble. # # It is very important that the result of `vec_ptype()` is a bare tibble. # This ensures that the `vec_ptype2..()` methods # never get called. `vec_ptype()` is able to return a bare tibble because it # essentially takes a 0-row slice of the rset, and then calls `vec_restore()`. # Because the row structure has been modified, we return a bare tibble from # `vec_restore.()`. # # Currently `vec_restore()` uses inheritance in vctrs, but I don't expect this # to be the case in the future. For that reason, I use explicit methods for # each individual rset subclass, rather than implementing `vec_restore.rset()`. # `vec_ptype2()` # # The purpose of `vec_ptype2()` methods is generally to determine the type # of the output in operations like `vec_c()` or `vec_rbind()`. However, this # implementation does not use any custom `vec_ptype2()` methods at all. This is # explained below. # # `vec_ptype2()` internally works by calling `vec_ptype()` on both `x` and `y`, # and then looking up the common type of those two ptypes. Generally, the # ptype of a vector `x` returned from `vec_ptype()` has the same class as `x`. # However, for rsample objects it makes more sense for the ptype of an rset to # be a bare tibble. The reason for this is because a ptype of a data frame is # generally a 0-row slice. However for rsample rsets this doesn't make sense # (you can't have 0 rows in a 10-fold cv rset), so instead we return a bare # tibble as the ptype. # So when `vec_ptype2()` is called on an rset, that rset is downgraded to a # bare tibble _before_ the search for a `vec_ptype2()` method occurs. This # means that it will never find a ptype2 method such as # `vec_ptype2.bootstraps.tbl_df()`, because the will become a # tbl_df first. This means that we rely entirely on the `tbl_df` ptype2 methods, # which are already implemented in vctrs to return another tbl_df. # # The implications of this are that whenever a rset subclass is combined with # another rset subclass or bare tibble, the resulting common type is always # another bare tibble. So if you `vec_c(, )` the result # will always be a tibble, never a bootstraps object. This makes sense because # you might be adding rows, which would invalidate the structure of the # bootstraps object. # `vec_cast()` # # The `vec_cast()` methods for rset objects really only serve 1 purpose. They # cast an rset subclass to a tibble or data frame. The cast to tibble is most # useful. Most of the operations in vctrs work by finding a common type # with `vec_ptype2()`, and then casting all of the inputs to that common type. # Because `vec_ptype2()` returns a bare tibble anytime a rset-subclass is # involved, we will always be casting the rset subclass to a tibble. # The cast method uses `vctrs::tib_cast()`, which always returns a bare tibble # with all of the data in `x` cast to the type of `to`. # ------------------------------------------------------------------------------ # bootstraps #' @export vec_restore.bootstraps <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.bootstraps.bootstraps <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.bootstraps.bootstraps") } #' @export vec_ptype2.bootstraps.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.bootstraps.tbl_df") } #' @export vec_ptype2.tbl_df.bootstraps <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.bootstraps") } #' @export vec_ptype2.bootstraps.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.bootstraps.data.frame") } #' @export vec_ptype2.data.frame.bootstraps <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.bootstraps") } #' @export vec_cast.bootstraps.bootstraps <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.bootstraps.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.bootstraps <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.bootstraps.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.bootstraps <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # vfold_cv #' @export vec_restore.vfold_cv <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.vfold_cv.vfold_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.vfold_cv.vfold_cv") } #' @export vec_ptype2.vfold_cv.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.vfold_cv.tbl_df") } #' @export vec_ptype2.tbl_df.vfold_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.vfold_cv") } #' @export vec_ptype2.vfold_cv.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.vfold_cv.data.frame") } #' @export vec_ptype2.data.frame.vfold_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.vfold_cv") } #' @export vec_cast.vfold_cv.vfold_cv <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.vfold_cv.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.vfold_cv <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.vfold_cv.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.vfold_cv <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # group_vfold_cv #' @export vec_restore.group_vfold_cv <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.group_vfold_cv.group_vfold_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.group_vfold_cv.group_vfold_cv") } #' @export vec_ptype2.group_vfold_cv.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.group_vfold_cv.tbl_df") } #' @export vec_ptype2.tbl_df.group_vfold_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.group_vfold_cv") } #' @export vec_ptype2.group_vfold_cv.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.group_vfold_cv.data.frame") } #' @export vec_ptype2.data.frame.group_vfold_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.group_vfold_cv") } #' @export vec_cast.group_vfold_cv.group_vfold_cv <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.group_vfold_cv.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.group_vfold_cv <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.group_vfold_cv.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.group_vfold_cv <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # loo_cv #' @export vec_restore.loo_cv <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.loo_cv.loo_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.loo_cv.loo_cv") } #' @export vec_ptype2.loo_cv.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.loo_cv.tbl_df") } #' @export vec_ptype2.tbl_df.loo_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.loo_cv") } #' @export vec_ptype2.loo_cv.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.loo_cv.data.frame") } #' @export vec_ptype2.data.frame.loo_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.loo_cv") } #' @export vec_cast.loo_cv.loo_cv <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.loo_cv.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.loo_cv <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.loo_cv.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.loo_cv <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # mc_cv #' @export vec_restore.mc_cv <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.mc_cv.mc_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.mc_cv.mc_cv") } #' @export vec_ptype2.mc_cv.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.mc_cv.tbl_df") } #' @export vec_ptype2.tbl_df.mc_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.mc_cv") } #' @export vec_ptype2.mc_cv.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.mc_cv.data.frame") } #' @export vec_ptype2.data.frame.mc_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.mc_cv") } #' @export vec_cast.mc_cv.mc_cv <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.mc_cv.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.mc_cv <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.mc_cv.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.mc_cv <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # nested_cv #' @export vec_restore.nested_cv <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.nested_cv.nested_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.nested_cv.nested_cv") } #' @export vec_ptype2.nested_cv.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.nested_cv.tbl_df") } #' @export vec_ptype2.tbl_df.nested_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.nested_cv") } #' @export vec_ptype2.nested_cv.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.nested_cv.data.frame") } #' @export vec_ptype2.data.frame.nested_cv <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.nested_cv") } #' @export vec_cast.nested_cv.nested_cv <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.nested_cv.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.nested_cv <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.nested_cv.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.nested_cv <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # validation_split #' @export vec_restore.validation_split <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.validation_split.validation_split <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.validation_split.validation_split") } #' @export vec_ptype2.validation_split.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.validation_split.tbl_df") } #' @export vec_ptype2.tbl_df.validation_split <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.validation_split") } #' @export vec_ptype2.validation_split.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.validation_split.data.frame") } #' @export vec_ptype2.data.frame.validation_split <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.validation_split") } #' @export vec_cast.validation_split.validation_split <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.validation_split.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.validation_split <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.validation_split.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.validation_split <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # rolling_origin #' @export vec_restore.rolling_origin <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.rolling_origin.rolling_origin <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.rolling_origin.rolling_origin") } #' @export vec_ptype2.rolling_origin.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.rolling_origin.tbl_df") } #' @export vec_ptype2.tbl_df.rolling_origin <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.rolling_origin") } #' @export vec_ptype2.rolling_origin.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.rolling_origin.data.frame") } #' @export vec_ptype2.data.frame.rolling_origin <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.rolling_origin") } #' @export vec_cast.rolling_origin.rolling_origin <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.rolling_origin.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.rolling_origin <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.rolling_origin.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.rolling_origin <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # sliding_window #' @export vec_restore.sliding_window <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.sliding_window.sliding_window <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_window.sliding_window") } #' @export vec_ptype2.sliding_window.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_window.tbl_df") } #' @export vec_ptype2.tbl_df.sliding_window <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.sliding_window") } #' @export vec_ptype2.sliding_window.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_window.data.frame") } #' @export vec_ptype2.data.frame.sliding_window <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.sliding_window") } #' @export vec_cast.sliding_window.sliding_window <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.sliding_window.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.sliding_window <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.sliding_window.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.sliding_window <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # sliding_index #' @export vec_restore.sliding_index <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.sliding_index.sliding_index <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_index.sliding_index") } #' @export vec_ptype2.sliding_index.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_index.tbl_df") } #' @export vec_ptype2.tbl_df.sliding_index <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.sliding_index") } #' @export vec_ptype2.sliding_index.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_index.data.frame") } #' @export vec_ptype2.data.frame.sliding_index <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.sliding_index") } #' @export vec_cast.sliding_index.sliding_index <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.sliding_index.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.sliding_index <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.sliding_index.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.sliding_index <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # sliding_period #' @export vec_restore.sliding_period <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.sliding_period.sliding_period <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_period.sliding_period") } #' @export vec_ptype2.sliding_period.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_period.tbl_df") } #' @export vec_ptype2.tbl_df.sliding_period <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.sliding_period") } #' @export vec_ptype2.sliding_period.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.sliding_period.data.frame") } #' @export vec_ptype2.data.frame.sliding_period <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.sliding_period") } #' @export vec_cast.sliding_period.sliding_period <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.sliding_period.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.sliding_period <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.sliding_period.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.sliding_period <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # manual_rset #' @export vec_restore.manual_rset <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.manual_rset.manual_rset <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.manual_rset.manual_rset") } #' @export vec_ptype2.manual_rset.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.manual_rset.tbl_df") } #' @export vec_ptype2.tbl_df.manual_rset <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.manual_rset") } #' @export vec_ptype2.manual_rset.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.manual_rset.data.frame") } #' @export vec_ptype2.data.frame.manual_rset <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.manual_rset") } #' @export vec_cast.manual_rset.manual_rset <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.manual_rset.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.manual_rset <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.manual_rset.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.manual_rset <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ # apparent #' @export vec_restore.apparent <- function(x, to, ...) { rset_reconstruct(x, to) } #' @export vec_ptype2.apparent.apparent <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.apparent.apparent") } #' @export vec_ptype2.apparent.tbl_df <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.apparent.tbl_df") } #' @export vec_ptype2.tbl_df.apparent <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.tbl_df.apparent") } #' @export vec_ptype2.apparent.data.frame <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.apparent.data.frame") } #' @export vec_ptype2.data.frame.apparent <- function(x, y, ..., x_arg = "", y_arg = "") { stop_never_called("vec_ptype2.data.frame.apparent") } #' @export vec_cast.apparent.apparent <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.apparent.tbl_df <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.tbl_df.apparent <- function(x, to, ..., x_arg = "", to_arg = "") { tib_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.apparent.data.frame <- function(x, to, ..., x_arg = "", to_arg = "") { stop_incompatible_cast_rset(x, to, x_arg = x_arg, to_arg = to_arg) } #' @export vec_cast.data.frame.apparent <- function(x, to, ..., x_arg = "", to_arg = "") { df_cast(x, to, ..., x_arg = x_arg, to_arg = to_arg) } # ------------------------------------------------------------------------------ stop_incompatible_cast_rset <- function(x, to, ..., x_arg, to_arg) { details <- "Can't cast to an rset because attributes are likely incompatible." stop_incompatible_cast(x, to, x_arg = x_arg, to_arg = to_arg, details = details) } stop_never_called <- function(fn) { rlang::abort(paste0("Internal error: `", fn, "()` should never be called.")) } rsample/R/compat-vctrs-helpers.R0000644000175000017500000001335514010267142016461 0ustar nileshnilesh #' Extending rsample with new rset subclasses #' #' `rset_reconstruct()` encapsulates the logic for allowing new rset #' subclasses to work properly with vctrs (through `vctrs::vec_restore()`) and #' dplyr (through `dplyr::dplyr_reconstruct()`). It is intended to be a #' developer tool, and is not required for normal usage of rsample. #' #' rset objects are considered "reconstructable" after a vctrs/dplyr operation #' if: #' #' - `x` and `to` both have an identical column named `"splits"` (column #' and row order do not matter). #' #' - `x` and `to` both have identical columns prefixed with `"id"` (column #' and row order do not matter). #' #' @param x A data frame to restore to an rset subclass. #' @param to An rset subclass to restore to. #' #' @return `x` restored to the rset subclass of `to`. #' #' @export #' @examples #' to <- bootstraps(mtcars, times = 25) #' #' # Imitate a vctrs/dplyr operation, #' # where the class might be lost along the way #' x <- tibble::as_tibble(to) #' #' # Say we added a new column to `x`. Here we mock a `mutate()`. #' x$foo <- "bar" #' #' # This is still reconstructable to `to` #' rset_reconstruct(x, to) #' #' # Say we lose the first row #' x <- x[-1,] #' #' # This is no longer reconstructable to `to`, as `x` is no longer an rset #' # bootstraps object with 25 bootstraps if one is lost! #' rset_reconstruct(x, to) rset_reconstruct <- function(x, to) { if (rset_reconstructable(x, to)) { df_reconstruct(x, to) } else { tib_upcast(x) } } # ------------------------------------------------------------------------------ # Two data frames are considered identical by `rset_reconstructable()` if the rset # sub-data-frames are identical. This means that if we select out the rset # specific columns, they should be exactly the same (modulo reordering). # It is expected that `to` is an rset object already, but `x` can be a # bare data frame, or even a named list. rset_reconstructable <- function(x, to) { x_names <- names(x) to_names <- names(to) x_rset_indicator <- col_equals_splits(x_names) | col_starts_with_id(x_names) to_rset_indicator <- col_equals_splits(to_names) | col_starts_with_id(to_names) # Special casing of `nested_cv` to also look for `inner_resamples` if (inherits(to, "nested_cv")) { x_rset_indicator <- x_rset_indicator | col_equals_inner_resamples(x_names) to_rset_indicator <- to_rset_indicator | col_equals_inner_resamples(to_names) } x_rset_names <- x_names[x_rset_indicator] to_rset_names <- to_names[to_rset_indicator] # Ignore ordering x_rset_names <- sort(x_rset_names) to_rset_names <- sort(to_rset_names) # Early return if names aren't identical if (!identical(x_rset_names, to_rset_names)) { return(FALSE) } # Avoid all non-bare-data-frame S3 dispatch and # don't compare outer data frame attributes. # Only look at column names and actual column data. x <- new_data_frame(x) to <- new_data_frame(to) # Early return if number of rows doesn't match if (!identical(vec_size(x), vec_size(to))) { return(FALSE) } x_rset_cols <- x[x_rset_names] to_rset_cols <- to[to_rset_names] # Row order doesn't matter x_rset_cols <- vec_sort(x_rset_cols) to_rset_cols <- vec_sort(to_rset_cols) # Check identical structures identical(x_rset_cols, to_rset_cols) } # ------------------------------------------------------------------------------ test_data <- function() { data.frame( x = 1:50, y = rep(c(1, 2), each = 25), index = as.Date(0:49, origin = "1970-01-01") ) } # Keep this list up to date with known rset subclasses for testing. # Delay assignment because we are creating this directly in the R script # and not all of the required helpers might have been sourced yet. delayedAssign("rset_subclasses", { list( bootstraps = bootstraps(test_data()), vfold_cv = vfold_cv(test_data(), v = 10, repeats = 2), group_vfold_cv = group_vfold_cv(test_data(), y), loo_cv = loo_cv(test_data()), mc_cv = mc_cv(test_data()), nested_cv = nested_cv(test_data(), outside = vfold_cv(v = 3), inside = bootstraps(times = 5)), validation_split = validation_split(test_data()), rolling_origin = rolling_origin(test_data()), sliding_window = sliding_window(test_data()), sliding_index = sliding_index(test_data(), index), sliding_period = sliding_period(test_data(), index, "week"), manual_rset = manual_rset(bootstraps(test_data())$splits[1:2], c("ID1", "ID2")), apparent = apparent(test_data()) ) }) # ------------------------------------------------------------------------------ col_equals_splits <- function(x) { vec_equal(x, "splits") } col_starts_with_id <- function(x) { grepl("(^id$)|(^id[1-9]$)", x) } col_equals_inner_resamples <- function(x) { vec_equal(x, "inner_resamples") } # ------------------------------------------------------------------------------ # Maybe this should live in vctrs? # Fallback to a tibble from the current data frame subclass. # Removes subclass specific attributes and additional ones added by the user. tib_upcast <- function(x) { size <- df_size(x) # Strip all attributes except names to construct # a bare list to build the tibble back up from. attributes(x) <- list(names = names(x)) tibble::new_tibble(x, nrow = size) } df_size <- function(x) { if (!is.list(x)) { rlang::abort("Cannot get the df size of a non-list.") } if (length(x) == 0L) { return(0L) } col <- x[[1L]] vec_size(col) } # ------------------------------------------------------------------------------ # Maybe this should live in vctrs? df_reconstruct <- function(x, to) { attrs <- attributes(to) attrs$names <- names(x) attrs$row.names <- .row_names_info(x, type = 0L) attributes(x) <- attrs x } rsample/LICENSE0000644000175000017500000000005514020224531013045 0ustar nileshnileshYEAR: 2021 COPYRIGHT HOLDER: rsample authors rsample/inst/0000755000175000017500000000000014142304331013017 5ustar nileshnileshrsample/inst/doc/0000755000175000017500000000000014142304331013564 5ustar nileshnileshrsample/inst/doc/rsample.Rmd0000644000175000017500000000625513743327400015713 0ustar nileshnilesh--- title: "Introduction to rsample" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Introduction to rsample} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3) ``` ## Terminology We define a _resample_ as the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. Cross-validation is another type of resampling. ## `rset` Objects Contain Many Resamples The main class in the package (`rset`) is for a _set_ or _collection_ of resamples. In 10-fold cross-validation, the set would consist of the 10 different resamples of the original data. Like [`modelr`](https://cran.r-project.org/package=modelr), the resamples are stored in data-frame-like `tibble` object. As a simple example, here is a small set of bootstraps of the `mtcars` data: ```{r mtcars_bt, message=FALSE} library(rsample) set.seed(8584) bt_resamples <- bootstraps(mtcars, times = 3) bt_resamples ``` ## Individual Resamples are `rsplit` Objects The resamples are stored in the `splits` column in an object that has class `rsplit`. In this package we use the following terminology for the two partitions that comprise a resample: * The _analysis_ data are those that we selected in the resample. For a bootstrap, this is the sample with replacement. For 10-fold cross-validation, this is the 90% of the data. These data are often used to fit a model or calculate a statistic in traditional bootstrapping. * The _assessment_ data are usually the section of the original data not covered by the analysis set. Again, in 10-fold CV, this is the 10% held out. These data are often used to evaluate the performance of a model that was fit to the analysis data. (Aside: While some might use the term "training" and "testing" for these data sets, we avoid them since those labels often conflict with the data that result from an initial partition of the data that is typically done _before_ resampling. The training/test split can be conducted using the `initial_split` function in this package.) Let's look at one of the `rsplit` objects ```{r rsplit} first_resample <- bt_resamples$splits[[1]] first_resample ``` This indicates that there were `r dim(bt_resamples$splits[[1]])["analysis"]` data points in the analysis set, `r dim(bt_resamples$splits[[1]])["assessment"]` instances were in the assessment set, and that the original data contained `r dim(bt_resamples$splits[[1]])["n"]` data points. These results can also be determined using the `dim` function on an `rsplit` object. To obtain either of these data sets from an `rsplit`, the `as.data.frame` function can be used. By default, the analysis set is returned but the `data` option can be used to return the assessment data: ```{r rsplit_df} head(as.data.frame(first_resample)) as.data.frame(first_resample, data = "assessment") ``` Alternatively, you can use the shortcuts `analysis(first_resample)` and `assessment(first_resample)`. rsample/inst/doc/Working_with_rsets.R0000644000175000017500000001023614142304330017603 0ustar nileshnilesh## ----ex_setup, include=FALSE------------------------------------------------------------ knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3, width = 90) library(ggplot2) theme_set(theme_bw()) ## ----attrition, message=FALSE----------------------------------------------------------- library(rsample) data("attrition", package = "modeldata") names(attrition) table(attrition$Attrition) ## ----form, message=FALSE---------------------------------------------------------------- mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome) ## ----model_vfold, message=FALSE--------------------------------------------------------- library(rsample) set.seed(4622) rs_obj <- vfold_cv(attrition, v = 10, repeats = 10) rs_obj ## ----lm_func---------------------------------------------------------------------------- ## splits will be the `rsplit` object with the 90/10 partition holdout_results <- function(splits, ...) { # Fit the model to the 90% mod <- glm(..., data = analysis(splits), family = binomial) # Save the 10% holdout <- assessment(splits) # `augment` will save the predictions with the holdout data set res <- broom::augment(mod, newdata = holdout) # Class predictions on the assessment set from class probs lvls <- levels(holdout$Attrition) predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]), levels = lvls) # Calculate whether the prediction was correct res$correct <- predictions == holdout$Attrition # Return the assessment data set with the additional columns res } ## ----onefold, warning = FALSE----------------------------------------------------------- example <- holdout_results(rs_obj$splits[[1]], mod_form) dim(example) dim(assessment(rs_obj$splits[[1]])) ## newly added columns: example[1:10, setdiff(names(example), names(attrition))] ## ----model_purrr, warning=FALSE--------------------------------------------------------- library(purrr) rs_obj$results <- map(rs_obj$splits, holdout_results, mod_form) rs_obj ## ----model_acc-------------------------------------------------------------------------- rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct)) summary(rs_obj$accuracy) ## ----type_plot-------------------------------------------------------------------------- ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + geom_boxplot() + scale_y_log10() ## ----mean_diff-------------------------------------------------------------------------- median_diff <- function(splits) { x <- analysis(splits) median(x$MonthlyIncome[x$Gender == "Female"]) - median(x$MonthlyIncome[x$Gender == "Male"]) } ## ----boot_mean_diff--------------------------------------------------------------------- set.seed(353) bt_resamples <- bootstraps(attrition, times = 500) ## ----stats------------------------------------------------------------------------------ bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff) ## ----stats_plot------------------------------------------------------------------------- ggplot(bt_resamples, aes(x = wage_diff)) + geom_line(stat = "density", adjust = 1.25) + xlab("Difference in Median Monthly Income (Female - Male)") ## ----ci--------------------------------------------------------------------------------- quantile(bt_resamples$wage_diff, probs = c(0.025, 0.975)) ## ----coefs------------------------------------------------------------------------------ glm_coefs <- function(splits, ...) { ## use `analysis` or `as.data.frame` to get the analysis data mod <- glm(..., data = analysis(splits), family = binomial) as.data.frame(t(coef(mod))) } bt_resamples$betas <- map(.x = bt_resamples$splits, .f = glm_coefs, mod_form) bt_resamples bt_resamples$betas[[1]] ## ----tidy_rsplit------------------------------------------------------------------------ first_resample <- bt_resamples$splits[[1]] class(first_resample) tidy(first_resample) ## ----tidy_rset-------------------------------------------------------------------------- class(bt_resamples) tidy(bt_resamples) rsample/inst/doc/rsample.html0000644000175000017500000005435214142304331016126 0ustar nileshnilesh Introduction to rsample

Introduction to rsample

Terminology

We define a resample as the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. Cross-validation is another type of resampling.

rset Objects Contain Many Resamples

The main class in the package (rset) is for a set or collection of resamples. In 10-fold cross-validation, the set would consist of the 10 different resamples of the original data.

Like modelr, the resamples are stored in data-frame-like tibble object. As a simple example, here is a small set of bootstraps of the mtcars data:

library(rsample)
set.seed(8584)
bt_resamples <- bootstraps(mtcars, times = 3)
bt_resamples
#> # Bootstrap sampling 
#> # A tibble: 3 × 2
#>   splits          id        
#>   <list>          <chr>     
#> 1 <split [32/14]> Bootstrap1
#> 2 <split [32/12]> Bootstrap2
#> 3 <split [32/14]> Bootstrap3

Individual Resamples are rsplit Objects

The resamples are stored in the splits column in an object that has class rsplit.

In this package we use the following terminology for the two partitions that comprise a resample:

  • The analysis data are those that we selected in the resample. For a bootstrap, this is the sample with replacement. For 10-fold cross-validation, this is the 90% of the data. These data are often used to fit a model or calculate a statistic in traditional bootstrapping.
  • The assessment data are usually the section of the original data not covered by the analysis set. Again, in 10-fold CV, this is the 10% held out. These data are often used to evaluate the performance of a model that was fit to the analysis data.

(Aside: While some might use the term “training” and “testing” for these data sets, we avoid them since those labels often conflict with the data that result from an initial partition of the data that is typically done before resampling. The training/test split can be conducted using the initial_split function in this package.)

Let’s look at one of the rsplit objects

first_resample <- bt_resamples$splits[[1]]
first_resample
#> <Analysis/Assess/Total>
#> <32/14/32>

This indicates that there were 32 data points in the analysis set, 14 instances were in the assessment set, and that the original data contained 32 data points. These results can also be determined using the dim function on an rsplit object.

To obtain either of these data sets from an rsplit, the as.data.frame function can be used. By default, the analysis set is returned but the data option can be used to return the assessment data:

head(as.data.frame(first_resample))
#>                   mpg cyl  disp  hp drat   wt qsec vs am gear carb
#> Fiat 128         32.4   4  78.7  66 4.08 2.20 19.5  1  1    4    1
#> Toyota Corolla   33.9   4  71.1  65 4.22 1.83 19.9  1  1    4    1
#> Toyota Corolla.1 33.9   4  71.1  65 4.22 1.83 19.9  1  1    4    1
#> AMC Javelin      15.2   8 304.0 150 3.15 3.44 17.3  0  0    3    2
#> Valiant          18.1   6 225.0 105 2.76 3.46 20.2  1  0    3    1
#> Merc 450SLC      15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3
as.data.frame(first_resample, data = "assessment")
#>                     mpg cyl  disp  hp drat   wt qsec vs am gear carb
#> Mazda RX4 Wag      21.0   6 160.0 110 3.90 2.88 17.0  0  1    4    4
#> Hornet 4 Drive     21.4   6 258.0 110 3.08 3.21 19.4  1  0    3    1
#> Merc 240D          24.4   4 146.7  62 3.69 3.19 20.0  1  0    4    2
#> Merc 230           22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
#> Merc 280           19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4
#> Merc 280C          17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4
#> Merc 450SE         16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3
#> Merc 450SL         17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3
#> Cadillac Fleetwood 10.4   8 472.0 205 2.93 5.25 18.0  0  0    3    4
#> Chrysler Imperial  14.7   8 440.0 230 3.23 5.34 17.4  0  0    3    4
#> Honda Civic        30.4   4  75.7  52 4.93 1.61 18.5  1  1    4    2
#> Fiat X1-9          27.3   4  79.0  66 4.08 1.94 18.9  1  1    4    1
#> Lotus Europa       30.4   4  95.1 113 3.77 1.51 16.9  1  1    5    2
#> Volvo 142E         21.4   4 121.0 109 4.11 2.78 18.6  1  1    4    2

Alternatively, you can use the shortcuts analysis(first_resample) and assessment(first_resample).

rsample/inst/doc/rsample.R0000644000175000017500000000132014142304331015346 0ustar nileshnilesh## ----ex_setup, include=FALSE------------------------------------------------------------ knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3) ## ----mtcars_bt, message=FALSE----------------------------------------------------------- library(rsample) set.seed(8584) bt_resamples <- bootstraps(mtcars, times = 3) bt_resamples ## ----rsplit----------------------------------------------------------------------------- first_resample <- bt_resamples$splits[[1]] first_resample ## ----rsplit_df-------------------------------------------------------------------------- head(as.data.frame(first_resample)) as.data.frame(first_resample, data = "assessment") rsample/inst/doc/Working_with_rsets.html0000644000175000017500000065470314142304331020364 0ustar nileshnilesh Working with rsets

Working with rsets

Introduction

rsample can be used to create objects containing resamples of the original data. This page contains examples of how those objects can be used for data analysis.

For illustration, the attrition data are used. From the help file:

These data are from the IBM Watson Analytics Lab. The website describes the data with “Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or ‘compare average monthly income by education and attrition’. This is a fictional data set created by IBM data scientists.” There are 1470 rows.

The data can be accessed using

library(rsample)
data("attrition", package = "modeldata")
names(attrition)
#>  [1] "Age"                      "Attrition"                "BusinessTravel"          
#>  [4] "DailyRate"                "Department"               "DistanceFromHome"        
#>  [7] "Education"                "EducationField"           "EnvironmentSatisfaction" 
#> [10] "Gender"                   "HourlyRate"               "JobInvolvement"          
#> [13] "JobLevel"                 "JobRole"                  "JobSatisfaction"         
#> [16] "MaritalStatus"            "MonthlyIncome"            "MonthlyRate"             
#> [19] "NumCompaniesWorked"       "OverTime"                 "PercentSalaryHike"       
#> [22] "PerformanceRating"        "RelationshipSatisfaction" "StockOptionLevel"        
#> [25] "TotalWorkingYears"        "TrainingTimesLastYear"    "WorkLifeBalance"         
#> [28] "YearsAtCompany"           "YearsInCurrentRole"       "YearsSinceLastPromotion" 
#> [31] "YearsWithCurrManager"
table(attrition$Attrition)
#> 
#>   No  Yes 
#> 1233  237

Model Assessment

Let’s fit a logistic regression model to the data with model terms for the job satisfaction, gender, and monthly income.

If we were fitting the model to the entire data set, we might model attrition using

glm(Attrition ~ JobSatisfaction + Gender + MonthlyIncome, data = attrition, family = binomial)

For convenience, we’ll create a formula object that will be used later:

mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome)

To evaluate this model, we will use 10 repeats of 10-fold cross-validation and use the 100 holdout samples to evaluate the overall accuracy of the model.

First, let’s make the splits of the data:

library(rsample)
set.seed(4622)
rs_obj <- vfold_cv(attrition, v = 10, repeats = 10)
rs_obj
#> #  10-fold cross-validation repeated 10 times 
#> # A tibble: 100 × 3
#>    splits             id       id2   
#>    <list>             <chr>    <chr> 
#>  1 <split [1323/147]> Repeat01 Fold01
#>  2 <split [1323/147]> Repeat01 Fold02
#>  3 <split [1323/147]> Repeat01 Fold03
#>  4 <split [1323/147]> Repeat01 Fold04
#>  5 <split [1323/147]> Repeat01 Fold05
#>  6 <split [1323/147]> Repeat01 Fold06
#>  7 <split [1323/147]> Repeat01 Fold07
#>  8 <split [1323/147]> Repeat01 Fold08
#>  9 <split [1323/147]> Repeat01 Fold09
#> 10 <split [1323/147]> Repeat01 Fold10
#> # … with 90 more rows

Now let’s write a function that will, for each resample:

  1. obtain the analysis data set (i.e. the 90% used for modeling)
  2. fit a logistic regression model
  3. predict the assessment data (the other 10% not used for the model) using the broom package
  4. determine if each sample was predicted correctly.

Here is our function:

## splits will be the `rsplit` object with the 90/10 partition
holdout_results <- function(splits, ...) {
  # Fit the model to the 90%
  mod <- glm(..., data = analysis(splits), family = binomial)
  # Save the 10%
  holdout <- assessment(splits)
  # `augment` will save the predictions with the holdout data set
  res <- broom::augment(mod, newdata = holdout)
  # Class predictions on the assessment set from class probs
  lvls <- levels(holdout$Attrition)
  predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]),
                        levels = lvls)
  # Calculate whether the prediction was correct
  res$correct <- predictions == holdout$Attrition
  # Return the assessment data set with the additional columns
  res
}

For example:

example <- holdout_results(rs_obj$splits[[1]],  mod_form)
dim(example)
#> [1] 147  34
dim(assessment(rs_obj$splits[[1]]))
#> [1] 147  31
## newly added columns:
example[1:10, setdiff(names(example), names(attrition))]
#> # A tibble: 10 × 3
#>    .rownames .fitted correct
#>    <chr>       <dbl> <lgl>  
#>  1 11          -1.20 TRUE   
#>  2 24          -1.78 TRUE   
#>  3 30          -1.45 TRUE   
#>  4 39          -1.60 TRUE   
#>  5 53          -1.54 TRUE   
#>  6 72          -1.93 TRUE   
#>  7 73          -3.06 TRUE   
#>  8 80          -3.28 TRUE   
#>  9 83          -2.23 TRUE   
#> 10 90          -1.28 FALSE

For this model, the .fitted value is the linear predictor in log-odds units.

To compute this data set for each of the 100 resamples, we’ll use the map function from the purrr package:

library(purrr)
rs_obj$results <- map(rs_obj$splits,
                      holdout_results,
                      mod_form)
rs_obj
#> #  10-fold cross-validation repeated 10 times 
#> # A tibble: 100 × 4
#>    splits             id       id2    results            
#>    <list>             <chr>    <chr>  <list>             
#>  1 <split [1323/147]> Repeat01 Fold01 <tibble [147 × 34]>
#>  2 <split [1323/147]> Repeat01 Fold02 <tibble [147 × 34]>
#>  3 <split [1323/147]> Repeat01 Fold03 <tibble [147 × 34]>
#>  4 <split [1323/147]> Repeat01 Fold04 <tibble [147 × 34]>
#>  5 <split [1323/147]> Repeat01 Fold05 <tibble [147 × 34]>
#>  6 <split [1323/147]> Repeat01 Fold06 <tibble [147 × 34]>
#>  7 <split [1323/147]> Repeat01 Fold07 <tibble [147 × 34]>
#>  8 <split [1323/147]> Repeat01 Fold08 <tibble [147 × 34]>
#>  9 <split [1323/147]> Repeat01 Fold09 <tibble [147 × 34]>
#> 10 <split [1323/147]> Repeat01 Fold10 <tibble [147 × 34]>
#> # … with 90 more rows

Now we can compute the accuracy values for all of the assessment data sets:

rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct))
summary(rs_obj$accuracy)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   0.776   0.821   0.840   0.839   0.859   0.905

Keep in mind that the baseline accuracy to beat is the rate of non-attrition, which is 0.839. Not a great model so far.

Using the Bootstrap to Make Comparisons

Traditionally, the bootstrap has been primarily used to empirically determine the sampling distribution of a test statistic. Given a set of samples with replacement, a statistic can be calculated on each analysis set and the results can be used to make inferences (such as confidence intervals).

For example, are there differences in the median monthly income between genders?

ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + 
  geom_boxplot() + 
  scale_y_log10()

If we wanted to compare the genders, we could conduct a t-test or rank-based test. Instead, let’s use the bootstrap to see if there is a difference in the median incomes for the two groups. We need a simple function to compute this statistic on the resample:

median_diff <- function(splits) {
  x <- analysis(splits)
  median(x$MonthlyIncome[x$Gender == "Female"]) - 
      median(x$MonthlyIncome[x$Gender == "Male"])     
}

Now we would create a large number of bootstrap samples (say 2000+). For illustration, we’ll only do 500 in this document.

set.seed(353)
bt_resamples <- bootstraps(attrition, times = 500)

This function is then computed across each resample:

bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff)

The bootstrap distribution of this statistic has a slightly bimodal and skewed distribution:

ggplot(bt_resamples, aes(x = wage_diff)) + 
  geom_line(stat = "density", adjust = 1.25) + 
  xlab("Difference in Median Monthly Income (Female - Male)")

The variation is considerable in this statistic. One method of computing a confidence interval is to take the percentiles of the bootstrap distribution. A 95% confidence interval for the difference in the means would be:

quantile(bt_resamples$wage_diff, 
         probs = c(0.025, 0.975))
#>  2.5% 97.5% 
#>  -189   615

The calculated 95% confidence interval contains zero, so we don’t have evidence for a difference in median income between these genders at a confidence level of 95%.

Bootstrap Estimates of Model Coefficients

Unless there is already a column in the resample object that contains the fitted model, a function can be used to fit the model and save all of the model coefficients. The broom package package has a tidy function that will save the coefficients in a data frame. Instead of returning a data frame with a row for each model term, we will save a data frame with a single row and columns for each model term. As before, purrr::map can be used to estimate and save these values for each split.

glm_coefs <- function(splits, ...) {
  ## use `analysis` or `as.data.frame` to get the analysis data
  mod <- glm(..., data = analysis(splits), family = binomial)
  as.data.frame(t(coef(mod)))
}
bt_resamples$betas <- map(.x = bt_resamples$splits, 
                          .f = glm_coefs, 
                          mod_form)
bt_resamples
#> # Bootstrap sampling 
#> # A tibble: 500 × 4
#>    splits             id           wage_diff betas       
#>    <list>             <chr>            <dbl> <list>      
#>  1 <split [1470/558]> Bootstrap001      136  <df [1 × 6]>
#>  2 <split [1470/528]> Bootstrap002      282. <df [1 × 6]>
#>  3 <split [1470/541]> Bootstrap003      470  <df [1 × 6]>
#>  4 <split [1470/561]> Bootstrap004     -213  <df [1 × 6]>
#>  5 <split [1470/518]> Bootstrap005      453  <df [1 × 6]>
#>  6 <split [1470/539]> Bootstrap006      684  <df [1 × 6]>
#>  7 <split [1470/542]> Bootstrap007       60  <df [1 × 6]>
#>  8 <split [1470/536]> Bootstrap008      286  <df [1 × 6]>
#>  9 <split [1470/552]> Bootstrap009      -30  <df [1 × 6]>
#> 10 <split [1470/517]> Bootstrap010      410  <df [1 × 6]>
#> # … with 490 more rows
bt_resamples$betas[[1]]
#>   (Intercept) JobSatisfaction.L JobSatisfaction.Q JobSatisfaction.C GenderMale
#> 1      -0.939            -0.501            -0.272            0.0842     0.0989
#>   MonthlyIncome
#> 1     -0.000129

Keeping Tidy

As previously mentioned, the broom package contains a class called tidy that created representations of objects that can be easily used for analysis, plotting, etc. rsample contains tidy methods for rset and rsplit objects. For example:

first_resample <- bt_resamples$splits[[1]]
class(first_resample)
#> [1] "boot_split" "rsplit"
tidy(first_resample)
#> # A tibble: 1,470 × 2
#>      Row Data    
#>    <int> <chr>   
#>  1     2 Analysis
#>  2     3 Analysis
#>  3     4 Analysis
#>  4     7 Analysis
#>  5     9 Analysis
#>  6    10 Analysis
#>  7    11 Analysis
#>  8    13 Analysis
#>  9    18 Analysis
#> 10    19 Analysis
#> # … with 1,460 more rows

and

class(bt_resamples)
#> [1] "bootstraps" "rset"       "tbl_df"     "tbl"        "data.frame"
tidy(bt_resamples)
#> # A tibble: 735,000 × 3
#>      Row Data     Resample    
#>    <int> <chr>    <chr>       
#>  1     1 Analysis Bootstrap002
#>  2     1 Analysis Bootstrap004
#>  3     1 Analysis Bootstrap007
#>  4     1 Analysis Bootstrap008
#>  5     1 Analysis Bootstrap009
#>  6     1 Analysis Bootstrap010
#>  7     1 Analysis Bootstrap011
#>  8     1 Analysis Bootstrap013
#>  9     1 Analysis Bootstrap015
#> 10     1 Analysis Bootstrap016
#> # … with 734,990 more rows
rsample/inst/doc/Working_with_rsets.Rmd0000644000175000017500000001673613673171774020165 0ustar nileshnilesh--- title: "Working with rsets" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Working with rsets} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3, width = 90) library(ggplot2) theme_set(theme_bw()) ``` ## Introduction `rsample` can be used to create objects containing resamples of the original data. This page contains examples of how those objects can be used for data analysis. For illustration, the `attrition` data are used. From the help file: > These data are from the IBM Watson Analytics Lab. The website describes the data with "Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or 'compare average monthly income by education and attrition'. This is a fictional data set created by IBM data scientists." There are 1470 rows. The data can be accessed using ```{r attrition, message=FALSE} library(rsample) data("attrition", package = "modeldata") names(attrition) table(attrition$Attrition) ``` ## Model Assessment Let's fit a logistic regression model to the data with model terms for the job satisfaction, gender, and monthly income. If we were fitting the model to the entire data set, we might model attrition using ```r glm(Attrition ~ JobSatisfaction + Gender + MonthlyIncome, data = attrition, family = binomial) ``` For convenience, we'll create a formula object that will be used later: ```{r form, message=FALSE} mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome) ``` To evaluate this model, we will use 10 repeats of 10-fold cross-validation and use the 100 holdout samples to evaluate the overall accuracy of the model. First, let's make the splits of the data: ```{r model_vfold, message=FALSE} library(rsample) set.seed(4622) rs_obj <- vfold_cv(attrition, v = 10, repeats = 10) rs_obj ``` Now let's write a function that will, for each resample: 1. obtain the analysis data set (i.e. the 90% used for modeling) 1. fit a logistic regression model 1. predict the assessment data (the other 10% not used for the model) using the `broom` package 1. determine if each sample was predicted correctly. Here is our function: ```{r lm_func} ## splits will be the `rsplit` object with the 90/10 partition holdout_results <- function(splits, ...) { # Fit the model to the 90% mod <- glm(..., data = analysis(splits), family = binomial) # Save the 10% holdout <- assessment(splits) # `augment` will save the predictions with the holdout data set res <- broom::augment(mod, newdata = holdout) # Class predictions on the assessment set from class probs lvls <- levels(holdout$Attrition) predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]), levels = lvls) # Calculate whether the prediction was correct res$correct <- predictions == holdout$Attrition # Return the assessment data set with the additional columns res } ``` For example: ```{r onefold, warning = FALSE} example <- holdout_results(rs_obj$splits[[1]], mod_form) dim(example) dim(assessment(rs_obj$splits[[1]])) ## newly added columns: example[1:10, setdiff(names(example), names(attrition))] ``` For this model, the `.fitted` value is the linear predictor in log-odds units. To compute this data set for each of the 100 resamples, we'll use the `map` function from the `purrr` package: ```{r model_purrr, warning=FALSE} library(purrr) rs_obj$results <- map(rs_obj$splits, holdout_results, mod_form) rs_obj ``` Now we can compute the accuracy values for all of the assessment data sets: ```{r model_acc} rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct)) summary(rs_obj$accuracy) ``` Keep in mind that the baseline accuracy to beat is the rate of non-attrition, which is `r round(mean(attrition$Attrition == "No"), 3)`. Not a great model so far. ## Using the Bootstrap to Make Comparisons Traditionally, the bootstrap has been primarily used to empirically determine the sampling distribution of a test statistic. Given a set of samples with replacement, a statistic can be calculated on each analysis set and the results can be used to make inferences (such as confidence intervals). For example, are there differences in the median monthly income between genders? ```{r type_plot} ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + geom_boxplot() + scale_y_log10() ``` If we wanted to compare the genders, we could conduct a _t_-test or rank-based test. Instead, let's use the bootstrap to see if there is a difference in the median incomes for the two groups. We need a simple function to compute this statistic on the resample: ```{r mean_diff} median_diff <- function(splits) { x <- analysis(splits) median(x$MonthlyIncome[x$Gender == "Female"]) - median(x$MonthlyIncome[x$Gender == "Male"]) } ``` Now we would create a large number of bootstrap samples (say 2000+). For illustration, we'll only do 500 in this document. ```{r boot_mean_diff} set.seed(353) bt_resamples <- bootstraps(attrition, times = 500) ``` This function is then computed across each resample: ```{r stats} bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff) ``` The bootstrap distribution of this statistic has a slightly bimodal and skewed distribution: ```{r stats_plot} ggplot(bt_resamples, aes(x = wage_diff)) + geom_line(stat = "density", adjust = 1.25) + xlab("Difference in Median Monthly Income (Female - Male)") ``` The variation is considerable in this statistic. One method of computing a confidence interval is to take the percentiles of the bootstrap distribution. A 95% confidence interval for the difference in the means would be: ```{r ci} quantile(bt_resamples$wage_diff, probs = c(0.025, 0.975)) ``` The calculated 95% confidence interval contains zero, so we don't have evidence for a difference in median income between these genders at a confidence level of 95%. ## Bootstrap Estimates of Model Coefficients Unless there is already a column in the resample object that contains the fitted model, a function can be used to fit the model and save all of the model coefficients. The [`broom` package](https://cran.r-project.org/package=broom) package has a `tidy` function that will save the coefficients in a data frame. Instead of returning a data frame with a row for each model term, we will save a data frame with a single row and columns for each model term. As before, `purrr::map` can be used to estimate and save these values for each split. ```{r coefs} glm_coefs <- function(splits, ...) { ## use `analysis` or `as.data.frame` to get the analysis data mod <- glm(..., data = analysis(splits), family = binomial) as.data.frame(t(coef(mod))) } bt_resamples$betas <- map(.x = bt_resamples$splits, .f = glm_coefs, mod_form) bt_resamples bt_resamples$betas[[1]] ``` ## Keeping Tidy As previously mentioned, the [`broom` package](https://cran.r-project.org/package=broom) contains a class called `tidy` that created representations of objects that can be easily used for analysis, plotting, etc. `rsample` contains `tidy` methods for `rset` and `rsplit` objects. For example: ```{r tidy_rsplit} first_resample <- bt_resamples$splits[[1]] class(first_resample) tidy(first_resample) ``` and ```{r tidy_rset} class(bt_resamples) tidy(bt_resamples) ``` rsample/NAMESPACE0000644000175000017500000002431614142301371013270 0ustar nileshnilesh# Generated by roxygen2: do not edit by hand S3method("[",rset) S3method("names<-",rset) S3method(.get_fingerprint,default) S3method(.get_fingerprint,rset) S3method(as.data.frame,rsplit) S3method(as.integer,rsplit) S3method(complement,apparent_split) S3method(complement,default) S3method(complement,rof_split) S3method(complement,rsplit) S3method(complement,sliding_index_split) S3method(complement,sliding_period_split) S3method(complement,sliding_window_split) S3method(dim,rsplit) S3method(gather,rset) S3method(labels,rset) S3method(labels,rsplit) S3method(labels,vfold_cv) S3method(make_splits,data.frame) S3method(make_splits,default) S3method(make_splits,list) S3method(obj_sum,rsplit) S3method(populate,rset) S3method(populate,rsplit) S3method(pretty,apparent) S3method(pretty,bootstraps) S3method(pretty,group_vfold_cv) S3method(pretty,loo_cv) S3method(pretty,manual_rset) S3method(pretty,mc_cv) S3method(pretty,nested_cv) S3method(pretty,permutations) S3method(pretty,rolling_origin) S3method(pretty,sliding_index) S3method(pretty,sliding_period) S3method(pretty,sliding_window) S3method(pretty,validation_split) S3method(pretty,vfold_cv) S3method(print,apparent) S3method(print,bootstraps) S3method(print,group_vfold_cv) S3method(print,loo_cv) S3method(print,manual_rset) S3method(print,mc_cv) S3method(print,nested_cv) S3method(print,permutations) S3method(print,rolling_origin) S3method(print,rsplit) S3method(print,sliding_index) S3method(print,sliding_period) S3method(print,sliding_window) S3method(print,val_split) S3method(print,validation_split) S3method(print,vfold_cv) S3method(tidy,nested_cv) S3method(tidy,rset) S3method(tidy,rsplit) S3method(tidy,vfold_cv) S3method(type_sum,rsplit) S3method(vec_cast,apparent.apparent) S3method(vec_cast,apparent.data.frame) S3method(vec_cast,apparent.tbl_df) S3method(vec_cast,bootstraps.bootstraps) S3method(vec_cast,bootstraps.data.frame) S3method(vec_cast,bootstraps.tbl_df) S3method(vec_cast,data.frame.apparent) S3method(vec_cast,data.frame.bootstraps) S3method(vec_cast,data.frame.group_vfold_cv) S3method(vec_cast,data.frame.loo_cv) S3method(vec_cast,data.frame.manual_rset) S3method(vec_cast,data.frame.mc_cv) S3method(vec_cast,data.frame.nested_cv) S3method(vec_cast,data.frame.rolling_origin) S3method(vec_cast,data.frame.sliding_index) S3method(vec_cast,data.frame.sliding_period) S3method(vec_cast,data.frame.sliding_window) S3method(vec_cast,data.frame.validation_split) S3method(vec_cast,data.frame.vfold_cv) S3method(vec_cast,group_vfold_cv.data.frame) S3method(vec_cast,group_vfold_cv.group_vfold_cv) S3method(vec_cast,group_vfold_cv.tbl_df) S3method(vec_cast,loo_cv.data.frame) S3method(vec_cast,loo_cv.loo_cv) S3method(vec_cast,loo_cv.tbl_df) S3method(vec_cast,manual_rset.data.frame) S3method(vec_cast,manual_rset.manual_rset) S3method(vec_cast,manual_rset.tbl_df) S3method(vec_cast,mc_cv.data.frame) S3method(vec_cast,mc_cv.mc_cv) S3method(vec_cast,mc_cv.tbl_df) S3method(vec_cast,nested_cv.data.frame) S3method(vec_cast,nested_cv.nested_cv) S3method(vec_cast,nested_cv.tbl_df) S3method(vec_cast,rolling_origin.data.frame) S3method(vec_cast,rolling_origin.rolling_origin) S3method(vec_cast,rolling_origin.tbl_df) S3method(vec_cast,sliding_index.data.frame) S3method(vec_cast,sliding_index.sliding_index) S3method(vec_cast,sliding_index.tbl_df) S3method(vec_cast,sliding_period.data.frame) S3method(vec_cast,sliding_period.sliding_period) S3method(vec_cast,sliding_period.tbl_df) S3method(vec_cast,sliding_window.data.frame) S3method(vec_cast,sliding_window.sliding_window) S3method(vec_cast,sliding_window.tbl_df) S3method(vec_cast,tbl_df.apparent) S3method(vec_cast,tbl_df.bootstraps) S3method(vec_cast,tbl_df.group_vfold_cv) S3method(vec_cast,tbl_df.loo_cv) S3method(vec_cast,tbl_df.manual_rset) S3method(vec_cast,tbl_df.mc_cv) S3method(vec_cast,tbl_df.nested_cv) S3method(vec_cast,tbl_df.rolling_origin) S3method(vec_cast,tbl_df.sliding_index) S3method(vec_cast,tbl_df.sliding_period) S3method(vec_cast,tbl_df.sliding_window) S3method(vec_cast,tbl_df.validation_split) S3method(vec_cast,tbl_df.vfold_cv) S3method(vec_cast,validation_split.data.frame) S3method(vec_cast,validation_split.tbl_df) S3method(vec_cast,validation_split.validation_split) S3method(vec_cast,vfold_cv.data.frame) S3method(vec_cast,vfold_cv.tbl_df) S3method(vec_cast,vfold_cv.vfold_cv) S3method(vec_ptype2,apparent.apparent) S3method(vec_ptype2,apparent.data.frame) S3method(vec_ptype2,apparent.tbl_df) S3method(vec_ptype2,bootstraps.bootstraps) S3method(vec_ptype2,bootstraps.data.frame) S3method(vec_ptype2,bootstraps.tbl_df) S3method(vec_ptype2,data.frame.apparent) S3method(vec_ptype2,data.frame.bootstraps) S3method(vec_ptype2,data.frame.group_vfold_cv) S3method(vec_ptype2,data.frame.loo_cv) S3method(vec_ptype2,data.frame.manual_rset) S3method(vec_ptype2,data.frame.mc_cv) S3method(vec_ptype2,data.frame.nested_cv) S3method(vec_ptype2,data.frame.rolling_origin) S3method(vec_ptype2,data.frame.sliding_index) S3method(vec_ptype2,data.frame.sliding_period) S3method(vec_ptype2,data.frame.sliding_window) S3method(vec_ptype2,data.frame.validation_split) S3method(vec_ptype2,data.frame.vfold_cv) S3method(vec_ptype2,group_vfold_cv.data.frame) S3method(vec_ptype2,group_vfold_cv.group_vfold_cv) S3method(vec_ptype2,group_vfold_cv.tbl_df) S3method(vec_ptype2,loo_cv.data.frame) S3method(vec_ptype2,loo_cv.loo_cv) S3method(vec_ptype2,loo_cv.tbl_df) S3method(vec_ptype2,manual_rset.data.frame) S3method(vec_ptype2,manual_rset.manual_rset) S3method(vec_ptype2,manual_rset.tbl_df) S3method(vec_ptype2,mc_cv.data.frame) S3method(vec_ptype2,mc_cv.mc_cv) S3method(vec_ptype2,mc_cv.tbl_df) S3method(vec_ptype2,nested_cv.data.frame) S3method(vec_ptype2,nested_cv.nested_cv) S3method(vec_ptype2,nested_cv.tbl_df) S3method(vec_ptype2,rolling_origin.data.frame) S3method(vec_ptype2,rolling_origin.rolling_origin) S3method(vec_ptype2,rolling_origin.tbl_df) S3method(vec_ptype2,sliding_index.data.frame) S3method(vec_ptype2,sliding_index.sliding_index) S3method(vec_ptype2,sliding_index.tbl_df) S3method(vec_ptype2,sliding_period.data.frame) S3method(vec_ptype2,sliding_period.sliding_period) S3method(vec_ptype2,sliding_period.tbl_df) S3method(vec_ptype2,sliding_window.data.frame) S3method(vec_ptype2,sliding_window.sliding_window) S3method(vec_ptype2,sliding_window.tbl_df) S3method(vec_ptype2,tbl_df.apparent) S3method(vec_ptype2,tbl_df.bootstraps) S3method(vec_ptype2,tbl_df.group_vfold_cv) S3method(vec_ptype2,tbl_df.loo_cv) S3method(vec_ptype2,tbl_df.manual_rset) S3method(vec_ptype2,tbl_df.mc_cv) S3method(vec_ptype2,tbl_df.nested_cv) S3method(vec_ptype2,tbl_df.rolling_origin) S3method(vec_ptype2,tbl_df.sliding_index) S3method(vec_ptype2,tbl_df.sliding_period) S3method(vec_ptype2,tbl_df.sliding_window) S3method(vec_ptype2,tbl_df.validation_split) S3method(vec_ptype2,tbl_df.vfold_cv) S3method(vec_ptype2,validation_split.data.frame) S3method(vec_ptype2,validation_split.tbl_df) S3method(vec_ptype2,validation_split.validation_split) S3method(vec_ptype2,vfold_cv.data.frame) S3method(vec_ptype2,vfold_cv.tbl_df) S3method(vec_ptype2,vfold_cv.vfold_cv) S3method(vec_restore,apparent) S3method(vec_restore,bootstraps) S3method(vec_restore,group_vfold_cv) S3method(vec_restore,loo_cv) S3method(vec_restore,manual_rset) S3method(vec_restore,mc_cv) S3method(vec_restore,nested_cv) S3method(vec_restore,rolling_origin) S3method(vec_restore,sliding_index) S3method(vec_restore,sliding_period) S3method(vec_restore,sliding_window) S3method(vec_restore,validation_split) S3method(vec_restore,vfold_cv) export(.get_fingerprint) export(add_resample_id) export(all_of) export(analysis) export(any_of) export(apparent) export(assessment) export(bootstraps) export(caret2rsample) export(complement) export(contains) export(ends_with) export(everything) export(form_pred) export(gather) export(gather.rset) export(group_vfold_cv) export(initial_split) export(initial_time_split) export(int_bca) export(int_pctl) export(int_t) export(last_col) export(loo_cv) export(make_splits) export(make_strata) export(manual_rset) export(matches) export(mc_cv) export(nested_cv) export(new_rset) export(num_range) export(permutations) export(populate) export(pretty.apparent) export(pretty.bootstraps) export(pretty.group_vfold_cv) export(pretty.loo_cv) export(pretty.manual_rset) export(pretty.mc_cv) export(pretty.nested_cv) export(pretty.permutations) export(pretty.rolling_origin) export(pretty.sliding_index) export(pretty.sliding_period) export(pretty.sliding_window) export(pretty.validation_split) export(pretty.vfold_cv) export(reg_intervals) export(rolling_origin) export(rsample2caret) export(rset_reconstruct) export(sliding_index) export(sliding_period) export(sliding_window) export(starts_with) export(testing) export(tidy) export(training) export(validation_split) export(validation_time_split) export(vfold_cv) import(vctrs) importFrom(dplyr,"%>%") importFrom(dplyr,arrange) importFrom(dplyr,arrange_) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) importFrom(dplyr,do) importFrom(dplyr,full_join) importFrom(dplyr,group_by) importFrom(dplyr,inner_join) importFrom(dplyr,last) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(furrr,future_map_dfr) importFrom(generics,tidy) importFrom(methods,formalArgs) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map_dbl) importFrom(purrr,map_df) importFrom(purrr,map_dfr) importFrom(purrr,map_lgl) importFrom(purrr,pluck) importFrom(rlang,"!!") importFrom(rlang,abort) importFrom(rlang,enquo) importFrom(rlang,exec) importFrom(rlang,is_call) importFrom(rlang,is_list) importFrom(rlang,is_string) importFrom(rlang,quos) importFrom(rlang,warn) importFrom(stats,pnorm) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,setNames) importFrom(stats,terms) importFrom(tibble,as_tibble) importFrom(tibble,is_tibble) importFrom(tibble,obj_sum) importFrom(tibble,tibble) importFrom(tibble,type_sum) importFrom(tidyr,gather) importFrom(tidyr,unnest) importFrom(tidyselect,all_of) importFrom(tidyselect,any_of) importFrom(tidyselect,contains) importFrom(tidyselect,ends_with) importFrom(tidyselect,everything) importFrom(tidyselect,last_col) importFrom(tidyselect,matches) importFrom(tidyselect,num_range) importFrom(tidyselect,one_of) importFrom(tidyselect,starts_with) importFrom(tidyselect,vars_select) importFrom(utils,globalVariables)