modeldata/ 0000755 0001762 0000144 00000000000 14634575727 012230 5 ustar ligges users modeldata/tests/ 0000755 0001762 0000144 00000000000 14534612370 013353 5 ustar ligges users modeldata/tests/testthat/ 0000755 0001762 0000144 00000000000 14634575727 015232 5 ustar ligges users modeldata/tests/testthat/test-simulations.R 0000644 0001762 0000144 00000013467 14534612370 020675 0 ustar ligges users test_that("classification simulation", {
set.seed(1)
dat_1 <- sim_classification(500, num_linear = 0)
dat_2 <- sim_classification(10, num_linear = 11)
dat_3 <- sim_classification(1000, num_linear = 1, intercept = 50)
dat_4 <- sim_classification(500, num_linear = 0, keep_truth = TRUE)
expect_equal(
names(dat_1),
c(
"class", "two_factor_1", "two_factor_2", "non_linear_1", "non_linear_2",
"non_linear_3"
)
)
expect_equal(
names(dat_2),
c(
"class", "two_factor_1", "two_factor_2", "non_linear_1", "non_linear_2",
"non_linear_3", modeldata:::names0(11, "linear_")
)
)
expect_equal(
names(dat_3),
c(
"class", "two_factor_1", "two_factor_2", "non_linear_1", "non_linear_2",
"non_linear_3", "linear_1"
)
)
expect_equal(
names(dat_4),
c(
"class", "two_factor_1", "two_factor_2", "non_linear_1", "non_linear_2",
"non_linear_3", ".truth"
)
)
expect_equal(nrow(dat_1), 500)
expect_equal(nrow(dat_2), 10)
expect_equal(nrow(dat_3), 1000)
expect_true(all(vapply(dat_1[, -1], is.numeric, logical(1))))
expect_equal(sum(dat_3 == "class_2"), 0)
expect_equal(levels(dat_3$class), paste0("class_", 1:2))
expect_error(
sim_classification(5, method = "potato"),
"must be one of"
)
})
test_that("sapp_2014_1 simulation", {
set.seed(1)
dat_1 <- sim_regression(10, method = "sapp_2014_1")
dat_2 <- sim_regression(10, method = "sapp_2014_1", keep_truth = TRUE)
expect_equal(names(dat_1), c("outcome", modeldata:::names0(20, "predictor_")))
expect_equal(names(dat_2), c("outcome", modeldata:::names0(20, "predictor_"), ".truth"))
expect_equal(nrow(dat_1), 10)
expect_true(all(vapply(dat_1, is.numeric, logical(1))))
expect_error(
sim_regression(5, method = "potato"),
"must be one of"
)
})
test_that("sapp_2014_2 simulation", {
set.seed(1)
dat_1 <- sim_regression(10, method = "sapp_2014_2")
dat_2 <- sim_regression(10, method = "sapp_2014_2", keep_truth = TRUE)
expect_equal(names(dat_1), c("outcome", modeldata:::names0(200, "predictor_")))
expect_equal(names(dat_2), c("outcome", modeldata:::names0(200, "predictor_"), ".truth"))
expect_equal(nrow(dat_1), 10)
expect_true(all(vapply(dat_1, is.numeric, logical(1))))
})
test_that("van_der_laan_2007_1 simulation", {
set.seed(1)
dat_1 <- sim_regression(10, method = "van_der_laan_2007_1")
dat_2 <- sim_regression(10, method = "van_der_laan_2007_1", factors = TRUE)
dat_3 <- sim_regression(10, method = "van_der_laan_2007_1", keep_truth = TRUE)
expect_equal(names(dat_1), c("outcome", modeldata:::names0(10, "predictor_")))
expect_equal(names(dat_3), c("outcome", modeldata:::names0(10, "predictor_"), ".truth"))
expect_equal(nrow(dat_1), 10)
expect_true(all(vapply(dat_1, is.numeric, logical(1))))
expect_true(all(vapply(dat_1[, -1], is.integer, logical(1))))
expect_true(all(vapply(dat_2[, -1], is.factor, logical(1))))
expect_equal(levels(dat_2$predictor_01), c("yes", "no"))
})
test_that("van_der_laan_2007_2 simulation", {
set.seed(1)
dat_1 <- sim_regression(10, method = "van_der_laan_2007_2")
dat_2 <- sim_regression(10, method = "van_der_laan_2007_2", keep_truth = TRUE)
expect_equal(names(dat_1), c("outcome", modeldata:::names0(20, "predictor_")))
expect_equal(names(dat_2), c("outcome", modeldata:::names0(20, "predictor_"), ".truth"))
expect_equal(nrow(dat_1), 10)
expect_true(all(vapply(dat_1, is.numeric, logical(1))))
})
test_that("hooker_2004 simulation", {
set.seed(1)
dat_1 <- sim_regression(10, method = "hooker_2004")
dat_2 <- sim_regression(10, method = "hooker_2004", keep_truth = TRUE)
expect_equal(names(dat_1), c("outcome", modeldata:::names0(10, "predictor_")))
expect_equal(names(dat_2), c("outcome", modeldata:::names0(10, "predictor_"), ".truth"))
expect_equal(nrow(dat_1), 10)
expect_true(all(vapply(dat_1, is.numeric, logical(1))))
})
test_that("noise simulation", {
set.seed(1)
dat_1 <- sim_noise(1000, num_vars = 10)
dat_2 <- sim_noise(1000, num_vars = 3, cov_param = .5)
dat_3 <- sim_noise(1000, num_vars = 3, cov_type = "toeplitz", cov_param = .99)
dat_4 <- sim_noise(10, num_vars = 3, outcome = "classification")
dat_5 <- sim_noise(10, num_vars = 3, outcome = "classification", num_classes = 10)
dat_6 <- sim_noise(10, num_vars = 3, outcome = "regression")
expect_equal(names(dat_1), modeldata:::names0(10, "noise_"))
expect_equal(names(dat_2), modeldata:::names0(3, "noise_"))
expect_equal(nrow(dat_1), 1000)
expect_equal(nrow(dat_4), 10)
expect_true(all(vapply(dat_1, is.numeric, logical(1))))
expect_true(all(vapply(dat_1[, -1], is.numeric, logical(1))))
expect_true(is.factor(dat_5$class))
expect_true(all(vapply(dat_6, is.numeric, logical(1))))
cor_1 <- cor(dat_1)[upper.tri(cor(dat_1))]
expect_true(all(cor_1 <= 0.1 & cor_1 >= -0.1))
cor_2 <- cor(dat_2)[upper.tri(cor(dat_2))]
expect_true(all(cor_2 <= 0.6 & cor_2 >= 0.4))
cor_3 <- cor(dat_3)[upper.tri(cor(dat_3))]
expect_true(all(cor_3 >= 0.95))
expect_equal(levels(dat_4$class), paste0("class_", 1:2))
expect_equal(levels(dat_5$class), modeldata:::names0(10, "class_"))
})
test_that("logistic simulation", {
set.seed(1)
dat_1 <- sim_logistic(10, ~ A)
dat_2 <- sim_logistic(10, rlang::expr(~ B), keep_truth = TRUE)
expect_equal(names(dat_1), c(LETTERS[1:2], "class"))
expect_equal(names(dat_2), c(LETTERS[1:2], ".linear_pred", ".truth", "class"))
expect_equal(nrow(dat_1), 10)
})
test_that("multinomial simulation", {
expect_snapshot_error(sim_multinomial(10, ~ A + C, ~ B, ~ A + B))
set.seed(1)
dat_1 <- sim_multinomial(10, ~ A, ~ B, ~ A + B)
dat_2 <- sim_multinomial(10, ~ A, ~ B, ~ A + B, keep_truth = TRUE)
expect_equal(names(dat_1), c(LETTERS[1:2], "class"))
expect_equal(names(dat_2), c(LETTERS[1:2], "class", ".truth_one", ".truth_two", ".truth_three"))
expect_equal(nrow(dat_1), 10)
})
modeldata/tests/testthat.R 0000644 0001762 0000144 00000000076 14534612370 015341 0 ustar ligges users library(testthat)
library(modeldata)
test_check("modeldata")
modeldata/MD5 0000644 0001762 0000144 00000017210 14634575727 012541 0 ustar ligges users 686d73d3f809469c03481407907c547a *DESCRIPTION
a39f3682becaeb31a25af9d9e76f4504 *LICENSE
9d89d505dcfe61755cc4336d8bb5830e *NAMESPACE
f10fa88b35b553fb22ddfc70bb681ccb *NEWS.md
da59251dad6a0d68563c54111a7ea91e *R/Chicago.R
540cf5f818f50dbda8e2fa96069682af *R/Smithsonian.R
3d8a342edc3e896b3f7ea1b22f49ca80 *R/ad_data.R
5871ae54bb9380759d8aacaf6419090f *R/ames.R
b38510f96b36e0ad9b5ecf94c867fd9b *R/attrition.R
686582393047b676fff1e50b4a14ee6f *R/biomass.R
f9188ee978356c55ae8088405442b7ba *R/bivariate.R
b32ce35710019dcfcea823fe8dce2bcc *R/car_prices.R
f9ea90b73708de3fcc7ed499b60c1753 *R/cat_adoption.R
9106b13109c3ba0bb4d67b065c873d72 *R/cells.R
b62a6c4538c5f6576585765950615666 *R/check_times.R
fbbfe12cf6480066afdb22f2472b6d63 *R/chem_proc_yield.R
a5d3e77050cf6a26481ab227846e33dc *R/churn.R
9a95ab144cdf05ddc9ae6f8e22ac9c31 *R/concrete.R
6d8b0d1e44553c22a9a5c25c03ee96fc *R/covers.R
d164358c94aad14c959f2a383563c83d *R/credit_data.R
df9bf90249ae62b671fdc03a09cb62a9 *R/crickets.R
8451737e2b08be953f4c7c58aa31648f *R/deliveries.R
020a5319e4ee32ea134f91e74ede731d *R/drinks.R
bec2ffe714652563c310b55737ddee1c *R/fine_foods.R
37d2feb312ff8173089420c67ef364fa *R/grants.R
e4f1602fef91617b2ae1c7196dc38ddb *R/hepatic_injury_qsar.R
508eed26d31dbaec114f7780f9d40614 *R/hotel_rates.R
931e009add0654680844cefa82f21a08 *R/hpc_cv.R
9fb3028395bf15e185b60d49bc07aa1a *R/hpc_data.R
12f5c52188cd036f1a1cdf9619b6b88a *R/ischemic_stroke.R
37378ad30777d041580ea63ed02be34f *R/leaf_id_flavia.R
bb2dc8a98d999164aa22b7764875e82a *R/lending_club.R
8d01e787468effd71c08bdc40fed8921 *R/meats.R
c8bad8c9134fb7309d22e864be6fb229 *R/modeldata-package.R
d7a69ab2ab0dc970123983cda6337419 *R/oils.R
54780c36d8715ed72eadd92bcf5696ae *R/parabolic.R
e018567346a1139915b3533db0e130f1 *R/pathology.R
bd848447331e9cdb5dde901e493da54d *R/pd_speech.R
50e56cbb2fcd2c81eedaddb2d5f7c5fa *R/penguins.R
f1aafe8c7f81971da941612f7e1f7796 *R/permeability_qsar.R
d97789e088bf4a5c04b9a9626842c4a0 *R/reexports.R
19ef2bebeee54b50814c7054e7e70847 *R/sacremento.R
bf50595dd8c7317003f9593d4a9f5b71 *R/scat.R
446e8b66ca709f251778884f3dbdd417 *R/simulations.R
2eb872a5145efaa5561f0ca678dd2b59 *R/solubility.R
2b5fddae8728d85857ce1862eac1155c *R/stackoverflow.R
d1d485c16ccc64561f70bbc13ceae71b *R/steroidogenic_toxicity.R
c50b0b351187d7847172975acb3c4885 *R/tate_text.R
a69d55abd3ad106411d7b9b0b18575fd *R/taxi.R
96045b2cd6e57effe9a32c6966371642 *R/two_class_dat.R
1b38fb8a1e3c5d67c22b80ca810c787b *R/wa_churn.R
98b5ac8223b65e602d60992fdfafdd64 *README.md
f61db891eb928123aba4412ca322f508 *build/partial.rdb
590493d381409f05a467616ce039a786 *data/Chicago.rda
6b5fbfc2c53fdbfad061381207472e8b *data/Sacramento.RData
f1b618ee84ff76424f9fc2bf115198b4 *data/Smithsonian.RData
f3ba9c9bf6550cb165507706a6ba70f6 *data/ad_data.RData
d8728a5f02457b4c0da32c0573559dc9 *data/ames.rda
c63c6ce057163fcd18d2266147327b63 *data/attrition.RData
6e5be8779e1421983ce678317a357f98 *data/biomass.RData
8b9ea644bff0360d1b712959790d67b0 *data/bivariate.RData
93656e1436b24ae14595cb70ff81c784 *data/car_prices.RData
bbad41c4992afbb1897c9e7dd03f9b73 *data/cat_adoption.rda
487bde50c24a69acd1900c743e87e56c *data/cells.RData
161b69b4a8c5795dd01e1b11a74345e8 *data/check_times.rda
99b67eeee82707d131962198debd9b86 *data/chem_proc_yield.rda
7e0fc1d028ec9b7ac369757ff7767a61 *data/concrete.RData
284d16b49994189ad9ca68acb8370188 *data/covers.RData
64dddbafa12b116014ee324d478d1cb0 *data/credit_data.RData
97c1572e3483c222c6c798296e4f2e91 *data/crickets.rda
1ea66a8cb5b0f92d7c2a77d48597cf72 *data/datalist
fef56f1dc74fbdb2f07aa0048cbf9cc8 *data/deliveries.rda
adb3496011f0d4346cb8f2bd3d7a5a18 *data/drinks.rda
b4857b4763939ade4675ad559060b5e7 *data/grants.rda
4cbbfc56690d0e0f5160f5b33dcd4815 *data/hepatic_injury_qsar.rda
ea47fd07c08fe77749be3eb32460609c *data/hotel_rates.rda
c367f3840cfdc20e731d788648e29f8e *data/hpc_cv.rda
2615a95abd1963f56498e52d8a0bed24 *data/hpc_data.RData
7d2777190bebc52cdc733d0565da0e23 *data/ischemic_stroke.rda
988484a4569534958ea84f046897a079 *data/leaf_id_flavia.rda
1cec446b8e805c2e85a754dd859edaa2 *data/lending_club.rda
29d81fda141b85f90dc70651abd8c811 *data/meats.RData
04328d682ca97967930f7d0cd3b112f3 *data/mlc_churn.RData
328ae38e9030202b79341ec7e51ea8d2 *data/oils.RData
824ed4c858e77a001be83714bd92c73c *data/parabolic.rda
2a606ed519bae832d55166a2d0236899 *data/pathology.rda
f8ce20e02f6430dee048871e0037eb69 *data/pd_speech.rda
9dba1f6bb0fa3c914a563e26d925dc1c *data/penguins.rda
a2623c9fc4cbca43082814a9a50180db *data/permeability_qsar.rda
3f4ba2b8b7322a0080281945d6299ca7 *data/scat.RData
5d522c392655e8d1f4d54ab82527c4ed *data/small_fine_foods.RData
8a39da94ef41767ff5e3f524474a02b5 *data/solubility_test.rda
cd0d726b26be2228ab68c3488997efc1 *data/stackoverflow.rda
c1e8b46586163a7866d3dc3586bed526 *data/steroidogenic_toxicity.rda
413a9ebed9ec53580f753dd296cce227 *data/tate_text.rda
f3f56da5de044832a40f04f0a68df0b4 *data/taxi.rda
1bdd5dc55abc3b2f6997267df31d9b04 *data/two_class_dat.RData
2a5a5aa9196fe2284406a54f285e5882 *data/two_class_example.rda
08cf85ed74458a55e81a61d1a8f799fa *data/wa_churn.rda
591d5dd25150fa5d8d35fa337e251751 *man/Chicago.Rd
c9fef38db24668845f8fbf076316ff30 *man/Sacramento.Rd
a5b0b1de31ebbecb2bf7485779d1769a *man/Smithsonian.Rd
f5dbdfcc9c276f5a03b0afa175cd7943 *man/ad_data.Rd
c7ac71e1e34b8c99c47dba77328ab63c *man/ames.Rd
e28df6db1be1c0e4eaf8d5f3976652c5 *man/attrition.Rd
06de25ddabb9120fbac6d336a4306b3b *man/biomass.Rd
ab6d35b0ab54de0b2e3bad0e607994c5 *man/bivariate.Rd
fd1125aeb008045d60bb3cf33862c5b3 *man/car_prices.Rd
d55921d9f2e9a88c8455d5691b347401 *man/cat_adoption.Rd
8d0415b81281c65879c40cc685c86dd8 *man/cells.Rd
d201479e7dcc230a6439b4f27453e224 *man/check_times.Rd
38224d32ee0625cf3061f4746f658c59 *man/chem_proc_yield.Rd
a34cfeb3712717f8d8f3fc19c12a3263 *man/concrete.Rd
b0b0dc98cbfa5e159293cc67ce0215f9 *man/covers.Rd
1a5155f4aeb226097605b3effab8e6be *man/credit_data.Rd
8216b591f9300b6faacce223ae698bb7 *man/crickets.Rd
fbbbdf246ff0d0f7a1ded8cc2e36f6cb *man/deliveries.Rd
2b265b7e8f9f74fbdcf1a30e769c7176 *man/drinks.Rd
a1b8c987c676c16af790f563f96cbb1f *man/figures/lifecycle-deprecated.svg
5c8591123d6d8d84b13b4eee2fc3f8d5 *man/grants.Rd
7a1f99ff1f83e9b228b10a97d9485834 *man/hepatic_injury_qsar.Rd
814ea6125ccf539957415c2f0814c508 *man/hotel_rates.Rd
694cfe7a3678428bc2e8b4474023248a *man/hpc_cv.Rd
f2f5a10d70193ddcd52e4cd6643759a8 *man/hpc_data.Rd
6c01639f8b019a5d9670a2b59c4ecee3 *man/ischemic_stroke.Rd
fcddcf41cf9f64dc1926c2ddfb3c3880 *man/leaf_id_flavia.Rd
93a1d984c794c1b13549dc58257b359d *man/lending_club.Rd
7aaec813e9f2310c1b52e475e1947f03 *man/meats.Rd
150b1f377152f0834e5e45c8add9e010 *man/mlc_churn.Rd
67741dc96e9e85a547669b4afa6ef1c6 *man/modeldata-package.Rd
bc5781f03986ae206db9b0d963c1bab8 *man/oils.Rd
302bb6e82efdbf6e5e4c90ec32024111 *man/parabolic.Rd
10ce10aa1b20707b3a3b3527a656a5fd *man/pathology.Rd
5cb84ef1c4d03e35034fc9929072c180 *man/pd_speech.Rd
ebe466adc1443ccb1f8ed5d133a9f8b6 *man/penguins.Rd
7cb40e8bb4786663188864618675379f *man/permeability_qsar.Rd
772c0db24485b252f175bad78554930c *man/reexports.Rd
568c0ce1cf7d4f74bbda8a139b41fe42 *man/rmd/ames.md
bc365d3d329ca038f4acb91379c52b6a *man/scat.Rd
240fae01073a878d4ce0e9d1ab0a4725 *man/sim_classification.Rd
a307b14278c84f680d54199681499eba *man/small_fine_foods.Rd
b6d77e683d4ec39729f6fd4bec160821 *man/solubility_test.Rd
789b27e129fa9d10c38b4510ab22e827 *man/stackoverflow.Rd
e81b6324ccdf8b06ce9723c9956eed19 *man/steroidogenic_toxicity.Rd
4c96a596b92ea3847c17fc0734a1b213 *man/tate_text.Rd
c558ff064d04188ad0dc208e85d6e839 *man/taxi.Rd
3eb4b3d614ad86fa9e558c9abaaa2861 *man/two_class_dat.Rd
4d02b053e56950753f2bfa99269d9c07 *man/two_class_example.Rd
ffd9b5cdc97b2b80d887d8e925d8bbc8 *man/wa_churn.Rd
19c826362e8686135f4bf67905291272 *tests/testthat.R
83a1a4bcc4bace1c04998861699ab8a4 *tests/testthat/test-simulations.R
modeldata/R/ 0000755 0001762 0000144 00000000000 14634545620 012416 5 ustar ligges users modeldata/R/taxi.R 0000644 0001762 0000144 00000002370 14534612370 013504 0 ustar ligges users #' Chicago taxi data set
#'
#' @description
#'
#' A data set containing information on a subset of taxi trips in the city
#' of Chicago in 2022.
#'
#' @name taxi
#' @aliases taxi
#' @docType data
#'
#' @return tibble
#'
#' @details
#'
#' The source data are originally described on the linked City of Chicago
#' data portal. The data exported here are a pre-processed subset motivated by
#' the modeling problem of predicting whether a rider will tip or not.
#'
#' \describe{
#' \item{tip}{Whether the rider left a tip. A factor with levels
#' "yes" and "no".}
#' \item{distance}{The trip distance, in odometer miles.}
#' \item{company}{The taxi company, as a factor. Companies that occurred
#' few times were binned as "other".}
#' \item{local}{Whether the trip's starting and ending locations are in the
#' same community. See the source data for community area values.}
#' \item{dow}{The day of the week in which the trip began, as a
#' factor.}
#' \item{month}{The month in which the trip began, as a factor.}
#' \item{hour}{The hour of the day in which the trip began, as a
#' numeric.}
#' }
#'
#' @source
#'
#' \url{https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew}
#'
#' @examples
#' \donttest{
#' taxi
#' }
NULL
modeldata/R/pathology.R 0000644 0001762 0000144 00000001126 14534612370 014543 0 ustar ligges users #' Liver pathology data
#'
#' @details These data have the results of a _x_-ray examination
#' to determine whether liver is abnormal or not (in the `scan`
#' column) versus the more extensive pathology results that
#' approximate the truth (in `pathology`).
#'
#' @name pathology
#' @aliases pathology
#' @docType data
#' @return \item{pathology}{a data frame}
#'
#' @source Altman, D.G., Bland, J.M. (1994) ``Diagnostic tests 1:
#' sensitivity and specificity,'' *British Medical Journal*,
#' vol 308, 1552.
#'
#'
#' @keywords datasets
#' @examples
#' data(pathology)
#' str(pathology)
NULL
modeldata/R/Smithsonian.R 0000644 0001762 0000144 00000000525 14534612370 015033 0 ustar ligges users #' Smithsonian museums
#'
#' Geocodes for the Smithsonian museums (circa 2018).
#'
#' @name Smithsonian
#' @aliases Smithsonian
#' @docType data
#' @return \item{Smithsonian}{a tibble}
#'
#' @source https://en.wikipedia.org/wiki/List_of_Smithsonian_museums
#'
#' @keywords datasets
#' @examples
#' data(Smithsonian)
#' str(Smithsonian)
NULL
modeldata/R/churn.R 0000644 0001762 0000144 00000001762 14534612370 013662 0 ustar ligges users #' Customer churn data
#'
#' A data set from the MLC++ machine learning software for modeling customer
#' churn. There are 19 predictors, mostly numeric: `state` (categorical),
#' `account_length` `area_code` `international_plan` (yes/no),
#' `voice_mail_plan` (yes/no), `number_vmail_messages`
#' `total_day_minutes` `total_day_calls` `total_day_charge`
#' `total_eve_minutes` `total_eve_calls` `total_eve_charge`
#' `total_night_minutes` `total_night_calls`
#' `total_night_charge` `total_intl_minutes`
#' `total_intl_calls` `total_intl_charge`, and
#' `number_customer_service_calls`.
#'
#' The outcome is contained in a column called `churn` (also yes/no).
#' A note in one of the source files states that the data are "artificial based
#' on claims similar to real world".
#'
#' @name mlc_churn
#' @aliases mlc_churn
#' @docType data
#' @return \item{mlc_churn}{a tibble}
#' @source Originally at `http://www.sgi.com/tech/mlc/`
#' @keywords datasets
#' @examples
#' data(mlc_churn)
#' str(mlc_churn)
NULL
modeldata/R/permeability_qsar.R 0000644 0001762 0000144 00000005126 14534612370 016255 0 ustar ligges users #' Predicting permeability from chemical information
#'
#' @description
#' A quantitative structure-activity relationship (QSAR) data set to predict
#' when a molecule can permeate cells.
#'
#' @name permeability_qsar
#' @aliases permeability_qsar
#' @docType data
#' @return \item{permeability_qsar}{a data frame}
#'
#' @details
#' This pharmaceutical data set was used to develop a model for predicting
#' compounds' permeability. In short, permeability is the measure of a
#' molecule's ability to cross a membrane. The body, for example, has notable
#' membranes between the body and brain, known as the blood-brain barrier, and
#' between the gut and body in the intestines. These membranes help the body
#' guard critical regions from receiving undesirable or detrimental substances.
#' For an orally taken drug to be effective in the brain, it first must pass
#' through the intestinal wall and then must pass through the blood-brain
#' barrier in order to be present for the desired neurological target.
#' Therefore, a compound's ability to permeate relevant biological membranes
#' is critically important to understand early in the drug discovery process.
#' Compounds that appear to be effective for a particular disease in research
#' screening experiments, but appear to be poorly permeable may need to be
#' altered in order improve permeability, and thus the compound's ability to
#' reach the desired target. Identifying permeability problems can help guide
#' chemists towards better molecules.
#'
#' Permeability assays such as PAMPA and Caco-2 have been developed to help
#' measure compounds' permeability (Kansy et al, 1998). These screens are
#' effective at quantifying a compound's permeability, but the assay is
#' expensive labor intensive. Given a sufficient number of compounds that have
#' been screened, we could develop a predictive model for permeability in an
#' attempt to potentially reduce the need for the assay. In this project there
#' were 165 unique compounds; 1107 molecular fingerprints were determined for
#' each. A molecular fingerprint is a binary sequence of numbers that
#' represents the presence or absence of a specific molecular sub-structure.
#' The response is highly skewed, the predictors are sparse (15.5% are present),
#' and many predictors are strongly associated.
#'
#' Columns:
#' \itemize{
#' \item \code{permeability}: numeric
#' \item \code{chem_fp_0001} - \code{chem_fp_1107}: numeric
#' }
#'
#' @source
#' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York:
#' Springer, 2013.
#'
#' @examples
#' data(permeability_qsar)
#' str(permeability_qsar)
#'
NULL
modeldata/R/solubility.R 0000644 0001762 0000144 00000001116 14534612370 014733 0 ustar ligges users #' Solubility predictions from MARS model
#'
#' @details For the solubility data in Kuhn and Johnson (2013),
#' these data are the test set results for the MARS model. The
#' observed solubility (in column `solubility`) and the model
#' results (`prediction`) are contained in the data.
#'
#' @name solubility_test
#' @aliases solubility_test
#' @docType data
#' @return \item{solubility_test}{a data frame}
#'
#' @source Kuhn, M., Johnson, K. (2013) *Applied Predictive
#' Modeling*, Springer
#'
#' @keywords datasets
#' @examples
#' data(solubility_test)
#' str(solubility_test)
NULL
modeldata/R/hotel_rates.R 0000644 0001762 0000144 00000002374 14534612370 015054 0 ustar ligges users #' Daily Hotel Rate Data
#'
#' @description
#' A data set to predict the average daily rate for a hotel in Lisbon Portugal.
#'
#' @name hotel_rates
#' @aliases hotel_rates
#' @docType data
#'
#' @details
#'
#' Data are originally described in Antonio, de Almeida, and Nunes (2019).
#' This version of the data is filtered for one hotel (the "Resort Hotel") and
#' is intended as regression data set for predicting the average daily rate for
#' a room. The data are post-2016; the 2016 data were used to have a predictor
#' for the historical daily rates. See the `hotel_rates.R` file in the
#' `data-raw` directory of the package to understand other filters used when
#' creating this version of the data.
#'
#' The `agent` and `company` fields were changed from random characters to use
#' a set of random names.
#'
#' The outcome column is `avg_price_per_room`.
#'
#' ## License
#'
#' No license was given for the data; See the reference below for source.
#'
#' @source
#' \url{https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11}
#'
#' @references
#' Antonio, N., de Almeida, A., and Nunes, L. (2019). Hotel booking demand
#' datasets. _Data in Brief_, 22, 41-49.
#'
#' @keywords datasets
#' @examples
#' \dontrun{
#' str(hotel_rates)
#' }
NULL
modeldata/R/ames.R 0000644 0001762 0000144 00000001562 14534612370 013466 0 ustar ligges users #' Ames Housing Data
#'
#' A data set from De Cock (2011) has 82 fields were recorded for 2,930
#' properties in Ames IA. This version is copies from the `AmesHousing` package
#' but does not include a few quality columns that appear to be outcomes
#' rather than predictors.
#'
#' See this links for the sources below for more information as well as
#' `?AmesHousing::make_ames`.
#'
#' @includeRmd man/rmd/ames.md details
#'
#' @name ames
#' @aliases ames
#' @docType data
#' @return \item{ames}{a tibble}
#' @source De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project," \emph{Journal of Statistics Education}, Volume 19, Number 3.
#'
#' \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt}
#'
#' \url{http://jse.amstat.org/v19n3/decock.pdf}
#' @keywords datasets
#' @examples
#' data(ames)
#' str(ames)
NULL
modeldata/R/scat.R 0000644 0001762 0000144 00000001151 14534612370 013465 0 ustar ligges users #' Morphometric data on scat
#'
#' Reid (2015) collected data on animal feses in coastal California. The data
#' consist of DNA verified species designations as well as fields related to
#' the time and place of the collection and the scat itself. The data are on
#' the three main species.
#'
#'
#' @name scat
#' @aliases scat
#' @docType data
#' @return \item{scat}{a tibble}
#' @source Reid, R. E. B. (2015). A morphometric modeling approach to
#' distinguishing among bobcat, coyote and gray fox scats. \emph{Wildlife
#' Biology}, 21(5), 254-262
#' @keywords datasets
#' @examples
#' data(scat)
#' str(scat)
NULL
modeldata/R/wa_churn.R 0000644 0001762 0000144 00000001311 14534612370 014337 0 ustar ligges users #' Watson churn data
#'
#' @details These data were downloaded from the IBM Watson site
#' (see below) in September 2018. The data contain a factor for
#' whether a customer churned or not. Alternatively, the `tenure`
#' column presumably contains information on how long the customer
#' has had an account. A survival analysis can be done on this
#' column using the `churn` outcome as the censoring information. A
#' data dictionary can be found on the source website.
#'
#' @name wa_churn
#' @aliases wa_churn
#' @docType data
#' @return \item{wa_churn}{a data frame}
#'
#' @source IBM Watson Analytics https://ibm.co/2sOvyvy
#'
#' @keywords datasets
#' @examples
#' data(wa_churn)
#' str(wa_churn)
NULL
modeldata/R/attrition.R 0000644 0001762 0000144 00000001442 14534612370 014553 0 ustar ligges users #' Job attrition
#'
#' @details These data are from the IBM Watson Analytics Lab.
#' The website describes the data with \dQuote{Uncover the
#' factors that lead to employee attrition and explore important
#' questions such as \sQuote{show me a breakdown of distance
#' from home by job role and attrition} or \sQuote{compare
#' average monthly income by education and attrition}. This is a
#' fictional data set created by IBM data scientists.}. There
#' are 1470 rows.
#'
#' @name attrition
#' @aliases attrition
#' @docType data
#' @return \item{attrition}{a data frame}
#'
#' @source The IBM Watson Analytics Lab website https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/
#'
#'
#' @keywords datasets
#' @examples
#' data(attrition)
#' str(attrition)
NULL
modeldata/R/car_prices.R 0000644 0001762 0000144 00000001577 14534612370 014661 0 ustar ligges users #' Kelly Blue Book resale data for 2005 model year GM cars
#'
#' Kuiper (2008) collected data on Kelly Blue Book resale data for 804 GM cars (2005 model year).
#'
#' @name car_prices
#' @docType data
#' @return \item{car_prices}{data frame of the suggested retail price (column \code{Price}) and various
#' characteristics of each car (columns \code{Mileage}, \code{Cylinder}, \code{Doors}, \code{Cruise},
#' \code{Sound}, \code{Leather}, \code{Buick}, \code{Cadillac}, \code{Chevy}, \code{Pontiac}, \code{Saab},
#' \code{Saturn}, \code{convertible}, \code{coupe}, \code{hatchback}, \code{sedan} and \code{wagon})}
#' @source Kuiper, S. (2008). Introduction to Multiple Regression: How Much Is Your Car Worth?,
#' \emph{Journal of Statistics Education}, Vol. 16
#' \url{http://jse.amstat.org/jse_archive.htm#2008}.
#' @keywords datasets
#' @examples
#' data(car_prices)
#' str(car_prices)
NULL
modeldata/R/tate_text.R 0000644 0001762 0000144 00000001354 14534612370 014541 0 ustar ligges users #' Tate Gallery modern artwork metadata
#'
#' Metadata such as artist, title, and year created for recent artworks owned
#' by the Tate Gallery. Only artworks created during or after 1990 are
#' included, and the metadata source was last updated in 2014. The Tate Gallery
#' provides these data but requests users to be respectful of their
#' [guidelines for use](https://github.com/tategallery/collection#usage-guidelines-for-open-data).
#'
#' @name tate_text
#' @aliases tate_text
#' @docType data
#' @return \item{tate_text}{a tibble}
#'
#' @source \itemize{
#' \item \url{https://github.com/tategallery/collection}
#' \item \url{https://www.tate.org.uk/}
#' }
#'
#' @keywords datasets
#' @examples
#' data(tate_text)
#' str(tate_text)
NULL
modeldata/R/meats.R 0000644 0001762 0000144 00000002451 14634543650 013655 0 ustar ligges users #' Fat, water and protein content of meat samples
#'
#' "These data are recorded on a Tecator Infratec Food and Feed Analyzer
#' working in the wavelength range 850 - 1050 nm by the Near Infrared
#' Transmission (NIT) principle. Each sample contains finely chopped pure meat
#' with different moisture, fat and protein contents.
#'
#' If results from these data are used in a publication we want you to mention
#' the instrument and company name (Tecator) in the publication. In addition,
#' please send a preprint of your article to:
#'
#' Karin Thente, Tecator AB, Box 70, S-263 21 Hoganas, Sweden
#'
#' The data are available in the public domain with no responsibility from the
#' original data source. The data can be redistributed as long as this
#' permission note is attached."
#'
#' "For each meat sample the data consists of a 100 channel spectrum of
#' absorbances and the contents of moisture (water), fat and protein. The
#' absorbance is -log10 of the transmittance measured by the spectrometer. The
#' three contents, measured in percent, are determined by analytic chemistry."
#'
#' Included here are the training, monitoring and test sets.
#'
#'
#' @name meats
#' @aliases meats
#' @docType data
#' @return \item{meats}{a tibble}
#' @keywords datasets
#' @examples
#'
#' data(meats)
#' str(meats)
NULL
modeldata/R/hpc_cv.R 0000644 0001762 0000144 00000001427 14534612370 014003 0 ustar ligges users #' Class probability predictions
#'
#' @details This data frame contains the predicted classes and
#' class probabilities for a linear discriminant analysis model fit
#' to the HPC data set from Kuhn and Johnson (2013). These data are
#' the assessment sets from a 10-fold cross-validation scheme. The
#' data column columns for the true class (`obs`), the class
#' prediction (`pred`) and columns for each class probability
#' (columns `VF`, `F`, `M`, and `L`). Additionally, a column for
#' the resample indicator is included.
#'
#' @name hpc_cv
#' @aliases hpc_cv
#' @docType data
#' @return \item{hpc_cv}{a data frame}
#'
#' @source Kuhn, M., Johnson, K. (2013) *Applied Predictive
#' Modeling*, Springer
#'
#' @keywords datasets
#' @examples
#' data(hpc_cv)
#' str(hpc_cv)
NULL
modeldata/R/credit_data.R 0000644 0001762 0000144 00000000756 14534612370 015010 0 ustar ligges users #' Credit data
#'
#' These data are from the website of Dr. Lluís A. Belanche Muñoz by way of a
#' github repository of Dr. Gaston Sanchez. One data point is a missing outcome
#' was removed from the original data.
#'
#' @name credit_data
#' @aliases credit_data
#' @docType data
#' @return \item{credit_data}{a data frame}
#'
#' @source https://github.com/gastonstat/CreditScoring,
#' http://bit.ly/2kkBFrk
#'
#' @keywords datasets
#' @examples
#' data(credit_data)
#' str(credit_data)
NULL
modeldata/R/simulations.R 0000644 0001762 0000144 00000053521 14534612370 015112 0 ustar ligges users #' Simulate datasets
#'
#' These functions can be used to generate simulated data for supervised
#' (classification and regression) and unsupervised modeling applications.
#'
#' @param num_samples Number of data points to simulate.
#' @param method A character string for the simulation method. For
#' classification, the single current option is "caret". For regression,
#' values can be "sapp_2014_1", "sapp_2014_2", "van_der_laan_2007_1", or
#' "van_der_laan_2007_2". See Details below.
#' @param intercept The intercept for the linear predictor.
#' @param num_linear Number of diminishing linear effects.
#' @param std_dev Gaussian distribution standard deviation for residuals.
#' Default values are shown below in Details.
#' @param num_vars Number of noise predictors to create.
#' @param cov_type The multivariate normal correlation structure of the
#' predictors. Possible values are "exchangeable" and "toeplitz".
#' @param cov_param A single numeric value for the exchangeable correlation
#' value or the base of the Toeplitz structure. See Details below.
#' @param factors A single logical for whether the binary indicators should be
#' encoded as factors or not.
#' @param outcome A single character string for what type of independent outcome
#' should be simulated (if any). The default value of "none" produces no extra
#' columns. Using "classification" will generate a `class` column with
#' `num_classes` values, equally distributed. A value of "regression" results
#' in an `outcome` column that contains independent standard normal values.
#' @param num_classes When `outcome = "classification"`, the number of classes
#' to simulate.
#' @param keep_truth A logical: should the true outcome value be retained for
#' the data? If so, the column name is `.truth`.
#' @param eqn,eqn_1,eqn_2,eqn_3 An R expression or (one sided) formula that
#' only involves variables `A` and `B` that is used to compute the linear
#' predictor. External objects should not be used as symbols; see the examples
#' below on how to use external objects in the equations.
#' @param correlation A single numeric value for the correlation between variables
#' `A` and `B`.
#'
#' @details
#'
#' ## Specific Regression and Classification methods
#'
#' These functions provide several supervised simulation methods (and one
#' unsupervised). Learn more by `method`:
#'
#' ### `method = "caret"`
#'
#' This is a simulated classification problem with two classes, originally
#' implemented in [caret::twoClassSim()] with all numeric predictors. The
#' predictors are simulated in different sets. First, two multivariate normal
#' predictors (denoted here as `two_factor_1` and `two_factor_2`) are created
#' with a correlation of about 0.65. They change the log-odds using main
#' effects and an interaction:
#'
#' \preformatted{ intercept - 4 * two_factor_1 + 4 * two_factor_2 + 2 * two_factor_1 * two_factor_2 }
#'
#' The intercept is a parameter for the simulation and can be used to control
#' the amount of class imbalance.
#'
#' The second set of effects are linear with coefficients that alternate signs
#' and have a sequence of values between 2.5 and 0.25. For example, if there
#' were four predictors in this set, their contribution to the log-odds would
#' be
#'
#' \preformatted{ -2.5 * linear_1 + 1.75 * linear_2 -1.00 * linear_3 + 0.25 * linear_4}
#'
#' (Note that these column names may change based on the value of `num_linear`).
#'
#' The third set is a nonlinear function of a single predictor ranging between
#' `[0, 1]` called `non_linear_1` here:
#'
#' \preformatted{ (non_linear_1^3) + 2 * exp(-6 * (non_linear_1 - 0.3)^2) }
#'
#' The fourth set of informative predictors are copied from one of Friedman's
#' systems and use two more predictors (`non_linear_2` and `non_linear_3`):
#'
#' \preformatted{ 2 * sin(non_linear_2 * non_linear_3) }
#'
#' All of these effects are added up to model the log-odds.
#'
#' ### `method = "sapp_2014_1"`
#'
#' This regression simulation is from Sapp et al. (2014). There are 20
#' independent Gaussian random predictors with mean zero and a variance of 9.
#' The prediction equation is:
#'
#' \preformatted{
#' predictor_01 + sin(predictor_02) + log(abs(predictor_03)) +
#' predictor_04^2 + predictor_05 * predictor_06 +
#' ifelse(predictor_07 * predictor_08 * predictor_09 < 0, 1, 0) +
#' ifelse(predictor_10 > 0, 1, 0) + predictor_11 * ifelse(predictor_11 > 0, 1, 0) +
#' sqrt(abs(predictor_12)) + cos(predictor_13) + 2 * predictor_14 + abs(predictor_15) +
#' ifelse(predictor_16 < -1, 1, 0) + predictor_17 * ifelse(predictor_17 < -1, 1, 0) -
#' 2 * predictor_18 - predictor_19 * predictor_20
#' }
#'
#' The error is Gaussian with mean zero and variance 9.
#'
#' ### `method = "sapp_2014_2"`
#'
#' This regression simulation is also from Sapp et al. (2014). There are 200
#' independent Gaussian predictors with mean zero and variance 16. The
#' prediction equation has an intercept of one and identical linear effects of
#' `log(abs(predictor))`.
#'
#' The error is Gaussian with mean zero and variance 25.
#'
#' ### `method = "van_der_laan_2007_1"`
#'
#' This is a regression simulation from van der Laan et al. (2007) with ten
#' random Bernoulli variables that have a 40% probability of being a value of
#' one. The true regression equation is:
#'
#' \preformatted{
#' 2 * predictor_01 * predictor_10 + 4 * predictor_02 * predictor_07 +
#' 3 * predictor_04 * predictor_05 - 5 * predictor_06 * predictor_10 +
#' 3 * predictor_08 * predictor_09 + predictor_01 * predictor_02 * predictor_04 -
#' 2 * predictor_07 * (1 - predictor_06) * predictor_02 * predictor_09 -
#' 4 * (1 - predictor_10) * predictor_01 * (1 - predictor_04)
#' }
#'
#' The error term is standard normal.
#'
#' ### `method = "van_der_laan_2007_2"`
#'
#' This is another regression simulation from van der Laan et al. (2007) with
#' twenty Gaussians with mean zero and variance 16. The prediction equation is:
#'
#' \preformatted{
#' predictor_01 * predictor_02 + predictor_10^2 - predictor_03 * predictor_17 -
#' predictor_15 * predictor_04 + predictor_09 * predictor_05 + predictor_19 -
#' predictor_20^2 + predictor_09 * predictor_08
#' }
#'
#' The error term is also Gaussian with mean zero and variance 16.
#'
#' ### `method = "hooker_2004"`
#'
#' Hooker (2004) and Sorokina _at al_ (2008) used the following:
#'
#' \preformatted{
#' pi ^ (predictor_01 * predictor_02) * sqrt( 2 * predictor_03 ) -
#' asin(predictor_04) + log(predictor_03 + predictor_05) -
#' (predictor_09 / predictor_10) * sqrt (predictor_07 / predictor_08) -
#' predictor_02 * predictor_07
#' }
#'
#' Predictors 1, 2, 3, 6, 7, and 9 are standard uniform while the others are
#' uniform on `[0.6, 1.0]`. The errors are normal with mean zero and default
#' standard deviation of 0.25.
#'
#' ## `sim_noise()`
#'
#' This function simulates a number of random normal variables with mean zero.
#' The values can be independent if `cov_param = 0`. Otherwise the values are
#' multivariate normal with non-diagonal covariance matrices. For
#' `cov_type = "exchangeable"`, the structure has unit variances and covariances
#' of `cov_param`. With `cov_type = "toeplitz"`, the covariances have an
#' exponential pattern (see example below).
#'
#' ## Logistic simulation
#'
#' `sim_logistic()` provides a flexible interface to simulating a logistic
#' regression model with two multivariate normal variables `A` and `B` (with
#' zero mean, unit variances and correlation determined by the `correlation`
#' argument).
#'
#' For example, using `eqn = A + B` would specify that the true probability of
#' the event was
#'
#' \preformatted{
#' prob = 1 / (1 + exp(A + B))
#' }
#'
#' The class levels for the outcome column are `"one"` and `"two"`.
#'
#' ## Multinomial simulation
#'
#' `sim_multinomial()` can generate data with classes `"one"`, `"two"`, and
#' `"three"` based on the values in arguments `eqn_1`, `eqn_2`, and `eqn_3`,
#' respectfully. Like [sim_logistic()] these equations use predictors `A` and
#' `B`.
#'
#' The individual equations are evaluated and exponentiated. After this, their
#' values are, for each row of data, normalized to add up to one. These
#' probabilities are them passed to [stats::rmultinom()] to generate the outcome
#' values.
#'
#' @references
#' Van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). Super learner.
#' _Statistical applications in genetics and molecular biology_, 6(1).
#' DOI: 10.2202/1544-6115.1309.
#'
#' Sapp, S., van der Laan, M. J., & Canny, J. (2014). Subsemble: an ensemble
#' method for combining subset-specific algorithm fits. _Journal of applied
#' statistics_, 41(6), 1247-1259. DOI: 10.1080/02664763.2013.864263
#'
#' Hooker, G. (2004, August). Discovering additive structure in black box
#' functions. In _Proceedings of the tenth ACM SIGKDD international conference
#' on Knowledge discovery and data mining_ (pp. 575-580).
#' DOI: 10.1145/1014052.1014122
#'
#' Sorokina, D., Caruana, R., Riedewald, M., & Fink, D. (2008, July). Detecting
#' statistical interactions with additive groves of trees. In _Proceedings of
#' the 25th international conference on Machine learning_ (pp. 1000-1007).
#' DOI: 10.1145/1390156.1390282
#'
#' @examples
#' set.seed(1)
#' sim_regression(100)
#' sim_classification(100)
#'
#' # Flexible logistic regression simulation
#' if (rlang::is_installed("ggplot2")) {
#' library(dplyr)
#' library(ggplot2)
#'
#' sim_logistic(1000, ~ .1 + 2 * A - 3 * B + 1 * A *B, corr = .7) %>%
#' ggplot(aes(A, B, col = class)) +
#' geom_point(alpha = 1/2) +
#' coord_equal()
#'
#' f_xor <- ~ 10 * xor(A > 0, B < 0)
#' # or
#' f_xor <- rlang::expr(10 * xor(A > 0, B < 0))
#'
#' sim_logistic(1000, f_xor, keep_truth = TRUE) %>%
#' ggplot(aes(A, B, col = class)) +
#' geom_point(alpha = 1/2) +
#' coord_equal() +
#' theme_bw()
#' }
#'
#' ## How to use external symbols:
#'
#' a_coef <- 2
#' # splice the value in using rlang's !! operator
#' lp_eqn <- rlang::expr(!!a_coef * A+B)
#' lp_eqn
#' sim_logistic(5, lp_eqn)
#'
#' # Flexible multinomial regression simulation
#' if (rlang::is_installed("ggplot2")) {
#'
# set.seed(2)
# three_classes <-
# sim_multinomial(
# 1000,
# ~ -0.5 + 0.6 * abs(A),
# ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, - 2),
# ~ -0.6 * A + 0.50 * B - A * B)
#
# three_classes %>%
# ggplot(aes(A, B, col = class, pch = class)) +
# geom_point(alpha = 3/4) +
# facet_wrap(~ class) +
# coord_equal() +
# theme_bw()
#' }
#' @export
sim_classification <- function(num_samples = 100, method = "caret",
intercept = -5, num_linear = 10,
keep_truth = FALSE) {
method <- rlang::arg_match0(method, "caret", arg_nm = "method")
if (method == "caret") {
# Simulate two correlated normal variates
var_cov <- matrix(c(2, 1.3, 1.3, 2), 2, 2)
dat <- MASS::mvrnorm(n = num_samples, c(0, 0), var_cov)
# Simulate a uniform for the first nonlinear term
dat <- cbind(dat, stats::runif(num_samples, min = -1))
# Simulate second two nonlinear terms
dat <- cbind(dat, matrix(stats::runif(num_samples * 2), ncol = 2))
# Assign names
colnames(dat) <- c(paste0("two_factor_", 1:2), paste0("non_linear_", 1:3))
linear_pred <-
rlang::expr(
!!intercept - 4 * two_factor_1 + 4 * two_factor_2 +
2 * two_factor_1 * two_factor_2 +
(non_linear_1^3) + 2 * exp(-6 * (non_linear_1 - 0.3)^2) +
2 * sin(pi * non_linear_2 * non_linear_3)
)
# Simulate a series of linear coefficients
if (num_linear > 0) {
dat_linear <- matrix(stats::rnorm(num_samples * num_linear), ncol = num_linear)
lin_names <- names0(num_linear, "linear_")
colnames(dat_linear) <- lin_names
lin_symbols <- rlang::syms(lin_names)
lin_coefs <-
seq(10, 1, length = num_linear) / 4 *
rep_len(c(-1, 1), length.out = num_linear)
lin_expr <-
purrr::map2(lin_coefs, lin_symbols, ~ rlang::expr(!!.x * !!.y)) %>%
purrr::reduce(function(l, r) rlang::expr(!!l + !!r))
.truth <- rlang::expr(!!linear_pred + !!lin_expr)
dat <- cbind(dat, dat_linear)
}
}
dat <-
tibble::as_tibble(dat) %>%
dplyr::mutate(
linear_pred = rlang::eval_tidy(linear_pred, data = .),
.truth = stats::binomial()$linkinv(linear_pred),
rand = stats::runif(num_samples),
class = ifelse(rand <= .truth, "class_1", "class_2"),
class = factor(class, levels = c("class_1", "class_2"))
) %>%
dplyr::select(-linear_pred, -rand) %>%
dplyr::relocate(class)
if (!keep_truth) {
dat <- dplyr::select(dat, -.truth)
}
dat
}
#' @export
#' @rdname sim_classification
sim_regression <-
function(num_samples = 100, method = "sapp_2014_1", std_dev = NULL, factors = FALSE, keep_truth = FALSE) {
reg_methods <- c("sapp_2014_1", "sapp_2014_2", "van_der_laan_2007_1",
"van_der_laan_2007_2", "hooker_2004")
method <- rlang::arg_match0(method, reg_methods, arg_nm = "method")
dat <-
switch(method,
sapp_2014_1 = sapp_2014_1(num_samples, std_dev),
sapp_2014_2 = sapp_2014_2(num_samples, std_dev),
van_der_laan_2007_1 = van_der_laan_2007_1(num_samples, std_dev, factors = factors),
van_der_laan_2007_2 = van_der_laan_2007_2(num_samples, std_dev),
hooker_2004 = hooker_2004(num_samples, std_dev)
)
if (!keep_truth) {
dat <- dplyr::select(dat, -.truth)
}
dat
}
sapp_2014_1 <- function(num_samples = 100, std_dev = NULL) {
if (is.null(std_dev)) {
std_dev <- 3
}
dat <- matrix(stats::rnorm(num_samples * 20, sd = 3), ncol = 20)
colnames(dat) <- names0(20, "predictor_")
dat <- tibble::as_tibble(dat)
slc_14 <- rlang::expr(
predictor_01 + sin(predictor_02) + log(abs(predictor_03)) +
predictor_04^2 + predictor_05 * predictor_06 +
ifelse(predictor_07 * predictor_08 * predictor_09 < 0, 1, 0) +
ifelse(predictor_10 > 0, 1, 0) + predictor_11 * ifelse(predictor_11 > 0, 1, 0) +
sqrt(abs(predictor_12)) + cos(predictor_13) + 2 * predictor_14 + abs(predictor_15) +
ifelse(predictor_16 < -1, 1, 0) + predictor_17 * ifelse(predictor_17 < -1, 1, 0) -
2 * predictor_18 - predictor_19 * predictor_20
)
dat <-
tibble::as_tibble(dat) %>%
dplyr::mutate(
.truth = rlang::eval_tidy(slc_14, data = .),
outcome = .truth + stats::rnorm(num_samples, sd = std_dev)
) %>%
dplyr::relocate(outcome)
dat
}
sapp_2014_2 <- function(num_samples = 100, std_dev = 4) {
if (is.null(std_dev)) {
std_dev <- 5
}
dat <- matrix(stats::rnorm(num_samples * 200, sd = 4), ncol = 200)
colnames(dat) <- names0(200, "predictor_")
slc_14 <- function(x) sum(log(abs(x)))
.truth <- apply(dat, 1, slc_14)
y <- .truth + stats::rnorm(num_samples, sd = std_dev) - 1
dat <- tibble::as_tibble(dat)
dat$outcome <- y
dat$.truth <- .truth
dplyr::relocate(dat, outcome)
}
van_der_laan_2007_1 <- function(num_samples = 100, std_dev = NULL, factors = FALSE) {
if (is.null(std_dev)) {
std_dev <- 1
}
dat <- matrix(stats::rbinom(num_samples * 10, size = 1, prob = .4), ncol = 10)
colnames(dat) <- names0(10, "predictor_")
dat <- tibble::as_tibble(dat)
lph_07 <- rlang::expr(
2 * predictor_01 * predictor_10 + 4 * predictor_02 * predictor_07 + 3 * predictor_04 *
predictor_05 - 5 * predictor_06 * predictor_10 + 3 * predictor_08 * predictor_09 +
predictor_01 * predictor_02 * predictor_04 -
2 * predictor_07 * (1 - predictor_06) * predictor_02 *
predictor_09 - 4 * (1 - predictor_10) * predictor_01 * (1 - predictor_04)
)
dat <-
tibble::as_tibble(dat) %>%
dplyr::mutate(
.truth = rlang::eval_tidy(lph_07, data = .),
outcome = .truth + stats::rnorm(num_samples, sd = std_dev)
) %>%
dplyr::relocate(outcome)
if (factors) {
dat <-
dat %>%
dplyr::mutate(
dplyr::across(2:11, ~ ifelse(.x == 1, "yes", "no")),
dplyr::across(2:11, ~ factor(.x, levels = c("yes", "no")))
)
}
dat
}
van_der_laan_2007_2 <- function(num_samples = 100, std_dev = NULL) {
if (is.null(std_dev)) {
std_dev <- 4
}
dat <- matrix(stats::rnorm(num_samples * 20, sd = 4), ncol = 20)
colnames(dat) <- names0(20, "predictor_")
dat <- tibble::as_tibble(dat)
lph_07 <- rlang::expr(
predictor_01 * predictor_02 + predictor_10^2 - predictor_03 * predictor_17 -
predictor_15 * predictor_04 + predictor_09 * predictor_05 + predictor_19 -
predictor_20^2 + predictor_09 * predictor_08
)
dat <-
tibble::as_tibble(dat) %>%
dplyr::mutate(
.truth = rlang::eval_tidy(lph_07, data = .),
outcome = .truth + stats::rnorm(num_samples, sd = std_dev)
) %>%
dplyr::relocate(outcome)
dat
}
# TODO see table 1 of Detecting Statistical Interactions from Neural Network Weights for more
hooker_2004 <- function(num_samples = 100, std_dev = NULL) {
if (is.null(std_dev)) {
std_dev <- 1 / 4
}
uni_1 <- matrix(stats::runif(num_samples * 6), ncol = 6)
uni_2 <- matrix(stats::runif(num_samples * 4, min = 0.6), ncol = 4)
all_names <- names0(10, "predictor_")
colnames(uni_1) <- all_names[c(1, 2, 3, 6, 7, 9)]
colnames(uni_2) <- all_names[c(4, 5, 8, 10)]
dat <- cbind(uni_1, uni_2)
dat <- tibble::as_tibble(dat) %>% dplyr::select(dplyr::all_of(all_names))
hooker_2004 <- rlang::expr(
pi ^ (predictor_01 * predictor_02) * sqrt( 2 * predictor_03 ) -
asin(predictor_04) + log(predictor_03 + predictor_05) -
(predictor_09 / predictor_10) * sqrt (predictor_07 / predictor_08) -
predictor_02 * predictor_07
)
dat <-
tibble::as_tibble(dat) %>%
dplyr::mutate(
.truth = rlang::eval_tidy(hooker_2004, data = .),
outcome = .truth + stats::rnorm(num_samples, sd = std_dev)
) %>%
dplyr::relocate(outcome)
dat
}
# ------------------------------------------------------------------------------
#' @export
#' @rdname sim_classification
sim_noise <- function(num_samples, num_vars, cov_type = "exchangeable",
outcome = "none", num_classes = 2, cov_param = 0) {
cov_type <- rlang::arg_match0(cov_type, c("exchangeable", "toeplitz"),
arg_nm = "cov_type"
)
outcome <- rlang::arg_match0(outcome, c("none", "classification", "regression"),
arg_nm = "outcome"
)
if (cov_type == "exchangeable") {
var_cov <- matrix(cov_param, ncol = num_vars, nrow = num_vars)
diag(var_cov) <- 1
} else {
var_cov_values <- cov_param^(seq(0, num_vars - 1, by = 1))
var_cov <- stats::toeplitz(var_cov_values)
}
dat <- MASS::mvrnorm(num_samples, mu = rep(0, num_vars), Sigma = var_cov)
colnames(dat) <- names0(num_vars, "noise_")
dat <- tibble::as_tibble(dat)
if (outcome == "classification") {
if (num_classes <= 0) {
rlang::abort("'num_classes' should be a positive integer.")
}
cls <- names0(num_classes, "class_")
dat <-
dat %>%
dplyr::mutate(
class = sample(cls, num_samples, replace = TRUE),
class = factor(class, levels = cls)
) %>%
dplyr::relocate(class)
} else if (outcome == "regression") {
dat <-
dat %>%
dplyr::mutate(outcome = stats::rnorm(num_samples)) %>%
dplyr::relocate(outcome)
}
dat
}
# ------------------------------------------------------------------------------
#' @export
#' @rdname sim_classification
sim_logistic <- function(num_samples, eqn, correlation = 0, keep_truth = FALSE) {
sigma <- matrix(c(1, correlation, correlation, 1), 2, 2)
eqn <- rlang::get_expr(eqn)
check_equations(eqn)
dat <-
data.frame(MASS::mvrnorm(n = num_samples, c(0, 0), sigma)) %>%
stats::setNames(LETTERS[1:2]) %>%
dplyr::mutate(
.linear_pred = rlang::eval_tidy(eqn, data = .),
.linear_pred = as.numeric(.linear_pred),
.truth = stats::binomial()$linkinv(.linear_pred),
.rand = stats::runif(num_samples),
class = ifelse(.rand <= .truth, "one", "two"),
class = factor(class, levels = c("one", "two"))
) %>%
dplyr::select(-.rand) %>%
tibble::as_tibble()
if (!keep_truth) {
dat <- dat %>% dplyr::select(-.truth, -.linear_pred)
}
dat
}
# ------------------------------------------------------------------------------
#' @export
#' @rdname sim_classification
sim_multinomial <- function(num_samples, eqn_1, eqn_2, eqn_3, correlation = 0, keep_truth = FALSE) {
sigma <- matrix(c(1, correlation, correlation, 1), 2, 2)
eqn_1 <- rlang::get_expr(eqn_1)
eqn_2 <- rlang::get_expr(eqn_2)
eqn_3 <- rlang::get_expr(eqn_3)
purrr::map_lgl(list(eqn_1, eqn_2, eqn_3), check_equations)
dat <-
data.frame(MASS::mvrnorm(n = num_samples, c(0, 0), sigma)) %>%
stats::setNames(LETTERS[1:2]) %>%
dplyr::mutate(
.formula_1 = rlang::eval_tidy(eqn_1, data = .),
.formula_2 = rlang::eval_tidy(eqn_2, data = .),
.formula_3 = rlang::eval_tidy(eqn_3, data = .),
dplyr::across(c(dplyr::starts_with(".formula_")), ~ exp(.x))
)
probs <- as.matrix(dplyr::select(dat, dplyr::starts_with(".formula_")))
probs <- t(apply(probs, 1, function(x) x/sum(x)))
which_class <- function(x) which.max(stats::rmultinom(1, 1, x))
index <- apply(probs, 1, which_class)
lvls <- c("one", "two", "three")
dat$class <- factor(lvls[index], levels = lvls)
dat <- dat %>% dplyr::select(-dplyr::starts_with(".formula_"))
if (keep_truth) {
colnames(probs) <- paste0(".truth_", lvls)
probs <- tibble::as_tibble(probs)
dat <- dplyr::bind_cols(dat, probs)
}
tibble::as_tibble(dat)
}
# ------------------------------------------------------------------------------
check_equations <- function(x, expected = LETTERS[1:2]) {
used <- sort(all.vars(x))
its_fine <- length(setdiff(used, expected)) == 0
if (!its_fine) {
rlang::abort("The model equations should only use variables/objects `A` and `B`")
}
invisible(its_fine)
}
names0 <- function(num, prefix = "x") {
if (num < 1) {
rlang::abort("`num` should be > 0")
}
ind <- format(1:num)
ind <- gsub(" ", "0", ind)
paste0(prefix, ind)
}
modeldata/R/drinks.R 0000644 0001762 0000144 00000001020 14534612370 014020 0 ustar ligges users #' Sample time series data
#'
#' @details Drink sales. The exact name of the series from FRED is:
#' "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices
#' Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales"
#'
#' @name drinks
#' @aliases drinks
#' @docType data
#' @return \item{drinks}{a tibble}
#'
#' @source The Federal Reserve Bank of St. Louis website https://fred.stlouisfed.org/series/S4248SM144NCEN
#'
#' @keywords datasets
#' @examples
#' data(drinks)
#' str(drinks)
NULL
modeldata/R/ad_data.R 0000644 0001762 0000144 00000002762 14634543650 014126 0 ustar ligges users #' Alzheimer's disease data
#'
#' @details
#' Craig-Schapiro et al. (2011) describe a clinical study of 333 patients,
#' including some with mild (but well-characterized) cognitive impairment as
#' well as healthy individuals. CSF samples were taken from all subjects. The
#' goal of the study was to determine if subjects in the early states of
#' impairment could be differentiated from cognitively healthy individuals.
#' Data collected on each subject included:
#' \itemize{
#' \item Demographic characteristics such as age and gender
#' \item Apolipoprotein E genotype
#' \item Protein measurements of Abeta, Tau, and a phosphorylated version of Tau (called pTau)
#' \item Protein measurements of 124 exploratory biomarkers, and
#' \item Clinical dementia scores
#' }
#'
#' For these analyses, we have converted the scores to two classes: impaired
#' and healthy. The goal of this analysis is to create classification models
#' using the demographic and assay data to predict which patients have early
#' stages of disease.
#'
#' @name ad_data
#' @aliases ad_data
#' @docType data
#' @return \item{ad_data}{a tibble}
#'
#' @source
#' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer.
#'
#' Craig-Schapiro R, Kuhn M, Xiong C, Pickering EH, Liu J, Misko TP, et al.
#' (2011) Multiplexed Immunoassay Panel Identifies Novel CSF Biomarkers for
#' Alzheimer's Disease Diagnosis and Prognosis. PLoS ONE 6(4): e18850.
#'
#'
#' @keywords datasets
#' @examples
#' data(ad_data)
#' str(ad_data)
NULL
modeldata/R/concrete.R 0000644 0001762 0000144 00000001217 14534612370 014340 0 ustar ligges users #' Compressive strength of concrete mixtures
#'
#' Yeh (2006) describes an aggregated data set for experimental designs used to
#' test the compressive strength of concrete mixtures. The data are used by
#' Kuhn and Johnson (2013).
#'
#'
#' @name concrete
#' @aliases concrete
#' @docType data
#' @return \item{concrete}{a tibble}
#' @keywords datasets
#' @source
#' Yeh I (2006). "Analysis of Strength of Concrete Using Design of Experiments
#' and Neural Networks." *Journal of Materials in Civil Engineering*, 18, 597-604.
#'
#' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer.
#' @examples
#' data(concrete)
#' str(concrete)
NULL
modeldata/R/two_class_dat.R 0000644 0001762 0000144 00000001445 14634543650 015374 0 ustar ligges users #' Two class data
#'
#' @details There are artificial data with two predictors (`A` and `B`) and
#' a factor outcome variable (`Class`).
#'
#' @name two_class_dat
#' @aliases two_class_dat
#' @docType data
#' @return \item{two_class_dat}{a data frame}
#'
#' @keywords datasets
#' @examples
#' data(two_class_dat)
#' str(two_class_dat)
NULL
#' Two class predictions
#'
#' @details These data are a test set form a model built for two
#' classes ("Class1" and "Class2"). There are columns for the true
#' and predicted classes and column for the probabilities for each
#' class.
#'
#' @name two_class_example
#' @aliases two_class_example
#' @docType data
#' @return \item{two_class_example}{a data frame}
#'
#' @keywords datasets
#' @examples
#' data(two_class_example)
#' str(two_class_example)
NULL
modeldata/R/pd_speech.R 0000644 0001762 0000144 00000002430 14534612370 014466 0 ustar ligges users #' Parkinson's disease speech classification data set
#'
#' @details From the UCI ML archive, the description is "The data used in this
#' study were gathered from 188 patients with PD (107 men and 81 women) with
#' ages ranging from 33 to 87 (65.1 p/m 10.9) at the Department of Neurology
#' in Cerrahpaşa Faculty of Medicine, Istanbul University. The control group
#' consists of 64 healthy individuals (23 men and 41 women) with ages varying
#' between 41 and 82 (61.1 p/m 8.9). During the data collection process,
#' the microphone is set to 44.1 KHz and following the physician's examination,
#' the sustained phonation of the vowel `/a/` was collected from each subject
#' with three repetitions."
#'
#' The data here are averaged over the replicates.
#'
#' @name pd_speech
#' @aliases pd_speech
#' @docType data
#' @return \item{pd_speech}{a data frame}
#'
#' @source UCI ML repository (data) https://archive.ics.uci.edu/ml/datasets/Parkinson%27s+Disease+Classification#,
#'
#' Sakar et al (2019), "A comparative analysis of speech signal processing
#' algorithms for Parkinson’s disease classification and the use of the tunable
#' Q-factor wavelet transform", _Applied Soft Computing_, V74, pg 255-263.
#'
#' @keywords datasets
#' @examples
#' data(pd_speech)
#' str(pd_speech)
NULL
modeldata/R/leaf_id_flavia.R 0000644 0001762 0000144 00000007311 14534612370 015444 0 ustar ligges users #' Leaf identification data (Flavia)
#'
#' @description
#' Image analysis of leaves to predict species.
#'
#' @name leaf_id_flavia
#' @aliases leaf_id_flavia
#' @docType data
#' @return \item{leaf_id_flavia}{a data frame}
#'
#' @details
#' From the original manuscript: "The Flavia dataset contains 1907 leaf images.
#' There are 32 different species and each has 50-77 images. Scanners and
#' digital cameras are used to acquire the leaf images on a plain background.
#' The isolated leaf images contain blades only, without a petiole. These leaf
#' images are collected from the most common plants in Yangtze, Delta,
#' China. Those leaves were sampled on the campus of the Nanjing University and
#' the Sun Yat-Sen arboretum, Nanking, China."
#'
#' The reference below has details information on the features used for
#' prediction.
#'
#' Columns:
#' \itemize{
#' \item \code{species}: factor (32 levels)
#' \item \code{apex}: factor (9 levels)
#' \item \code{base}: factor (6 levels)
#' \item \code{shape}: factor (5 levels)
#' \item \code{denate_edge}: factor (levels: 'no' and 'yes')
#' \item \code{lobed_edge}: factor (levels: 'no' and 'yes')
#' \item \code{smooth_edge}: factor (levels: 'no' and 'yes')
#' \item \code{toothed_edge}: factor (levels: 'no' and 'yes')
#' \item \code{undulate_edge}: factor (levels: 'no' and 'yes')
#' \item \code{outlying_polar}: numeric
#' \item \code{skewed_polar}: numeric
#' \item \code{clumpy_polar}: numeric
#' \item \code{sparse_polar}: numeric
#' \item \code{striated_polar}: numeric
#' \item \code{convex_polar}: numeric
#' \item \code{skinny_polar}: numeric
#' \item \code{stringy_polar}: numeric
#' \item \code{monotonic_polar}: numeric
#' \item \code{outlying_contour}: numeric
#' \item \code{skewed_contour}: numeric
#' \item \code{clumpy_contour}: numeric
#' \item \code{sparse_contour}: numeric
#' \item \code{striated_contour}: numeric
#' \item \code{convex_contour}: numeric
#' \item \code{skinny_contour}: numeric
#' \item \code{stringy_contour}: numeric
#' \item \code{monotonic_contour}: numeric
#' \item \code{num_max_ponits}: numeric
#' \item \code{num_min_points}: numeric
#' \item \code{diameter}: numeric
#' \item \code{area}: numeric
#' \item \code{perimeter}: numeric
#' \item \code{physiological_length}: numeric
#' \item \code{physiological_width}: numeric
#' \item \code{aspect_ratio}: numeric
#' \item \code{rectangularity}: numeric
#' \item \code{circularity}: numeric
#' \item \code{compactness}: numeric
#' \item \code{narrow_factor}: numeric
#' \item \code{perimeter_ratio_diameter}: numeric
#' \item \code{perimeter_ratio_length}: numeric
#' \item \code{perimeter_ratio_lw}: numeric
#' \item \code{num_convex_points}: numeric
#' \item \code{perimeter_convexity}: numeric
#' \item \code{area_convexity}: numeric
#' \item \code{area_ratio_convexity}: numeric
#' \item \code{equivalent_diameter}: numeric
#' \item \code{eccentriciry}: numeric
#' \item \code{contrast}: numeric
#' \item \code{correlation_texture}: numeric
#' \item \code{inverse_difference_moments}: numeric
#' \item \code{entropy}: numeric
#' \item \code{mean_red_val}: numeric
#' \item \code{mean_green_val}: numeric
#' \item \code{mean_blue_val}: numeric
#' \item \code{std_red_val}: numeric
#' \item \code{std_green_val}: numeric
#' \item \code{std_blue_val}: numeric
#' \item \code{correlation}: numeric
#' }
#' @source
#' Lakshika, Jayani PG, and Thiyanga S. Talagala. "Computer-aided interpretable
#' features for leaf image classification." _arXiv preprint_ arXiv:2106.08077
#' (2021).
#'
#' \url{https://github.com/SMART-Research/leaffeatures_paper}
#'
#' @examples
#' data(leaf_id_flavia)
#' str(leaf_id_flavia)
#'
NULL
modeldata/R/deliveries.R 0000644 0001762 0000144 00000001454 14634543650 014701 0 ustar ligges users #' Food Delivery Time Data
#'
#' @details
#' These data are from a study of food delivery times in minutes (i.e., the time from the
#' initial order to receiving the food) for a single restaurant. The data
#' contains 10,012 orders from a specific restaurant. The predictors include:
#' \itemize{
#' \item The time, in decimal hours, of the order.
#' \item The day of the week for the order.
#' \item The approximate distance in miles between the restaurant and the delivery
#' location.
#' \item A set of 27 predictors that count the number of distinct menu items
#' in the order.
#' }
#'
#' No times are censored.
#'
#' @name deliveries
#' @aliases deliveries
#' @docType data
#' @return \item{deliveries}{a tibble}
#'
#' @keywords datasets
#' @examples
#' data(deliveries)
#' str(deliveries)
NULL
modeldata/R/covers.R 0000644 0001762 0000144 00000001042 14534612370 014033 0 ustar ligges users #' Raw cover type data
#'
#' These data are raw data describing different types of forest cover-types
#' from the UCI Machine Learning Database (see link below). There is one
#' column in the data that has a few difference pieces of textual
#' information (of variable lengths).
#'
#' @name covers
#' @aliases covers
#' @docType data
#' @return \item{covers}{a data frame}
#'
#' @source https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
#'
#' @keywords datasets
#' @examples
#' data(covers)
#' str(covers)
NULL
modeldata/R/cells.R 0000644 0001762 0000144 00000002651 14534612370 013643 0 ustar ligges users #' Cell body segmentation
#'
#' Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a
#' high content screen were well segmented. The data consists of 119 imaging
#' measurements on 2019. The original analysis used 1009 for training and 1010
#' as a test set (see the column called \code{case}).
#'
#' The outcome class is contained in a factor variable called \code{class} with
#' levels "PS" for poorly segmented and "WS" for well segmented.
#'
#' The raw data used in the paper can be found at the Biomedcentral website.
#' The version
#' contained in \code{cells} is modified. First, several discrete
#' versions of some of the predictors (with the suffix "Status") were removed.
#' Second, there are several skewed predictors with minimum values of zero
#' (that would benefit from some transformation, such as the log). A constant
#' value of 1 was added to these fields: \code{avg_inten_ch_2},
#' \code{fiber_align_2_ch_3}, \code{fiber_align_2_ch_4}, \code{spot_fiber_count_ch_4} and
#' \code{total_inten_ch_2}.
#'
#' @name cells
#' @docType data
#' @return \item{cells}{a tibble}
#' @source Hill, LaPan, Li and Haney (2007). Impact of image segmentation on
#' high-content screening data quality for SK-BR-3 cells, \emph{BMC
#' Bioinformatics}, Vol. 8, pg. 340,
#' \url{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-340}.
#' @keywords datasets
#' @examples
#' data(cells)
#' str(cells)
NULL
modeldata/R/grants.R 0000644 0001762 0000144 00000004401 14534612370 014032 0 ustar ligges users #' Grant acceptance data
#'
#' A data set related to the success or failure of academic grants.
#'
#' The data are discussed in Kuhn and Johnson (2013):
#'
#' "These data are from a 2011 Kaggle competition sponsored by the University
#' of Melbourne where there was interest in predicting whether or not a grant
#' application would be accepted. Since public funding of grants had decreased
#' over time, triaging grant applications based on their likelihood of success
#' could be important for estimating the amount of potential funding to the
#' university. In addition to predicting grant success, the university sought
#' to understand factors that were important in predicting success."
#'
#' The data ranged from 2005 and 2008 and the data spending strategy was
#' driven by the date of the grant. Kuhn and Johnson (2013) describe:
#'
#' "The compromise taken here is to build models on the pre-2008 data and
#' tune them by evaluating a random sample of 2,075 grants from 2008. Once the
#' optimal parameters are determined, final model is built using these
#' parameters and the entire training set (i.e., the data prior to 2008 and the
#' additional 2,075 grants). A small holdout set of 518 grants from 2008 will
#' be used to ensure that no gross methodology errors occur from repeatedly
#' evaluating the 2008 data during model tuning. In the text, this set of
#' samples is called the 2 0 0 8 holdout set. This small set of year 2008
#' grants will be referred to as the test set and will not be evaluated until
#' set of candidate models are identified."
#'
#' To emulate this, `grants_other` contains the training (pre-2008, n = 6,633)
#' and holdout/validation data (2008, n = 1,557). `grants_test` has 518 grant
#' samples from 2008. The object `grants_2008` is an integer vector that can
#' be used to separate the modeling with the holdout/validation sets.
#'
#'
#' @name grants
#' @aliases grants_other grants_test grants_2008
#' @docType data
#' @return \item{grants_other,grants_test,grants_2008}{two tibbles and an integer
#' vector of data points used for training}
#' @source Kuhn and Johnson (2013). _Applied Predictive Modeling_. Springer.
#' @keywords datasets
#' @examples
#' data(grants)
#' str(grants_other)
#' str(grants_test)
#' str(grants_2008)
NULL
modeldata/R/modeldata-package.R 0000644 0001762 0000144 00000001241 14534612370 016056 0 ustar ligges users #' @keywords internal
"_PACKAGE"
## usethis namespace: start
## usethis namespace: end
NULL
# needed for simulation docs
utils::globalVariables(
c(
".", "linear_pred", "non_linear_1", "non_linear_2", "non_linear_3", "outcome",
"predictor_01", "predictor_02", "predictor_03", "predictor_04", "predictor_05",
"predictor_06", "predictor_07", "predictor_08", "predictor_09", "predictor_10",
"predictor_11", "predictor_12", "predictor_13", "predictor_14", "predictor_15",
"predictor_16", "predictor_17", "predictor_18", "predictor_19", "predictor_20",
"rand", "true_prob", "two_factor_1", "two_factor_2", ".truth", ".linear_pred",
".rand"
)
)
modeldata/R/parabolic.R 0000644 0001762 0000144 00000000520 14534612370 014466 0 ustar ligges users #' Parabolic class boundary data
#'
#' @details These data were simulated. There are two correlated predictors and
#' two classes in the factor outcome.
#'
#' @name parabolic
#' @aliases parabolic
#' @docType data
#' @return \item{parabolic}{a data frame}
#'
#' @keywords datasets
#' @examples
#' data(parabolic)
#' str(parabolic)
NULL
modeldata/R/penguins.R 0000644 0001762 0000144 00000001324 14534612370 014365 0 ustar ligges users #' Palmer Station penguin data
#'
#' A data set from Gorman, Williams, and Fraser (2014) containing measurements
#' from different types of penguins. This version of the data was retrieved from
#' Allison Horst's `palmerpenguins` package on 2020-06-22.
#'
#' @name penguins
#' @aliases penguins
#' @docType data
#' @return \item{penguins}{a tibble}
#' @source Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism
#' and Environmental Variability within a Community of Antarctic Penguins
#' (_Genus Pygoscelis_). PLoS ONE 9(3): e90081.
#' \doi{10.1371/journal.pone.0090081}
#'
#' \url{https://github.com/allisonhorst/palmerpenguins}
#' @keywords datasets
#' @examples
#' data(penguins)
#' str(penguins)
NULL
modeldata/R/steroidogenic_toxicity.R 0000644 0001762 0000144 00000003377 14534612370 017341 0 ustar ligges users #' Predicting steroidogenic toxicity with assay data
#'
#' @description
#' A set of _in vitro_ assays are used to quantify the risk of reproductive
#' toxicity via the disruption of steroidogenic pathways.
#'
#' @name steroidogenic_toxicity
#' @aliases steroidogenic_toxicity
#' @docType data
#' @return A tibble with columns
#' - `class`: factor(levels: toxic and nontoxic)
#' - `cyp_11a1`: numeric
#' - `cyp_11b1`: numeric
#' - `cyp_11b2`: numeric
#' - `cyp_17a1`: numeric
#' - `cyp_19a1`: numeric
#' - `cyp_21a1`: numeric
#' - `hsd3b2`: numeric
#' - `star`: numeric
#' - `progesterone`: numeric
#' - `testosterone`: numeric
#' - `dhea`: numeric
#' - `cortisol`: numeric
#' @details
#' H295R cells were used to measure the effect with two sets of assay results.
#' The first includes a set of protein measurements on: cytochrome P450 enzymes
#' ("cyp"s), STAR, and 3BHSD2. The second include hormone measurements for
#' DHEA, progesterone, testosterone, and cortisol.
#'
#' Columns:
#' \itemize{
#' \item \code{class}: factor (levels: 'toxic' and 'nontoxic')
#' \item \code{cyp_11a1}: numeric
#' \item \code{cyp_11b1}: numeric
#' \item \code{cyp_11b2}: numeric
#' \item \code{cyp_17a1}: numeric
#' \item \code{cyp_19a1}: numeric
#' \item \code{cyp_21a1}: numeric
#' \item \code{hsd3b2}: numeric
#' \item \code{star}: numeric
#' \item \code{progesterone}: numeric
#' \item \code{testosterone}: numeric
#' \item \code{dhea}: numeric
#' \item \code{cortisol}: numeric
#' }
#'
#' @source
#' Maglich, J. M., Kuhn, M., Chapin, R. E., & Pletcher, M. T. (2014). More than
#' just hormones: H295R cells as predictors of reproductive toxicity.
#' _Reproductive Toxicology_, 45, 77-86.
#'
#' @examples
#' data(steroidogenic_toxicity)
#' str(steroidogenic_toxicity)
#'
NULL
modeldata/R/fine_foods.R 0000644 0001762 0000144 00000002462 14534612370 014654 0 ustar ligges users #' Fine foods example data
#'
#' @details
#' These data are from Amazon, who describe it as "This dataset consists of
#' reviews of fine foods from amazon. The data span a period of more than 10
#' years, including all ~500,000 reviews up to October 2012. Reviews include
#' product and user information, ratings, and a plaintext review."
#'
#' A subset of the data are contained here and are split into a training and
#' test set. The training set sampled 10 products and retained all of their
#' individual reviews. Since the reviews within these products are correlated,
#' we recommend resampling the data using a leave-one-product-out approach. The
#' test set sampled 500 products that were not included in the training set
#' and selected a single review at random for each.
#'
#' There is a column for the product, a column for the text of the review, and
#' a factor column for a class variable. The outcome is whether the reviewer
#' gave the product a 5-star rating or not.
#'
#' @name small_fine_foods
#' @aliases small_fine_foods training_data testing_data
#' @docType data
#' @return \item{training_data,testing_data}{tibbles}
#'
#' @source https://snap.stanford.edu/data/web-FineFoods.html
#'
#'
#' @keywords datasets
#' @examples
#' data(small_fine_foods)
#' str(training_data)
#' str(testing_data)
NULL
modeldata/R/cat_adoption.R 0000644 0001762 0000144 00000002506 14634545620 015210 0 ustar ligges users #' Cat Adoption
#'
#' @description
#' A subset of the cats at the animal shelter in Long Beach, California, USA.
#'
#' @return tibble
#' @aliases cat_adoption
#' @name cat_adoption
#' @docType data
#' @details
#'
#' A data frame with 2257 rows and 19 columns:
#' \describe{
#' \item{time}{The time the cat spent at the shelter.}
#' \item{event}{The event of interest is the cat being homed or returned to
#' its original location (i.e., owner or community). The non-event is the cat
#' being transferred to another shelter or dying. Zero indicates a non-event
#' (censored), and one corresponds to the event occurring.}
#' \item{sex}{The sex of the cat.}
#' \item{neutered}{Whether the cat is neutered.}
#' \item{intake_condition}{The intake condition of the cat.}
#' \item{intake_type}{The type of intake.}
#' \item{latitude}{Latitude of the intersection/cross street of intake or capture.}
#' \item{longitude}{Longitude of the intersection/cross street of intake or capture.}
#' \item{black,brown,brown_tabby,calico,cream,gray,gray_tabby,orange,orange_tabby,tan,tortie,white}{Indicators for the color/pattern of the cat's fur.}
#' }
#' @source
#'
#'
#' on 2024-06-17
#'
#' @examples
#' str(cat_adoption)
#' @keywords datasets
NULL
modeldata/R/hpc_data.R 0000644 0001762 0000144 00000001075 14534612370 014303 0 ustar ligges users #' High-performance computing system data
#'
#' Kuhn and Johnson (2013) describe a data set where characteristics of unix
#' jobs were used to classify there completion times as either very fast
#' (1 min or less, `VF`), fast (1–50 min, `F`), moderate (5–30 min, `M`), or
#' long (greater than 30 min, `L`).
#'
#'
#' @name hpc_data
#' @aliases hpc_data
#' @docType data
#' @return \item{hpc_data}{a tibble}
#' @keywords datasets
#' @source
#' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer.
#' @examples
#'
#' data(hpc_data)
#' str(hpc_data)
NULL
modeldata/R/ischemic_stroke.R 0000644 0001762 0000144 00000006250 14534612370 015713 0 ustar ligges users #' Clinical data used to predict ischemic stroke
#'
#' @description
#' A data set to predict a binary outcome using imaging and patient data.
#'
#' @name ischemic_stroke
#' @aliases ischemic_stroke
#' @docType data
#' @return \item{ischemic_stroke}{a tibble}
#'
#' @details
#' These data were gathered to predict patient risk for ischemic stroke. A
#' historical set of patients with a range of carotid artery blockages were
#' selected. The data consisted of 126 patients, 44 of which had blockages
#' greater than 70%. All patients had undergone Computed Tomography Angiography
#' (CTA) to generate a detailed three-dimensional visualization and
#' characterization of the blockage. These images were then analyzed in order to
#' compute several features related to the disease, including: percent stenosis,
#' arterial wall thickness, and tissue characteristics such as lipid-rich
#' necrotic core and calcification.
#'
#' The group of patients in this study also had follow-up information on
#' whether or not a stroke occurred at a subsequent point in time. The data for
#' each patient also included commonly collected clinical characteristics for
#' risk of stroke such as whether or not the patient had atrial fibrillation,
#' coronary artery disease, and a history of smoking. Demographics of gender and
#' age were included as well. These readily available risk factors can be
#' thought of as another potentially useful predictor set that can be evaluated.
#' In fact, this set of predictors should be evaluated first to assess their
#' ability to predict stroke since these predictors are easy to collect, are
#' acquired at patient presentation, and do not require an expensive imaging
#' technique.
#'
#' Columns:
#' \itemize{
#' \item \code{stroke}: factor (levels: 'yes' and 'no')
#' \item \code{nascet_scale}: numeric
#' \item \code{calc_vol}: numeric
#' \item \code{calc_vol_prop}: numeric
#' \item \code{matx_vol}: numeric
#' \item \code{matx_vol_prop}: numeric
#' \item \code{lrnc_vol}: numeric
#' \item \code{lrnc_vol_prop}: numeric
#' \item \code{max_calc_area}: numeric
#' \item \code{max_calc_area_prop}: numeric
#' \item \code{max_dilation_by_area}: numeric
#' \item \code{max_matx_area}: numeric
#' \item \code{max_matx_area_prop}: numeric
#' \item \code{max_lrnc_area}: numeric
#' \item \code{max_lrnc_area_prop}: numeric
#' \item \code{max_max_wall_thickness}: numeric
#' \item \code{max_remodeling_ratio}: numeric
#' \item \code{max_stenosis_by_area}: numeric
#' \item \code{max_wall_area}: numeric
#' \item \code{wall_vol}: numeric
#' \item \code{max_stenosis_by_diameter}: numeric
#' \item \code{age}: integer
#' \item \code{male}: integer
#' \item \code{smoking_history}: integer
#' \item \code{atrial_fibrillation}: integer
#' \item \code{coronary_artery_disease}: integer
#' \item \code{diabetes_history}: integer
#' \item \code{hypercholesterolemia_history}: integer
#' \item \code{hypertension_history}: integer
#' }
#' @source
#' Kuhn, Max, and Kjell Johnson. _Feature Engineering and Selection: A Practical
#' Approach for Predictive Models_. Chapman and Hall/CRC, 2019.
#'
#' @examples
#' data(ischemic_stroke)
#' str(ischemic_stroke)
#'
NULL
modeldata/R/check_times.R 0000644 0001762 0000144 00000004451 14534612370 015017 0 ustar ligges users #' Execution time data
#'
#' These data were collected from the CRAN web page for 13,626 R
#' packages. The time to complete the standard package checking
#' routine was collected In some cases, the package checking
#' process is stopped due to errors and these data are treated as
#' censored. It is less than 1 percent.
#'
#' As predictors, the associated package source code were
#' downloaded and parsed to create predictors, including
#'
#' * `authors`: The number of authors in the author field.
#' * `imports`: The number of imported packages.
#' * `suggests`: The number of packages suggested.
#' * `depends`: The number of hard dependencies.
#' * `Roxygen`: a binary indicator for whether Roxygen was used
#' for documentation.
#' * `gh`: a binary indicator for whether the URL field contained
#' a GitHub link.
#' * `rforge`: a binary indicator for whether the URL field
#' contained a link to R-forge.
#' * `descr`: The number of characters (or, in some cases, bytes)
#' in the description field.
#' * `r_count`: The number of R files in the R directory.
#' * `r_size`: The total disk size of the R files.
#' * `ns_import`: Estimated number of imported functions or methods.
#' * `ns_export`: Estimated number of exported functions or methods.
#' * `s3_methods`: Estimated number of S3 methods.
#' * `s4_methods`: Estimated number of S4 methods.
#' * `doc_count`: How many Rmd or Rnw files in the vignettes
#' directory.
#' * `doc_size`: The disk size of the Rmd or Rnw files.
#' * `src_count`: The number of files in the `src` directory.
#' * `src_size`: The size on disk of files in the `src` directory.
#' * `data_count` The number of files in the `data` directory.
#' * `data_size`: The size on disk of files in the `data` directory.
#' * `testthat_count`: The number of files in the `testthat`
#' directory.
#' * `testthat_size`: The size on disk of files in the `testthat`
#' directory.
#' * `check_time`: The time (in seconds) to run `R CMD check`
#' using the "r-devel-windows-ix86+x86_64` flavor.
#' * `status`: An indicator for whether the tests completed.
#'
#' Data were collected on 2019-01-20.
#' @name check_times
#' @aliases check_times
#' @docType data
#' @return \item{check_times}{a data frame}
#'
#' @source CRAN
#'
#' @keywords datasets
#' @examples
#' data(check_times)
#' str(check_times)
NULL
modeldata/R/chem_proc_yield.R 0000644 0001762 0000144 00000003623 14534612370 015666 0 ustar ligges users #' Chemical manufacturing process data set
#'
#' @description
#' A data set that models yield as a function of biological material predictors
#' and chemical structure predictors.
#'
#' @name chem_proc_yield
#' @aliases chem_proc_yield
#' @docType data
#' @return \item{chem_proc_yield}{a tibble}
#'
#' @details
#' This data set contains information about a chemical manufacturing
#' process, in which the goal is to understand the relationship between
#' the process and the resulting final product yield. Raw material in
#' this process is put through a sequence of 27 steps to generate the
#' final pharmaceutical product. The starting material is generated from
#' a biological unit and has a range of quality and characteristics. The
#' objective in this project was to develop a model to predict percent
#' yield of the manufacturing process. The data set consisted of 177
#' samples of biological material for which 57 characteristics were
#' measured. Of the 57 characteristics, there were 12 measurements of
#' the biological starting material, and 45 measurements of the
#' manufacturing process. The process variables included measurements
#' such as temperature, drying time, washing time, and concentrations of
#' by-products at various steps. Some of the process measurements can
#' be controlled, while others are observed. Predictors are continuous,
#' count, categorical; some are correlated, and some contain missing
#' values. Samples are not independent because sets of samples come from
#' the same batch of biological starting material.
#'
#' Columns:
#' \itemize{
#' \item \code{yield}: numeric
#' \item \code{bio_material_01} - \code{bio_material_12}: numeric
#' \item \code{man_proc_01} - \code{man_proc_45}: numeric
#' }
#' @source
#' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York:
#' Springer, 2013.
#'
#' @examples
#' data(chem_proc_yield)
#' str(chem_proc_yield)
#'
NULL
modeldata/R/stackoverflow.R 0000644 0001762 0000144 00000001173 14634546373 015442 0 ustar ligges users #' Annual Stack Overflow Developer Survey Data
#'
#' @details These data are a collection of 5,594 data points collected on
#' developers. These data could be used to try to predict who works remotely
#' (as used in the source listed below).
#'
#' @name stackoverflow
#' @aliases stackoverflow
#' @docType data
#' @return \item{stackoverflow}{a tibble}
#'
#' @source
#' Julia Silge, _Supervised Machine Learning Case Studies in R_
#'
#' `https://supervised-ml-course.netlify.com/chapter2`
#'
#' Raw data: `https://insights.stackoverflow.com/survey/`
#' @keywords datasets
#' @examples
#' data(stackoverflow)
#' str(stackoverflow)
NULL
modeldata/R/reexports.R 0000644 0001762 0000144 00000000061 14534612370 014565 0 ustar ligges users #' @importFrom dplyr %>%
#' @export
dplyr::`%>%`
modeldata/R/oils.R 0000644 0001762 0000144 00000001344 14534612370 013505 0 ustar ligges users #' Fatty acid composition of commercial oils
#'
#' Fatty acid concentrations of commercial oils were measured using gas
#' chromatography. The data is used to predict the type of oil. Note that
#' only the known oils are in the data set. Also, the authors state that there
#' are 95 samples of known oils. However, we count 96 in Table 1 (pgs. 33-35).
#'
#'
#' @name oils
#' @aliases oils
#' @docType data
#' @return \item{oils}{a tibble}
#' @source Brodnjak-Voncina et al. (2005). Multivariate data analysis in
#' classification of vegetable oils characterized by the content of fatty
#' acids, \emph{Chemometrics and Intelligent Laboratory Systems}, Vol.
#' 75:31-45.
#' @keywords datasets
#' @examples
#' data(oils)
#' str(oils)
NULL
modeldata/R/Chicago.R 0000644 0001762 0000144 00000002114 14534612370 014070 0 ustar ligges users #' Chicago ridership data
#'
#' @details These data are from Kuhn and Johnson (2020) and contain an
#' _abbreviated_ training set for modeling the number of people (in thousands)
#' who enter the Clark and Lake L station.
#'
#' The `date` column corresponds to the current date. The columns with station
#' names (`Austin` through `California`) are a _sample_ of the columns used in
#' the original analysis (for file size reasons). These are 14 day lag
#' variables (i.e. `date - 14 days`). There are columns related to weather and
#' sports team schedules.
#'
#' The station at 35th and Archer is contained in the column `Archer_35th` to
#' make it a valid R column name.
#'
#'
#' @name Chicago
#' @aliases Chicago stations
#' @docType data
#' @return \item{Chicago}{a tibble} \item{stations}{a vector of station names}
#'
#' @source Kuhn and Johnson (2020), _Feature Engineering and Selection_,
#' Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and
#' \url{https://github.com/topepo/FES}
#'
#'
#' @keywords datasets
#' @examples
#' data(Chicago)
#' str(Chicago)
#' stations
NULL
modeldata/R/biomass.R 0000644 0001762 0000144 00000001306 14534612370 014172 0 ustar ligges users #' Biomass data
#'
#' Ghugare et al (2014) contains a data set where different biomass fuels are
#' characterized by the amount of certain molecules (carbon, hydrogen, oxygen,
#' nitrogen, and sulfur) and the corresponding higher heating value (HHV).
#' These data are from their Table S.2 of the Supplementary Materials
#'
#' @name biomass
#' @aliases biomass
#' @docType data
#' @return \item{biomass}{a data frame}
#'
#' @source Ghugare, S. B., Tiwary, S., Elangovan, V., and Tambe, S. S. (2013).
#' Prediction of Higher Heating Value of Solid Biomass Fuels Using Artificial
#' Intelligence Formalisms. *BioEnergy Research*, 1-12.
#'
#' @keywords datasets
#' @examples
#' data(biomass)
#' str(biomass)
NULL
modeldata/R/crickets.R 0000644 0001762 0000144 00000001427 14634543650 014355 0 ustar ligges users #' Rates of Cricket Chirps
#'
#' These data are from from McDonald (2009), by way of Mangiafico (2015), on
#' the relationship between the ambient temperature and the rate of cricket
#' chirps per minute. Data were collected for two species of the genus _Oecanthus_: _O. exclamationis_
#' and _O. niveus_. The data are contained in a data frame called `crickets` with
#' a total of 31 data points.
#'
#' @name crickets
#' @aliases crickets
#' @docType data
#' @return \item{crickets}{a tibble}
#' @source Mangiafico, S. 2015. "An R Companion for the Handbook of Biological
#' Statistics." \url{https://rcompanion.org/handbook/}.
#'
#' McDonald, J. 2009. _Handbook of Biological Statistics_. Sparky House Publishing.
#' @keywords datasets
#' @examples
#' data(crickets)
#' str(crickets)
NULL
modeldata/R/sacremento.R 0000644 0001762 0000144 00000001323 14534612370 014674 0 ustar ligges users #' Sacramento CA home prices
#'
#' This data frame contains house and sale price data for 932 homes in
#' Sacramento CA. The original data were obtained from the website for the
#' SpatialKey software. From their website: "The Sacramento real estate
#' transactions file is a list of 985 real estate transactions in the
#' Sacramento area reported over a five-day period, as reported by the
#' Sacramento Bee." Google was used to fill in missing/incorrect data.
#'
#'
#' @name Sacramento
#' @docType data
#' @return \item{Sacramento}{a tibble}
#' @source SpatialKey website:
#' \url{https://support.spatialkey.com/spatialkey-sample-csv-data/}
#' @keywords datasets
#' @examples
#' data(Sacramento)
#' str(Sacramento)
NULL
modeldata/R/lending_club.R 0000644 0001762 0000144 00000001350 14534612370 015161 0 ustar ligges users #' Loan data
#'
#' @details These data were downloaded from the Lending Club
#' access site (see below) and are from the first quarter of 2016.
#' A subset of the rows and variables are included here. The
#' outcome is in the variable `Class` and is either "good" (meaning
#' that the loan was fully paid back or currently on-time) or "bad"
#' (charged off, defaulted, of 21-120 days late). A data dictionary
#' can be found on the source website.
#'
#' @name lending_club
#' @aliases lending_club
#' @docType data
#' @return \item{lending_club}{a data frame}
#'
#' @source Lending Club Statistics https://www.lendingclub.com/info/download-data.action
#'
#' @keywords datasets
#' @examples
#' data(lending_club)
#' str(lending_club)
NULL
modeldata/R/hepatic_injury_qsar.R 0000644 0001762 0000144 00000003353 14534612370 016604 0 ustar ligges users #' Predicting hepatic injury from chemical information
#'
#' @description
#' A quantitative structure-activity relationship (QSAR) data set to predict
#' when a molecule has risk associated with liver function.
#'
#' @name hepatic_injury_qsar
#' @aliases hepatic_injury_qsar
#' @docType data
#' @return \item{hepatic_injury_qsar}{a tibble}
#'
#' @details
#' This data set was used to develop a model for predicting compounds'
#' probability of causing hepatic injury (i.e. liver damage). This data set
#' consisted of 281 unique compounds; 376 predictors were measured or computed
#' for each. The response was categorical (either "none", "mild", or "severe"),
#' and was highly unbalanced.
#'
#' This kind of response often occurs in pharmaceutical data because companies
#' steer away from creating molecules that have undesirable characteristics.
#' Therefore, well-behaved molecules often greatly outnumber undesirable
#' molecules. The predictors consisted of measurements from 184 biological
#' screens and 192 chemical feature predictors. The biological predictors
#' represent activity for each screen and take values between 0 and 10 with a
#' mode of 4. The chemical feature predictors represent counts of important
#' sub-structures as well as measures of physical properties that are thought to
#' be associated with hepatic injury.
#'
#' Columns:
#' \itemize{
#' \item \code{class}: ordered and factor (levels: 'none', 'mild', and 'severe')
#' \item \code{bio_assay_001} - \code{bio_assay_184}: numeric
#' \item \code{chem_fp_001} - \code{chem_fp_192}: numeric
#' }
#' @source
#' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York:
#' Springer, 2013.
#'
#' @examples
#' data(hepatic_injury_qsar)
#' str(hepatic_injury_qsar)
#'
NULL
modeldata/R/bivariate.R 0000644 0001762 0000144 00000001233 14534612370 014502 0 ustar ligges users #' Example bivariate classification data
#'
#' @details These data are a simplified version of the segmentation data contained
#' in `caret`. There are three columns: `A` and `B` are predictors and the column
#' `Class` is a factor with levels "One" and "Two". There are three data sets:
#' one for training (n = 1009), validation (n = 300), and testing (n = 710).
#'
#' @name bivariate
#' @aliases bivariate_train bivariate_test bivariate_val
#' @docType data
#' @return \item{bivariate_train, bivariate_test, bivariate_val}{tibbles}
#'
#' @keywords datasets
#' @examples
#' data(bivariate)
#' str(bivariate_train)
#' str(bivariate_val)
#' str(bivariate_test)
NULL
modeldata/data/ 0000755 0001762 0000144 00000000000 14634545620 013126 5 ustar ligges users modeldata/data/cells.RData 0000644 0001762 0000144 00002617536 14534612370 015165 0 ustar ligges users U6sL!TDd<7|>gZZi-'E&K#K8ED'g..ȇqh~oyosi~ow}$MHe$7I?tlgj$f#t8Ii7#o07OGӑl4
a$;WH8'X,gIWw8c4>O?
bd8_h6VpGsm$ϑt5kl24hm>7ZmH?psH
Gg4:]&HsIz8}o:e$'X'fnm8<'X7 g_I?#HS:pXO:G$H>>?1Z]Mhq0ZͧMnh7F>M#7[5<dˑ>Z_M?#q_pϏtHxoH7bu4:0?Ip<ܼFhuOt8p~:mFp8Xky4?#ao6aq4d8&Hr
dWFc|ڔ6"D;\ؿRWySkK6ᄁ"R%~a$$hP0G]5o1g[?6ڱGw4sG#h9
'?h^D#p=\ߣpv
>G`rhu0G#|>p4?pIk?gFΣot7m;Fcjs4pv8??- F/ϣ`?1m7}[M02ZCI-ooo3GM3Zyd4><ܽ#fox8>o=|c_|4}/D'hl3ZGhu2Z]ۿ?ſ7/F+hm6l+pmchKpOu4rH
ÿpc2?Ց0>s8fIpsK6ZGH4bi4I'o|dM9?o6Ms>^?_|{x%J#M+3lW_ {B4cS XR^Kh@dk:HUD-7CD ڽ-M.:!"z!fy{?3GܑTp|7 `ݴmޗPLb܂#B;*yNGUGLOU /uv&jf3^ybk5Q}@P$,
.&diz<^}38-aYzk+L^^=M͟/ދ6V*._1%Վ͠jtWd
1guK` Vz5snxw>Wu^"ixÀ*;c#"eSOM}\|=kj_ryׯ%"jmA.Dڷ#2ǚ
ϫcS:4JAjG.0zTϞ~98~":WNP-w'Ç_`ze\-2xj0\{~
X0nXҪ)7_\wқb!e+~]w@{weD:zL}3rVF&1^AdEe!yx*ǧycߗ#$$'Dxu,;S A~e[-iO2ɏ&,,я?_iey^'//ԹBwɯg1vo!Mߋqś(sQ;^bx'j*Y!X]l0bO憲jK?X3]Z4
նf %,L#Yϗ{ʇ N'O7ău4e,x֟I+Ɓ1_sZx+^SyϼS2;|Cg竽C9D9(3
~Oh?'eBKLeUU2a'
a)3a?x>@ծSaX0Iu[-\iGjJ-I/7 Դeg~(V~ܒi,m-k=Eм&,xLUrQC`y0#֊0#ZHڒ!
-Ћc˙$l1,4zӄUbm_pN _SX~jsܾr 1WNq<
ۿuՌs78V|TxL:߯ˏw^=|DLemAc~`8pLje'8Jt]mMu]4Ro|PibemRsL)F7AJ_*Gň;a3 YCpSu)Y4./V\_ᒋK0;z1]qU|z.-R1^o3l->3\&APCS!rۺ=PTJNkwnY暍<ƞJ2WMM1äKh~O|lz5m5<{s`i5KYi{7$u"4*M]{tCѺ\&fûɫ3=5$+Vacvt{e`K:V`x8 #\-=^|>
gO3*o"hǧKo01LX֭e;c^bxgCޝWǪמ>9o%N7+4t${_i)u=67P1XNgL /k`['\\J:{gךG"`%@ņ>VCȻ7L-yEGz4QftYV@w7$
`i[icMja /˞ıۛ`@N%ׯfj9N6ɗ\ޫ5#wի%+flsߺ""?{IhT]|9xsw0 kO+#(`[qWSu+вLV1yR1NLȲ=O.z>d-j*"|4D~A?.I~|V4x:گ'ǦEۓ}O8tϹ0xlZ89GMS'K{>k?HReaMm:;iy;Sev,D&i\[`!\"Ozu{ ]sfM cIx9S4%cyA?1Ifl ^_?m
Ԧ*)Ŭ3^
/&IW_ߞ 4b3~ˋڳ8!AX^L<2p
Sw}k?Dk#!yBi-v!䃗9LRB@,M]xWgwe"@YI"oU֞(G^CGa1L}Fd#QD|qrꀍLO1)[QQ*(m% p>X~H!f:0tf",p8lB:w] `.}r;=}lU{QL0Wz|^ Ls[@54S)mj2oDm%cv
GW4t|ޘuCpm7JڏcS> Co
aU 4ux7BSMӔk8f8D*J4]@nQ!#\]+V,APA0w@K$$oy߅5a7MGbѳAb+cC`ȞXQɑ8mhm.2W͂L+d,
U=nҼ?)}-NYyFCү=>eLt_jzfJTuVD|/e]ލPx,)q~KY.'K3kt@u3*w8xu
zm`-2|)}vCjBSaκ2a^ؙKF}\_
y+ϚM.EfK"{A5DZz؟vb=Ey>kfHǛ4v]<5
W]'yJzNOೋg]K=4\᧫ZF$
?&;Wu߳[)|Ù5ge¸$K/('Y.po
=2elמ3.]U$(ZM/3{ώ.y^#TߖS"h}[wH
v
"0L?}xnϷ@GW%@u. `4[0#V?-߆^#"~KcEaR[9ek&5Թ.
p/L,ܾnGapL^TE]GF`ʙI$tU/u[a<2CIS Wٲ'詤6!l%-C3!0"| ֿ,EwglY<7:
VꖽI@NGr=NAQn3ݏ+U#y՝u6Wkf:dzY^͖`x<q58܉ؽ;vy+Yמh
[%1ja2_>JD.scD0p> 7;bqKҽ|Q-)4BPb
p('.w@ŷ/N)G9=x-~D~̈I;Ǽ#ݨo|Ww6݄Ќw-[z~^3\hq{ g_diS M?pXX,\><gJJa|\|śܘw0%Uu&[t=֚Ce1v'JE'7ƲhJXz88yfD>^bUQ+yH(i䝇`2*s߮6wIB~"8zXaDx&k͇n|]
"N9U7C_$$.H&]D^)\ٻTwO'Eg!ic!'2I
;dCwiQObUlHq?Ay۬OD$2ɹAefDL"=``X/}zcX&E6/(